port actor_critic_cartpole to keras core (#542)
This commit is contained in:
parent
99d4a75f1b
commit
2c2962158e
189
examples/keras_io/tensorflow/rl/actor_critic_cartpole.py
Normal file
189
examples/keras_io/tensorflow/rl/actor_critic_cartpole.py
Normal file
@ -0,0 +1,189 @@
|
|||||||
|
"""
|
||||||
|
Title: Actor Critic Method
|
||||||
|
Author: [Apoorv Nandan](https://twitter.com/NandanApoorv)
|
||||||
|
Converted to Keras Core by: [Muhammad Anas Raza](https://anasrz.com)
|
||||||
|
Date created: 2020/05/13
|
||||||
|
Last modified: 2023/07/19
|
||||||
|
Description: Implement Actor Critic Method in CartPole environment.
|
||||||
|
Accelerator: NONE
|
||||||
|
"""
|
||||||
|
"""
|
||||||
|
## Introduction
|
||||||
|
|
||||||
|
This script shows an implementation of Actor Critic method on CartPole-V0 environment.
|
||||||
|
|
||||||
|
### Actor Critic Method
|
||||||
|
|
||||||
|
As an agent takes actions and moves through an environment, it learns to map
|
||||||
|
the observed state of the environment to two possible outputs:
|
||||||
|
|
||||||
|
1. Recommended action: A probability value for each action in the action space.
|
||||||
|
The part of the agent responsible for this output is called the **actor**.
|
||||||
|
2. Estimated rewards in the future: Sum of all rewards it expects to receive in the
|
||||||
|
future. The part of the agent responsible for this output is the **critic**.
|
||||||
|
|
||||||
|
Agent and Critic learn to perform their tasks, such that the recommended actions
|
||||||
|
from the actor maximize the rewards.
|
||||||
|
|
||||||
|
### CartPole-V1
|
||||||
|
|
||||||
|
A pole is attached to a cart placed on a frictionless track. The agent has to apply
|
||||||
|
force to move the cart. It is rewarded for every time step the pole
|
||||||
|
remains upright. The agent, therefore, must learn to keep the pole from falling over.
|
||||||
|
|
||||||
|
### References
|
||||||
|
|
||||||
|
- [CartPole](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf)
|
||||||
|
- [Actor Critic Method](https://hal.inria.fr/hal-00840470/document)
|
||||||
|
"""
|
||||||
|
"""
|
||||||
|
## Setup
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
os.environ['KERAS_BACKEND'] = 'tensorflow'
|
||||||
|
|
||||||
|
import keras_core as keras
|
||||||
|
from keras_core import layers
|
||||||
|
|
||||||
|
import gym
|
||||||
|
import numpy as np
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
|
|
||||||
|
# Configuration parameters for the whole setup
|
||||||
|
seed = 42
|
||||||
|
gamma = 0.99 # Discount factor for past rewards
|
||||||
|
max_steps_per_episode = 10000
|
||||||
|
env = gym.make("CartPole-v1", new_step_api=True) # Create the environment
|
||||||
|
env.reset(seed=seed)
|
||||||
|
eps = np.finfo(np.float32).eps.item() # Smallest number such that 1.0 + eps != 1.0
|
||||||
|
|
||||||
|
"""
|
||||||
|
## Implement Actor Critic network
|
||||||
|
|
||||||
|
This network learns two functions:
|
||||||
|
|
||||||
|
1. Actor: This takes as input the state of our environment and returns a
|
||||||
|
probability value for each action in its action space.
|
||||||
|
2. Critic: This takes as input the state of our environment and returns
|
||||||
|
an estimate of total rewards in the future.
|
||||||
|
|
||||||
|
In our implementation, they share the initial layer.
|
||||||
|
"""
|
||||||
|
|
||||||
|
num_inputs = 4
|
||||||
|
num_actions = 2
|
||||||
|
num_hidden = 128
|
||||||
|
|
||||||
|
inputs = layers.Input(shape=(num_inputs,))
|
||||||
|
common = layers.Dense(num_hidden, activation="relu")(inputs)
|
||||||
|
action = layers.Dense(num_actions, activation="softmax")(common)
|
||||||
|
critic = layers.Dense(1)(common)
|
||||||
|
|
||||||
|
model = keras.Model(inputs=inputs, outputs=[action, critic])
|
||||||
|
|
||||||
|
"""
|
||||||
|
## Train
|
||||||
|
"""
|
||||||
|
|
||||||
|
optimizer = keras.optimizers.Adam(learning_rate=0.01)
|
||||||
|
huber_loss = keras.losses.Huber()
|
||||||
|
action_probs_history = []
|
||||||
|
critic_value_history = []
|
||||||
|
rewards_history = []
|
||||||
|
running_reward = 0
|
||||||
|
episode_count = 0
|
||||||
|
|
||||||
|
while True: # Run until solved
|
||||||
|
state = env.reset()
|
||||||
|
episode_reward = 0
|
||||||
|
with tf.GradientTape() as tape:
|
||||||
|
for timestep in range(1, max_steps_per_episode):
|
||||||
|
# env.render(); Adding this line would show the attempts
|
||||||
|
# of the agent in a pop up window.
|
||||||
|
|
||||||
|
state = tf.convert_to_tensor(state)
|
||||||
|
state = tf.expand_dims(state, 0)
|
||||||
|
|
||||||
|
# Predict action probabilities and estimated future rewards
|
||||||
|
# from environment state
|
||||||
|
action_probs, critic_value = model(state)
|
||||||
|
critic_value_history.append(critic_value[0, 0])
|
||||||
|
|
||||||
|
# Sample action from action probability distribution
|
||||||
|
action = np.random.choice(num_actions, p=np.squeeze(action_probs))
|
||||||
|
action_probs_history.append(tf.math.log(action_probs[0, action]))
|
||||||
|
|
||||||
|
# Apply the sampled action in our environment
|
||||||
|
state, reward, done, _, _ = env.step(action)
|
||||||
|
rewards_history.append(reward)
|
||||||
|
episode_reward += reward
|
||||||
|
|
||||||
|
if done:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Update running reward to check condition for solving
|
||||||
|
running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward
|
||||||
|
|
||||||
|
# Calculate expected value from rewards
|
||||||
|
# - At each timestep what was the total reward received after that timestep
|
||||||
|
# - Rewards in the past are discounted by multiplying them with gamma
|
||||||
|
# - These are the labels for our critic
|
||||||
|
returns = []
|
||||||
|
discounted_sum = 0
|
||||||
|
for r in rewards_history[::-1]:
|
||||||
|
discounted_sum = r + gamma * discounted_sum
|
||||||
|
returns.insert(0, discounted_sum)
|
||||||
|
|
||||||
|
# Normalize
|
||||||
|
returns = np.array(returns)
|
||||||
|
returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
|
||||||
|
returns = returns.tolist()
|
||||||
|
|
||||||
|
# Calculating loss values to update our network
|
||||||
|
history = zip(action_probs_history, critic_value_history, returns)
|
||||||
|
actor_losses = []
|
||||||
|
critic_losses = []
|
||||||
|
for log_prob, value, ret in history:
|
||||||
|
# At this point in history, the critic estimated that we would get a
|
||||||
|
# total reward = `value` in the future. We took an action with log probability
|
||||||
|
# of `log_prob` and ended up recieving a total reward = `ret`.
|
||||||
|
# The actor must be updated so that it predicts an action that leads to
|
||||||
|
# high rewards (compared to critic's estimate) with high probability.
|
||||||
|
diff = ret - value
|
||||||
|
actor_losses.append(-log_prob * diff) # actor loss
|
||||||
|
|
||||||
|
# The critic must be updated so that it predicts a better estimate of
|
||||||
|
# the future rewards.
|
||||||
|
critic_losses.append(
|
||||||
|
huber_loss(tf.expand_dims(value, 0), tf.expand_dims(ret, 0))
|
||||||
|
)
|
||||||
|
|
||||||
|
# Backpropagation
|
||||||
|
loss_value = sum(actor_losses) + sum(critic_losses)
|
||||||
|
grads = tape.gradient(loss_value, model.trainable_variables)
|
||||||
|
optimizer.apply_gradients(zip(grads, model.trainable_variables))
|
||||||
|
|
||||||
|
# Clear the loss and reward history
|
||||||
|
action_probs_history.clear()
|
||||||
|
critic_value_history.clear()
|
||||||
|
rewards_history.clear()
|
||||||
|
|
||||||
|
# Log details
|
||||||
|
episode_count += 1
|
||||||
|
if episode_count % 10 == 0:
|
||||||
|
template = "running reward: {:.2f} at episode {}"
|
||||||
|
print(template.format(running_reward, episode_count))
|
||||||
|
|
||||||
|
if running_reward > 195: # Condition to consider the task solved
|
||||||
|
print("Solved at episode {}!".format(episode_count))
|
||||||
|
break
|
||||||
|
"""
|
||||||
|
## Visualizations
|
||||||
|
In early stages of training:
|
||||||
|
![Imgur](https://i.imgur.com/5gCs5kH.gif)
|
||||||
|
|
||||||
|
In later stages of training:
|
||||||
|
![Imgur](https://i.imgur.com/5ziiZUD.gif)
|
||||||
|
"""
|
Loading…
Reference in New Issue
Block a user