Skip to content

Commit

Permalink
status reporting tools updated
Browse files Browse the repository at this point in the history
  • Loading branch information
CUN-bjy committed Nov 21, 2020
1 parent c7af08c commit da41852
Show file tree
Hide file tree
Showing 6 changed files with 32 additions and 29 deletions.
6 changes: 1 addition & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,12 @@

Keras Implementation of DDPG(Deep Deterministic Policy Gradient) on OpenAI gym framwork

**This project is not totally implemented yet.**
#### Status : IMPLEMENTING!

[project_link](https://github.com/CUN-bjy/gym-ddpg-keras/projects/1)

</br>

![](https://raw.githubusercontent.com/CUN-bjy/WalkYTo-rl-gym/master/img/ant_v1.png)

</br>

## Experiment Details

### **from paper**
Expand Down
1 change: 0 additions & 1 deletion agent/actor.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,6 @@ def train(self, obs, critic, q_grads):
actions = self.network(obs)
# actor_loss = -tf.reduce_mean(critic([obs,actions]))
actor_grad = tape.gradient(self.network(obs), self.network.trainable_variables,-q_grads)
# tf.print("actor loss :",actor_loss)
# actor_grad = tape.gradient(actor_loss,self.network.trainable_variables)
self.optimizer.apply_gradients(zip(actor_grad,self.network.trainable_variables))

Expand Down
15 changes: 3 additions & 12 deletions agent/critic.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ def __init__(self, in_dim, out_dim, lr_, tau_, discount_factor):
weights_ = self.network.get_weights()
self.target_network.set_weights(weights_)

self.critic_loss = None

def create_network(self):
""" Create a Critic Network Model using Keras
as a Q-value approximator function
Expand Down Expand Up @@ -77,18 +79,6 @@ def create_network(self):

return Model(inputs,output)

def bellman(self, rewards, q_vals, dones, idx):
""" Bellman Equation for q value iteration
"""
critic_target = np.asarray(q_vals)
for i in range(q_vals.shape[0]):
if dones[i]:
critic_target[i] = rewards[i]
else:
critic_target[i] = self.discount_factor * q_vals[i] + rewards[i]

return critic_target

def Qgradient(self, obs, acts):
acts = tf.convert_to_tensor(acts)
with tf.GradientTape() as tape:
Expand All @@ -105,6 +95,7 @@ def train(self, obs, acts, target):
td_error = q_values - target
critic_loss = tf.reduce_mean(tf.math.square(td_error))
tf.print("critic loss :",critic_loss)
self.critic_loss = float(critic_loss)

critic_grad = tape.gradient(critic_loss, self.network.trainable_variables) # compute critic gradient
self.optimizer.apply_gradients(zip(critic_grad, self.network.trainable_variables))
Expand Down
1 change: 0 additions & 1 deletion agent/ddpg.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,6 @@ def replay(self, replay_num_):
q_vals = self.critic.target_predict([new_states,self.actor.target_predict(new_states)])

# bellman iteration for target critic value
# critic_target = self.critic.bellman(rewards, q_vals, dones, idx)
critic_target = np.asarray(q_vals)
for i in range(q_vals.shape[0]):
if dones[i]:
Expand Down
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@ keras==2.4.3
scipy==1.4.1
tqdm
numpy==1.16.0
matplotlib
matplotlib
seaborn
pandas
34 changes: 25 additions & 9 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,14 @@
import numpy as np, time, os
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

import argparse

from agent.ddpg import ddpgAgent

NUM_EPISODES_ = 3000
NUM_EPISODES_ = 5000

def model_train(pretrained_):
# Create Environments
Expand All @@ -62,12 +64,16 @@ def model_train(pretrained_):
print("======================================")


logger = dict()
logger = dict(episode=[],reward=[],critic_loss=[])
plt.ion()
fig1 = plt.figure(1); fig2 = plt.figure(2)
ax1 = fig1.add_subplot(111)
ax2 = fig2.add_subplot(111)


try:
act_range = env.action_space.high
rewards = []
rewards = []; critic_losses = []
for epi in range(NUM_EPISODES_):
print("=========EPISODE # %d =========="%epi)
obs = env.reset()
Expand All @@ -92,26 +98,36 @@ def model_train(pretrained_):
obs = new_obs
epi_reward = epi_reward + reward


if t%50 == 0: agent.replay(1)

# check if the episode is finished
if done or (t == steps-1):
# Replay
agent.replay(1)
print("Episode#%d, steps:%d, rewards:%f"%(epi,t,epi_reward))
rewards.append(epi_reward)
agent.replay(1)

# save weights at every 50 iters
if epi%50 == 0:
dir_path = "%s/weights"%os.getcwd()
if not os.path.isdir(dir_path):
os.mkdir(dir_path)
path = dir_path+'/'+'gym_ddpg_'
agent.save_weights(path + 'ep%d'%epi)


# save reward logs
ax1.cla(); ax2.cla();
logger['episode'] = range(epi+1)
logger['reward'].append(epi_reward)
logger['critic_loss'].append(agent.critic.critic_loss)

df = pd.DataFrame(logger)
sns.lineplot(ax=ax1,x='episode',y='reward', data=df)
sns.lineplot(ax=ax2,x='episode',y='critic_loss', data=df)
break;

except KeyboardInterrupt as e:
print(e)
except KeyboardInterrupt as e: print(e)
finally:
# weight saver
dir_path = "%s/weights"%os.getcwd()
if not os.path.isdir(dir_path):
os.mkdir(dir_path)
Expand Down

0 comments on commit da41852

Please sign in to comment.