Merge pull request #22 from CUN-bjy/rebase

discrete mode added
CUN-bjy · Dec 30, 2020 · 0725db5 · 0725db5
2 parents bf6356d + 27f0528
commit 0725db5
Show file tree

Hide file tree

Showing 3 changed files with 7 additions and 36 deletions.
diff --git a/agent/__init__.py b/agent/__init__.py
@@ -1,24 +0,0 @@
-'''
-MIT License
-
-Copyright (c) 2020 Junyoeb Baek
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-'''
-#DDPG Agent module Initialized

diff --git a/agent/ddpg.py b/agent/ddpg.py
@@ -37,7 +37,6 @@ class ddpgAgent():
 	def __init__(self, env_, is_discrete=False, batch_size=100, w_per=True):
 		# gym environments
 		self.env = env_
-
 		self.discrete = is_discrete
 		self.obs_dim = env_.observation_space.shape[0]
 		self.act_dim = env_.action_space.n if is_discrete else env_.action_space.shape[0]
@@ -63,10 +62,9 @@ def __init__(self, env_, is_discrete=False, batch_size=100, w_per=True):
 	def make_action(self, obs, t, noise=True):
 		""" predict next action from Actor's Policy
 		"""
-		# obs = np.expand_dims(obs, axis=0).astype(np.float32)
 		action_ = self.actor.predict(obs)[0]
 		a = np.clip(action_ + self.noise.generate(t) if noise else 0, -self.action_bound, self.action_bound)
-		return np.argmax(a) if self.discrete else a
+		return a
 
 	def update_networks(self, obs, acts, critic_target):
 		""" Train actor & critic from sampled experience
@@ -93,9 +91,7 @@ def replay(self, replay_num_):
 			states, actions, rewards, dones, new_states, idx = self.sample_batch(self.batch_size)
 
 			# get target q-value using target network
-			a = self.actor.target_predict(new_states)
-			print(a)
-			q_vals = self.critic.target_predict([new_states,np.argmax(a) if self.discrete else a])
+			q_vals = self.critic.target_predict([new_states,self.actor.target_predict(new_states)])
 
 			# bellman iteration for target critic value
 			critic_target = np.asarray(q_vals)
@@ -120,8 +116,7 @@ def memorize(self,obs,act,reward,done,new_obs):
 		"""store experience in the buffer
 		"""
 		if self.with_per:
-			a = self.actor.predict(obs)[0]
-			q_val = self.critic.network([np.expand_dims(obs,axis=0),np.argmax(a) if self.discrete else a])
+			q_val = self.critic.network([np.expand_dims(obs,axis=0),self.actor.predict(obs)])
 			next_action = self.actor.target_network.predict(np.expand_dims(new_obs, axis=0))
 			q_val_t = self.critic.target_predict([np.expand_dims(new_obs,axis=0), next_action])
 			new_val = reward + self.discount_factor * q_val_t

diff --git a/train.py b/train.py
@@ -45,7 +45,7 @@ def model_train(pretrained_):
 				'cheetah':"RoboschoolHalfCheetah-v1",
 				'walker':"RoboschoolWalker2d-v1",
 				'hopper':"RoboschoolHopper-v1"}
-
+	
 	# env = gym.make(models['pendulum'])
 	env = gym.make("CartPole-v1")
 
@@ -97,14 +97,14 @@ def model_train(pretrained_):
 				env.render()
 
 				# Make action from the current policy
-				action = agent.make_action(obs, t)#env.action_space.sample()#
-				# action = 
+				a = agent.make_action(obs, t)#env.action_space.sample()#
+				action = np.argmax(a) if is_discrete else a[0]
 
 				# do step on gym at t-time
 				new_obs, reward, done, info = env.step(action) 
 
 				# store the results to buffer	
-				agent.memorize(obs, action, reward, done, new_obs)
+				agent.memorize(obs, a, reward, done, new_obs)
 
 				# grace finish and go to t+1 time
 				obs = new_obs