-
Notifications
You must be signed in to change notification settings - Fork 0
/
dqn.py
178 lines (140 loc) · 5.65 KB
/
dqn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
"""
Based on:
https://towardsdatascience.com/reinforcement-learning-w-keras-openai-dqns-1eed3a5338c
Not sure if this is the right algorithm... action space is too big.
I think this won't work for a large, MultiDiscrete action space.
https://www.reddit.com/r/reinforcementlearning/comments/hp95c6/multi_discrete_action_spaces_for_dqn/
"""
import random
from collections import deque
import gym
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import adam_v2
from gym.envs.registration import register
# Register our custom environment
register(
id="wordle-v0",
entry_point="wordle:WordleEnv",
)
class DQN:
"Deep Q Network"
def __init__(self, env):
self.env = env
self.memory = deque(maxlen=2000)
# Hyperparameters
self.gamma = 0.85
self.epsilon = 1.0
self.epsilon_min = 0.01
self.epsilon_decay = 0.995
self.learning_rate = 0.005
self.tau = 0.125
self.model = self.create_model()
self.target_model = self.create_model()
def create_model(self):
"Create the neural network using Keras"
model = Sequential()
state_shape = self.env.observation_space.shape
model.add(Dense(24, input_dim=state_shape[0], activation="relu"))
model.add(Dense(48, activation="relu"))
model.add(Dense(24, activation="relu"))
# I think this is why maybe DQN is not the right algorithm
# The action space is huge.
# model.add(Dense(self.env.action_size()))
model.add(Dense(5)) # Is this the entire action space or the length of the shape?
opt = adam_v2.Adam(learning_rate=self.learning_rate) # , decay=lr/epochs
model.compile(loss="mean_squared_error", optimizer=opt)
return model
def act(self, state):
"Decide if we're going to do something random, or exploit what we know"
self.epsilon *= self.epsilon_decay
self.epsilon = max(self.epsilon_min, self.epsilon)
if np.random.random() < self.epsilon:
return self.env.action_space.sample()
# Breaks here. Probably because we aren't reshaping in main().
#
# ValueError: Exception encountered when calling layer "sequential" (type Sequential).
# Input 0 of layer "dense" is incompatible with the layer:
# expected min_ndim=2, found ndim=1.
# Full shape received: (None,)
# Call arguments received:
# • inputs=tf.Tensor(shape=(None,), dtype=int64)
# • training=False
# • mask=None
# After fixing reshape, action retuned is an int64, which fails
print("act state", state)
prediction = self.model.predict(state)
print("prediction", prediction)
return prediction[0]
# return np.argmax(self.model.predict(state)[0])
def remember(self, state, action, reward, new_state, done):
"Store the results of each step"
self.memory.append([state, action, reward, new_state, done])
def replay(self):
"Learn from what we saw in the past"
batch_size = 32
if len(self.memory) < batch_size:
return
samples = random.sample(self.memory, batch_size)
for sample in samples:
print("sample", sample)
# pylint:disable=unpacking-non-sequence
state, action, reward, new_state, done = sample
target = self.target_model.predict(state)
print("target", target)
if done:
target[0][action] = reward
else:
q_future = max(self.target_model.predict(new_state)[0])
target[0][action] = reward + q_future * self.gamma
self.model.fit(state, target, epochs=1, verbose=0)
def target_train(self):
"Copy weights from the main model to the target"
weights = self.model.get_weights()
target_weights = self.target_model.get_weights()
for i in range(len(target_weights)):
target_weights[i] = weights[i] * self.tau + target_weights[i] * (
1 - self.tau
)
self.target_model.set_weights(target_weights)
def save_model(self, filename):
"Save the model"
self.model.save(filename)
def main():
"Train the model"
env = gym.make("wordle-v0")
trials = 1
trial_len = 500
dqn_agent = DQN(env=env)
for trial in range(trials):
print("Starting trial", trial)
# cur_state = np.reshape(env.reset(), (1,2))
# cur_state = env.reset().reshape(1, 2)
cur_state = np.reshape(env.reset(), (1, 5))
print("cur_state", cur_state)
for step in range(trial_len):
action = dqn_agent.act(cur_state)
print("action", action)
new_state, reward, done, _ = env.step(action)
# if reward > 0:
env.render()
# new_state = np.reshape(new_state, (1, 2))
# new_state = new_state.reshape(1, 2)
new_state = np.reshape(new_state, (1, 5))
dqn_agent.remember(cur_state, action, reward, new_state, done)
dqn_agent.replay() # internally iterates default (prediction) model
dqn_agent.target_train() # iterates target model
cur_state = new_state
if done:
break
if step >= 199:
print(f"Failed to complete in trial {trial}")
if step % 10 == 0:
dqn_agent.save_model(f"trial-{trial}.model")
else:
print(f"Completed in {trial} trials")
dqn_agent.save_model("success.model")
break
if __name__ == "__main__":
main()