-
Notifications
You must be signed in to change notification settings - Fork 2
/
model.py
113 lines (83 loc) · 4.27 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import tensorflow as tf
import numpy as np
from collections import deque
class DQNetwork:
def __init__(self, learning_rate, name='DQNetwork'):
self.state_size = ( 30, 100,1)
self.action_size = 3
self.learning_rate = learning_rate
with tf.variable_scope(name):
# We create the placeholders
# *state_size means that we take each elements of state_size in tuple hence is like if we wrote
# [None, 84, 84, 4]
self.inputs_ = tf.placeholder(tf.float32, (None,) + self.state_size, name="inputs")
self.actions_ = tf.placeholder(tf.float32, [None, self.action_size], name="actions_")
# Remember that target_Q is the R(s,a) + ymax Qhat(s', a')
self.target_Q = tf.placeholder(tf.float32, [None], name="target")
"""
First convnet:
CNN
ELU
"""
# Input is 110x84x4
self.conv1 = tf.layers.conv2d(inputs = self.inputs_,
filters = 32,
kernel_size = [4,4],
strides = [4,4],
padding = "VALID",
kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
name = "conv1")
self.conv1_out = tf.nn.relu(self.conv1, name="conv1_out")
"""
Second convnet:
CNN
ELU
"""
self.conv2 = tf.layers.conv2d(inputs = self.conv1_out,
filters = 64,
kernel_size = [3,3],
strides = [2,2],
padding = "VALID",
kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
name = "conv2")
self.conv2_out = tf.nn.relu(self.conv2, name="conv2_out")
"""
Third convnet:
CNN
ELU
"""
self.conv3 = tf.layers.conv2d(inputs = self.conv2_out,
filters = 64,
kernel_size = [1,1],
strides = [1,1],
padding = "VALID",
kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
name = "conv3")
self.conv3_out = tf.nn.relu(self.conv3, name="conv3_out")
self.flatten = tf.contrib.layers.flatten(self.conv3_out)
self.fc = tf.layers.dense(inputs = self.flatten,
units = 512,
activation = tf.nn.relu,
kernel_initializer=tf.contrib.layers.xavier_initializer(),
name="fc1")
self.output = tf.layers.dense(inputs = self.fc,
kernel_initializer=tf.contrib.layers.xavier_initializer(),
units = self.action_size,
activation=None)
# Q is our predicted Q value.
self.Q = tf.reduce_sum(tf.multiply(self.output, self.actions_))
# The loss is the difference between our predicted Q_values and the Q_target
# Sum(Qtarget - Q)^2
self.loss = tf.reduce_mean(tf.square(self.target_Q - self.Q))
self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
class ReplayMemory():
def __init__(self, max_size):
self.buffer = deque(maxlen = max_size)
def add(self, experience):
self.buffer.append(experience)
def sample(self, batch_size):
buffer_size = len(self.buffer)
index = np.random.choice(np.arange(buffer_size),
size = batch_size,
replace = False)
return [self.buffer[i] for i in index]