-
Notifications
You must be signed in to change notification settings - Fork 0
/
nfq.py
72 lines (61 loc) · 1.91 KB
/
nfq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from RobotHoop import RobotHoop
import random, time
import numpy as np
import matplotlib.pyplot as plt
import torch
iterations = 200
maxsteps = 100
epochs = 50
allactions = [('f', 'f', False), ('f', 's', False), ('f', '', False), ('s', 'f', False), ('s', 's', False), ('s', '', False), ('', 'f', False), ('', 's', False), ('','',False), ('', '', True)]
class NeuralNet(torch.nn.Module):
def __init__(self):
super().__init__()
self.layers = torch.nn.Sequential(torch.nn.Linear(5, 10), torch.nn.Linear(10, 1))
def forward(self, x):
return self.layers(x)
states = []
actions = []
rewards = []
next_states = []
for i in range(iterations):
env = RobotHoop(-2,.5,.5, False)
for j in range(maxsteps):
state = env.state()
action = random.choice(allactions)
ac_num = allactions.index(action)
res = env.step(*action)
reward = res['reward']
states.append(state)
actions.append(ac_num)
rewards.append(reward)
next_states.append(env.state())
if res['end']:
break
discount_factor = .95
Q= NeuralNet()
optimizer = torch.optim.Rprop(Q.parameters())
for i in range(epochs):
Q= NeuralNet()
print(i)
for j in range(len(states)):
st = torch.tensor(states[j]).float()
ac = torch.tensor(actions[j]).float()
re = torch.tensor(rewards[j]).float()
ns = torch.tensor(next_states[j]).float()
# target = ((Q(st) - (discount_factor*Q(ns) + re))**2)
target = re + discount_factor*Q(ns)
target.backward()
optimizer.step()
env = RobotHoop(-2,.5,.5, True)
while True:
a = int(torch.argmax(Q(torch.tensor(env.state()).float())))
result = env.step(*allactions[a])
# print(result)
if result['end']:
if result['reward'] == 100:
print('success')
else:
print('failure')
break
env.vis()
time.sleep(.1)