-
Notifications
You must be signed in to change notification settings - Fork 0
/
train.py
154 lines (142 loc) · 7.23 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
from reversi_env import Reversi
from agant import Agent_PG, Agent_DQN
import numpy as np
import matplotlib.pyplot as plt
import copy
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if __name__=="__main__":
model = "DQN" # "PG", "DQN"
if model == "PG":
white_check_point = "agent_White_PG2000"
black_check_point = "agent_Black_PG2000"
agent_White = Agent_PG("White", device=device).to(device)
agent_Black = Agent_PG("Black", device=device).to(device)
if white_check_point:
agent_White.load_model(white_check_point)
if black_check_point:
agent_Black.load_model(black_check_point)
env = Reversi(human_VS_machine=False)
reward_history, winning_rate = [], []
is_White = []
max_epoch = 10000
RENDER = False
for ep in range(1, max_epoch+1):
ep_reward = []
obs, info = env.reset()
done = False
if RENDER: env.render()
while True:
next_palyer = info["next_player"]
next_possible_actions = info["next_possible_actions"]
if next_palyer == "White": # We train the white
action = agent_White.choose_action(obs, next_possible_actions)
obs_, reward, done, info = env.step(action)
ep_reward.append(reward)
agent_White.store_transition(obs, action, reward, next_possible_actions)
else:
# action = agent_Black.choose_action(obs, next_possible_actions)
action = env.get_random_action()
obs_, reward, done, info = env.step(action)
if done:
if info["winner"] == "Black": # when black take the last turn and game over, rewards of white player should be updated
agent_White.ep_rs[-1] -= 10
ep_reward[-1] -= 10
elif info["winner"] == "White":
agent_White.ep_rs[-1] += 10
ep_reward[-1] += 10
else: # "Tie"
agent_White.ep_rs[-1] += 2
ep_reward[-1] += 2
obs = copy.deepcopy(obs_)
if RENDER: env.render()
if done: # Game Over
loss = agent_White.learn()
print("ep: {:d}/{:d}, white player taining loss value: {:.4f}".format(ep, max_epoch, loss))
is_White.append(True if info["winner"] == "White" else False)
break
reward_history.append(np.sum(ep_reward))
if ep % 20 == 0: # update the weights of the black player
winning_rate.append(np.mean(is_White))
is_White = []
print("ep: {:d}/{:d}, white player winning rate in latest 20 rounds: {:.2%}.".format(ep, max_epoch, winning_rate[-1]))
if len(winning_rate) >= 3 and all([w >= 0.65 for w in winning_rate[-3:]]):
agent_Black.weights_assign(agent_White.brain)
print("ep: {:d}/{:d}, black player updated.".format(ep, max_epoch))
# end of training
agent_White.save_model("agent_PG")
# plot
plt.figure("White winning rate")
plt.plot(range(0, max_epoch, 20), winning_rate)
plt.show()
elif model == "DQN":
white_check_point = None
black_check_point = None
agent_White = Agent_DQN("White", device=device).to(device)
agent_Black = Agent_DQN("Black", device=device).to(device)
if white_check_point:
agent_White.load_model(white_check_point)
if black_check_point:
agent_Black.load_model(black_check_point)
env = Reversi(human_VS_machine=False)
reward_history, winning_rate = [], []
best_model, best_winning_rate = None, 0. # the one obtained the highest winning rate, regardless of opponent
is_White = []
max_epoch = 20000
dominant_counter_white = 0
RENDER = False
for ep in range(1, max_epoch + 1):
ep_reward = []
obs, info = env.reset()
done = False
if RENDER: env.render()
while True:
next_palyer = info["next_player"]
next_possible_actions = info["next_possible_actions"]
if next_palyer == "White": # We train the white
action = agent_White.choose_action(obs, next_possible_actions)
obs_, reward, done, info = env.step(action)
ep_reward.append(reward)
agent_White.store_transition(obs, action, reward, done, obs_, next_possible_actions)
else:
action = agent_Black.choose_action(obs, next_possible_actions)
# action = env.get_random_action()
obs_, reward, done, info = env.step(action)
if done:
if info["winner"] == "Black": # when black take the last turn and game over, rewards of white player should be updated
agent_White.reward_transition_update(-10.)
elif info["winner"] == "White":
agent_White.reward_transition_update(10.)
else: # "Tie"
agent_White.reward_transition_update(2.)
obs = copy.deepcopy(obs_)
if RENDER: env.render()
if done: # Game Over
loss = agent_White.learn()
print("ep: {:d}/{:d}, white player taining loss value: {:.4f}".format(ep, max_epoch, loss))
is_White.append(True if info["winner"] == "White" else False)
break
reward_history.append(np.sum(ep_reward))
if ep % 20 == 0: # log winning rate in every 20 eps
winning_rate.append(np.mean(is_White))
is_White = []
print("ep: {:d}/{:d}, white player winning rate in latest 20 rounds: {:.2%}.".format(ep, max_epoch, winning_rate[-1]))
if best_winning_rate <= winning_rate[-1]:
best_model = copy.deepcopy(agent_White)
best_winning_rate = winning_rate[-1]
if winning_rate[-1] >= 0.60:
dominant_counter_white += 1
else:
dominant_counter_white = 0
if dominant_counter_white >= 3:
dominant_counter_white = 0
agent_Black.weights_assign(agent_White.brain_evl)
print("ep: {:d}/{:d}, black player updated.".format(ep, max_epoch))
# end of training
agent_White.save_model("Brain_DQN_prioritized_White20000")
agent_Black.save_model("Brain_DQN_prioritized_Black20000")
best_model.save_model("Brain_DQN_prioritized_Best20000")
# plot
plt.figure("White winning rate")
plt.plot(range(0, max_epoch, 20), winning_rate)
plt.show()