-
Notifications
You must be signed in to change notification settings - Fork 1
/
TCGame_Env.py
126 lines (95 loc) · 4.99 KB
/
TCGame_Env.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
from gym import spaces
import numpy as np
import random
from itertools import groupby
from itertools import product
class TicTacToe():
def __init__(self):
"""initialise the board"""
# initialise state as an array
self.state = [np.nan for _ in range(9)] # initialises the board position, can initialise to an array or matrix
# all possible numbers
self.all_possible_numbers = [i for i in range(1, len(self.state) + 1)] # , can initialise to an array or matrix
self.reset()
def is_winning(self, curr_state):
"""Takes state as an input and returns whether any row, column or diagonal has winning sum
Example: Input state- [1, 2, 3, 4, nan, nan, nan, nan, nan]
Output = False"""
#Matrix Indices
## 0 1 2
## 3 4 5
## 6 7 8
#Combinations
comb=[(curr_state[0], curr_state[1], curr_state[2]), ##For Rows
(curr_state[3], curr_state[4], curr_state[5]),
(curr_state[6], curr_state[7], curr_state[8]),
(curr_state[0], curr_state[3], curr_state[6]), ##For Columns
(curr_state[1], curr_state[4], curr_state[7]),
(curr_state[2], curr_state[5], curr_state[8]),
(curr_state[0], curr_state[4], curr_state[8]), ##For Diagonals
(curr_state[2], curr_state[4], curr_state[6])]
##Eliminating 'nan' combinations and finding winning combinations in curr_state
for i in comb:
if np.sum(i)==15:
return True
return False
def is_terminal(self, curr_state):
# Terminal state could be winning state or when the board is filled up
if self.is_winning(curr_state) == True:
return True, 'Win'
elif len(self.allowed_positions(curr_state)) ==0:
return True, 'Tie'
else:
return False, 'Resume'
def allowed_positions(self, curr_state):
"""Takes state as an input and returns all indexes that are blank"""
return [i for i, val in enumerate(curr_state) if np.isnan(val)]
def allowed_values(self, curr_state):
"""Takes the current state as input and returns all possible (unused) values that can be placed on the board"""
used_values = [val for val in curr_state if not np.isnan(val)]
agent_values = [val for val in self.all_possible_numbers if val not in used_values and val % 2 !=0]
env_values = [val for val in self.all_possible_numbers if val not in used_values and val % 2 ==0]
return (agent_values, env_values)
def action_space(self, curr_state):
"""Takes the current state as input and returns all possible actions, i.e, all combinations of allowed positions and allowed
values"""
agent_actions = list(product(self.allowed_positions(curr_state), self.allowed_values(curr_state)[0]))
env_actions = list(product(self.allowed_positions(curr_state), self.allowed_values(curr_state)[1]))
return (agent_actions, env_actions)
def state_transition(self, curr_state, curr_action):
"""Takes current state and action and returns the board position just after agent's move.
Example: Input state- [1, 2, 3, 4, nan, nan, nan, nan, nan], action- [7, 9] or [position, value]
Output = [1, 2, 3, 4, nan, nan, nan, 9, nan]
"""
curr_state[curr_action[0]]= curr_action[1]
return curr_state
def step(self, curr_state, curr_action):
"""Takes current state and action and returns the next state, reward and whether the state is terminal. Hint: First, check the board
position after
agent's move, whether the game is won/loss/tied. Then incorporate environment's move and again check the board status.
Example: Input state- [1, 2, 3, 4, nan, nan, nan, nan, nan], action- [7, 9] or [position, value]
Output = ([1, 2, 3, 4, nan, nan, nan, 9, nan], -1, False)"""
next_state= self.state_transition(curr_state, curr_action)
is_terminal_val,status= self.is_terminal(next_state)
if is_terminal_val:
if status=='Win':
reward=10
return next_state, reward , is_terminal_val
else:
reward=0
return next_state, reward , is_terminal_val
else:
env_action=random.choice(self.action_space(next_state)[1])
next_next_state=self.state_transition(next_state,env_action)
is_terminal_next_next, status_next_next = self.is_terminal(next_next_state)
if is_terminal_next_next:
if (status_next_next=='Win'):
reward= -10
return next_next_state, reward , is_terminal_next_next
else:
reward=0
return next_next_state, reward , is_terminal_next_next
else:
return next_next_state, -1 , is_terminal_next_next
def reset(self):
return self.state