todo: implement q learning
This commit is contained in:
parent
dd3eb11ea8
commit
a228356319
25
Player.py
25
Player.py
@ -58,21 +58,8 @@ class ComputerPlayer(Player):
|
||||
'''
|
||||
|
||||
def choose_action(self, positions: list, current_board: np.array = None, symbol: int = -1) -> tuple:
|
||||
if np.random.uniform(0, 1) <= self.exp_rate:
|
||||
# take random action
|
||||
idx = np.random.choice(len(positions))
|
||||
action = positions[idx]
|
||||
else:
|
||||
value_max = -999
|
||||
for p in positions:
|
||||
next_board = current_board.copy()
|
||||
next_board[p] = symbol
|
||||
next_board_hash = self.get_hash(next_board)
|
||||
value = 0 if self.states_value.get(next_board_hash) is None else self.states_value.get(next_board_hash)
|
||||
if value >= value_max:
|
||||
value_max = value
|
||||
action = p
|
||||
return action
|
||||
# TODO: Implement this
|
||||
pass
|
||||
|
||||
def add_state(self, state: np.array) -> None:
|
||||
self.states.append(state)
|
||||
@ -82,12 +69,8 @@ class ComputerPlayer(Player):
|
||||
'''
|
||||
|
||||
def feed_reward(self, reward: float) -> None:
|
||||
for st in reversed(self.states):
|
||||
if self.states_value.get(st) is None:
|
||||
self.states_value[st] = 0
|
||||
# bellman equation
|
||||
self.states_value[st] += self.lr * (self.decay_gamma * reward - self.states_value[st])
|
||||
reward = self.states_value[st]
|
||||
# TODO: Implement this
|
||||
pass
|
||||
|
||||
def reset(self) -> None:
|
||||
self.states = []
|
||||
|
Loading…
Reference in New Issue
Block a user