from abc import ABC, abstractmethod import numpy as np import pickle ''' Class defines Computer-Player ''' class Player(ABC): def __init__(self, name: str): self.name = name @abstractmethod def get_hash(self, board: np.array): pass @abstractmethod def choose_action(self, positions: list, current_board: np.array = None, symbol: int = -1) -> tuple: pass @abstractmethod def feed_reward(self, reward: float) -> None: pass @abstractmethod def reset(self) -> None: pass @abstractmethod def add_state(self, state: np.array) -> None: pass class ComputerPlayer(Player): def __init__(self, name: str, board_cols: int, board_rows: int, exp_rate: float = 0.3): super().__init__(name) self.states = [] self.lr = 0.2 self.exp_rate = exp_rate self.decay_gamma = 0.9 self.states_value = {} self.board_cols = board_cols self.board_rows = board_rows ''' get Board hash ''' def get_hash(self, board: np.array) -> str: board_hash = str(board.reshape(self.board_cols * self.board_rows)) return board_hash ''' get best action for current state ''' def choose_action(self, positions: list, current_board: np.array = None, symbol: int = -1) -> tuple: if np.random.uniform(0, 1) <= self.exp_rate: # take random action idx = np.random.choice(len(positions)) action = positions[idx] else: value_max = -999 for p in positions: next_board = current_board.copy() next_board[p] = symbol next_board_hash = self.get_hash(next_board) value = 0 if self.states_value.get(next_board_hash) is None else self.states_value.get(next_board_hash) if value >= value_max: value_max = value action = p return action def add_state(self, state: np.array) -> None: self.states.append(state) ''' at the end of game, backpropagate and update states value ''' def feed_reward(self, reward: float) -> None: for st in reversed(self.states): if self.states_value.get(st) is None: self.states_value[st] = 0 # bellman equation self.states_value[st] += self.lr * (self.decay_gamma * reward - self.states_value[st]) reward = self.states_value[st] def reset(self) -> None: self.states = [] def save_policy(self) -> None: fw = open('policy_' + str(self.name), 'wb') pickle.dump(self.states_value, fw) fw.close() def load_policy(self, file) -> None: fr = open(file, 'rb') self.states_value = pickle.load(fr) fr.close() ''' Class for Human-Player ''' class HumanPlayer(Player): def __init__(self, name): super().__init__(name) def choose_action(self, positions: list, current_board: np.array = None, symbol: int = -1) -> tuple: while True: row = int(input("Input your action row:")) col = int(input("Input your action col:")) action = (row, col) if action in positions: return action # append a hash state def add_state(self, state) -> None: pass # at the end of game, backpropagate and update states value def feed_reward(self, reward: float) -> None: pass def get_hash(self, board: np.array): pass def reset(self) -> None: pass