tictactoe/Player.py

from abc import ABC, abstractmethod

import numpy as np
import pickle

'''
    Class defines Computer-Player
'''


class Player(ABC):

    def __init__(self, name: str):
        self.name = name

    @abstractmethod
    def get_hash(self, board: np.array):
        pass

    @abstractmethod
    def choose_action(self, positions: list, current_board: np.array = None, symbol: int = -1) -> tuple:
        pass

    @abstractmethod
    def feed_reward(self, reward: float) -> None:
        pass

    @abstractmethod
    def reset(self) -> None:
        pass

    @abstractmethod
    def add_state(self, state: np.array) -> None:
        pass


class ComputerPlayer(Player):
    def __init__(self, name: str, board_cols: int, board_rows: int, exp_rate: float = 0.3):
        super().__init__(name)
        self.states = []
        self.lr = 0.2
        self.exp_rate = exp_rate
        self.decay_gamma = 0.9
        self.states_value = {}
        self.board_cols = board_cols
        self.board_rows = board_rows

    '''
        get Board hash
    '''

    def get_hash(self, board: np.array) -> str:
        board_hash = str(board.reshape(self.board_cols * self.board_rows))
        return board_hash

    '''
        get best action for current state
    '''

    def choose_action(self, positions: list, current_board: np.array = None, symbol: int = -1) -> tuple:
        if np.random.uniform(0, 1) <= self.exp_rate:
            # take random action
            idx = np.random.choice(len(positions))
            action = positions[idx]
        else:
            value_max = -999
            for p in positions:
                next_board = current_board.copy()
                next_board[p] = symbol
                next_board_hash = self.get_hash(next_board)
                value = 0 if self.states_value.get(next_board_hash) is None else self.states_value.get(next_board_hash)
                if value >= value_max:
                    value_max = value
                    action = p
        return action

    def add_state(self, state: np.array) -> None:
        self.states.append(state)

    '''
        at the end of game, backpropagate and update states value
    '''

    def feed_reward(self, reward: float) -> None:
        for st in reversed(self.states):
            if self.states_value.get(st) is None:
                self.states_value[st] = 0
            # bellman equation
            self.states_value[st] += self.lr * (self.decay_gamma * reward - self.states_value[st])
            reward = self.states_value[st]

    def reset(self) -> None:
        self.states = []

    def save_policy(self) -> None:
        fw = open('policy_' + str(self.name), 'wb')
        pickle.dump(self.states_value, fw)
        fw.close()

    def load_policy(self, file) -> None:
        fr = open(file, 'rb')
        self.states_value = pickle.load(fr)
        fr.close()


'''
    Class for Human-Player
'''


class HumanPlayer(Player):

    def __init__(self, name):
        super().__init__(name)

    def choose_action(self, positions: list, current_board: np.array = None, symbol: int = -1) -> tuple:
        while True:
            row = int(input("Input your action row:"))
            col = int(input("Input your action col:"))
            action = (row, col)
            if action in positions:
                return action

    # append a hash state
    def add_state(self, state) -> None:
        pass

    # at the end of game, backpropagate and update states value
    def feed_reward(self, reward: float) -> None:
        pass

    def get_hash(self, board: np.array):
        pass

    def reset(self) -> None:
        pass