initial commit

2022-07-22 13:08:35 +02:00
commit dd3eb11ea8
5 changed files with 412 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,25 @@
 # ---> macOS
 .DS_Store
 .AppleDouble
 .LSOverride
 # Icon must end with two \r
 Icon
 # Thumbnails
 ._*
 __pycache__
 # Files that might appear in the root of a volume
 .DocumentRevisions-V100
 .fseventsd
 .Spotlight-V100
 .TemporaryItems
 .Trashes
 .VolumeIcon.icns
 # Directories potentially created on remote AFP share
 .AppleDB
 .AppleDesktop
 Network Trash Folder
 Temporary Items
 .apdisk
 run_old.py
 .idea
 venv
--- a/Player.py
+++ b/Player.py
@ -0,0 +1,136 @@
 from abc import ABC, abstractmethod
 import numpy as np
 import pickle
 '''
    Class defines Computer-Player
 '''
 class Player(ABC):
    def __init__(self, name: str):
        self.name = name
    @abstractmethod
    def get_hash(self, board: np.array):
        pass
    @abstractmethod
    def choose_action(self, positions: list, current_board: np.array = None, symbol: int = -1) -> tuple:
        pass
    @abstractmethod
    def feed_reward(self, reward: float) -> None:
        pass
    @abstractmethod
    def reset(self) -> None:
        pass
    @abstractmethod
    def add_state(self, state: np.array) -> None:
        pass
 class ComputerPlayer(Player):
    def __init__(self, name: str, board_cols: int, board_rows: int, exp_rate: float = 0.3):
        super().__init__(name)
        self.states = []
        self.lr = 0.2
        self.exp_rate = exp_rate
        self.decay_gamma = 0.9
        self.states_value = {}
        self.board_cols = board_cols
        self.board_rows = board_rows
    '''
        get Board hash
    '''
    def get_hash(self, board: np.array) -> str:
        board_hash = str(board.reshape(self.board_cols * self.board_rows))
        return board_hash
    '''
        get best action for current state
    '''
    def choose_action(self, positions: list, current_board: np.array = None, symbol: int = -1) -> tuple:
        if np.random.uniform(0, 1) <= self.exp_rate:
            # take random action
            idx = np.random.choice(len(positions))
            action = positions[idx]
        else:
            value_max = -999
            for p in positions:
                next_board = current_board.copy()
                next_board[p] = symbol
                next_board_hash = self.get_hash(next_board)
                value = 0 if self.states_value.get(next_board_hash) is None else self.states_value.get(next_board_hash)
                if value >= value_max:
                    value_max = value
                    action = p
        return action
    def add_state(self, state: np.array) -> None:
        self.states.append(state)
    '''
        at the end of game, backpropagate and update states value
    '''
    def feed_reward(self, reward: float) -> None:
        for st in reversed(self.states):
            if self.states_value.get(st) is None:
                self.states_value[st] = 0
            # bellman equation
            self.states_value[st] += self.lr * (self.decay_gamma * reward - self.states_value[st])
            reward = self.states_value[st]
    def reset(self) -> None:
        self.states = []
    def save_policy(self) -> None:
        fw = open('policy_' + str(self.name), 'wb')
        pickle.dump(self.states_value, fw)
        fw.close()
    def load_policy(self, file) -> None:
        fr = open(file, 'rb')
        self.states_value = pickle.load(fr)
        fr.close()
 '''
    Class for Human-Player
 '''
 class HumanPlayer(Player):
    def __init__(self, name):
        super().__init__(name)
    def choose_action(self, positions: list, current_board: np.array = None, symbol: int = -1) -> tuple:
        while True:
            row = int(input("Input your action row:"))
            col = int(input("Input your action col:"))
            action = (row, col)
            if action in positions:
                return action
    # append a hash state
    def add_state(self, state) -> None:
        pass
    # at the end of game, backpropagate and update states value
    def feed_reward(self, reward: float) -> None:
        pass
    def get_hash(self, board: np.array):
        pass
    def reset(self) -> None:
        pass
--- a/State.py
+++ b/State.py
@ -0,0 +1,215 @@
 import numpy as np
 from Player import Player
 '''
    Class defines Boardstates, rules for winning and distinguish between pure computer game and game against a human
 '''
 class State:
    def __init__(self, p1: Player, p2: Player, board_rows: int, board_cols: int):
        self.board = np.zeros((board_rows, board_cols))
        self.p1 = p1
        self.p2 = p2
        self.isEnd = False
        self.board_hash = None
        self.player_symbol = 1
        self.board_cols = board_cols
        self.board_rows = board_rows
    '''
        Get unique hash of current board state
    '''
    def get_hash(self) -> str:
        self.board_hash = str(self.board.reshape(self.board_cols * self.board_rows))
        return self.board_hash
    '''
        Define winning rules
    '''
    def winner(self):
        # 3 in a row
        for i in range(self.board_rows):
            if sum(self.board[i, :]) == 3:
                self.isEnd = True
                return 1
            if sum(self.board[i, :]) == -3:
                self.isEnd = True
                return -1
        # 3 in a column
        for i in range(self.board_cols):
            if sum(self.board[:, i]) == 3:
                self.isEnd = True
                return 1
            if sum(self.board[:, i]) == -3:
                self.isEnd = True
                return -1
        # diagonal
        diag_sum1 = sum([self.board[i, i] for i in range(self.board_cols)])
        diag_sum2 = sum([self.board[i, self.board_cols - i - 1] for i in range(self.board_cols)])
        diag_sum = max(abs(diag_sum1), abs(diag_sum2))
        if diag_sum == 3:
            self.isEnd = True
            if diag_sum1 == 3 or diag_sum2 == 3:
                return 1
            else:
                return -1
        # tie
        # no available positions
        if len(self.available_positions()) == 0:
            self.isEnd = True
            return 0
        # not end
        self.isEnd = False
        return None
    '''
        Returns all available positions in current state
    '''
    def available_positions(self) -> list:
        positions = []
        for i in range(self.board_rows):
            for j in range(self.board_cols):
                if self.board[i, j] == 0:
                    positions.append((i, j))  # need to be tuple
        return positions
    '''
        Set token on a position and switch to another player
    '''
    def update_state(self, position) -> None:
        self.board[position] = self.player_symbol
        self.player_symbol = -1 if self.player_symbol == 1 else 1
    '''
        If game ends, backpropagate reward
    '''
    def give_reward(self) -> None:
        result = self.winner()
        # P1 won
        if result == 1:
            self.p1.feed_reward(1)
            self.p2.feed_reward(0)
        # P2 won
        elif result == -1:
            self.p1.feed_reward(0)
            self.p2.feed_reward(1)
        # Tie
        else:
            self.p1.feed_reward(0.1)
            self.p2.feed_reward(0.5)
    '''
        Reset Board to Startposition
    '''
    def reset(self) -> None:
        self.board = np.zeros((self.board_rows, self.board_cols))
        self.board_hash = None
        self.isEnd = False
        self.player_symbol = 1
    '''
        Game with 2 Computer
    '''
    def play(self, rounds=100) -> None:
        for i in range(rounds):
            if i % 1000 == 0:
                print("Rounds {}".format(i))
            while not self.isEnd:
                # Player 1
                positions = self.available_positions()
                p1_action = self.p1.choose_action(positions, self.board, self.player_symbol)
                self.update_state(p1_action)
                board_hash = self.get_hash()
                self.p1.add_state(board_hash)
                # Does P1 won or is it a tie?
                win = self.winner()
                if win is not None:
                    self.give_reward()
                    self.p1.reset()
                    self.p2.reset()
                    self.reset()
                    break
                else:
                    # Player 2
                    positions = self.available_positions()
                    p2_action = self.p2.choose_action(positions, self.board, self.player_symbol)
                    self.update_state(p2_action)
                    board_hash = self.get_hash()
                    self.p2.add_state(board_hash)
                    # Does P2 won or is it a tie?
                    win = self.winner()
                    if win is not None:
                        self.give_reward()
                        self.p1.reset()
                        self.p2.reset()
                        self.reset()
                        break
    # Game with a human
    def play2(self) -> None:
        while not self.isEnd:
            # Player 1
            positions = self.available_positions()
            p1_action = self.p1.choose_action(positions, self.board, self.player_symbol)
            self.update_state(p1_action)
            self.show_board()
            # Does P1 won or is it a tie?
            win = self.winner()
            if win is not None:
                if win == 1:
                    print(self.p1.name, "wins!")
                else:
                    print("tie!")
                self.reset()
                break
            else:
                # Player 2 (Human)
                positions = self.available_positions()
                p2_action = self.p2.choose_action(positions)
                self.update_state(p2_action)
                self.show_board()
                # Does P2 won or is it a tie?
                win = self.winner()
                if win is not None:
                    if win == -1:
                        print(self.p2.name, "wins!")
                    else:
                        print("tie!")
                    self.reset()
                    break
    '''
        Prints current state of Board
    '''
    def show_board(self):
        # P1: x  P2: o
        for i in range(0, self.board_rows):
            print('-------------')
            out = '| '
            for j in range(0, self.board_cols):
                if self.board[i, j] == 1:
                    token = 'x'
                if self.board[i, j] == -1:
                    token = 'o'
                if self.board[i, j] == 0:
                    token = ' '
                out += token + ' | '
            print(out)
        print('-------------')
--- a/main.py
+++ b/main.py
@ -0,0 +1,29 @@
 from Player import ComputerPlayer, HumanPlayer
 from State import State
 BOARD_ROWS = 3
 BOARD_COLS = 3
 def tic_tac_toe():
    # train with 2 computer player
    p1 = ComputerPlayer("p1", board_cols=BOARD_COLS, board_rows=BOARD_ROWS)
    p2 = ComputerPlayer("p2", board_cols=BOARD_COLS, board_rows=BOARD_ROWS)
    st = State(p1, p2, board_cols=BOARD_COLS, board_rows=BOARD_ROWS)
    print("training...")
    st.play(50000)
    p1.save_policy()
    # play with human
    p1 = ComputerPlayer("computer", board_cols=BOARD_COLS, board_rows=BOARD_ROWS, exp_rate=0)
    p1.load_policy("policy_p1")
    p2 = HumanPlayer("human")
    st = State(p1, p2, board_cols=BOARD_COLS, board_rows=BOARD_ROWS)
    st.play2()
 if __name__ == "__main__":
    tic_tac_toe()
--- a/readme.md
+++ b/readme.md
@ -0,0 +1,7 @@
 # Tic Tac Toe
 A simple game of tic tac toe
 Q-Learning example as part of the asim reinforcement learning tutorial.
 the agent (player) learns not to lose the game, given sufficiently long training period.