commit dd3eb11ea827ad1aaaa79c067455131c15764075 Author: Dominik Brunmeir Date: Fri Jul 22 13:08:35 2022 +0200 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..82f4808 --- /dev/null +++ b/.gitignore @@ -0,0 +1,25 @@ +# ---> macOS +.DS_Store +.AppleDouble +.LSOverride +# Icon must end with two \r +Icon +# Thumbnails +._* +__pycache__ +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk +run_old.py +.idea +venv diff --git a/Player.py b/Player.py new file mode 100644 index 0000000..ea1bcbe --- /dev/null +++ b/Player.py @@ -0,0 +1,136 @@ +from abc import ABC, abstractmethod + +import numpy as np +import pickle + +''' + Class defines Computer-Player +''' + + +class Player(ABC): + + def __init__(self, name: str): + self.name = name + + @abstractmethod + def get_hash(self, board: np.array): + pass + + @abstractmethod + def choose_action(self, positions: list, current_board: np.array = None, symbol: int = -1) -> tuple: + pass + + @abstractmethod + def feed_reward(self, reward: float) -> None: + pass + + @abstractmethod + def reset(self) -> None: + pass + + @abstractmethod + def add_state(self, state: np.array) -> None: + pass + + +class ComputerPlayer(Player): + def __init__(self, name: str, board_cols: int, board_rows: int, exp_rate: float = 0.3): + super().__init__(name) + self.states = [] + self.lr = 0.2 + self.exp_rate = exp_rate + self.decay_gamma = 0.9 + self.states_value = {} + self.board_cols = board_cols + self.board_rows = board_rows + + ''' + get Board hash + ''' + + def get_hash(self, board: np.array) -> str: + board_hash = str(board.reshape(self.board_cols * self.board_rows)) + return board_hash + + ''' + get best action for current state + ''' + + def choose_action(self, positions: list, current_board: np.array = None, symbol: int = -1) -> tuple: + if np.random.uniform(0, 1) <= self.exp_rate: + # take random action + idx = np.random.choice(len(positions)) + action = positions[idx] + else: + value_max = -999 + for p in positions: + next_board = current_board.copy() + next_board[p] = symbol + next_board_hash = self.get_hash(next_board) + value = 0 if self.states_value.get(next_board_hash) is None else self.states_value.get(next_board_hash) + if value >= value_max: + value_max = value + action = p + return action + + def add_state(self, state: np.array) -> None: + self.states.append(state) + + ''' + at the end of game, backpropagate and update states value + ''' + + def feed_reward(self, reward: float) -> None: + for st in reversed(self.states): + if self.states_value.get(st) is None: + self.states_value[st] = 0 + # bellman equation + self.states_value[st] += self.lr * (self.decay_gamma * reward - self.states_value[st]) + reward = self.states_value[st] + + def reset(self) -> None: + self.states = [] + + def save_policy(self) -> None: + fw = open('policy_' + str(self.name), 'wb') + pickle.dump(self.states_value, fw) + fw.close() + + def load_policy(self, file) -> None: + fr = open(file, 'rb') + self.states_value = pickle.load(fr) + fr.close() + + +''' + Class for Human-Player +''' + + +class HumanPlayer(Player): + + def __init__(self, name): + super().__init__(name) + + def choose_action(self, positions: list, current_board: np.array = None, symbol: int = -1) -> tuple: + while True: + row = int(input("Input your action row:")) + col = int(input("Input your action col:")) + action = (row, col) + if action in positions: + return action + + # append a hash state + def add_state(self, state) -> None: + pass + + # at the end of game, backpropagate and update states value + def feed_reward(self, reward: float) -> None: + pass + + def get_hash(self, board: np.array): + pass + + def reset(self) -> None: + pass diff --git a/State.py b/State.py new file mode 100644 index 0000000..b4848f6 --- /dev/null +++ b/State.py @@ -0,0 +1,215 @@ +import numpy as np + +from Player import Player + +''' + Class defines Boardstates, rules for winning and distinguish between pure computer game and game against a human +''' + + +class State: + def __init__(self, p1: Player, p2: Player, board_rows: int, board_cols: int): + self.board = np.zeros((board_rows, board_cols)) + self.p1 = p1 + self.p2 = p2 + self.isEnd = False + self.board_hash = None + self.player_symbol = 1 + self.board_cols = board_cols + self.board_rows = board_rows + + ''' + Get unique hash of current board state + ''' + + def get_hash(self) -> str: + self.board_hash = str(self.board.reshape(self.board_cols * self.board_rows)) + return self.board_hash + + ''' + Define winning rules + ''' + + def winner(self): + # 3 in a row + for i in range(self.board_rows): + if sum(self.board[i, :]) == 3: + self.isEnd = True + return 1 + if sum(self.board[i, :]) == -3: + self.isEnd = True + return -1 + # 3 in a column + for i in range(self.board_cols): + if sum(self.board[:, i]) == 3: + self.isEnd = True + return 1 + if sum(self.board[:, i]) == -3: + self.isEnd = True + return -1 + # diagonal + diag_sum1 = sum([self.board[i, i] for i in range(self.board_cols)]) + diag_sum2 = sum([self.board[i, self.board_cols - i - 1] for i in range(self.board_cols)]) + diag_sum = max(abs(diag_sum1), abs(diag_sum2)) + if diag_sum == 3: + self.isEnd = True + if diag_sum1 == 3 or diag_sum2 == 3: + return 1 + else: + return -1 + + # tie + # no available positions + if len(self.available_positions()) == 0: + self.isEnd = True + return 0 + # not end + self.isEnd = False + return None + + ''' + Returns all available positions in current state + ''' + + def available_positions(self) -> list: + positions = [] + for i in range(self.board_rows): + for j in range(self.board_cols): + if self.board[i, j] == 0: + positions.append((i, j)) # need to be tuple + return positions + + ''' + Set token on a position and switch to another player + ''' + + def update_state(self, position) -> None: + self.board[position] = self.player_symbol + self.player_symbol = -1 if self.player_symbol == 1 else 1 + + ''' + If game ends, backpropagate reward + ''' + + def give_reward(self) -> None: + result = self.winner() + # P1 won + if result == 1: + self.p1.feed_reward(1) + self.p2.feed_reward(0) + # P2 won + elif result == -1: + self.p1.feed_reward(0) + self.p2.feed_reward(1) + # Tie + else: + self.p1.feed_reward(0.1) + self.p2.feed_reward(0.5) + + ''' + Reset Board to Startposition + ''' + + def reset(self) -> None: + self.board = np.zeros((self.board_rows, self.board_cols)) + self.board_hash = None + self.isEnd = False + self.player_symbol = 1 + + ''' + Game with 2 Computer + ''' + + def play(self, rounds=100) -> None: + for i in range(rounds): + if i % 1000 == 0: + print("Rounds {}".format(i)) + while not self.isEnd: + # Player 1 + positions = self.available_positions() + p1_action = self.p1.choose_action(positions, self.board, self.player_symbol) + self.update_state(p1_action) + board_hash = self.get_hash() + self.p1.add_state(board_hash) + + # Does P1 won or is it a tie? + win = self.winner() + if win is not None: + self.give_reward() + self.p1.reset() + self.p2.reset() + self.reset() + break + + else: + # Player 2 + positions = self.available_positions() + p2_action = self.p2.choose_action(positions, self.board, self.player_symbol) + self.update_state(p2_action) + board_hash = self.get_hash() + self.p2.add_state(board_hash) + + # Does P2 won or is it a tie? + win = self.winner() + if win is not None: + self.give_reward() + self.p1.reset() + self.p2.reset() + self.reset() + break + + # Game with a human + def play2(self) -> None: + while not self.isEnd: + # Player 1 + positions = self.available_positions() + p1_action = self.p1.choose_action(positions, self.board, self.player_symbol) + self.update_state(p1_action) + self.show_board() + + # Does P1 won or is it a tie? + win = self.winner() + if win is not None: + if win == 1: + print(self.p1.name, "wins!") + else: + print("tie!") + self.reset() + break + + else: + # Player 2 (Human) + positions = self.available_positions() + p2_action = self.p2.choose_action(positions) + self.update_state(p2_action) + self.show_board() + + # Does P2 won or is it a tie? + win = self.winner() + if win is not None: + if win == -1: + print(self.p2.name, "wins!") + else: + print("tie!") + self.reset() + break + + ''' + Prints current state of Board + ''' + + def show_board(self): + # P1: x P2: o + for i in range(0, self.board_rows): + print('-------------') + out = '| ' + for j in range(0, self.board_cols): + if self.board[i, j] == 1: + token = 'x' + if self.board[i, j] == -1: + token = 'o' + if self.board[i, j] == 0: + token = ' ' + out += token + ' | ' + print(out) + print('-------------') diff --git a/main.py b/main.py new file mode 100644 index 0000000..6aa2bee --- /dev/null +++ b/main.py @@ -0,0 +1,29 @@ +from Player import ComputerPlayer, HumanPlayer +from State import State + +BOARD_ROWS = 3 +BOARD_COLS = 3 + + +def tic_tac_toe(): + # train with 2 computer player + p1 = ComputerPlayer("p1", board_cols=BOARD_COLS, board_rows=BOARD_ROWS) + p2 = ComputerPlayer("p2", board_cols=BOARD_COLS, board_rows=BOARD_ROWS) + + st = State(p1, p2, board_cols=BOARD_COLS, board_rows=BOARD_ROWS) + print("training...") + st.play(50000) + p1.save_policy() + + # play with human + p1 = ComputerPlayer("computer", board_cols=BOARD_COLS, board_rows=BOARD_ROWS, exp_rate=0) + p1.load_policy("policy_p1") + + p2 = HumanPlayer("human") + + st = State(p1, p2, board_cols=BOARD_COLS, board_rows=BOARD_ROWS) + st.play2() + + +if __name__ == "__main__": + tic_tac_toe() diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..c5c1411 --- /dev/null +++ b/readme.md @@ -0,0 +1,7 @@ +# Tic Tac Toe + +A simple game of tic tac toe + +Q-Learning example as part of the asim reinforcement learning tutorial. +the agent (player) learns not to lose the game, given sufficiently long training period. +