initial commit

This commit is contained in:
Dominik Brunmeir 2022-07-22 13:08:35 +02:00
commit dd3eb11ea8
5 changed files with 412 additions and 0 deletions

25
.gitignore vendored Normal file
View File

@ -0,0 +1,25 @@
# ---> macOS
.DS_Store
.AppleDouble
.LSOverride
# Icon must end with two \r
Icon
# Thumbnails
._*
__pycache__
# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
run_old.py
.idea
venv

136
Player.py Normal file
View File

@ -0,0 +1,136 @@
from abc import ABC, abstractmethod
import numpy as np
import pickle
'''
Class defines Computer-Player
'''
class Player(ABC):
def __init__(self, name: str):
self.name = name
@abstractmethod
def get_hash(self, board: np.array):
pass
@abstractmethod
def choose_action(self, positions: list, current_board: np.array = None, symbol: int = -1) -> tuple:
pass
@abstractmethod
def feed_reward(self, reward: float) -> None:
pass
@abstractmethod
def reset(self) -> None:
pass
@abstractmethod
def add_state(self, state: np.array) -> None:
pass
class ComputerPlayer(Player):
def __init__(self, name: str, board_cols: int, board_rows: int, exp_rate: float = 0.3):
super().__init__(name)
self.states = []
self.lr = 0.2
self.exp_rate = exp_rate
self.decay_gamma = 0.9
self.states_value = {}
self.board_cols = board_cols
self.board_rows = board_rows
'''
get Board hash
'''
def get_hash(self, board: np.array) -> str:
board_hash = str(board.reshape(self.board_cols * self.board_rows))
return board_hash
'''
get best action for current state
'''
def choose_action(self, positions: list, current_board: np.array = None, symbol: int = -1) -> tuple:
if np.random.uniform(0, 1) <= self.exp_rate:
# take random action
idx = np.random.choice(len(positions))
action = positions[idx]
else:
value_max = -999
for p in positions:
next_board = current_board.copy()
next_board[p] = symbol
next_board_hash = self.get_hash(next_board)
value = 0 if self.states_value.get(next_board_hash) is None else self.states_value.get(next_board_hash)
if value >= value_max:
value_max = value
action = p
return action
def add_state(self, state: np.array) -> None:
self.states.append(state)
'''
at the end of game, backpropagate and update states value
'''
def feed_reward(self, reward: float) -> None:
for st in reversed(self.states):
if self.states_value.get(st) is None:
self.states_value[st] = 0
# bellman equation
self.states_value[st] += self.lr * (self.decay_gamma * reward - self.states_value[st])
reward = self.states_value[st]
def reset(self) -> None:
self.states = []
def save_policy(self) -> None:
fw = open('policy_' + str(self.name), 'wb')
pickle.dump(self.states_value, fw)
fw.close()
def load_policy(self, file) -> None:
fr = open(file, 'rb')
self.states_value = pickle.load(fr)
fr.close()
'''
Class for Human-Player
'''
class HumanPlayer(Player):
def __init__(self, name):
super().__init__(name)
def choose_action(self, positions: list, current_board: np.array = None, symbol: int = -1) -> tuple:
while True:
row = int(input("Input your action row:"))
col = int(input("Input your action col:"))
action = (row, col)
if action in positions:
return action
# append a hash state
def add_state(self, state) -> None:
pass
# at the end of game, backpropagate and update states value
def feed_reward(self, reward: float) -> None:
pass
def get_hash(self, board: np.array):
pass
def reset(self) -> None:
pass

215
State.py Normal file
View File

@ -0,0 +1,215 @@
import numpy as np
from Player import Player
'''
Class defines Boardstates, rules for winning and distinguish between pure computer game and game against a human
'''
class State:
def __init__(self, p1: Player, p2: Player, board_rows: int, board_cols: int):
self.board = np.zeros((board_rows, board_cols))
self.p1 = p1
self.p2 = p2
self.isEnd = False
self.board_hash = None
self.player_symbol = 1
self.board_cols = board_cols
self.board_rows = board_rows
'''
Get unique hash of current board state
'''
def get_hash(self) -> str:
self.board_hash = str(self.board.reshape(self.board_cols * self.board_rows))
return self.board_hash
'''
Define winning rules
'''
def winner(self):
# 3 in a row
for i in range(self.board_rows):
if sum(self.board[i, :]) == 3:
self.isEnd = True
return 1
if sum(self.board[i, :]) == -3:
self.isEnd = True
return -1
# 3 in a column
for i in range(self.board_cols):
if sum(self.board[:, i]) == 3:
self.isEnd = True
return 1
if sum(self.board[:, i]) == -3:
self.isEnd = True
return -1
# diagonal
diag_sum1 = sum([self.board[i, i] for i in range(self.board_cols)])
diag_sum2 = sum([self.board[i, self.board_cols - i - 1] for i in range(self.board_cols)])
diag_sum = max(abs(diag_sum1), abs(diag_sum2))
if diag_sum == 3:
self.isEnd = True
if diag_sum1 == 3 or diag_sum2 == 3:
return 1
else:
return -1
# tie
# no available positions
if len(self.available_positions()) == 0:
self.isEnd = True
return 0
# not end
self.isEnd = False
return None
'''
Returns all available positions in current state
'''
def available_positions(self) -> list:
positions = []
for i in range(self.board_rows):
for j in range(self.board_cols):
if self.board[i, j] == 0:
positions.append((i, j)) # need to be tuple
return positions
'''
Set token on a position and switch to another player
'''
def update_state(self, position) -> None:
self.board[position] = self.player_symbol
self.player_symbol = -1 if self.player_symbol == 1 else 1
'''
If game ends, backpropagate reward
'''
def give_reward(self) -> None:
result = self.winner()
# P1 won
if result == 1:
self.p1.feed_reward(1)
self.p2.feed_reward(0)
# P2 won
elif result == -1:
self.p1.feed_reward(0)
self.p2.feed_reward(1)
# Tie
else:
self.p1.feed_reward(0.1)
self.p2.feed_reward(0.5)
'''
Reset Board to Startposition
'''
def reset(self) -> None:
self.board = np.zeros((self.board_rows, self.board_cols))
self.board_hash = None
self.isEnd = False
self.player_symbol = 1
'''
Game with 2 Computer
'''
def play(self, rounds=100) -> None:
for i in range(rounds):
if i % 1000 == 0:
print("Rounds {}".format(i))
while not self.isEnd:
# Player 1
positions = self.available_positions()
p1_action = self.p1.choose_action(positions, self.board, self.player_symbol)
self.update_state(p1_action)
board_hash = self.get_hash()
self.p1.add_state(board_hash)
# Does P1 won or is it a tie?
win = self.winner()
if win is not None:
self.give_reward()
self.p1.reset()
self.p2.reset()
self.reset()
break
else:
# Player 2
positions = self.available_positions()
p2_action = self.p2.choose_action(positions, self.board, self.player_symbol)
self.update_state(p2_action)
board_hash = self.get_hash()
self.p2.add_state(board_hash)
# Does P2 won or is it a tie?
win = self.winner()
if win is not None:
self.give_reward()
self.p1.reset()
self.p2.reset()
self.reset()
break
# Game with a human
def play2(self) -> None:
while not self.isEnd:
# Player 1
positions = self.available_positions()
p1_action = self.p1.choose_action(positions, self.board, self.player_symbol)
self.update_state(p1_action)
self.show_board()
# Does P1 won or is it a tie?
win = self.winner()
if win is not None:
if win == 1:
print(self.p1.name, "wins!")
else:
print("tie!")
self.reset()
break
else:
# Player 2 (Human)
positions = self.available_positions()
p2_action = self.p2.choose_action(positions)
self.update_state(p2_action)
self.show_board()
# Does P2 won or is it a tie?
win = self.winner()
if win is not None:
if win == -1:
print(self.p2.name, "wins!")
else:
print("tie!")
self.reset()
break
'''
Prints current state of Board
'''
def show_board(self):
# P1: x P2: o
for i in range(0, self.board_rows):
print('-------------')
out = '| '
for j in range(0, self.board_cols):
if self.board[i, j] == 1:
token = 'x'
if self.board[i, j] == -1:
token = 'o'
if self.board[i, j] == 0:
token = ' '
out += token + ' | '
print(out)
print('-------------')

29
main.py Normal file
View File

@ -0,0 +1,29 @@
from Player import ComputerPlayer, HumanPlayer
from State import State
BOARD_ROWS = 3
BOARD_COLS = 3
def tic_tac_toe():
# train with 2 computer player
p1 = ComputerPlayer("p1", board_cols=BOARD_COLS, board_rows=BOARD_ROWS)
p2 = ComputerPlayer("p2", board_cols=BOARD_COLS, board_rows=BOARD_ROWS)
st = State(p1, p2, board_cols=BOARD_COLS, board_rows=BOARD_ROWS)
print("training...")
st.play(50000)
p1.save_policy()
# play with human
p1 = ComputerPlayer("computer", board_cols=BOARD_COLS, board_rows=BOARD_ROWS, exp_rate=0)
p1.load_policy("policy_p1")
p2 = HumanPlayer("human")
st = State(p1, p2, board_cols=BOARD_COLS, board_rows=BOARD_ROWS)
st.play2()
if __name__ == "__main__":
tic_tac_toe()

7
readme.md Normal file
View File

@ -0,0 +1,7 @@
# Tic Tac Toe
A simple game of tic tac toe
Q-Learning example as part of the asim reinforcement learning tutorial.
the agent (player) learns not to lose the game, given sufficiently long training period.