2022-07-22 13:44:29 +02:00
|
|
|
import numpy as np
|
|
|
|
|
|
|
|
from Environment import Env
|
|
|
|
|
|
|
|
np.random.seed(1)
|
|
|
|
|
|
|
|
|
|
|
|
class DynaQ:
|
|
|
|
def __init__(self, env: Env, episodes: int, epsilon: float, alpha: float, gamma: float, n_steps: int):
|
|
|
|
# Initialize parameter
|
|
|
|
self.env = env
|
|
|
|
self.alpha = alpha
|
|
|
|
self.gamma = gamma
|
|
|
|
self.epsilon = epsilon
|
|
|
|
self.episodes = episodes
|
|
|
|
self.n_steps = n_steps
|
|
|
|
|
|
|
|
self.time_step = 0
|
|
|
|
self.state = self.env.start
|
|
|
|
self.steps_per_episode = []
|
|
|
|
self.state_actions = []
|
|
|
|
self.step_in_episode = 0
|
|
|
|
|
|
|
|
# Initialize Q-matrix and model
|
|
|
|
self.Q = {}
|
|
|
|
self.model = {}
|
|
|
|
|
|
|
|
for state in list(self.env.G):
|
|
|
|
self.Q[state] = {}
|
|
|
|
self.model[state] = {}
|
|
|
|
for action in list(self.env.G.neighbors(state)) + [state]:
|
|
|
|
self.Q[state][action] = 0
|
|
|
|
self.model[state][action] = (-1, action, 0)
|
|
|
|
|
|
|
|
'''
|
|
|
|
Resets the model
|
|
|
|
'''
|
|
|
|
|
|
|
|
def reset(self) -> None:
|
|
|
|
self.state = self.env.start
|
|
|
|
self.state_actions = []
|
|
|
|
self.step_in_episode = 0
|
|
|
|
self.env.reset()
|
|
|
|
|
|
|
|
'''
|
|
|
|
Learning method for agent
|
|
|
|
Basically DynaQ algorithm adapted for graphs
|
|
|
|
'''
|
|
|
|
|
|
|
|
def learn(self, epsilon_decay: float, epsilon_min: float, run: int) -> None:
|
2022-07-22 13:46:42 +02:00
|
|
|
# todo: implement learning
|
|
|
|
pass
|
2022-07-22 13:44:29 +02:00
|
|
|
|
|
|
|
'''
|
|
|
|
Returns epsilon-greedy action
|
|
|
|
'''
|
|
|
|
|
|
|
|
def get_action(self, eps: float) -> int:
|
2022-07-22 13:46:42 +02:00
|
|
|
# todo: implement eval
|
|
|
|
pass
|
2022-07-22 13:44:29 +02:00
|
|
|
|
|
|
|
'''
|
|
|
|
Add Reward, next state and current time step to state-action pair in model
|
|
|
|
'''
|
|
|
|
|
|
|
|
def update_model(self, state: int, action: int, reward: float, next_state) -> None:
|
|
|
|
self.model[state][action] = (reward, next_state, self.time_step)
|
|
|
|
|
|
|
|
'''
|
|
|
|
Planning phase, basically Bellmann equation with already taken state-action pairs
|
|
|
|
'''
|
|
|
|
|
|
|
|
def planning(self) -> None:
|
2022-07-22 13:46:42 +02:00
|
|
|
# todo: implement planning
|
|
|
|
pass
|