import numpy as np from Environment import Env np.random.seed(1) class DynaQ: def __init__(self, env: Env, episodes: int, epsilon: float, alpha: float, gamma: float, n_steps: int): # Initialize parameter self.env = env self.alpha = alpha self.gamma = gamma self.epsilon = epsilon self.episodes = episodes self.n_steps = n_steps self.time_step = 0 self.state = self.env.start self.steps_per_episode = [] self.state_actions = [] self.step_in_episode = 0 # Initialize Q-matrix and model self.Q = {} self.model = {} for state in list(self.env.G): self.Q[state] = {} self.model[state] = {} for action in list(self.env.G.neighbors(state)) + [state]: self.Q[state][action] = 0 self.model[state][action] = (-1, action, 0) ''' Resets the model ''' def reset(self) -> None: self.state = self.env.start self.state_actions = [] self.step_in_episode = 0 self.env.reset() ''' Learning method for agent Basically DynaQ algorithm adapted for graphs ''' def learn(self, epsilon_decay: float, epsilon_min: float, run: int) -> None: # todo: implement learning pass ''' Returns epsilon-greedy action ''' def get_action(self, eps: float) -> int: # todo: implement eval pass ''' Add Reward, next state and current time step to state-action pair in model ''' def update_model(self, state: int, action: int, reward: float, next_state) -> None: self.model[state][action] = (reward, next_state, self.time_step) ''' Planning phase, basically Bellmann equation with already taken state-action pairs ''' def planning(self) -> None: # todo: implement planning pass