diff --git a/DynaQ.py b/DynaQ.py index c69e98c..1a43c18 100644 --- a/DynaQ.py +++ b/DynaQ.py @@ -48,67 +48,16 @@ class DynaQ: ''' def learn(self, epsilon_decay: float, epsilon_min: float, run: int) -> None: - self.steps_per_episode = [] - eps = self.epsilon - for episode in range(self.episodes): - done = False - self.reset() - if episode == 70: - self.env.block_node(1) - - # Episodes last until the goal is reached - while not done: - print("Run: " + str(run), "n_steps: " + str(self.n_steps), "Episode: " + str(episode), - "State: " + str(self.state)) - - # Get action, reward and next state - action = self.get_action(eps) - self.state_actions.append((self.state, action)) - (done, reward, next_state) = self.env.get_state_reward(self.state, action) - - # Bellmann equation - q_current = self.Q[self.state][action] - q_max = np.max(list(self.Q[next_state].values())) - self.Q[self.state][action] = q_current + self.alpha * (reward + self.gamma * q_max) - q_current - - # Update model - self.time_step += 1 - self.step_in_episode += 1 - self.update_model(self.state, action, reward, next_state) - - # Planning phase - self.planning() - self.state = next_state - - self.steps_per_episode.append(len(self.state_actions)) - self.reset() - print("Goal") - eps = max(epsilon_min, self.epsilon * np.exp(-epsilon_decay * episode)) + # todo: implement learning + pass ''' Returns epsilon-greedy action ''' def get_action(self, eps: float) -> int: - random = np.random.uniform(0, 1) - q = float('-inf') - action_list = list(self.env.G.neighbors(self.state)) + [self.state] - - # greedy or not - if random < eps: - action = np.random.choice(action_list) - else: - # if all q-values have the same values - if len(set(self.Q[self.state].values())) == 1: - action = np.random.choice(action_list) - else: - # get action with highest q-value - for a in action_list: - tmp_q = self.Q[self.state][a] - if tmp_q >= q: - q = tmp_q - action = a - return action + # todo: implement eval + pass ''' Add Reward, next state and current time step to state-action pair in model @@ -122,12 +71,5 @@ class DynaQ: ''' def planning(self) -> None: - for step in range(self.n_steps): - state_rnd = np.random.choice(list(self.model.keys())) - action_rnd = np.random.choice(list(self.env.G.neighbors(state_rnd)) + [state_rnd]) - (reward_rnd, next_state_rnd, time_step_rnd) = self.model[state_rnd][action_rnd] - - q_rnd = self.Q[state_rnd][action_rnd] - q_max = np.max(list(self.Q[next_state_rnd].values())) - - self.Q[state_rnd][action_rnd] = q_rnd + self.alpha * (reward_rnd + self.gamma * q_max) - q_rnd + # todo: implement planning + pass