todo: implement dynaq

2022-07-22 13:46:42 +02:00
parent 077c635934
commit e6c4910141
1 changed files with 6 additions and 64 deletions
@@ -48,67 +48,16 @@ class DynaQ:
    '''

    def learn(self, epsilon_decay: float, epsilon_min: float, run: int) -> None:
-        self.steps_per_episode = []
-        eps = self.epsilon
-        for episode in range(self.episodes):
-            done = False
-            self.reset()
-            if episode == 70:
-                self.env.block_node(1)
-
-            # Episodes last until the goal is reached
-            while not done:
-                print("Run: " + str(run), "n_steps: " + str(self.n_steps), "Episode: " + str(episode),
-                      "State: " + str(self.state))
-
-                # Get action, reward and next state
-                action = self.get_action(eps)
-                self.state_actions.append((self.state, action))
-                (done, reward, next_state) = self.env.get_state_reward(self.state, action)
-
-                # Bellmann equation
-                q_current = self.Q[self.state][action]
-                q_max = np.max(list(self.Q[next_state].values()))
-                self.Q[self.state][action] = q_current + self.alpha * (reward + self.gamma * q_max) - q_current
-
-                # Update model
-                self.time_step += 1
-                self.step_in_episode += 1
-                self.update_model(self.state, action, reward, next_state)
-
-                # Planning phase
-                self.planning()
-                self.state = next_state
-
-            self.steps_per_episode.append(len(self.state_actions))
-            self.reset()
-            print("Goal")
-            eps = max(epsilon_min, self.epsilon * np.exp(-epsilon_decay * episode))
+        # todo: implement learning
+        pass

    '''
        Returns epsilon-greedy action
    '''

    def get_action(self, eps: float) -> int:
-        random = np.random.uniform(0, 1)
-        q = float('-inf')
-        action_list = list(self.env.G.neighbors(self.state)) + [self.state]
-
-        # greedy or not
-        if random < eps:
-            action = np.random.choice(action_list)
-        else:
-            # if all q-values have the same values
-            if len(set(self.Q[self.state].values())) == 1:
-                action = np.random.choice(action_list)
-            else:
-                # get action with highest q-value
-                for a in action_list:
-                    tmp_q = self.Q[self.state][a]
-                    if tmp_q >= q:
-                        q = tmp_q
-                        action = a
-        return action
+        # todo: implement eval
+        pass

    '''
        Add Reward, next state and current time step to state-action pair in model
@@ -122,12 +71,5 @@ class DynaQ:
    '''

    def planning(self) -> None:
-        for step in range(self.n_steps):
-            state_rnd = np.random.choice(list(self.model.keys()))
-            action_rnd = np.random.choice(list(self.env.G.neighbors(state_rnd)) + [state_rnd])
-            (reward_rnd, next_state_rnd, time_step_rnd) = self.model[state_rnd][action_rnd]
-
-            q_rnd = self.Q[state_rnd][action_rnd]
-            q_max = np.max(list(self.Q[next_state_rnd].values()))
-
-            self.Q[state_rnd][action_rnd] = q_rnd + self.alpha * (reward_rnd + self.gamma * q_max) - q_rnd
+        # todo: implement planning
+        pass