diff --git a/model.py b/model.py index e6b294d..4d9fece 100644 --- a/model.py +++ b/model.py @@ -8,50 +8,21 @@ class ActorCritic(nn.Module): def __init__(self, seed: int = 12345): super(ActorCritic, self).__init__() torch.random.manual_seed(seed) - self.affine = nn.Linear(8, 128) - - self.action_layer = nn.Linear(128, 4) - self.value_layer = nn.Linear(128, 1) + # here we need the AC Network self.logprobs = [] self.state_values = [] self.rewards = [] + pass def forward(self, state): - state = torch.from_numpy(state).float() - state = F.relu(self.affine(state)) - - state_value = self.value_layer(state) - - action_probs = F.softmax(self.action_layer(state), dim=0) - action_distribution = Categorical(action_probs) - action = action_distribution.sample() - - self.logprobs.append(action_distribution.log_prob(action)) - self.state_values.append(state_value) - - return action.item() + # Here we need to evaluate the AC Network + pass def calculate_loss(self, gamma: float = 0.99): + # calculating discounted rewards - # calculating discounted rewards: - rewards = [] - dis_reward = 0 - for reward in self.rewards[::-1]: - dis_reward = reward + gamma * dis_reward - rewards.insert(0, dis_reward) - - # normalizing the rewards: - rewards = torch.tensor(rewards) - rewards = (rewards - rewards.mean()) / (rewards.std(dim=0)) - - loss = 0 - for logprob, value, reward in zip(self.logprobs, self.state_values, rewards): - advantage = reward - value.item() - action_loss = -logprob * advantage - value_loss = F.smooth_l1_loss(value, reward) - loss += (action_loss + value_loss) - return loss + pass def clear_memory(self): del self.logprobs[:]