import torch import torch.nn as nn import torch.nn.functional as F from torch.distributions import Categorical class ActorCritic(nn.Module): def __init__(self, seed: int = 12345): super(ActorCritic, self).__init__() torch.random.manual_seed(seed) self.affine = nn.Linear(8, 128) self.action_layer = nn.Linear(128, 4) self.value_layer = nn.Linear(128, 1) self.logprobs = [] self.state_values = [] self.rewards = [] def forward(self, state): state = torch.from_numpy(state).float() state = F.relu(self.affine(state)) state_value = self.value_layer(state) action_probs = F.softmax(self.action_layer(state), dim=0) action_distribution = Categorical(action_probs) action = action_distribution.sample() self.logprobs.append(action_distribution.log_prob(action)) self.state_values.append(state_value) return action.item() def calculate_loss(self, gamma: float = 0.99): # calculating discounted rewards: rewards = [] dis_reward = 0 for reward in self.rewards[::-1]: dis_reward = reward + gamma * dis_reward rewards.insert(0, dis_reward) # normalizing the rewards: rewards = torch.tensor(rewards) rewards = (rewards - rewards.mean()) / (rewards.std(dim=0)) loss = 0 for logprob, value, reward in zip(self.logprobs, self.state_values, rewards): advantage = reward - value.item() action_loss = -logprob * advantage value_loss = F.smooth_l1_loss(value, reward) loss += (action_loss + value_loss) return loss def clear_memory(self): del self.logprobs[:] del self.state_values[:] del self.rewards[:]