initial commit

2022-07-22 11:49:30 +02:00
commit 491d5feafd
7 changed files with 230 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,25 @@
+# ---> macOS
+.DS_Store
+.AppleDouble
+.LSOverride
+# Icon must end with two \r
+Icon
+# Thumbnails
+._*
+__pycache__
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+run_old.py
+.idea
+venv
--- a/main.py
+++ b/main.py
@ -0,0 +1,26 @@
+"""
+Lunar Lander example
+"""
+
+import gym
+
+from model import ActorCritic
+from run import run
+from train import train
+
+seed = 12345
+
+if __name__ == "__main__":
+    env = gym.make('LunarLander-v2')
+    env.seed(seed)
+
+    policy = ActorCritic()
+
+    training = True
+
+    if training:
+        train(env, policy)
+
+    run(env, policy)
+
+
--- a/model.py
+++ b/model.py
@ -0,0 +1,59 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.distributions import Categorical
+
+
+class ActorCritic(nn.Module):
+    def __init__(self, seed: int = 12345):
+        super(ActorCritic, self).__init__()
+        torch.random.manual_seed(seed)
+        self.affine = nn.Linear(8, 128)
+
+        self.action_layer = nn.Linear(128, 4)
+        self.value_layer = nn.Linear(128, 1)
+
+        self.logprobs = []
+        self.state_values = []
+        self.rewards = []
+
+    def forward(self, state):
+        state = torch.from_numpy(state).float()
+        state = F.relu(self.affine(state))
+
+        state_value = self.value_layer(state)
+
+        action_probs = F.softmax(self.action_layer(state), dim=0)
+        action_distribution = Categorical(action_probs)
+        action = action_distribution.sample()
+
+        self.logprobs.append(action_distribution.log_prob(action))
+        self.state_values.append(state_value)
+
+        return action.item()
+
+    def calculate_loss(self, gamma: float = 0.99):
+
+        # calculating discounted rewards:
+        rewards = []
+        dis_reward = 0
+        for reward in self.rewards[::-1]:
+            dis_reward = reward + gamma * dis_reward
+            rewards.insert(0, dis_reward)
+
+        # normalizing the rewards:
+        rewards = torch.tensor(rewards)
+        rewards = (rewards - rewards.mean()) / (rewards.std(dim=0))
+
+        loss = 0
+        for logprob, value, reward in zip(self.logprobs, self.state_values, rewards):
+            advantage = reward - value.item()
+            action_loss = -logprob * advantage
+            value_loss = F.smooth_l1_loss(value, reward)
+            loss += (action_loss + value_loss)
+        return loss
+
+    def clear_memory(self):
+        del self.logprobs[:]
+        del self.state_values[:]
+        del self.rewards[:]
--- a/readme.md
+++ b/readme.md
@ -0,0 +1,34 @@
+# Lunar Lander
+
+This is an example of an Actor-Critic learning Agent as part of the ASIM RL Tutorial
+
+It uses gym for the environment and torch as the basis for the A-C Network
+
+### Action Space
+ 1) do nothing,
+ 2) fire left engine,
+ 3) fire bottom engine,
+ 4) fire right engine
+
+### Observation Space
+ 1) x,
+ 2) y,
+ 3) linear velocity in x,
+ 4) linear velocity in y,
+ 5) angle,
+ 6) angular velocity
+ 7) ground contact leg 1
+ 8) ground contact leg 2
+
+### Rewards
+ 1) landing at landing pad +[100-140] points.
+ 2) crash: -100 points
+ 3) landing: +100 points
+ 4) landing on leg: +10 points per leg
+ 5) firing an engine: -0.3 points per engine per frame
+
+it counts as solved at 200 points
+
+### Starting State
+The lander starts at the top center of the viewport with
+ a random initial force applied to its center of mass.
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,10 @@
+box2d-py==2.3.5
+cloudpickle==2.1.0
+gym==0.25.0
+gym-notices==0.0.7
+numpy==1.23.1
+Pillow==9.2.0
+pygame==2.1.0
+swig==4.0.2
+torch==1.12.0
+typing_extensions==4.3.0
--- a/run.py
+++ b/run.py
@ -0,0 +1,33 @@
+import torch
+from PIL import Image
+from gym import Env
+
+from model import ActorCritic
+
+
+def run(env: Env, policy: ActorCritic=None, n_episodes=5, name='LunarLander.pth'):
+    if policy is None:
+        policy = ActorCritic()
+        policy.load_state_dict(torch.load('./{}'.format(name)))
+
+    render = True
+    save_gif = False
+
+    for i_episode in range(1, n_episodes + 1):
+        state = env.reset()
+        running_reward = 0
+        for t in range(10000):
+            action = policy(state)
+            state, reward, done, _ = env.step(action)
+            running_reward += reward
+            if render:
+                env.render()
+                if save_gif:
+                    img = env.render(mode='rgb_array')
+                    img = Image.fromarray(img)
+                    img.save('./{}.jpg'.format(t))
+            if done:
+                break
+        print('Episode {}\tReward: {}'.format(i_episode, running_reward))
+    env.close()
+
--- a/train.py
+++ b/train.py
@ -0,0 +1,43 @@
+import torch
+from gym import Env
+from torch import optim
+
+from model import ActorCritic
+
+# Parameters for learning
+gamma = 0.99
+lr = 0.02
+betas = (0.9, 0.999)
+
+
+def train(environment: Env, policy: ActorCritic):
+    optimizer = optim.Adam(policy.parameters(), lr=lr, betas=betas)
+    running_reward = 0
+    for i_episode in range(0, 5000):
+        state = environment.reset()
+        for t in range(10000):
+            action = policy(state)
+            state, reward, done, _ = environment.step(action)
+            policy.rewards.append(reward)
+            running_reward += reward
+            if done:
+                break
+        # Updating the policy :
+        optimizer.zero_grad()
+        loss = policy.calculate_loss(gamma)
+        loss.backward()
+        optimizer.step()
+        policy.clear_memory()
+
+        if running_reward > 4000:
+            torch.save(policy.state_dict(), './LunarLander.pth'.format(lr, betas[0], betas[1]))
+            print("########## Solved! ##########")
+            break
+
+        if i_episode % 20 == 0:
+            running_reward = running_reward / 20
+            print('Episode {}\tlength: {}\treward: {}'.format(i_episode, t, running_reward))
+            running_reward = 0
+
+    # We safe a checkpoint anyway to ensure we have a something.
+    torch.save(policy.state_dict(), './LunarLander.pth'.format(lr, betas[0], betas[1]))