From 491d5feafd4d488d5db8ecb29ff06e98cdc8916e Mon Sep 17 00:00:00 2001 From: Dominik Brunmeir Date: Fri, 22 Jul 2022 11:49:30 +0200 Subject: [PATCH] initial commit --- .gitignore | 25 ++++++++++++++++++++ main.py | 26 +++++++++++++++++++++ model.py | 59 ++++++++++++++++++++++++++++++++++++++++++++++++ readme.md | 34 ++++++++++++++++++++++++++++ requirements.txt | 10 ++++++++ run.py | 33 +++++++++++++++++++++++++++ train.py | 43 +++++++++++++++++++++++++++++++++++ 7 files changed, 230 insertions(+) create mode 100644 .gitignore create mode 100644 main.py create mode 100644 model.py create mode 100644 readme.md create mode 100644 requirements.txt create mode 100644 run.py create mode 100644 train.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..82f4808 --- /dev/null +++ b/.gitignore @@ -0,0 +1,25 @@ +# ---> macOS +.DS_Store +.AppleDouble +.LSOverride +# Icon must end with two \r +Icon +# Thumbnails +._* +__pycache__ +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk +run_old.py +.idea +venv diff --git a/main.py b/main.py new file mode 100644 index 0000000..21b2e75 --- /dev/null +++ b/main.py @@ -0,0 +1,26 @@ +""" +Lunar Lander example +""" + +import gym + +from model import ActorCritic +from run import run +from train import train + +seed = 12345 + +if __name__ == "__main__": + env = gym.make('LunarLander-v2') + env.seed(seed) + + policy = ActorCritic() + + training = True + + if training: + train(env, policy) + + run(env, policy) + + diff --git a/model.py b/model.py new file mode 100644 index 0000000..e6b294d --- /dev/null +++ b/model.py @@ -0,0 +1,59 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.distributions import Categorical + + +class ActorCritic(nn.Module): + def __init__(self, seed: int = 12345): + super(ActorCritic, self).__init__() + torch.random.manual_seed(seed) + self.affine = nn.Linear(8, 128) + + self.action_layer = nn.Linear(128, 4) + self.value_layer = nn.Linear(128, 1) + + self.logprobs = [] + self.state_values = [] + self.rewards = [] + + def forward(self, state): + state = torch.from_numpy(state).float() + state = F.relu(self.affine(state)) + + state_value = self.value_layer(state) + + action_probs = F.softmax(self.action_layer(state), dim=0) + action_distribution = Categorical(action_probs) + action = action_distribution.sample() + + self.logprobs.append(action_distribution.log_prob(action)) + self.state_values.append(state_value) + + return action.item() + + def calculate_loss(self, gamma: float = 0.99): + + # calculating discounted rewards: + rewards = [] + dis_reward = 0 + for reward in self.rewards[::-1]: + dis_reward = reward + gamma * dis_reward + rewards.insert(0, dis_reward) + + # normalizing the rewards: + rewards = torch.tensor(rewards) + rewards = (rewards - rewards.mean()) / (rewards.std(dim=0)) + + loss = 0 + for logprob, value, reward in zip(self.logprobs, self.state_values, rewards): + advantage = reward - value.item() + action_loss = -logprob * advantage + value_loss = F.smooth_l1_loss(value, reward) + loss += (action_loss + value_loss) + return loss + + def clear_memory(self): + del self.logprobs[:] + del self.state_values[:] + del self.rewards[:] diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..b5f7b8c --- /dev/null +++ b/readme.md @@ -0,0 +1,34 @@ +# Lunar Lander + +This is an example of an Actor-Critic learning Agent as part of the ASIM RL Tutorial + +It uses gym for the environment and torch as the basis for the A-C Network + +### Action Space + 1) do nothing, + 2) fire left engine, + 3) fire bottom engine, + 4) fire right engine + +### Observation Space + 1) x, + 2) y, + 3) linear velocity in x, + 4) linear velocity in y, + 5) angle, + 6) angular velocity + 7) ground contact leg 1 + 8) ground contact leg 2 + +### Rewards + 1) landing at landing pad +[100-140] points. + 2) crash: -100 points + 3) landing: +100 points + 4) landing on leg: +10 points per leg + 5) firing an engine: -0.3 points per engine per frame + +it counts as solved at 200 points + +### Starting State +The lander starts at the top center of the viewport with + a random initial force applied to its center of mass. \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ddda0a0 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +box2d-py==2.3.5 +cloudpickle==2.1.0 +gym==0.25.0 +gym-notices==0.0.7 +numpy==1.23.1 +Pillow==9.2.0 +pygame==2.1.0 +swig==4.0.2 +torch==1.12.0 +typing_extensions==4.3.0 diff --git a/run.py b/run.py new file mode 100644 index 0000000..ec676d1 --- /dev/null +++ b/run.py @@ -0,0 +1,33 @@ +import torch +from PIL import Image +from gym import Env + +from model import ActorCritic + + +def run(env: Env, policy: ActorCritic=None, n_episodes=5, name='LunarLander.pth'): + if policy is None: + policy = ActorCritic() + policy.load_state_dict(torch.load('./{}'.format(name))) + + render = True + save_gif = False + + for i_episode in range(1, n_episodes + 1): + state = env.reset() + running_reward = 0 + for t in range(10000): + action = policy(state) + state, reward, done, _ = env.step(action) + running_reward += reward + if render: + env.render() + if save_gif: + img = env.render(mode='rgb_array') + img = Image.fromarray(img) + img.save('./{}.jpg'.format(t)) + if done: + break + print('Episode {}\tReward: {}'.format(i_episode, running_reward)) + env.close() + diff --git a/train.py b/train.py new file mode 100644 index 0000000..e6c3733 --- /dev/null +++ b/train.py @@ -0,0 +1,43 @@ +import torch +from gym import Env +from torch import optim + +from model import ActorCritic + +# Parameters for learning +gamma = 0.99 +lr = 0.02 +betas = (0.9, 0.999) + + +def train(environment: Env, policy: ActorCritic): + optimizer = optim.Adam(policy.parameters(), lr=lr, betas=betas) + running_reward = 0 + for i_episode in range(0, 5000): + state = environment.reset() + for t in range(10000): + action = policy(state) + state, reward, done, _ = environment.step(action) + policy.rewards.append(reward) + running_reward += reward + if done: + break + # Updating the policy : + optimizer.zero_grad() + loss = policy.calculate_loss(gamma) + loss.backward() + optimizer.step() + policy.clear_memory() + + if running_reward > 4000: + torch.save(policy.state_dict(), './LunarLander.pth'.format(lr, betas[0], betas[1])) + print("########## Solved! ##########") + break + + if i_episode % 20 == 0: + running_reward = running_reward / 20 + print('Episode {}\tlength: {}\treward: {}'.format(i_episode, t, running_reward)) + running_reward = 0 + + # We safe a checkpoint anyway to ensure we have a something. + torch.save(policy.state_dict(), './LunarLander.pth'.format(lr, betas[0], betas[1])) \ No newline at end of file