2
0
Fork 0

initial commit

This commit is contained in:
Dominik Brunmeir 2022-07-22 11:49:30 +02:00
commit 491d5feafd
7 changed files with 230 additions and 0 deletions

25
.gitignore vendored Normal file
View File

@ -0,0 +1,25 @@
# ---> macOS
.DS_Store
.AppleDouble
.LSOverride
# Icon must end with two \r
Icon
# Thumbnails
._*
__pycache__
# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
run_old.py
.idea
venv

26
main.py Normal file
View File

@ -0,0 +1,26 @@
"""
Lunar Lander example
"""
import gym
from model import ActorCritic
from run import run
from train import train
seed = 12345
if __name__ == "__main__":
env = gym.make('LunarLander-v2')
env.seed(seed)
policy = ActorCritic()
training = True
if training:
train(env, policy)
run(env, policy)

59
model.py Normal file
View File

@ -0,0 +1,59 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
class ActorCritic(nn.Module):
def __init__(self, seed: int = 12345):
super(ActorCritic, self).__init__()
torch.random.manual_seed(seed)
self.affine = nn.Linear(8, 128)
self.action_layer = nn.Linear(128, 4)
self.value_layer = nn.Linear(128, 1)
self.logprobs = []
self.state_values = []
self.rewards = []
def forward(self, state):
state = torch.from_numpy(state).float()
state = F.relu(self.affine(state))
state_value = self.value_layer(state)
action_probs = F.softmax(self.action_layer(state), dim=0)
action_distribution = Categorical(action_probs)
action = action_distribution.sample()
self.logprobs.append(action_distribution.log_prob(action))
self.state_values.append(state_value)
return action.item()
def calculate_loss(self, gamma: float = 0.99):
# calculating discounted rewards:
rewards = []
dis_reward = 0
for reward in self.rewards[::-1]:
dis_reward = reward + gamma * dis_reward
rewards.insert(0, dis_reward)
# normalizing the rewards:
rewards = torch.tensor(rewards)
rewards = (rewards - rewards.mean()) / (rewards.std(dim=0))
loss = 0
for logprob, value, reward in zip(self.logprobs, self.state_values, rewards):
advantage = reward - value.item()
action_loss = -logprob * advantage
value_loss = F.smooth_l1_loss(value, reward)
loss += (action_loss + value_loss)
return loss
def clear_memory(self):
del self.logprobs[:]
del self.state_values[:]
del self.rewards[:]

34
readme.md Normal file
View File

@ -0,0 +1,34 @@
# Lunar Lander
This is an example of an Actor-Critic learning Agent as part of the ASIM RL Tutorial
It uses gym for the environment and torch as the basis for the A-C Network
### Action Space
1) do nothing,
2) fire left engine,
3) fire bottom engine,
4) fire right engine
### Observation Space
1) x,
2) y,
3) linear velocity in x,
4) linear velocity in y,
5) angle,
6) angular velocity
7) ground contact leg 1
8) ground contact leg 2
### Rewards
1) landing at landing pad +[100-140] points.
2) crash: -100 points
3) landing: +100 points
4) landing on leg: +10 points per leg
5) firing an engine: -0.3 points per engine per frame
it counts as solved at 200 points
### Starting State
The lander starts at the top center of the viewport with
a random initial force applied to its center of mass.

10
requirements.txt Normal file
View File

@ -0,0 +1,10 @@
box2d-py==2.3.5
cloudpickle==2.1.0
gym==0.25.0
gym-notices==0.0.7
numpy==1.23.1
Pillow==9.2.0
pygame==2.1.0
swig==4.0.2
torch==1.12.0
typing_extensions==4.3.0

33
run.py Normal file
View File

@ -0,0 +1,33 @@
import torch
from PIL import Image
from gym import Env
from model import ActorCritic
def run(env: Env, policy: ActorCritic=None, n_episodes=5, name='LunarLander.pth'):
if policy is None:
policy = ActorCritic()
policy.load_state_dict(torch.load('./{}'.format(name)))
render = True
save_gif = False
for i_episode in range(1, n_episodes + 1):
state = env.reset()
running_reward = 0
for t in range(10000):
action = policy(state)
state, reward, done, _ = env.step(action)
running_reward += reward
if render:
env.render()
if save_gif:
img = env.render(mode='rgb_array')
img = Image.fromarray(img)
img.save('./{}.jpg'.format(t))
if done:
break
print('Episode {}\tReward: {}'.format(i_episode, running_reward))
env.close()

43
train.py Normal file
View File

@ -0,0 +1,43 @@
import torch
from gym import Env
from torch import optim
from model import ActorCritic
# Parameters for learning
gamma = 0.99
lr = 0.02
betas = (0.9, 0.999)
def train(environment: Env, policy: ActorCritic):
optimizer = optim.Adam(policy.parameters(), lr=lr, betas=betas)
running_reward = 0
for i_episode in range(0, 5000):
state = environment.reset()
for t in range(10000):
action = policy(state)
state, reward, done, _ = environment.step(action)
policy.rewards.append(reward)
running_reward += reward
if done:
break
# Updating the policy :
optimizer.zero_grad()
loss = policy.calculate_loss(gamma)
loss.backward()
optimizer.step()
policy.clear_memory()
if running_reward > 4000:
torch.save(policy.state_dict(), './LunarLander.pth'.format(lr, betas[0], betas[1]))
print("########## Solved! ##########")
break
if i_episode % 20 == 0:
running_reward = running_reward / 20
print('Episode {}\tlength: {}\treward: {}'.format(i_episode, t, running_reward))
running_reward = 0
# We safe a checkpoint anyway to ensure we have a something.
torch.save(policy.state_dict(), './LunarLander.pth'.format(lr, betas[0], betas[1]))