initial commit
This commit is contained in:
commit
491d5feafd
25
.gitignore
vendored
Normal file
25
.gitignore
vendored
Normal file
@ -0,0 +1,25 @@
|
||||
# ---> macOS
|
||||
.DS_Store
|
||||
.AppleDouble
|
||||
.LSOverride
|
||||
# Icon must end with two \r
|
||||
Icon
|
||||
# Thumbnails
|
||||
._*
|
||||
__pycache__
|
||||
# Files that might appear in the root of a volume
|
||||
.DocumentRevisions-V100
|
||||
.fseventsd
|
||||
.Spotlight-V100
|
||||
.TemporaryItems
|
||||
.Trashes
|
||||
.VolumeIcon.icns
|
||||
# Directories potentially created on remote AFP share
|
||||
.AppleDB
|
||||
.AppleDesktop
|
||||
Network Trash Folder
|
||||
Temporary Items
|
||||
.apdisk
|
||||
run_old.py
|
||||
.idea
|
||||
venv
|
26
main.py
Normal file
26
main.py
Normal file
@ -0,0 +1,26 @@
|
||||
"""
|
||||
Lunar Lander example
|
||||
"""
|
||||
|
||||
import gym
|
||||
|
||||
from model import ActorCritic
|
||||
from run import run
|
||||
from train import train
|
||||
|
||||
seed = 12345
|
||||
|
||||
if __name__ == "__main__":
|
||||
env = gym.make('LunarLander-v2')
|
||||
env.seed(seed)
|
||||
|
||||
policy = ActorCritic()
|
||||
|
||||
training = True
|
||||
|
||||
if training:
|
||||
train(env, policy)
|
||||
|
||||
run(env, policy)
|
||||
|
||||
|
59
model.py
Normal file
59
model.py
Normal file
@ -0,0 +1,59 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torch.distributions import Categorical
|
||||
|
||||
|
||||
class ActorCritic(nn.Module):
|
||||
def __init__(self, seed: int = 12345):
|
||||
super(ActorCritic, self).__init__()
|
||||
torch.random.manual_seed(seed)
|
||||
self.affine = nn.Linear(8, 128)
|
||||
|
||||
self.action_layer = nn.Linear(128, 4)
|
||||
self.value_layer = nn.Linear(128, 1)
|
||||
|
||||
self.logprobs = []
|
||||
self.state_values = []
|
||||
self.rewards = []
|
||||
|
||||
def forward(self, state):
|
||||
state = torch.from_numpy(state).float()
|
||||
state = F.relu(self.affine(state))
|
||||
|
||||
state_value = self.value_layer(state)
|
||||
|
||||
action_probs = F.softmax(self.action_layer(state), dim=0)
|
||||
action_distribution = Categorical(action_probs)
|
||||
action = action_distribution.sample()
|
||||
|
||||
self.logprobs.append(action_distribution.log_prob(action))
|
||||
self.state_values.append(state_value)
|
||||
|
||||
return action.item()
|
||||
|
||||
def calculate_loss(self, gamma: float = 0.99):
|
||||
|
||||
# calculating discounted rewards:
|
||||
rewards = []
|
||||
dis_reward = 0
|
||||
for reward in self.rewards[::-1]:
|
||||
dis_reward = reward + gamma * dis_reward
|
||||
rewards.insert(0, dis_reward)
|
||||
|
||||
# normalizing the rewards:
|
||||
rewards = torch.tensor(rewards)
|
||||
rewards = (rewards - rewards.mean()) / (rewards.std(dim=0))
|
||||
|
||||
loss = 0
|
||||
for logprob, value, reward in zip(self.logprobs, self.state_values, rewards):
|
||||
advantage = reward - value.item()
|
||||
action_loss = -logprob * advantage
|
||||
value_loss = F.smooth_l1_loss(value, reward)
|
||||
loss += (action_loss + value_loss)
|
||||
return loss
|
||||
|
||||
def clear_memory(self):
|
||||
del self.logprobs[:]
|
||||
del self.state_values[:]
|
||||
del self.rewards[:]
|
34
readme.md
Normal file
34
readme.md
Normal file
@ -0,0 +1,34 @@
|
||||
# Lunar Lander
|
||||
|
||||
This is an example of an Actor-Critic learning Agent as part of the ASIM RL Tutorial
|
||||
|
||||
It uses gym for the environment and torch as the basis for the A-C Network
|
||||
|
||||
### Action Space
|
||||
1) do nothing,
|
||||
2) fire left engine,
|
||||
3) fire bottom engine,
|
||||
4) fire right engine
|
||||
|
||||
### Observation Space
|
||||
1) x,
|
||||
2) y,
|
||||
3) linear velocity in x,
|
||||
4) linear velocity in y,
|
||||
5) angle,
|
||||
6) angular velocity
|
||||
7) ground contact leg 1
|
||||
8) ground contact leg 2
|
||||
|
||||
### Rewards
|
||||
1) landing at landing pad +[100-140] points.
|
||||
2) crash: -100 points
|
||||
3) landing: +100 points
|
||||
4) landing on leg: +10 points per leg
|
||||
5) firing an engine: -0.3 points per engine per frame
|
||||
|
||||
it counts as solved at 200 points
|
||||
|
||||
### Starting State
|
||||
The lander starts at the top center of the viewport with
|
||||
a random initial force applied to its center of mass.
|
10
requirements.txt
Normal file
10
requirements.txt
Normal file
@ -0,0 +1,10 @@
|
||||
box2d-py==2.3.5
|
||||
cloudpickle==2.1.0
|
||||
gym==0.25.0
|
||||
gym-notices==0.0.7
|
||||
numpy==1.23.1
|
||||
Pillow==9.2.0
|
||||
pygame==2.1.0
|
||||
swig==4.0.2
|
||||
torch==1.12.0
|
||||
typing_extensions==4.3.0
|
33
run.py
Normal file
33
run.py
Normal file
@ -0,0 +1,33 @@
|
||||
import torch
|
||||
from PIL import Image
|
||||
from gym import Env
|
||||
|
||||
from model import ActorCritic
|
||||
|
||||
|
||||
def run(env: Env, policy: ActorCritic=None, n_episodes=5, name='LunarLander.pth'):
|
||||
if policy is None:
|
||||
policy = ActorCritic()
|
||||
policy.load_state_dict(torch.load('./{}'.format(name)))
|
||||
|
||||
render = True
|
||||
save_gif = False
|
||||
|
||||
for i_episode in range(1, n_episodes + 1):
|
||||
state = env.reset()
|
||||
running_reward = 0
|
||||
for t in range(10000):
|
||||
action = policy(state)
|
||||
state, reward, done, _ = env.step(action)
|
||||
running_reward += reward
|
||||
if render:
|
||||
env.render()
|
||||
if save_gif:
|
||||
img = env.render(mode='rgb_array')
|
||||
img = Image.fromarray(img)
|
||||
img.save('./{}.jpg'.format(t))
|
||||
if done:
|
||||
break
|
||||
print('Episode {}\tReward: {}'.format(i_episode, running_reward))
|
||||
env.close()
|
||||
|
43
train.py
Normal file
43
train.py
Normal file
@ -0,0 +1,43 @@
|
||||
import torch
|
||||
from gym import Env
|
||||
from torch import optim
|
||||
|
||||
from model import ActorCritic
|
||||
|
||||
# Parameters for learning
|
||||
gamma = 0.99
|
||||
lr = 0.02
|
||||
betas = (0.9, 0.999)
|
||||
|
||||
|
||||
def train(environment: Env, policy: ActorCritic):
|
||||
optimizer = optim.Adam(policy.parameters(), lr=lr, betas=betas)
|
||||
running_reward = 0
|
||||
for i_episode in range(0, 5000):
|
||||
state = environment.reset()
|
||||
for t in range(10000):
|
||||
action = policy(state)
|
||||
state, reward, done, _ = environment.step(action)
|
||||
policy.rewards.append(reward)
|
||||
running_reward += reward
|
||||
if done:
|
||||
break
|
||||
# Updating the policy :
|
||||
optimizer.zero_grad()
|
||||
loss = policy.calculate_loss(gamma)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
policy.clear_memory()
|
||||
|
||||
if running_reward > 4000:
|
||||
torch.save(policy.state_dict(), './LunarLander.pth'.format(lr, betas[0], betas[1]))
|
||||
print("########## Solved! ##########")
|
||||
break
|
||||
|
||||
if i_episode % 20 == 0:
|
||||
running_reward = running_reward / 20
|
||||
print('Episode {}\tlength: {}\treward: {}'.format(i_episode, t, running_reward))
|
||||
running_reward = 0
|
||||
|
||||
# We safe a checkpoint anyway to ensure we have a something.
|
||||
torch.save(policy.state_dict(), './LunarLander.pth'.format(lr, betas[0], betas[1]))
|
Loading…
Reference in New Issue
Block a user