initial commit
This commit is contained in:
commit
491d5feafd
25
.gitignore
vendored
Normal file
25
.gitignore
vendored
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
# ---> macOS
|
||||||
|
.DS_Store
|
||||||
|
.AppleDouble
|
||||||
|
.LSOverride
|
||||||
|
# Icon must end with two \r
|
||||||
|
Icon
|
||||||
|
# Thumbnails
|
||||||
|
._*
|
||||||
|
__pycache__
|
||||||
|
# Files that might appear in the root of a volume
|
||||||
|
.DocumentRevisions-V100
|
||||||
|
.fseventsd
|
||||||
|
.Spotlight-V100
|
||||||
|
.TemporaryItems
|
||||||
|
.Trashes
|
||||||
|
.VolumeIcon.icns
|
||||||
|
# Directories potentially created on remote AFP share
|
||||||
|
.AppleDB
|
||||||
|
.AppleDesktop
|
||||||
|
Network Trash Folder
|
||||||
|
Temporary Items
|
||||||
|
.apdisk
|
||||||
|
run_old.py
|
||||||
|
.idea
|
||||||
|
venv
|
26
main.py
Normal file
26
main.py
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
"""
|
||||||
|
Lunar Lander example
|
||||||
|
"""
|
||||||
|
|
||||||
|
import gym
|
||||||
|
|
||||||
|
from model import ActorCritic
|
||||||
|
from run import run
|
||||||
|
from train import train
|
||||||
|
|
||||||
|
seed = 12345
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
env = gym.make('LunarLander-v2')
|
||||||
|
env.seed(seed)
|
||||||
|
|
||||||
|
policy = ActorCritic()
|
||||||
|
|
||||||
|
training = True
|
||||||
|
|
||||||
|
if training:
|
||||||
|
train(env, policy)
|
||||||
|
|
||||||
|
run(env, policy)
|
||||||
|
|
||||||
|
|
59
model.py
Normal file
59
model.py
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
from torch.distributions import Categorical
|
||||||
|
|
||||||
|
|
||||||
|
class ActorCritic(nn.Module):
|
||||||
|
def __init__(self, seed: int = 12345):
|
||||||
|
super(ActorCritic, self).__init__()
|
||||||
|
torch.random.manual_seed(seed)
|
||||||
|
self.affine = nn.Linear(8, 128)
|
||||||
|
|
||||||
|
self.action_layer = nn.Linear(128, 4)
|
||||||
|
self.value_layer = nn.Linear(128, 1)
|
||||||
|
|
||||||
|
self.logprobs = []
|
||||||
|
self.state_values = []
|
||||||
|
self.rewards = []
|
||||||
|
|
||||||
|
def forward(self, state):
|
||||||
|
state = torch.from_numpy(state).float()
|
||||||
|
state = F.relu(self.affine(state))
|
||||||
|
|
||||||
|
state_value = self.value_layer(state)
|
||||||
|
|
||||||
|
action_probs = F.softmax(self.action_layer(state), dim=0)
|
||||||
|
action_distribution = Categorical(action_probs)
|
||||||
|
action = action_distribution.sample()
|
||||||
|
|
||||||
|
self.logprobs.append(action_distribution.log_prob(action))
|
||||||
|
self.state_values.append(state_value)
|
||||||
|
|
||||||
|
return action.item()
|
||||||
|
|
||||||
|
def calculate_loss(self, gamma: float = 0.99):
|
||||||
|
|
||||||
|
# calculating discounted rewards:
|
||||||
|
rewards = []
|
||||||
|
dis_reward = 0
|
||||||
|
for reward in self.rewards[::-1]:
|
||||||
|
dis_reward = reward + gamma * dis_reward
|
||||||
|
rewards.insert(0, dis_reward)
|
||||||
|
|
||||||
|
# normalizing the rewards:
|
||||||
|
rewards = torch.tensor(rewards)
|
||||||
|
rewards = (rewards - rewards.mean()) / (rewards.std(dim=0))
|
||||||
|
|
||||||
|
loss = 0
|
||||||
|
for logprob, value, reward in zip(self.logprobs, self.state_values, rewards):
|
||||||
|
advantage = reward - value.item()
|
||||||
|
action_loss = -logprob * advantage
|
||||||
|
value_loss = F.smooth_l1_loss(value, reward)
|
||||||
|
loss += (action_loss + value_loss)
|
||||||
|
return loss
|
||||||
|
|
||||||
|
def clear_memory(self):
|
||||||
|
del self.logprobs[:]
|
||||||
|
del self.state_values[:]
|
||||||
|
del self.rewards[:]
|
34
readme.md
Normal file
34
readme.md
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
# Lunar Lander
|
||||||
|
|
||||||
|
This is an example of an Actor-Critic learning Agent as part of the ASIM RL Tutorial
|
||||||
|
|
||||||
|
It uses gym for the environment and torch as the basis for the A-C Network
|
||||||
|
|
||||||
|
### Action Space
|
||||||
|
1) do nothing,
|
||||||
|
2) fire left engine,
|
||||||
|
3) fire bottom engine,
|
||||||
|
4) fire right engine
|
||||||
|
|
||||||
|
### Observation Space
|
||||||
|
1) x,
|
||||||
|
2) y,
|
||||||
|
3) linear velocity in x,
|
||||||
|
4) linear velocity in y,
|
||||||
|
5) angle,
|
||||||
|
6) angular velocity
|
||||||
|
7) ground contact leg 1
|
||||||
|
8) ground contact leg 2
|
||||||
|
|
||||||
|
### Rewards
|
||||||
|
1) landing at landing pad +[100-140] points.
|
||||||
|
2) crash: -100 points
|
||||||
|
3) landing: +100 points
|
||||||
|
4) landing on leg: +10 points per leg
|
||||||
|
5) firing an engine: -0.3 points per engine per frame
|
||||||
|
|
||||||
|
it counts as solved at 200 points
|
||||||
|
|
||||||
|
### Starting State
|
||||||
|
The lander starts at the top center of the viewport with
|
||||||
|
a random initial force applied to its center of mass.
|
10
requirements.txt
Normal file
10
requirements.txt
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
box2d-py==2.3.5
|
||||||
|
cloudpickle==2.1.0
|
||||||
|
gym==0.25.0
|
||||||
|
gym-notices==0.0.7
|
||||||
|
numpy==1.23.1
|
||||||
|
Pillow==9.2.0
|
||||||
|
pygame==2.1.0
|
||||||
|
swig==4.0.2
|
||||||
|
torch==1.12.0
|
||||||
|
typing_extensions==4.3.0
|
33
run.py
Normal file
33
run.py
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
import torch
|
||||||
|
from PIL import Image
|
||||||
|
from gym import Env
|
||||||
|
|
||||||
|
from model import ActorCritic
|
||||||
|
|
||||||
|
|
||||||
|
def run(env: Env, policy: ActorCritic=None, n_episodes=5, name='LunarLander.pth'):
|
||||||
|
if policy is None:
|
||||||
|
policy = ActorCritic()
|
||||||
|
policy.load_state_dict(torch.load('./{}'.format(name)))
|
||||||
|
|
||||||
|
render = True
|
||||||
|
save_gif = False
|
||||||
|
|
||||||
|
for i_episode in range(1, n_episodes + 1):
|
||||||
|
state = env.reset()
|
||||||
|
running_reward = 0
|
||||||
|
for t in range(10000):
|
||||||
|
action = policy(state)
|
||||||
|
state, reward, done, _ = env.step(action)
|
||||||
|
running_reward += reward
|
||||||
|
if render:
|
||||||
|
env.render()
|
||||||
|
if save_gif:
|
||||||
|
img = env.render(mode='rgb_array')
|
||||||
|
img = Image.fromarray(img)
|
||||||
|
img.save('./{}.jpg'.format(t))
|
||||||
|
if done:
|
||||||
|
break
|
||||||
|
print('Episode {}\tReward: {}'.format(i_episode, running_reward))
|
||||||
|
env.close()
|
||||||
|
|
43
train.py
Normal file
43
train.py
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
import torch
|
||||||
|
from gym import Env
|
||||||
|
from torch import optim
|
||||||
|
|
||||||
|
from model import ActorCritic
|
||||||
|
|
||||||
|
# Parameters for learning
|
||||||
|
gamma = 0.99
|
||||||
|
lr = 0.02
|
||||||
|
betas = (0.9, 0.999)
|
||||||
|
|
||||||
|
|
||||||
|
def train(environment: Env, policy: ActorCritic):
|
||||||
|
optimizer = optim.Adam(policy.parameters(), lr=lr, betas=betas)
|
||||||
|
running_reward = 0
|
||||||
|
for i_episode in range(0, 5000):
|
||||||
|
state = environment.reset()
|
||||||
|
for t in range(10000):
|
||||||
|
action = policy(state)
|
||||||
|
state, reward, done, _ = environment.step(action)
|
||||||
|
policy.rewards.append(reward)
|
||||||
|
running_reward += reward
|
||||||
|
if done:
|
||||||
|
break
|
||||||
|
# Updating the policy :
|
||||||
|
optimizer.zero_grad()
|
||||||
|
loss = policy.calculate_loss(gamma)
|
||||||
|
loss.backward()
|
||||||
|
optimizer.step()
|
||||||
|
policy.clear_memory()
|
||||||
|
|
||||||
|
if running_reward > 4000:
|
||||||
|
torch.save(policy.state_dict(), './LunarLander.pth'.format(lr, betas[0], betas[1]))
|
||||||
|
print("########## Solved! ##########")
|
||||||
|
break
|
||||||
|
|
||||||
|
if i_episode % 20 == 0:
|
||||||
|
running_reward = running_reward / 20
|
||||||
|
print('Episode {}\tlength: {}\treward: {}'.format(i_episode, t, running_reward))
|
||||||
|
running_reward = 0
|
||||||
|
|
||||||
|
# We safe a checkpoint anyway to ensure we have a something.
|
||||||
|
torch.save(policy.state_dict(), './LunarLander.pth'.format(lr, betas[0], betas[1]))
|
Loading…
Reference in New Issue
Block a user