-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPG.py
86 lines (71 loc) · 2.99 KB
/
PG.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
class PG(nn.Module):
def __init__(self, state_shape, n_actions):
super().__init__()
self.state_shape = state_shape
self.n_actions = n_actions
self.model = nn.Sequential(
nn.Linear(in_features = state_shape[0], out_features = 128),
nn.ReLU(),
nn.Linear(in_features = 128 , out_features = 64),
nn.ReLU(),
nn.Linear(in_features = 64 , out_features = self.n_actions)
)
self.optimizer = torch.optim.Adam(self.model.parameters(), 1e-3)
def forward(self, x):
logits = self.model(x)
return logits
def predict_probs(self, states):
states = torch.FloatTensor(states)
logits = self.model(states).detach()
probs = F.softmax(logits, dim = -1).numpy()
# print(states, logits, probs)
return probs
def generate_session(self, env, t_max=1000):
states, traj_probs, actions, rewards = [], [], [], []
s = env.reset()
q_t = 1.0
for t in range(t_max):
action_probs = self.predict_probs(np.array([s]))[0]
a = np.random.choice(self.n_actions, p = action_probs)
new_s, r, done, info = env.step(a)
q_t *= action_probs[a]
states.append(s)
traj_probs.append(q_t)
actions.append(a)
rewards.append(r)
s = new_s
if done:
break
return states, actions, rewards
def _get_cumulative_rewards(self, rewards, gamma=0.99):
G = np.zeros_like(rewards, dtype = float)
G[-1] = rewards[-1]
for idx in range(-2, -len(rewards)-1, -1):
G[idx] = rewards[idx] + gamma * G[idx+1]
return G
def _to_one_hot(self, y_tensor, ndims):
y_tensor = y_tensor.type(torch.LongTensor).view(-1, 1)
y_one_hot = torch.zeros(
y_tensor.size()[0], ndims).scatter_(1, y_tensor, 1)
return y_one_hot
def train_on_env(self, env, gamma=0.99, entropy_coef=1e-2):
states, actions, rewards = self.generate_session(env)
states = torch.tensor(states, dtype=torch.float32)
actions = torch.tensor(actions, dtype=torch.int32)
cumulative_returns = np.array(self._get_cumulative_rewards(rewards, gamma))
cumulative_returns = torch.tensor(cumulative_returns, dtype=torch.float32)
logits = self.model(states)
probs = nn.functional.softmax(logits, -1)
log_probs = nn.functional.log_softmax(logits, -1)
log_probs_for_actions = torch.sum(
log_probs * self._to_one_hot(actions, env.action_space.n), dim=1)
entropy = -torch.mean(torch.sum(probs*log_probs), dim = -1 )
loss = -torch.mean(log_probs_for_actions*cumulative_returns -entropy*entropy_coef)
loss.backward()
self.optimizer.step()
self.optimizer.zero_grad()
return np.sum(rewards)