-
Notifications
You must be signed in to change notification settings - Fork 1
/
PGRL.py
42 lines (35 loc) · 1.55 KB
/
PGRL.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# simple policy gradient RL
import numpy as np
import project
def retrain(input, model):
config,nsamples,retrain_datasets = project.getDatasets(input)
retrain_controlsin = input['controls'] # original policy without deviation
retrain_reward=input['rewards']
retrain_noise=input["sample_noise"] # deviation
wl=60 # how far to calculate discounted future reward
advantage=np.zeros_like(retrain_controlsin)
discount_factor=0.97
for n in range(0,nsamples-1):
discount=1
rv=0
twl=min(wl,nsamples-n)
for m in range(0,twl-1):
rv=retrain_reward[n+m]*discount
discount *= discount_factor
rv /= twl
advantage[n] = rv/twl
advantage = (advantage - np.mean(advantage)) / np.std(advantage) # improves training per karpathy
advantaged_controls=retrain_controlsin + retrain_noise*advantage
#advantaged_controls[:,1]=0.7
print("advantage={} noise={} product={}".format(np.mean(advantage, axis=0), np.mean(retrain_noise, axis=0),
np.mean(retrain_noise * advantage, axis=0)))
retrain_y=[advantaged_controls[:,0],advantaged_controls[:,1]]
model.lr=0.0001
print("learning rate={}".format(model.lr))
ninputs=len(model.input_shape)
model.fit(retrain_datasets, retrain_y[:ninputs], verbose=2, validation_split=0.2,batch_size=100,epochs=3,shuffle="batch")
# we should test if validation results improved
print("Predict")
p=model.predict(retrain_datasets, 20)
print(p[:10])
return model