-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcPlayer.cls
1037 lines (872 loc) · 34.3 KB
/
cPlayer.cls
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
VERSION 1.0 CLASS
BEGIN
MultiUse = -1 'True
END
Attribute VB_Name = "cPlayer"
Attribute VB_GlobalNameSpace = False
Attribute VB_Creatable = False
Attribute VB_PredeclaredId = False
Attribute VB_Exposed = False
Option Explicit
'===========================================================================
'2023-04-25
'This is the agent which has a policy layer for action, and a
'separate value layer for learning state value function. Some global
'variables store history as it interacts with the environment, these
'include: xt(), yt(), x_actiont(), probs(), vt(), px_rewards(), pEpisodeIdx(), pEpisodeStatus
'Learn() is the function to learn from episodes saved in memory. PPO is implemented and
'ADAM is used in all optimizations
'Interact() is the main interface for it to take an action in an environment
'collect_reward() is antoher interface for it to collect a single reward from the
'environment, which is a scalar append to the vector px_rewards()
'Init() statement is game specific as each game requires specific input and output size, and
'possibly different network structures
'PrintNetwork() and ReadNetwork() are used to print and read both the policy and value networks to a worksheet
'StateMatrix() is a case by case function that outputs the matrix of state vs action
'and relevant data inside, but this is not alwasy meaningful to be presented in 2D matrix.
'===========================================================================
Private cNNPolicy As cNeuralNet
Private pn_input As Long, pn_hidden As Long, pn_output As Long
Private xt() As Double, yt() As Double, x_actiont() As Long, probs() As Double
Private px_rewards() As Double, pdiscounted_rewards() As Double
Private pstrGame As String
Private cNNValue As cNeuralNet
Private pValue_n_hidden As Long
Private vt() As Double
Private pn_episode As Long
Private pEpisodeIdx() As Long, pEpisodeStatus() As Long
Private pdiscount_rate As Double
Sub Init(strGame)
Dim i As Long, j As Long
pstrGame = UCase(Trim(strGame))
If UCase(Trim(strGame)) = "MAZE" Then
pdiscount_rate = 0.9
pn_input = 7 * 7
pn_hidden = 16
pn_output = 4
Set cNNPolicy = New cNeuralNet
With cNNPolicy
Call .Init(pn_input, pn_output, "SOFTMAX")
Call .AddLayer(pn_hidden, "RELU")
End With
pValue_n_hidden = 16
Set cNNValue = New cNeuralNet
With cNNValue
Call .Init(pn_input, 1, "LINEAR")
Call .AddLayer(pValue_n_hidden, "RELU")
End With
ElseIf UCase(Trim(strGame)) = "MAZEII" Then
pdiscount_rate = 0.9
pn_input = 20
pn_hidden = 16
pn_output = 4
Set cNNPolicy = New cNeuralNet
With cNNPolicy
Call .Init(pn_input, pn_output, "SOFTMAX")
Call .AddLayer(pn_hidden, "RELU")
End With
pValue_n_hidden = 16
Set cNNValue = New cNeuralNet
With cNNValue
Call .Init(pn_input, 1, "LINEAR")
Call .AddLayer(pValue_n_hidden, "RELU")
End With
ElseIf UCase(Trim(strGame)) = "STOMP" Then
pdiscount_rate = 0.9
pn_input = 5
pn_hidden = 16
pn_output = 1
Set cNNPolicy = New cNeuralNet
With cNNPolicy
Call .Init(pn_input, pn_output, "SIGMOID")
Call .AddLayer(pn_hidden, "RELU")
End With
pValue_n_hidden = 16
Set cNNValue = New cNeuralNet
With cNNValue
Call .Init(pn_input, 1, "LINEAR")
Call .AddLayer(pValue_n_hidden, "RELU")
End With
End If
End Sub
Sub PrintNetwork(mysht As Worksheet)
Dim i As Long, j As Long, m As Long
Call cNNPolicy.PrintNetwork(mysht)
Call cNNValue.PrintNetwork(mysht, cNNPolicy.n_row + 1)
End Sub
Sub ReadNetwork(mysht As Worksheet)
Dim i As Long, j As Long, m As Long
Set cNNPolicy = New cNeuralNet
Set cNNValue = New cNeuralNet
Call cNNPolicy.ReadNetwork(mysht)
Call cNNValue.ReadNetwork(mysht, cNNPolicy.n_row + 1)
pn_input = cNNPolicy.n_input
pn_output = cNNPolicy.n_output
End Sub
Property Let discount_rate(x As Double)
pdiscount_rate = x
End Property
'Total number of steps taken so far by policy network
Property Get n_hist() As Long
n_hist = cNNPolicy.n_hist
End Property
'Collect a single reward and append it to px_rewards()
Sub collect_reward(x As Double)
Dim i As Long
If ArrayIsEmpty(px_rewards) Then
i = 0
ReDim px_rewards(1 To 1)
Else
i = UBound(px_rewards, 1)
ReDim Preserve px_rewards(1 To i + 1)
End If
px_rewards(i + 1) = x
End Sub
'Return total reward if m is not supplied, otherwise return reward from specified time step
Property Get x_reward(Optional m As Long = -1) As Double
Dim i As Long, tmp_x As Double
If m > 0 Then
x_reward = px_rewards(m)
Else
tmp_x = 0
For i = 1 To UBound(px_rewards)
tmp_x = tmp_x + px_rewards(i)
Next i
x_reward = tmp_x
End If
End Property
'clear all saved rewards
Sub reset_reward()
Erase px_rewards
End Sub
'clear all experience buffer
Sub ClearHist()
pn_episode = 0
Call cNNPolicy.ClearHist
Call cNNValue.ClearHist
Erase xt, yt, vt, x_actiont
Erase px_rewards, pdiscounted_rewards
Erase pEpisodeIdx, pEpisodeStatus
End Sub
'clear all experience buffer, I forget why this is here since it's the same as ClearHist, buy anyway
'Private Sub Clear()
' Erase xt, yt, vt, x_actiont
' Erase px_rewards, pdiscounted_rewards
' Call cNNPolicy.ClearHist
' Call cNNValue.ClearHist
'End Sub
'Reset episode counter
Sub ClearEpisode()
pn_episode = 0
Erase pEpisodeIdx, pEpisodeStatus
End Sub
'record end of an episode by saving its end status, and also an integer pointer to
'time steps in the full trajectory that represents ending points
Sub SaveEpisode(endStatus As Variant)
Dim i As Long, j As Long, k As Long, m As Long, n As Long
pn_episode = pn_episode + 1
If pn_episode = 1 Then
ReDim pEpisodeIdx(1 To pn_episode)
ReDim pEpisodeStatus(1 To pn_episode)
Else
ReDim Preserve pEpisodeIdx(1 To pn_episode)
ReDim Preserve pEpisodeStatus(1 To pn_episode)
End If
pEpisodeIdx(pn_episode) = UBound(xt, 2)
pEpisodeStatus(pn_episode) = endStatus
End Sub
'No longer using these
'Sub ADAM_Init()
' Call cNNPolicy.ADAM_Init
' Call cNNValue.ADAM_Init
'End Sub
'
'Sub ADAM_Clear()
' Call cNNPolicy.ADAM_Clear
' Call cNNValue.ADAM_Clear
'End Sub
'
'Sub NNWgtCache()
' Call cNNPolicy.CacheCurrentWgt
' Call cNNValue.CacheCurrentWgt
'End Sub
'
'Sub NNWgtCacheClear()
' Call cNNPolicy.ClearCacheWgt
' Call cNNValue.ClearCacheWgt
'End Sub
'
'Sub NNWgtRestore()
' Call cNNPolicy.RestoreWgt
' Call cNNValue.RestoreWgt
'End Sub
'
'Sub RMSClear()
' Call cNNPolicy.RMSProp_Clear
' Call cNNValue.RMSProp_Clear
'End Sub
'Learn from saved episodes
'bread and butter of this exercise, implemented PPO by Schulman 2017 (https://arxiv.org/abs/1707.06347v2)
'implementation detail is similar to https://iclr-blog-track.github.io/2022/03/25/ppo-implementation-details/
'for simplicity a fixed value of 0.2 is hardcode as the clipping parameter
'learn_rate learning rate for policy network
'learn_rate_value learning rate for value network
'discount_rate discount rate for reward allocation
'n_epoch number of epochs to run in optimization
'mini_batch number of mini-batchs to divide the trainingg data into batches
Function Learn(learn_rate As Double, learn_rate_value As Double, discount_rate As Double, _
n_epoch As Long, mini_batch As Long, _
Optional entropy_coef As Double = 0.01)
Dim i As Long, j As Long, k As Long, m As Long, n As Long, n_T As Long
Dim i_epoch As Long, batch_start As Long, n_mb As Long, batch_size As Long, n_converge As Long, n_step As Long
Dim tmp_x As Double, tmp_y As Double, tmp_z As Double, step_size As Double
Dim grad_out() As Double, grad_value() As Double
Dim discounted_rewards() As Double
Dim x_err As Double, x_err_prv As Double, delta As Double, i_episode As Long, i_action As Long
Dim r_mean As Double, r_sd As Double
Dim batch_idx() As Long, mini_batch_idx As Variant
Dim x_ratio() As Double, x_loss() As Double, x_loss_mean As Double, x_obj As Double, x_obj_prv As Double
Dim xt_old() As Double, xt_old_sub As Variant
Dim vt_old() As Double
Dim x_advantages() As Double, x_advantages_sub As Variant
Dim probs_old() As Double, probs_old_sub As Variant
Dim x_actiont_old_sub As Variant, x_actiont_old() As Long
Dim v As Variant
Dim x_entropy() As Double, x_entropy_mean As Double
'estimate size of each batch based on number of mini-batches
n_T = UBound(yt, 2)
ReDim batch_idx(1 To n_T)
For i = 1 To n_T
batch_idx(i) = i
Next i
batch_size = Int(n_T / mini_batch)
'Save old outputs
ReDim xt_old(1 To pn_input, 1 To n_T)
ReDim vt_old(1 To 1, 1 To n_T)
ReDim probs_old(1 To n_T)
ReDim x_actiont_old(1 To n_T)
For i = 1 To n_T
probs_old(i) = probs(i)
x_actiont_old(i) = x_actiont(i)
vt_old(1, i) = vt(1, i)
For j = 1 To pn_input
xt_old(j, i) = xt(j, i)
Next j
Next i
'Reshape Rewards - this is game specific, see if this
'can be removed in the future with better reward shaping algorithm
If pstrGame = "STOMP" Then
m = 1
For i_episode = 1 To pn_episode
n = pEpisodeIdx(i_episode)
If pEpisodeStatus(i_episode) <> 0 Then
tmp_x = px_rewards(n)
px_rewards(n) = 0
For i = n - 1 To m Step -1
If xt(5, i) = 0 Then
px_rewards(i) = tmp_x
Exit For
End If
Next i
End If
m = n + 1
If m > n_T Then Exit For
Next i_episode
End If
'no longer needed, discounted rewards is calculated in LearnValueOnly() when needed
' 'calculate discounted rewards
' pdiscounted_rewards = Calc_discounted_rewards(px_rewards, pEpisodeIdx, discount_rate)
'calculate advantages
'according to ICRL's blog, they also estimate the value from next states when an episode is not finished
'but in this case I assume all epsiodes are finished and no rollout is performed
'this could be something to improve in the future
ReDim x_advantages(1 To n_T)
m = 1
For i_episode = 1 To pn_episode
n = pEpisodeIdx(i_episode)
x_advantages(n) = px_rewards(n) - vt(1, n) '+ discount_rate * EstimateValue(xt_tmp, False)
For i = n - 1 To m Step -1
delta = px_rewards(i) - vt(1, i) + discount_rate * vt(1, i + 1)
x_advantages(i) = delta + discount_rate * 1 * x_advantages(i + 1)
Next i
m = n + 1
If m > n_T Then Exit For
Next i_episode
'temp codes to output data for debugging
' With ActiveWorkbook.Sheets("Stomp")
' .Range("U2:AD200").Clear
' n = pEpisodeIdx(1)
' For i = 1 To n
' .Range("U" & 1 + i).Value = i
' .Range("V" & 1 + i).Value = xt(1, i)
' .Range("W" & 1 + i).Value = xt(2, i)
' .Range("X" & 1 + i).Value = xt(3, i)
' .Range("Y" & 1 + i).Value = xt(4, i)
' .Range("Z" & 1 + i).Value = x_actiont(i)
' .Range("AA" & 1 + i).Value = px_rewards(i)
' .Range("AB" & 1 + i).Value = pdiscounted_rewards(i)
' .Range("AC" & 1 + i).Value = x_advantages(i)
' .Range("AD" & 1 + i).Value = probs(i)
' Next i
' End With
' End
'Update value network
If learn_rate_value > 0 Then
x_err = LearnValueOnly(learn_rate_value, discount_rate, n_epoch, mini_batch)
End If
'mini batch gradient ascent to update policy network and maximize objective
n_converge = 0
x_obj_prv = -Exp(70)
Call cNNPolicy.ClearHist
Erase xt, vt, yt, probs, x_actiont
For i_epoch = 1 To n_epoch
'shuffle time steps
Call Shuffle(batch_idx)
x_obj = 0: n_step = 0
batch_start = 1
Do While batch_start < n_T
'Extract a segment of trajectories, note that the last segment could be shorter than batch_size
mini_batch_idx = SubsetIdx(batch_idx, batch_start, batch_start + batch_size - 1)
n_mb = UBound(mini_batch_idx)
batch_start = batch_start + batch_size 'start of next batch
xt_old_sub = SubsetIdx(xt_old, x_idx:=mini_batch_idx)
x_actiont_old_sub = SubsetIdx(x_actiont_old, x_idx:=mini_batch_idx)
x_advantages_sub = SubsetIdx(x_advantages, x_idx:=mini_batch_idx)
probs_old_sub = SubsetIdx(probs_old, x_idx:=mini_batch_idx)
'Normalize advantage, note that this is done on mini-batch level, not on the full trajectory
r_mean = 0: r_sd = 0
For i = 1 To n_mb
r_mean = r_mean + x_advantages_sub(i)
r_sd = r_sd + x_advantages_sub(i) ^ 2
Next i
r_mean = r_mean / n_mb
r_sd = (r_sd / n_mb - r_mean * r_mean) / (n_mb - 1 + 0.000001)
If r_sd > 0 Then
r_sd = Sqr(r_sd)
Else
r_sd = 1 'in rare case when r_sd is zero, simply de-mean
End If
For i = 1 To n_mb
x_advantages_sub(i) = (x_advantages_sub(i) - r_mean) / (r_sd + 0.000000001)
Next i
'Find probabilities of performing the same actions under new network
v = Interact(xt_old_sub, storeHist:=True, isGreedy:=False, fix_action:=x_actiont_old_sub)
ReDim x_ratio(1 To n_mb)
For i = 1 To n_mb
x_ratio(i) = probs(i) / probs_old_sub(i)
Next i
'calculate clipped surrogate objective and its gradient
'I did this by hand, hope it's right!
x_loss_mean = 0
ReDim x_loss(1 To n_mb)
ReDim grad_out(1 To pn_output, 1 To n_mb)
For i = 1 To n_mb
i_action = x_actiont_old_sub(i)
tmp_z = x_advantages_sub(i) / probs_old_sub(i)
tmp_x = x_ratio(i) * x_advantages_sub(i)
If x_ratio(i) > 1.2 Then
tmp_y = 1.2 * x_advantages_sub(i)
ElseIf x_ratio(i) < 0.8 Then
tmp_y = 0.8 * x_advantages_sub(i)
Else
tmp_y = tmp_x
End If
If tmp_x <= tmp_y Then
x_loss(i) = tmp_x
If pn_output = 1 Then
If i_action = 1 Then
grad_out(1, i) = yt(1, i) * (1 - yt(1, i)) * tmp_z
Else
grad_out(1, i) = -yt(1, i) * (1 - yt(1, i)) * tmp_z
End If
Else
For j = 1 To pn_output
If j = i_action Then
grad_out(j, i) = yt(i_action, i) * (1 - yt(j, i)) * tmp_z
Else
grad_out(j, i) = -yt(i_action, i) * yt(j, i) * tmp_z
End If
Next j
End If
Else
x_loss(i) = tmp_y
End If
x_loss_mean = x_loss_mean + x_loss(i)
Next i
x_loss_mean = x_loss_mean
'Calculate Entropy bonus and its gradient
'again it's done by hand, hope it's right!
If entropy_coef > 0 Then
x_entropy = calcProbEntropy(yt)
x_entropy_mean = 0
For i = 1 To n_mb
x_entropy_mean = x_entropy_mean + x_entropy(i)
Next i
x_entropy_mean = x_entropy_mean
For i = 1 To n_mb
For j = 1 To pn_output
grad_out(j, i) = grad_out(j, i) - entropy_coef * yt(j, i) * (Log(yt(j, i)) + x_entropy(i))
Next j
Next i
End If
'combined objective
x_obj = x_obj + x_loss_mean + entropy_coef * x_entropy_mean
n_step = n_step + n_mb
'Update policy
With cNNPolicy
Call .Backward(yt, grad_out, , isdEdx:=True)
Call .UpdateWgt(-learn_rate, useSpeedUp:="ADAM") 'negative for ascent instead of descent
Call .ResetWgtChg
Call .ClearHist
End With
Erase xt, vt, yt, probs, x_actiont
Loop
'Evalulate this epoch and test for convergence
If (i_epoch - 1) Mod 10 = 0 Then
DoEvents
Debug.Print "cPlayer: Learn: PolicyLayer: Epoch " & i_epoch & "/" & n_epoch & ", Objective=" & Round(x_obj, 4)
End If
x_obj = x_obj / n_step
If x_obj >= x_obj_prv And (x_obj - x_obj_prv) < (Abs(x_obj_prv) * 0.001) Then
n_converge = n_converge + 1
Else
n_converge = 0
End If
If n_converge >= 5 Then
Exit For
End If
x_obj_prv = x_obj
Next i_epoch
Call cNNPolicy.ADAM_Clear
'output error in value layer, for no particular reasons
Learn = x_err
End Function
'Update value network from saved episodes
'input definition is same as Learn()
Function LearnValueOnly(learn_rate As Double, discount_rate As Double, n_epoch As Long, mini_batch As Long)
Dim i As Long, j As Long, k As Long, m As Long, n As Long, n_T As Long, total_steps As Long, n_converge As Long
Dim i_epoch As Long, batch_start As Long, n_mb As Long, batch_size As Long
Dim tmp_x As Double, tmp_y As Double, tmp_z As Double, step_size As Double
Dim grad_value() As Double
Dim discounted_rewards() As Double
Dim x_err As Double, x_err_prv As Double, delta As Double, i_episode As Long
Dim r_mean As Double, r_sd As Double
Dim batch_idx() As Long, mini_batch_idx As Variant
Dim xt_old() As Double, xt_old_sub As Variant
Dim vt_old() As Double
Dim v As Variant
Dim discounted_rewards_old() As Double, discounted_rewards_old_sub As Variant
n_T = UBound(vt, 2)
ReDim batch_idx(1 To n_T)
For i = 1 To n_T
batch_idx(i) = i
Next i
batch_size = Int(n_T / mini_batch)
'calculate discounted rewards
pdiscounted_rewards = Calc_discounted_rewards(px_rewards, pEpisodeIdx, discount_rate)
'Compute starting error
x_err_prv = 0
For i = 1 To n_T
x_err_prv = x_err_prv + (vt(1, i) - pdiscounted_rewards(i)) ^ 2
Next i
x_err_prv = x_err_prv / (2 * n_T)
'Save old outputs
ReDim xt_old(1 To pn_input, 1 To n_T)
ReDim vt_old(1 To 1, 1 To n_T)
ReDim discounted_rewards_old(1 To n_T)
For i = 1 To n_T
vt_old(1, i) = vt(1, i)
discounted_rewards_old(i) = pdiscounted_rewards(i)
For j = 1 To pn_input
xt_old(j, i) = xt(j, i)
Next j
Next i
'mini batch SGD to update value layer
Call cNNValue.ClearHist
Erase vt
n_converge = 0
For i_epoch = 1 To n_epoch
Call Shuffle(batch_idx)
x_err = 0: total_steps = 0
batch_start = 1
Do While batch_start < n_T
'Extract a segment of trajectories
mini_batch_idx = SubsetIdx(batch_idx, batch_start, batch_start + batch_size - 1)
n_mb = UBound(mini_batch_idx)
batch_start = batch_start + batch_size
xt_old_sub = SubsetIdx(xt_old, x_idx:=mini_batch_idx)
discounted_rewards_old_sub = SubsetIdx(discounted_rewards_old, x_idx:=mini_batch_idx)
'Estimate value using current value layer
v = EstimateValue(xt_old_sub, True)
'calculate error in value estimates and its gradient
ReDim grad_value(1 To 1, 1 To n_mb)
For i = 1 To n_mb
grad_value(1, i) = vt(1, i) - discounted_rewards_old_sub(i)
x_err = x_err + (vt(1, i) - discounted_rewards_old_sub(i)) ^ 2
Next i
total_steps = total_steps + n_mb
'Update weights
With cNNValue
Call .Backward(vt, grad_value)
Call .UpdateWgt(learn_rate, useSpeedUp:="ADAM")
Call .ResetWgtChg
Call .ClearHist
End With
Erase vt
Loop
'Evaluate current epoch and test for convergence
x_err = x_err / (2 * total_steps)
If Abs(x_err_prv - x_err) < (Abs(x_err_prv) * 0.001) Then 'use a lenient criteria as value layer is less important
n_converge = n_converge + 1
Else
n_converge = 0
End If
If n_converge >= 5 Then
Exit For
End If
x_err_prv = x_err
If (i_epoch - 1) Mod 10 = 0 Then
DoEvents
Debug.Print "cPlayer: LearnValueOnly: Epoch " & i_epoch & "/" & n_epoch & ", err=" & Round(x_err, 4) & ", converge count=" & n_converge
End If
Next i_epoch
Call cNNValue.ADAM_Clear
'Output error in value estimates
LearnValueOnly = x_err
End Function
Private Function Calc_discounted_rewards(x_rewards() As Double, EpisodeIdx() As Long, discount_rate As Double)
Dim i As Long, j As Long, k As Long, m As Long, n As Long, n_episode As Long
Dim n_T As Long, i_episode As Long
Dim discounted_rewards() As Double
n_T = UBound(x_rewards, 1)
n_episode = UBound(EpisodeIdx, 1)
ReDim discounted_rewards(1 To n_T)
m = 1
For i_episode = 1 To n_episode
n = EpisodeIdx(i_episode)
discounted_rewards(n) = x_rewards(n)
For i = n - 1 To m Step -1
discounted_rewards(i) = x_rewards(i) + discount_rate * discounted_rewards(i + 1)
Next i
m = n + 1
If m > n_T Then Exit For
Next i_episode
Calc_discounted_rewards = discounted_rewards
End Function
Function EvaluateValueError(discount_rate As Double) As Double
Dim i As Long, j As Long, k As Long, m As Long, n As Long, n_T As Long
Dim tmp_x As Double
Dim y() As Double, v() As Double
Dim discounted_rewards() As Double
Dim x_err As Double, delta As Double, i_episode As Long
n_T = UBound(xt, 2)
ReDim discounted_rewards(1 To n_T)
v = cNNValue.FwdPass(xt, storeOutput:=True)
discounted_rewards = Calc_discounted_rewards(px_rewards, pEpisodeIdx, discount_rate)
x_err = 0
For k = 1 To n_T
x_err = x_err + (v(1, k) - discounted_rewards(k)) ^ 2
Next k
x_err = x_err / (2 * n_T)
EvaluateValueError = x_err
End Function
'feed in state variables x and output a single integer that represents the choice of actions
'in learning phase when storeHist is True, state value is also estimated by the value layer
'x, input state variables of size (1:pn_input, 1:n)
'storeHist, save all intermediate outputs for training purpose when set to True
'isGreedy, greedy action when set to True, otherwise action is sample from the output discrete distribution
'fix_action, used only in PPO, by supplying a series of actions together with states variables, it
' calculates the probability of performing those actions (under current policy) and store them in memory
Function Interact(x As Variant, Optional storeHist As Boolean = False, Optional isGreedy As Boolean = False, Optional fix_action As Variant = Null) As Long
Dim i As Long, j As Long, k As Long, m As Long, n As Long, n_T As Long, iterate As Long, n_T_old As Long
Dim tmp_x As Double
Dim y() As Double, v As Variant
Dim x_action() As Long
n_T = UBound(x, 2)
y = cNNPolicy.FwdPass(x, storeOutput:=storeHist)
If IsArray(fix_action) Then
ReDim x_action(1 To n_T)
For i = 1 To n_T
x_action(i) = fix_action(i)
Next i
Else
x_action = SampleAction(y, isGreedy)
End If
If storeHist Then
If ArrayIsEmpty(xt) Then
n_T_old = 0
ReDim xt(1 To pn_input, 1 To n_T)
ReDim yt(1 To pn_output, 1 To n_T)
ReDim probs(1 To n_T)
ReDim x_actiont(1 To n_T)
Else
n_T_old = UBound(xt, 2)
ReDim Preserve xt(1 To pn_input, 1 To n_T_old + n_T)
ReDim Preserve yt(1 To pn_output, 1 To n_T_old + n_T)
ReDim Preserve probs(1 To n_T_old + n_T)
ReDim Preserve x_actiont(1 To n_T_old + n_T)
End If
For iterate = 1 To n_T
k = n_T_old + iterate
For i = 1 To pn_input
xt(i, k) = x(i, iterate)
Next i
For i = 1 To pn_output
yt(i, k) = y(i, iterate)
Next i
If pn_output = 1 Then
If x_action(iterate) = 1 Then
probs(k) = y(1, iterate)
Else
probs(k) = 1 - y(1, iterate)
End If
ElseIf pn_output > 1 Then
probs(k) = y(x_action(iterate), iterate)
End If
x_actiont(k) = x_action(iterate)
Next iterate
End If
Interact = x_action(n_T)
'Evaluate value
If Not storeHist Then Exit Function
v = EstimateValue(x, storeHist:=storeHist)
End Function
Function EstimateValue(x As Variant, Optional storeHist As Boolean = False)
Dim i As Long, j As Long, k As Long, m As Long, n As Long, n_T As Long, iterate As Long, n_T_old As Long
Dim tmp_x As Double
Dim v() As Double
n_T = UBound(x, 2)
v = cNNValue.FwdPass(x, storeOutput:=storeHist)
If storeHist Then
If ArrayIsEmpty(vt) Then
n_T_old = 0
ReDim vt(1 To 1, 1 To n_T)
Else
n_T_old = UBound(vt, 2)
ReDim Preserve vt(1 To 1, 1 To n_T_old + n_T)
End If
For iterate = 1 To n_T
vt(1, n_T_old + iterate) = v(1, iterate)
Next iterate
End If
EstimateValue = v(1, n_T)
End Function
Function StateMatrix(Optional strOutType As String = "ACTION")
Dim i As Long, j As Long, k As Long, m As Long, n As Long
Dim ii As Long, jj As Long
Dim x() As Double
Dim y() As Double, v As Variant
Dim v_out() As Double
Dim tmp_x As Double, tmp_y As Double
If UCase(pstrGame) = "MAZE" Then
ReDim v_out(1 To 7 + 1, 1 To 7 + 1)
For ii = 1 To 7
tmp_y = ii
v_out(1 + (7 - ii + 1), 1) = tmp_y
For jj = 1 To 7
tmp_x = jj
v_out(1, 1 + jj) = tmp_x
ReDim x(1 To pn_input, 1 To 1)
x((tmp_x - 1) * 7 + tmp_y, 1) = 1
If strOutType = "ACTION" Then
y = cNNPolicy.FwdPass(x, storeOutput:=False)
v = SampleAction(y, True)
v_out(1 + (7 - ii + 1), 1 + jj) = v(1)
ElseIf strOutType = "VALUE" Then
v_out(1 + (7 - ii + 1), 1 + jj) = cNNValue.FwdPass(x, storeOutput:=False)(1, 1)
End If
Next jj
Next ii
ElseIf UCase(pstrGame) = "MAZEII" Then
ReDim v_out(1 To 7 + 1, 1 To 13 + 1)
For ii = 1 To 7
tmp_y = ii
v_out(1 + (7 - ii + 1), 1) = tmp_y
For jj = 1 To 13
tmp_x = jj
v_out(1, 1 + jj) = tmp_x
ReDim x(1 To pn_input, 1 To 1)
x(1, 1) = tmp_x / 13
x(2, 1) = tmp_y / 7
x(3, 1) = tmp_x / 13
x(4, 1) = tmp_y / 7
If strOutType = "ACTION" Then
y = cNNPolicy.FwdPass(x, storeOutput:=False)
v = SampleAction(y, True)
v_out(1 + (7 - ii + 1), 1 + jj) = v(1)
ElseIf strOutType = "VALUE" Then
v_out(1 + (7 - ii + 1), 1 + jj) = cNNValue.FwdPass(x, storeOutput:=False)(1, 1)
End If
Next jj
Next ii
ElseIf UCase(pstrGame) = "STOMP" Then
ReDim x(1 To pn_input, 1 To 1)
ReDim v_out(1 To 30 + 1, 1 To 10 + 1)
For ii = 1 To 30
tmp_y = 20 - ii + 1
v_out(1 + ii, 1) = tmp_y
For jj = 1 To 10
tmp_x = -1.5 + (jj - 1) * 0.7 / 9
v_out(1, 1 + jj) = tmp_x
x(1, 1) = tmp_y / 20
x(2, 1) = tmp_x
If strOutType = "ACTION" Then
y = cNNPolicy.FwdPass(x, storeOutput:=False)
v_out(1 + ii, 1 + jj) = y(1, 1)
ElseIf strOutType = "VALUE" Then
v_out(1 + ii, 1 + jj) = cNNValue.FwdPass(x, storeOutput:=False)(1, 1)
End If
Next jj
Next ii
End If
StateMatrix = v_out
End Function
Private Function ArrayIsEmpty(x) As Boolean
Dim i As Long
If Not IsArray(x) Then
ArrayIsEmpty = True
Else
ArrayIsEmpty = False
On Error Resume Next
i = UBound(x, 1)
If Err.Number <> 0 Then
Err.Clear
ArrayIsEmpty = True
End If
End If
End Function
Private Function SampleAction(y() As Double, isGreedy As Boolean)
Dim i As Long, j As Long, k As Long, m As Long, n As Long, iterate As Long
Dim n_output As Long, n_T As Long, n_T_old As Long
Dim tmp_max As Double, cumprob As Double
Dim x_action() As Long
n_T = UBound(y, 2)
n_output = UBound(y, 1)
ReDim x_action(1 To n_T)
If Not isGreedy Then
If n_output = 1 Then
For iterate = 1 To n_T
If y(1, iterate) >= Rnd() Then
x_action(iterate) = 1
End If
Next iterate
Else
For iterate = 1 To n_T
tmp_max = Rnd()
cumprob = y(1, iterate)
If tmp_max < cumprob Then
x_action(iterate) = 1
Else
For i = 2 To n_output
cumprob = cumprob + y(i, iterate)
If tmp_max < cumprob Then
x_action(iterate) = i
Exit For
End If
Next i
End If
Next iterate
End If
Else
If n_output = 1 Then
For iterate = 1 To n_T
If y(1, iterate) > 0.5 Then
x_action(iterate) = 1
End If
Next iterate
Else
For iterate = 1 To n_T
tmp_max = y(1, iterate)
x_action(iterate) = 1
For i = 2 To n_output
If y(i, iterate) > tmp_max Then
tmp_max = y(i, iterate)
x_action(iterate) = i
End If
Next i
Next iterate
End If
End If
SampleAction = x_action
End Function
Private Function SubsetIdx(x As Variant, Optional m As Long, Optional n As Long, Optional x_idx As Variant = Null)
Dim i As Long, j As Long, k As Long
Dim y As Variant, y_idx() As Long
If IsArray(x_idx) Then
k = UBound(x_idx, 1)
If getDimension(x) = 1 Then
ReDim y(1 To k)
For i = 1 To k
y(i) = x(x_idx(i))
Next i
ElseIf getDimension(x) = 2 Then
m = UBound(x, 1)
ReDim y(1 To m, 1 To k)
For i = 1 To k
For j = 1 To m
y(j, i) = x(j, x_idx(i))
Next j
Next i
End If
SubsetIdx = y
Exit Function
End If
k = n
If n > UBound(x, 1) Then
k = UBound(x, 1)
End If
ReDim y_idx(1 To k - m + 1)
For i = 1 To (k - m + 1)
y_idx(i) = x(m + i - 1)
Next i
SubsetIdx = y_idx
End Function
Private Function calcProbEntropy(x_probs() As Double) As Double()
Dim i As Long, j As Long, k As Long, m As Long, n As Long, iterate As Long
Dim n_output As Long, n_T As Long
Dim tmp_x As Double
Dim x_entropy() As Double, x_entropy_mean As Double
n_output = UBound(x_probs, 1)
n_T = UBound(x_probs, 2)
ReDim x_entropy(1 To n_T)
If n_output = 1 Then
For iterate = 1 To n_T
tmp_x = x_probs(i, iterate)
tmp_x = -tmp_x * Log(tmp_x) - (1 - tmp_x) * Log(1 - tmp_x)
x_entropy(iterate) = tmp_x
Next iterate