forked from andrew-j-levy/Hierarchical-Actor-Critc-HAC-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
actor.py
126 lines (94 loc) · 4.5 KB
/
actor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import tensorflow as tf
import numpy as np
from utils import layer
class Actor():
def __init__(self,
sess,
env,
batch_size,
layer_number,
FLAGS,
learning_rate=0.001,
tau=0.05):
self.sess = sess
# Determine range of actor network outputs. This will be used to configure outer layer of neural network
if layer_number == 0:
self.action_space_bounds = env.action_bounds
self.action_offset = env.action_offset
else:
# Determine symmetric range of subgoal space and offset
self.action_space_bounds = env.subgoal_bounds_symmetric
self.action_offset = env.subgoal_bounds_offset
# Dimensions of action will depend on layer level
if layer_number == 0:
self.action_space_size = env.action_dim
else:
self.action_space_size = env.subgoal_dim
self.actor_name = 'actor_' + str(layer_number)
# Dimensions of goal placeholder will differ depending on layer level
if layer_number == FLAGS.layers - 1:
self.goal_dim = env.end_goal_dim
else:
self.goal_dim = env.subgoal_dim
self.state_dim = env.state_dim
self.learning_rate = learning_rate
# self.exploration_policies = exploration_policies
self.tau = tau
self.batch_size = batch_size
self.state_ph = tf.placeholder(tf.float32, shape=(None, self.state_dim))
self.goal_ph = tf.placeholder(tf.float32, shape=(None, self.goal_dim))
self.features_ph = tf.concat([self.state_ph, self.goal_ph], axis=1)
# Create actor network
self.infer = self.create_nn(self.features_ph)
# Target network code "repurposed" from Patrick Emani :^)
self.weights = [v for v in tf.trainable_variables() if self.actor_name in v.op.name]
# self.num_weights = len(self.weights)
# Create target actor network
self.target = self.create_nn(self.features_ph, name = self.actor_name + '_target')
self.target_weights = [v for v in tf.trainable_variables() if self.actor_name in v.op.name][len(self.weights):]
self.update_target_weights = \
[self.target_weights[i].assign(tf.multiply(self.weights[i], self.tau) +
tf.multiply(self.target_weights[i], 1. - self.tau))
for i in range(len(self.target_weights))]
self.action_derivs = tf.placeholder(tf.float32, shape=(None, self.action_space_size))
self.unnormalized_actor_gradients = tf.gradients(self.infer, self.weights, -self.action_derivs)
self.policy_gradient = list(map(lambda x: tf.div(x, self.batch_size), self.unnormalized_actor_gradients))
# self.policy_gradient = tf.gradients(self.infer, self.weights, -self.action_derivs)
self.train = tf.train.AdamOptimizer(learning_rate).apply_gradients(zip(self.policy_gradient, self.weights))
def get_action(self, state, goal):
actions = self.sess.run(self.infer,
feed_dict={
self.state_ph: state,
self.goal_ph: goal
})
return actions
def get_target_action(self, state, goal):
actions = self.sess.run(self.target,
feed_dict={
self.state_ph: state,
self.goal_ph: goal
})
return actions
def update(self, state, goal, action_derivs):
weights, policy_grad, _ = self.sess.run([self.weights, self.policy_gradient, self.train],
feed_dict={
self.state_ph: state,
self.goal_ph: goal,
self.action_derivs: action_derivs
})
return len(weights)
# self.sess.run(self.update_target_weights)
# def create_nn(self, state, goal, name='actor'):
def create_nn(self, features, name=None):
if name is None:
name = self.actor_name
with tf.variable_scope(name + '_fc_1'):
fc1 = layer(features, 64)
with tf.variable_scope(name + '_fc_2'):
fc2 = layer(fc1, 64)
with tf.variable_scope(name + '_fc_3'):
fc3 = layer(fc2, 64)
with tf.variable_scope(name + '_fc_4'):
fc4 = layer(fc3, self.action_space_size, is_output=True)
output = tf.tanh(fc4) * self.action_space_bounds + self.action_offset
return output