From dbec19bbb81b69f072bf0be30dcac90dbbdffc4a Mon Sep 17 00:00:00 2001
From: Bob <bob@bob.bob>
Date: Mon, 29 Apr 2024 03:29:43 -0400
Subject: [PATCH] update

---
 rl3/td3.py | 343 -----------------------------------------------------
 1 file changed, 343 deletions(-)
 delete mode 100755 rl3/td3.py

diff --git a/rl3/td3.py b/rl3/td3.py
deleted file mode 100755
index fbec1095..00000000
--- a/rl3/td3.py
+++ /dev/null
@@ -1,343 +0,0 @@
-import numpy as np
-import tensorflow as tf
-import gym
-import matplotlib.pyplot as plt
-from datetime import datetime
-
-gym_minor_version = int(gym.__version__.split('.')[1])
-if gym_minor_version >= 19:
-  exit("Please install OpenAI Gym 0.19.0 or earlier")
-
-if tf.__version__.startswith('2'):
-  exit("Please install Tensorflow 1.x")
-
-
-### avoid crashing on Mac
-# doesn't seem to work
-from sys import platform as sys_pf
-if sys_pf == 'darwin':
-    import matplotlib
-    matplotlib.use("TkAgg")
-
-
-# simple feedforward neural net
-def ANN(x, layer_sizes, hidden_activation=tf.nn.relu, output_activation=None):
-  for h in layer_sizes[:-1]:
-    x = tf.layers.dense(x, units=h, activation=hidden_activation)
-  return tf.layers.dense(x, units=layer_sizes[-1], activation=output_activation)
-
-
-# get all variables within a scope
-def get_vars(scope):
-  return [x for x in tf.global_variables() if scope in x.name]
-
-
-### Create both the actor and critic networks at once ###
-### Q(s, mu(s)) returns the maximum Q for a given state s ###
-def CreateNetworks(
-    s, a,
-    num_actions,
-    action_max,
-    hidden_sizes=(300,),
-    hidden_activation=tf.nn.relu, 
-    output_activation=tf.tanh):
-
-  with tf.variable_scope('mu'):
-    mu = action_max * ANN(s, list(hidden_sizes)+[num_actions], hidden_activation, output_activation)
-  with tf.variable_scope('q1'):
-    input_ = tf.concat([s, a], axis=-1) # (state, action)
-    q1 = tf.squeeze(ANN(input_, list(hidden_sizes)+[1], hidden_activation, None), axis=1)
-  with tf.variable_scope('q2'):
-    input_ = tf.concat([s, a], axis=-1) # (state, action)
-    q2 = tf.squeeze(ANN(input_, list(hidden_sizes)+[1], hidden_activation, None), axis=1)
-  with tf.variable_scope('q1', reuse=True):
-    # reuse is True, so it reuses the weights from the previously defined Q network
-    input_ = tf.concat([s, mu], axis=-1) # (state, mu(state))
-    q1_mu = tf.squeeze(ANN(input_, list(hidden_sizes)+[1], hidden_activation, None), axis=1)
-  return mu, q1, q2, q1_mu
-
-
-### The experience replay memory ###
-class ReplayBuffer:
-  def __init__(self, obs_dim, act_dim, size):
-    self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32)
-    self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32)
-    self.acts_buf = np.zeros([size, act_dim], dtype=np.float32)
-    self.rews_buf = np.zeros(size, dtype=np.float32)
-    self.done_buf = np.zeros(size, dtype=np.float32)
-    self.ptr, self.size, self.max_size = 0, 0, size
-
-  def store(self, obs, act, rew, next_obs, done):
-    self.obs1_buf[self.ptr] = obs
-    self.obs2_buf[self.ptr] = next_obs
-    self.acts_buf[self.ptr] = act
-    self.rews_buf[self.ptr] = rew
-    self.done_buf[self.ptr] = done
-    self.ptr = (self.ptr+1) % self.max_size
-    self.size = min(self.size+1, self.max_size)
-
-  def sample_batch(self, batch_size=32):
-    idxs = np.random.randint(0, self.size, size=batch_size)
-    return dict(s=self.obs1_buf[idxs],
-                s2=self.obs2_buf[idxs],
-                a=self.acts_buf[idxs],
-                r=self.rews_buf[idxs],
-                d=self.done_buf[idxs])
-
-
-### Implement the TD3 algorithm ###
-def td3(
-    env_fn,
-    ac_kwargs=dict(),
-    seed=0,
-    save_folder=None,
-    num_train_episodes=100,
-    test_agent_every=25,
-    replay_size=int(1e6),
-    gamma=0.99, 
-    decay=0.995,
-    mu_lr=1e-3,
-    q_lr=1e-3,
-    batch_size=100,
-    start_steps=10000, 
-    action_noise=0.1,
-    target_noise=0.2,
-    noise_clip=0.5,
-    policy_delay=2,
-    max_episode_length=1000):
-
-  tf.set_random_seed(seed)
-  np.random.seed(seed)
-
-  env, test_env = env_fn(), env_fn()
-
-  # comment out this line if you don't want to record a video of the agent
-  if save_folder is not None:
-    test_env = gym.wrappers.Monitor(test_env, save_folder)
-
-  # get size of state space and action space
-  num_states = env.observation_space.shape[0]
-  num_actions = env.action_space.shape[0]
-
-  # Maximum value of action
-  # Assumes both low and high values are the same
-  # Assumes all actions have the same bounds
-  # May NOT be the case for all environments
-  action_max = env.action_space.high[0]
-
-  # Create Tensorflow placeholders (neural network inputs)
-  X = tf.placeholder(dtype=tf.float32, shape=(None, num_states)) # state
-  A = tf.placeholder(dtype=tf.float32, shape=(None, num_actions)) # action
-  X2 = tf.placeholder(dtype=tf.float32, shape=(None, num_states)) # next state
-  R = tf.placeholder(dtype=tf.float32, shape=(None,)) # reward
-  D = tf.placeholder(dtype=tf.float32, shape=(None,)) # done
-
-  # Main network outputs
-  with tf.variable_scope('main'):
-    mu, q1, q2, q1_mu = CreateNetworks(X, A, num_actions, action_max, **ac_kwargs)
-  
-  # Target networks
-  # First, get the output policy given next state X2
-  with tf.variable_scope('target'):
-    # Note: "A" placeholder is effectively ignored
-    #       since mu is only a function of state (X2)
-    mu_targ, _, _, _ = CreateNetworks(X2, A, num_actions, action_max, **ac_kwargs)
-
-  # Next, add noise to mu_targ, before passing it through the target Q-networks
-  with tf.variable_scope('target', reuse=True):
-    # Add Gaussian noise and clip to valid action range
-    epsilon = tf.random_normal(tf.shape(mu_targ), stddev=target_noise)
-    epsilon = tf.clip_by_value(epsilon, -noise_clip, noise_clip)
-    A2 = mu_targ + epsilon
-    A2 = tf.clip_by_value(A2, -action_max, action_max)
-
-    _, q1_targ, q2_targ, _ = CreateNetworks(X2, A2, num_actions, action_max, **ac_kwargs)
-
-  # Experience replay memory
-  replay_buffer = ReplayBuffer(obs_dim=num_states, act_dim=num_actions, size=replay_size)
-
-
-  # Target value for the Q-network loss
-  # We use stop_gradient to tell Tensorflow not to differentiate
-  # Take the smaller of Q1 and Q2!
-  min_q_targ = tf.minimum(q1_targ, q2_targ)
-  q_target = tf.stop_gradient(R + gamma * (1 - D) * min_q_targ)
-
-  # TD3 losses
-  mu_loss = -tf.reduce_mean(q1_mu)
-  q1_loss = tf.reduce_mean((q1 - q_target)**2)
-  q2_loss = tf.reduce_mean((q2 - q_target)**2)
-  q_loss = q1_loss + q2_loss # minimize simultaneously
-
-  # Train policy and value separately
-  mu_optimizer = tf.train.AdamOptimizer(learning_rate=mu_lr)
-  q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
-  mu_train_op = mu_optimizer.minimize(mu_loss, var_list=get_vars('main/mu'))
-  q_train_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q'))
-
-  # Use soft updates to update the target networks
-  target_update = tf.group(
-    [tf.assign(v_targ, decay*v_targ + (1 - decay)*v_main)
-      for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
-    ]
-  )
-
-  # Copy main network params to target networks
-  target_init = tf.group(
-    [tf.assign(v_targ, v_main)
-      for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
-    ]
-  )
-
-  # boilerplate (and copy to the target networks!)
-  sess = tf.Session()
-  sess.run(tf.global_variables_initializer())
-  sess.run(target_init)
-
-  def get_action(s, noise_scale):
-    a = sess.run(mu, feed_dict={X: s.reshape(1,-1)})[0]
-    a += noise_scale * np.random.randn(num_actions)
-    return np.clip(a, -action_max, action_max)
-
-  test_returns = []
-  def test_agent(num_episodes=5):
-    t0 = datetime.now()
-    n_steps = 0
-    for j in range(num_episodes):
-      s, episode_return, episode_length, d = test_env.reset(), 0, 0, False
-      while not (d or (episode_length == max_episode_length)):
-        # Take deterministic actions at test time (noise_scale=0)
-        test_env.render()
-        s, r, d, _ = test_env.step(get_action(s, 0))
-        episode_return += r
-        episode_length += 1
-        n_steps += 1
-      print('test return:', episode_return, 'episode_length:', episode_length)
-      test_returns.append(episode_return)
-    # print("test steps per sec:", n_steps / (datetime.now() - t0).total_seconds())
-
-
-  # Main loop: play episode and train
-  returns = []
-  q_losses = []
-  mu_losses = []
-  num_steps = 0
-  for i_episode in range(num_train_episodes):
-
-    # reset env
-    s, episode_return, episode_length, d = env.reset(), 0, 0, False
-
-    while not (d or (episode_length == max_episode_length)):
-      # For the first `start_steps` steps, use randomly sampled actions
-      # in order to encourage exploration.
-      if num_steps > start_steps:
-        a = get_action(s, action_noise)
-      else:
-        a = env.action_space.sample()
-
-      # Keep track of the number of steps done
-      num_steps += 1
-      if num_steps == start_steps:
-        print("USING AGENT ACTIONS NOW")
-
-      # Step the env
-      s2, r, d, _ = env.step(a)
-      episode_return += r
-      episode_length += 1
-
-      # Ignore the "done" signal if it comes from hitting the time
-      # horizon (that is, when it's an artificial terminal signal
-      # that isn't based on the agent's state)
-      d_store = False if episode_length == max_episode_length else d
-
-      # Store experience to replay buffer
-      replay_buffer.store(s, a, r, s2, d_store)
-
-      # Assign next state to be the current state on the next round
-      s = s2
-
-    # Perform the updates
-    for j in range(episode_length):
-      batch = replay_buffer.sample_batch(batch_size)
-      feed_dict = {
-        X: batch['s'],
-        X2: batch['s2'],
-        A: batch['a'],
-        R: batch['r'],
-        D: batch['d']
-      }
-
-      # Q network update
-      # Note: plot the Q loss if you want
-      ql, _ = sess.run([q_loss, q_train_op], feed_dict)
-      q_losses.append(ql)
-
-      # Policy update
-      # (And target networks update)
-      # Note: plot the mu loss if you want
-      if j % policy_delay == 0:
-        mul, _, _ = sess.run([mu_loss, mu_train_op, target_update], feed_dict)
-        mu_losses.append(mul)
-
-    print("Episode:", i_episode + 1, "Return:", episode_return, 'episode_length:', episode_length)
-    returns.append(episode_return)
-
-    # Test the agent
-    if i_episode > 0 and i_episode % test_agent_every == 0:
-      test_agent()
-
-  # on Mac, plotting results in an error, so just save the results for later
-  # if you're not on Mac, feel free to uncomment the below lines
-  np.savez('td3_results.npz', train=returns, test=test_returns, q_losses=q_losses, mu_losses=mu_losses)
-
-  # plt.plot(returns)
-  # plt.plot(smooth(np.array(returns)))
-  # plt.title("Train returns")
-  # plt.show()
-
-  # plt.plot(test_returns)
-  # plt.plot(smooth(np.array(test_returns)))
-  # plt.title("Test returns")
-  # plt.show()
-
-  # plt.plot(q_losses)
-  # plt.title('q_losses')
-  # plt.show()
-
-  # plt.plot(mu_losses)
-  # plt.title('mu_losses')
-  # plt.show()
-
-
-def smooth(x):
-  # last 100
-  n = len(x)
-  y = np.zeros(n)
-  for i in range(n):
-    start = max(0, i - 99)
-    y[i] = float(x[start:(i+1)].sum()) / (i - start + 1)
-  return y
-
-
-if __name__ == '__main__':
-  import argparse
-  parser = argparse.ArgumentParser()
-  # parser.add_argument('--env', type=str, default='HalfCheetah-v2')
-  parser.add_argument('--env', type=str, default='Pendulum-v0')
-  parser.add_argument('--hidden_layer_sizes', type=int, default=300)
-  parser.add_argument('--num_layers', type=int, default=1)
-  parser.add_argument('--gamma', type=float, default=0.99)
-  parser.add_argument('--seed', type=int, default=0)
-  parser.add_argument('--num_train_episodes', type=int, default=200)
-  parser.add_argument('--save_folder', type=str, default='td3_monitor')
-  args = parser.parse_args()
-
-
-  td3(
-    lambda : gym.make(args.env),
-    ac_kwargs=dict(hidden_sizes=[args.hidden_layer_sizes]*args.num_layers),
-    gamma=args.gamma,
-    seed=args.seed,
-    save_folder=args.save_folder,
-    num_train_episodes=args.num_train_episodes,
-  )