diff --git a/environment.py b/environment.py index c00969c..f720455 100644 --- a/environment.py +++ b/environment.py @@ -1,5 +1,7 @@ import gym from wrappers.frame_skipper import FrameSkipper +from wrappers.early_stop import EarlyStop +from wrappers.green_penalty import GreenPenalty from gym.wrappers import FrameStack, GrayScaleObservation, Monitor @@ -17,9 +19,12 @@ def __init__(self, device, seed, stack_frames=4, train=False): self.env.seed(seed) if not train: self.env = Monitor(self.env, './video', force=True) + self.env = GreenPenalty(self.env) self.env = GrayScaleObservation(self.env) self.env = FrameStack(self.env, stack_frames) self.env = FrameSkipper(self.env, 4) + self.env = EarlyStop(self.env, 100) + print(self.env.observation_space) def max_episode_steps(self): diff --git a/main.py b/main.py index d8769f8..cc7807e 100644 --- a/main.py +++ b/main.py @@ -31,19 +31,21 @@ def train(config): trainer.train() # Let's store a vid with one episode + config['train'] = False runner = Runner(env, config) runner.run() + config['train'] = True # for concurrent runs and logging -experiment = 'ppo-nm-hp-tuning' +experiment = 'ppo-nm-hp-tuning-2' if __name__ == "__main__": hyperparams = { - 'num_epochs': 700, # Number of training episodes + 'num_epochs': 2000, # Number of training episodes 'num_ppo_epochs': tune.randint(4, 10), 'mini_batch_size': 128, 'memory_size': 2000, - 'eps': 0.2, + 'eps': tune.quniform(0.1, 0.2, 0.1), 'c1': tune.quniform(0.5, 2.5, 0.25), # Value Function coeff 'c2': tune.quniform(0.01, 0.15, 0.01), # Entropy coeff 'lr': 1e-3, # Learning rate @@ -59,8 +61,8 @@ def train(config): analysis = tune.run( train, metric='running_reward', - mode='min', - num_samples=20, - resources_per_trial={"cpu": 0.5, "gpu": 0.3}, + mode='max', + num_samples=15, + resources_per_trial={"cpu": 0.4, "gpu": 0.3}, config=hyperparams, ) diff --git a/trainer.py b/trainer.py index 673b17f..21676c7 100644 --- a/trainer.py +++ b/trainer.py @@ -1,3 +1,4 @@ +from ray import tune import numpy as np import torch import torch.nn as nn @@ -136,6 +137,7 @@ def policy_update(self, transitions, v_targ, adv, iteration): def logging_episode(self, i_episode, ep_reward, running_reward): self.writer.add_scalar(f'reward', ep_reward, i_episode) self.writer.add_scalar(f'running reward', running_reward, i_episode) + tune.report(iterations=i_episode, running_reward=running_reward) def train(self): # Training loop diff --git a/wrappers/early_stop.py b/wrappers/early_stop.py index 2667033..3f6ab9b 100644 --- a/wrappers/early_stop.py +++ b/wrappers/early_stop.py @@ -21,7 +21,7 @@ def step(self, action): self.latest_rewards.append(reward) avg = 1 if self.remaining_steps == 0: - avg = np.array(self.latest_rewards).sum() / self.steps - if avg > 0: - self.remaining_steps = self.steps + avg = np.array(self.latest_rewards).mean() + self.remaining_steps = self.steps + self.latest_rewards = [] return state, reward, avg < 0, info diff --git a/wrappers/green_penalty.py b/wrappers/green_penalty.py new file mode 100644 index 0000000..fef709a --- /dev/null +++ b/wrappers/green_penalty.py @@ -0,0 +1,15 @@ +from gym import Wrapper +import numpy as np + + +class GreenPenalty(Wrapper): + r"""Stops the episode after n steps with negative reward""" + + def __init__(self, env): + super(GreenPenalty, self).__init__(env) + + def step(self, action): + state, reward, done, info = self.env.step(action) + if np.mean(state[:, :, 1]) > 180.0: + reward -= 0.05 + return state, reward, done, info