From 4ce9d7b4a28fe0a69e21aebdfb5f8b4553952bac Mon Sep 17 00:00:00 2001 From: SHILOVA Alena Date: Fri, 21 Jan 2022 17:01:22 +0100 Subject: [PATCH 01/13] corrected bug in assertion: missing self.env --- rlberry/agents/torch/dqn/dqn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rlberry/agents/torch/dqn/dqn.py b/rlberry/agents/torch/dqn/dqn.py index 978fc9fd9..fec8f4b82 100644 --- a/rlberry/agents/torch/dqn/dqn.py +++ b/rlberry/agents/torch/dqn/dqn.py @@ -135,7 +135,7 @@ def __init__( self.double = double assert isinstance( - env.action_space, spaces.Discrete + self.env.action_space, spaces.Discrete ), "Only compatible with Discrete action spaces." self.prioritized_replay = prioritized_replay From e2301521d9cc1bff400e0f87e2fe6d3b6db3fe8a Mon Sep 17 00:00:00 2001 From: riccardo Date: Wed, 16 Feb 2022 16:15:23 +0100 Subject: [PATCH 02/13] blacked main --- docs/conf.py | 50 +-- examples/demo_agents/video_plot_a2c.py | 7 +- examples/demo_agents/video_plot_dqn.py | 6 +- examples/demo_agents/video_plot_ppo.py | 8 +- .../demo_agents/video_plot_rs_kernel_ucbvi.py | 14 +- examples/demo_agents/video_plot_rsucbvi.py | 3 +- examples/demo_agents/video_plot_vi.py | 1 + examples/demo_env/video_plot_acrobot.py | 8 +- examples/demo_env/video_plot_gridworld.py | 1 - examples/demo_env/video_plot_pball.py | 31 +- examples/demo_env/video_plot_rooms.py | 12 +- examples/demo_examples/demo_adaptiveql.py | 48 ++- examples/demo_examples/demo_agent_manager.py | 36 +- .../demo_examples/demo_agent_manager_save.py | 21 +- .../demo_agent_manager_set_writer.py | 17 +- examples/demo_examples/demo_avecppo.py | 8 +- examples/demo_examples/demo_experiment/run.py | 6 +- .../demo_from_stable_baselines.py | 92 ++--- .../demo_from_stable_baselines_atari.py | 78 ++-- examples/demo_examples/demo_gym_wrapper.py | 5 +- .../demo_examples/demo_hyperparam_optim.py | 29 +- examples/demo_examples/demo_jax_dqn.py | 28 +- examples/demo_examples/demo_lsvi_ucb.py | 50 +-- .../demo_examples/demo_network/run_client.py | 13 +- .../demo_network/run_remote_manager.py | 28 +- .../demo_examples/demo_network/run_server.py | 29 +- examples/demo_examples/demo_ppo_benchmark.py | 32 +- examples/demo_examples/demo_ppo_bonus.py | 71 ++-- .../demo_examples/demo_ppo_partial_fit.py | 39 +- examples/demo_examples/demo_rnd.py | 6 +- examples/demo_examples/demo_seeding.py | 2 +- .../demo_examples/demo_ucbvi_and_opqtl.py | 45 ++- examples/demo_examples/demo_vis2d.py | 18 +- examples/plot_agent_manager.py | 52 ++- examples/plot_kernels.py | 4 +- rlberry/__init__.py | 2 +- rlberry/agents/adaptiveql/adaptiveql.py | 37 +- rlberry/agents/adaptiveql/tree.py | 36 +- rlberry/agents/agent.py | 52 +-- rlberry/agents/dynprog/value_iteration.py | 18 +- rlberry/agents/jax/dqn/dqn.py | 192 +++++----- rlberry/agents/jax/nets/common.py | 5 +- rlberry/agents/jax/tests/old_test_tqn.py | 9 +- rlberry/agents/jax/utils/replay_buffer.py | 61 ++-- rlberry/agents/kernel_based/common.py | 30 +- .../agents/kernel_based/rs_kernel_ucbvi.py | 186 ++++++---- rlberry/agents/kernel_based/rs_ucbvi.py | 111 +++--- rlberry/agents/linear/lsvi_ucb.py | 123 ++++--- rlberry/agents/mbqvi/mbqvi.py | 48 ++- rlberry/agents/optql/optql.py | 59 +-- rlberry/agents/tests/test_dynprog.py | 68 ++-- rlberry/agents/tests/test_kernel_based.py | 41 +-- rlberry/agents/tests/test_lsvi_ucb.py | 66 ++-- rlberry/agents/tests/test_optql.py | 5 +- rlberry/agents/tests/test_ucbvi.py | 38 +- rlberry/agents/torch/a2c/a2c.py | 110 +++--- rlberry/agents/torch/avec/avec_ppo.py | 129 ++++--- rlberry/agents/torch/dqn/dqn.py | 295 ++++++++------- rlberry/agents/torch/dqn/exploration.py | 39 +- rlberry/agents/torch/ppo/ppo.py | 178 +++++---- rlberry/agents/torch/reinforce/reinforce.py | 70 ++-- .../torch/tests/test_actor_critic_algos.py | 129 +++---- rlberry/agents/torch/tests/test_dqn.py | 48 +-- rlberry/agents/torch/tests/test_reinforce.py | 36 +- .../agents/torch/tests/test_torch_models.py | 29 +- .../agents/torch/tests/test_torch_training.py | 31 +- .../agents/torch/utils/attention_models.py | 160 ++++---- rlberry/agents/torch/utils/models.py | 164 +++++---- rlberry/agents/torch/utils/training.py | 12 +- rlberry/agents/ucbvi/ucbvi.py | 72 ++-- rlberry/agents/ucbvi/utils.py | 18 +- rlberry/agents/utils/memories.py | 46 +-- rlberry/colab_utils/display_setup.py | 13 +- rlberry/envs/basewrapper.py | 11 +- .../benchmarks/ball_exploration/ball2d.py | 82 +++-- .../envs/benchmarks/ball_exploration/pball.py | 149 +++++--- .../benchmarks/generalization/twinrooms.py | 19 +- .../benchmarks/grid_exploration/apple_gold.py | 27 +- .../benchmarks/grid_exploration/four_room.py | 26 +- .../envs/benchmarks/grid_exploration/nroom.py | 89 +++-- .../benchmarks/grid_exploration/six_room.py | 21 +- .../envs/bullet3/pybullet_envs/__init__.py | 16 +- .../pybullet_envs/gym_pendulum_envs.py | 11 +- .../envs/bullet3/pybullet_envs/robot_bases.py | 118 ++++-- .../bullet3/pybullet_envs/robot_pendula.py | 15 +- rlberry/envs/classic_control/acrobot.py | 89 +++-- rlberry/envs/classic_control/mountain_car.py | 31 +- rlberry/envs/classic_control/pendulum.py | 44 ++- rlberry/envs/finite/finite_mdp.py | 14 +- rlberry/envs/finite/gridworld.py | 121 +++--- rlberry/envs/gym_make.py | 2 + rlberry/envs/interface/model.py | 17 +- rlberry/envs/tests/test_env_seeding.py | 6 +- rlberry/envs/tests/test_gym_env_seeding.py | 6 +- rlberry/envs/tests/test_instantiation.py | 99 ++--- rlberry/experiment/generator.py | 13 +- rlberry/experiment/load_results.py | 36 +- .../tests/old_test_experiment_generator.py | 25 +- rlberry/experiment/yaml_utils.py | 39 +- rlberry/exploration_tools/discrete_counter.py | 26 +- .../online_discretization_counter.py | 84 +++-- .../tests/test_discrete_counter.py | 29 +- rlberry/exploration_tools/torch/rnd.py | 144 +++++--- .../exploration_tools/torch/tests/test_rnd.py | 3 +- rlberry/exploration_tools/typing.py | 8 +- .../uncertainty_estimator.py | 7 +- rlberry/manager/agent_manager.py | 343 ++++++++++-------- rlberry/manager/evaluation.py | 66 ++-- rlberry/manager/multiple_managers.py | 8 +- rlberry/manager/remote_agent_manager.py | 62 ++-- rlberry/manager/tests/test_agent_manager.py | 103 ++++-- .../tests/test_agent_manager_seeding.py | 36 +- .../manager/tests/test_hyperparam_optim.py | 130 +++---- rlberry/manager/utils.py | 2 +- rlberry/metadata_utils.py | 7 +- rlberry/network/client.py | 11 +- rlberry/network/interface.py | 51 +-- rlberry/network/server.py | 76 ++-- rlberry/network/server_utils.py | 59 +-- rlberry/network/utils.py | 45 ++- rlberry/rendering/opengl_render2d.py | 30 +- rlberry/rendering/pygame_render2d.py | 22 +- rlberry/rendering/render_interface.py | 9 +- .../tests/test_rendering_interface.py | 10 +- rlberry/rendering/utils.py | 35 +- rlberry/seeding/tests/test_seeding.py | 8 +- rlberry/seeding/tests/test_threads.py | 4 +- rlberry/seeding/tests/test_threads_torch.py | 4 +- rlberry/spaces/box.py | 32 +- rlberry/spaces/from_gym.py | 37 +- rlberry/spaces/multi_binary.py | 3 +- rlberry/spaces/tests/test_from_gym.py | 97 ++--- rlberry/spaces/tests/test_spaces.py | 97 ++--- rlberry/utils/binsearch.py | 7 +- rlberry/utils/io.py | 12 +- rlberry/utils/jit_setup.py | 1 + rlberry/utils/logging.py | 32 +- rlberry/utils/math.py | 6 +- rlberry/utils/space_discretizer.py | 5 +- rlberry/utils/tests/test_binsearch.py | 10 +- rlberry/utils/tests/test_metrics.py | 8 +- rlberry/utils/torch.py | 44 ++- rlberry/utils/writers.py | 95 +++-- rlberry/wrappers/discretize_state.py | 29 +- rlberry/wrappers/gym_utils.py | 5 +- rlberry/wrappers/tests/test_basewrapper.py | 5 +- .../wrappers/tests/test_common_wrappers.py | 27 +- .../tests/test_gym_space_conversion.py | 94 ++--- .../wrappers/tests/test_wrapper_seeding.py | 13 +- .../wrappers/uncertainty_estimator_wrapper.py | 68 ++-- rlberry/wrappers/vis2d.py | 177 +++++---- setup.py | 68 ++-- 152 files changed, 4007 insertions(+), 3177 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 366d4ba5e..4893481f1 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -16,52 +16,54 @@ import sphinx_gallery # noqa -sys.path.insert(0, os.path.abspath('.')) -sys.path.insert(0, os.path.abspath('../')) +sys.path.insert(0, os.path.abspath(".")) +sys.path.insert(0, os.path.abspath("../")) # -- Project information ----------------------------------------------------- -project = 'rlberry' -copyright = '2021, rlberry team' -author = 'rlberry team' +project = "rlberry" +copyright = "2021, rlberry team" +author = "rlberry team" # The full version, including alpha/beta/rc tags -release = '0.1' +release = "0.1" # -- General configuration --------------------------------------------------- # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. -extensions = ['sphinx.ext.doctest', - 'sphinx.ext.todo', - 'sphinx.ext.viewcode', - 'sphinx.ext.autodoc', - 'sphinx.ext.autosummary', - 'sphinx.ext.mathjax', - 'sphinx.ext.autosectionlabel', - 'sphinxcontrib.video', - "numpydoc", - "sphinx_gallery.gen_gallery", - 'myst_parser',] +extensions = [ + "sphinx.ext.doctest", + "sphinx.ext.todo", + "sphinx.ext.viewcode", + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.mathjax", + "sphinx.ext.autosectionlabel", + "sphinxcontrib.video", + "numpydoc", + "sphinx_gallery.gen_gallery", + "myst_parser", +] autodoc_default_flags = ["members", "inherited-members"] # Add any paths that contain templates here, relative to this directory. -templates_path = ['templates'] +templates_path = ["templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'themes'] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "themes"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # -source_suffix = ['.rst', '.md'] +source_suffix = [".rst", ".md"] # The master toctree document. -master_doc = 'index' +master_doc = "index" # Copied from scikit-learn: # For maths, use mathjax by default and svg if NO_MATHJAX env variable is set @@ -81,7 +83,7 @@ # html_theme = "scikit-learn-fork" -html_theme_options = { "mathjax_path": mathjax_path} +html_theme_options = {"mathjax_path": mathjax_path} html_theme_path = ["themes"] @@ -98,6 +100,6 @@ "doc_module": "rlberry", "backreferences_dir": os.path.join("generated"), "reference_url": {"rlberry": None}, - 'matplotlib_animations':True, - 'remove_config_comments': True, + "matplotlib_animations": True, + "remove_config_comments": True, } diff --git a/examples/demo_agents/video_plot_a2c.py b/examples/demo_agents/video_plot_a2c.py index 50a158aec..35a8ba0a4 100644 --- a/examples/demo_agents/video_plot_a2c.py +++ b/examples/demo_agents/video_plot_a2c.py @@ -18,12 +18,7 @@ env = PBall2D() n_episodes = 400 horizon = 256 -agent = A2CAgent( - env, - horizon=horizon, - gamma=0.99, - learning_rate=0.001, - k_epochs=4) +agent = A2CAgent(env, horizon=horizon, gamma=0.99, learning_rate=0.001, k_epochs=4) agent.fit(budget=n_episodes) env.enable_rendering() diff --git a/examples/demo_agents/video_plot_dqn.py b/examples/demo_agents/video_plot_dqn.py index 828eeff0c..340c53b10 100644 --- a/examples/demo_agents/video_plot_dqn.py +++ b/examples/demo_agents/video_plot_dqn.py @@ -39,7 +39,11 @@ print(f"Running DQN on {env}") agent.fit(budget=50) -vid = video_recorder.VideoRecorder(env,path="_video/video_plot_dqn.mp4", enabled=True,) +vid = video_recorder.VideoRecorder( + env, + path="_video/video_plot_dqn.mp4", + enabled=True, +) for episode in range(3): done = False diff --git a/examples/demo_agents/video_plot_ppo.py b/examples/demo_agents/video_plot_ppo.py index 0dc2444f3..9ace752be 100644 --- a/examples/demo_agents/video_plot_ppo.py +++ b/examples/demo_agents/video_plot_ppo.py @@ -20,12 +20,8 @@ horizon = 256 agent = PPOAgent( - env, - horizon=horizon, - gamma=0.99, - learning_rate=0.001, - eps_clip=0.2, - k_epochs=4) + env, horizon=horizon, gamma=0.99, learning_rate=0.001, eps_clip=0.2, k_epochs=4 +) agent.fit(budget=n_episodes) env.enable_rendering() diff --git a/examples/demo_agents/video_plot_rs_kernel_ucbvi.py b/examples/demo_agents/video_plot_rs_kernel_ucbvi.py index 6afa1c965..0d30d5d1f 100644 --- a/examples/demo_agents/video_plot_rs_kernel_ucbvi.py +++ b/examples/demo_agents/video_plot_rs_kernel_ucbvi.py @@ -19,10 +19,16 @@ # rescake rewards to [0, 1] env = RescaleRewardWrapper(env, (0.0, 1.0)) -agent = RSKernelUCBVIAgent(env, gamma=0.99, horizon=300, - bonus_scale_factor=0.01, - min_dist=0.2, bandwidth=0.05, beta=1.0, - kernel_type="gaussian") +agent = RSKernelUCBVIAgent( + env, + gamma=0.99, + horizon=300, + bonus_scale_factor=0.01, + min_dist=0.2, + bandwidth=0.05, + beta=1.0, + kernel_type="gaussian", +) agent.fit(budget=500) env.enable_rendering() diff --git a/examples/demo_agents/video_plot_rsucbvi.py b/examples/demo_agents/video_plot_rsucbvi.py index 8878f8c83..44dce662d 100644 --- a/examples/demo_agents/video_plot_rsucbvi.py +++ b/examples/demo_agents/video_plot_rsucbvi.py @@ -17,8 +17,7 @@ env = MountainCar() horizon = 170 print("Running RS-UCBVI on %s" % env.name) -agent = RSUCBVIAgent(env, gamma=0.99, horizon=horizon, - bonus_scale_factor=0.1) +agent = RSUCBVIAgent(env, gamma=0.99, horizon=horizon, bonus_scale_factor=0.1) agent.fit(budget=500) env.enable_rendering() diff --git a/examples/demo_agents/video_plot_vi.py b/examples/demo_agents/video_plot_vi.py index 2e7eb5ead..2065e9660 100644 --- a/examples/demo_agents/video_plot_vi.py +++ b/examples/demo_agents/video_plot_vi.py @@ -13,6 +13,7 @@ from rlberry.agents.dynprog import ValueIterationAgent from rlberry.envs.finite import Chain + env = Chain() agent = ValueIterationAgent(env, gamma=0.95) info = agent.fit() diff --git a/examples/demo_env/video_plot_acrobot.py b/examples/demo_env/video_plot_acrobot.py index d22c28b87..7a4129985 100644 --- a/examples/demo_env/video_plot_acrobot.py +++ b/examples/demo_env/video_plot_acrobot.py @@ -20,15 +20,13 @@ env = RescaleRewardWrapper(env, (0.0, 1.0)) n_episodes = 300 agent = RSUCBVIAgent( - env, - gamma=0.99, - horizon=300, - bonus_scale_factor=0.01, min_dist=0.25) + env, gamma=0.99, horizon=300, bonus_scale_factor=0.01, min_dist=0.25 +) agent.fit(budget=n_episodes) env.enable_rendering() state = env.reset() -for tt in range(2*agent.horizon): +for tt in range(2 * agent.horizon): action = agent.policy(state) next_state, reward, done, _ = env.step(action) state = next_state diff --git a/examples/demo_env/video_plot_gridworld.py b/examples/demo_env/video_plot_gridworld.py index c7d18e452..fe5e23c45 100644 --- a/examples/demo_env/video_plot_gridworld.py +++ b/examples/demo_env/video_plot_gridworld.py @@ -14,7 +14,6 @@ from rlberry.envs.finite import GridWorld - env = GridWorld(7, 10, walls=((2, 2), (3, 3))) agent = ValueIterationAgent(env, gamma=0.95) info = agent.fit() diff --git a/examples/demo_env/video_plot_pball.py b/examples/demo_env/video_plot_pball.py index cc0b85df2..af6c7c637 100644 --- a/examples/demo_env/video_plot_pball.py +++ b/examples/demo_env/video_plot_pball.py @@ -14,11 +14,7 @@ from rlberry.envs.benchmarks.ball_exploration import PBall2D p = 5 -A = np.array([ - [1.0, 0.1], - [-0.1, 1.0] -] -) +A = np.array([[1.0, 0.1], [-0.1, 1.0]]) reward_amplitudes = np.array([1.0, 0.5, 0.5]) reward_smoothness = np.array([0.25, 0.25, 0.25]) @@ -26,19 +22,24 @@ reward_centers = [ np.array([0.75 * np.cos(np.pi / 2), 0.75 * np.sin(np.pi / 2)]), np.array([0.75 * np.cos(np.pi / 6), 0.75 * np.sin(np.pi / 6)]), - np.array([0.75 * np.cos(5 * np.pi / 6), 0.75 * np.sin(5 * np.pi / 6)]) + np.array([0.75 * np.cos(5 * np.pi / 6), 0.75 * np.sin(5 * np.pi / 6)]), ] -action_list = [0.1 * np.array([1, 0]), - -0.1 * np.array([1, 0]), - 0.1 * np.array([0, 1]), - -0.1 * np.array([0, 1])] +action_list = [ + 0.1 * np.array([1, 0]), + -0.1 * np.array([1, 0]), + 0.1 * np.array([0, 1]), + -0.1 * np.array([0, 1]), +] -env = PBall2D(p=p, A=A, - reward_amplitudes=reward_amplitudes, - reward_centers=reward_centers, - reward_smoothness=reward_smoothness, - action_list=action_list) +env = PBall2D( + p=p, + A=A, + reward_amplitudes=reward_amplitudes, + reward_centers=reward_centers, + reward_smoothness=reward_smoothness, + action_list=action_list, +) env.enable_rendering() diff --git a/examples/demo_env/video_plot_rooms.py b/examples/demo_env/video_plot_rooms.py index 39e43d100..163ede1c6 100644 --- a/examples/demo_env/video_plot_rooms.py +++ b/examples/demo_env/video_plot_rooms.py @@ -13,11 +13,13 @@ from rlberry.envs.benchmarks.grid_exploration.nroom import NRoom from rlberry.agents.dynprog import ValueIterationAgent -env = NRoom(nrooms=9, - remove_walls=False, - room_size=9, - initial_state_distribution='center', - include_traps=True) +env = NRoom( + nrooms=9, + remove_walls=False, + room_size=9, + initial_state_distribution="center", + include_traps=True, +) horizon = env.observation_space.n agent = ValueIterationAgent(env, gamma=0.999, horizon=horizon) diff --git a/examples/demo_examples/demo_adaptiveql.py b/examples/demo_examples/demo_adaptiveql.py index a314ff042..3493af262 100644 --- a/examples/demo_examples/demo_adaptiveql.py +++ b/examples/demo_examples/demo_adaptiveql.py @@ -7,29 +7,30 @@ from rlberry.agents import AdaptiveQLAgent from rlberry.agents import RSUCBVIAgent from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env -from rlberry.manager import MultipleManagers, AgentManager, plot_writer_data, evaluate_agents +from rlberry.manager import ( + MultipleManagers, + AgentManager, + plot_writer_data, + evaluate_agents, +) import matplotlib.pyplot as plt -if __name__ == '__main__': +if __name__ == "__main__": env = (get_benchmark_env, dict(level=2)) N_EP = 1000 HORIZON = 30 params = {} - params['adaql'] = { - 'horizon': HORIZON, - 'gamma': 1.0, - 'bonus_scale_factor': 1.0 - } + params["adaql"] = {"horizon": HORIZON, "gamma": 1.0, "bonus_scale_factor": 1.0} - params['rsucbvi'] = { - 'horizon': HORIZON, - 'gamma': 1.0, - 'bonus_scale_factor': 1.0, - 'min_dist': 0.05, - 'max_repr': 800 + params["rsucbvi"] = { + "horizon": HORIZON, + "gamma": 1.0, + "bonus_scale_factor": 1.0, + "min_dist": 0.05, + "max_repr": 800, } eval_kwargs = dict(eval_horizon=HORIZON, n_simulations=20) @@ -40,26 +41,33 @@ AdaptiveQLAgent, env, fit_budget=N_EP, - init_kwargs=params['adaql'], + init_kwargs=params["adaql"], eval_kwargs=eval_kwargs, n_fit=4, - output_dir='dev/examples/') + output_dir="dev/examples/", + ) ) multimanagers.append( AgentManager( RSUCBVIAgent, env, fit_budget=N_EP, - init_kwargs=params['rsucbvi'], n_fit=2, - output_dir='dev/examples/') + init_kwargs=params["rsucbvi"], + n_fit=2, + output_dir="dev/examples/", + ) ) multimanagers.run(save=False) evaluate_agents(multimanagers.managers) - plot_writer_data(multimanagers.managers, tag='episode_rewards', - preprocess_func=np.cumsum, title='Cumulative Rewards') + plot_writer_data( + multimanagers.managers, + tag="episode_rewards", + preprocess_func=np.cumsum, + title="Cumulative Rewards", + ) for stats in multimanagers.managers: agent = stats.get_agent_instances()[0] @@ -70,4 +78,4 @@ plt.show() for stats in multimanagers.managers: - print(f'Agent = {stats.agent_name}, Eval = {stats.eval_agents()}') + print(f"Agent = {stats.agent_name}, Eval = {stats.eval_agents()}") diff --git a/examples/demo_examples/demo_agent_manager.py b/examples/demo_examples/demo_agent_manager.py index 74fce9ea8..7d3ee9273 100644 --- a/examples/demo_examples/demo_agent_manager.py +++ b/examples/demo_examples/demo_agent_manager.py @@ -11,7 +11,7 @@ from rlberry.seeding import set_external_seed -if __name__ == '__main__': +if __name__ == "__main__": set_external_seed(123) # -------------------------------- @@ -46,9 +46,7 @@ "kernel_type": "gaussian", } - params_a2c = {"gamma": GAMMA, - "horizon": HORIZON, - "learning_rate": 0.0003} + params_a2c = {"gamma": GAMMA, "horizon": HORIZON, "learning_rate": 0.0003} eval_kwargs = dict(eval_horizon=HORIZON, n_simulations=20) @@ -67,7 +65,8 @@ default_writer_kwargs=dict( maxlen=N_EPISODES - 10, log_interval=5.0, - )) + ), + ) rskernel_stats = AgentManager( RSKernelUCBVIAgent, train_env, @@ -76,7 +75,8 @@ eval_kwargs=eval_kwargs, n_fit=4, seed=123, - enable_tensorboard=True) + enable_tensorboard=True, + ) a2c_stats = AgentManager( A2CAgent, train_env, @@ -85,7 +85,8 @@ eval_kwargs=eval_kwargs, n_fit=4, seed=123, - parallelization='process') + parallelization="process", + ) agent_manager_list = [rsucbvi_stats, rskernel_stats, a2c_stats] @@ -96,16 +97,17 @@ rsucbvi_stats.fit(budget=50) # learning curves - plot_writer_data(agent_manager_list, - tag='episode_rewards', - preprocess_func=np.cumsum, - title='cumulative rewards', - show=False) - - plot_writer_data(agent_manager_list, - tag='episode_rewards', - title='episode rewards', - show=False) + plot_writer_data( + agent_manager_list, + tag="episode_rewards", + preprocess_func=np.cumsum, + title="cumulative rewards", + show=False, + ) + + plot_writer_data( + agent_manager_list, tag="episode_rewards", title="episode rewards", show=False + ) # compare final policies output = evaluate_agents(agent_manager_list) diff --git a/examples/demo_examples/demo_agent_manager_save.py b/examples/demo_examples/demo_agent_manager_save.py index c9eb49155..76439d43d 100644 --- a/examples/demo_examples/demo_agent_manager_save.py +++ b/examples/demo_examples/demo_agent_manager_save.py @@ -9,7 +9,7 @@ from rlberry.manager import AgentManager, plot_writer_data, evaluate_agents -if __name__ == '__main__': +if __name__ == "__main__": # -------------------------------- # Define train and evaluation envs # -------------------------------- @@ -25,9 +25,7 @@ BONUS_SCALE_FACTOR = 0.1 MIN_DIST = 0.1 - params_ppo = {"gamma": GAMMA, - "horizon": HORIZON, - "learning_rate": 0.0003} + params_ppo = {"gamma": GAMMA, "horizon": HORIZON, "learning_rate": 0.0003} eval_kwargs = dict(eval_horizon=HORIZON, n_simulations=20) @@ -41,8 +39,9 @@ init_kwargs=params_ppo, eval_kwargs=eval_kwargs, n_fit=4, - output_dir='dev/', - parallelization='process') + output_dir="dev/", + parallelization="process", + ) ppo_stats.fit() # fit the 4 agents ppo_stats_fname = ppo_stats.save() del ppo_stats @@ -53,9 +52,13 @@ ppo_stats = AgentManager.load(ppo_stats_fname) # learning curves - plot_writer_data(ppo_stats, tag='episode_rewards', - preprocess_func=np.cumsum, - title='Cumulative Rewards', show=False) + plot_writer_data( + ppo_stats, + tag="episode_rewards", + preprocess_func=np.cumsum, + title="Cumulative Rewards", + show=False, + ) # compare final policies output = evaluate_agents([ppo_stats], n_simulations=15) diff --git a/examples/demo_examples/demo_agent_manager_set_writer.py b/examples/demo_examples/demo_agent_manager_set_writer.py index 017972976..8fea389b7 100644 --- a/examples/demo_examples/demo_agent_manager_set_writer.py +++ b/examples/demo_examples/demo_agent_manager_set_writer.py @@ -8,7 +8,7 @@ from rlberry.manager import AgentManager, evaluate_agents from torch.utils.tensorboard import SummaryWriter -if __name__ == '__main__': +if __name__ == "__main__": # -------------------------------- # Define training env # -------------------------------- @@ -21,9 +21,7 @@ GAMMA = 0.99 HORIZON = 50 - params_ppo = {"gamma": GAMMA, - "horizon": HORIZON, - "learning_rate": 0.0003} + params_ppo = {"gamma": GAMMA, "horizon": HORIZON, "learning_rate": 0.0003} eval_kwargs = dict(eval_horizon=HORIZON, n_simulations=20) @@ -36,15 +34,18 @@ fit_budget=N_EPISODES, init_kwargs=params_ppo, eval_kwargs=eval_kwargs, - n_fit=4) + n_fit=4, + ) - ppo_stats.set_writer(0, SummaryWriter, writer_kwargs={'comment': 'worker_0'}) - ppo_stats.set_writer(1, SummaryWriter, writer_kwargs={'comment': 'worker_1'}) + ppo_stats.set_writer(0, SummaryWriter, writer_kwargs={"comment": "worker_0"}) + ppo_stats.set_writer(1, SummaryWriter, writer_kwargs={"comment": "worker_1"}) agent_manager_list = [ppo_stats] agent_manager_list[0].fit() - agent_manager_list[0].save() # after fit, writers are set to None to avoid pickle problems. + agent_manager_list[ + 0 + ].save() # after fit, writers are set to None to avoid pickle problems. # compare final policies output = evaluate_agents(agent_manager_list) diff --git a/examples/demo_examples/demo_avecppo.py b/examples/demo_examples/demo_avecppo.py index 15b4afca6..1a745b0fd 100644 --- a/examples/demo_examples/demo_avecppo.py +++ b/examples/demo_examples/demo_avecppo.py @@ -11,12 +11,8 @@ n_episodes = 400 horizon = 256 agent = AVECPPOAgent( - env, - horizon=horizon, - gamma=0.99, - learning_rate=0.00025, - eps_clip=0.2, - k_epochs=4) + env, horizon=horizon, gamma=0.99, learning_rate=0.00025, eps_clip=0.2, k_epochs=4 +) agent.fit(budget=n_episodes) env.enable_rendering() diff --git a/examples/demo_examples/demo_experiment/run.py b/examples/demo_examples/demo_experiment/run.py index 5f2d38979..38457401f 100644 --- a/examples/demo_examples/demo_experiment/run.py +++ b/examples/demo_examples/demo_experiment/run.py @@ -18,7 +18,7 @@ from rlberry.manager.multiple_managers import MultipleManagers -if __name__ == '__main__': +if __name__ == "__main__": multimanagers = MultipleManagers() for agent_manager in experiment_generator(): @@ -30,10 +30,10 @@ # Reading the results del multimanagers - data = load_experiment_results('results', 'params_experiment') + data = load_experiment_results("results", "params_experiment") print(data) # Fit one of the managers for a few more episodes # If tensorboard is enabled, you should see more episodes ran for 'rsucbvi_alternative' - data['manager']['rsucbvi_alternative'].fit(50) + data["manager"]["rsucbvi_alternative"].fit(50) diff --git a/examples/demo_examples/demo_from_stable_baselines.py b/examples/demo_examples/demo_from_stable_baselines.py index 870371335..bdddde3b0 100644 --- a/examples/demo_examples/demo_from_stable_baselines.py +++ b/examples/demo_examples/demo_from_stable_baselines.py @@ -9,31 +9,33 @@ class A2CAgent(AgentWithSimplePolicy): - name = 'A2C' - - def __init__(self, - env, - policy, - learning_rate=7e-4, - n_steps: int = 200, - gamma: float = 0.99, - gae_lambda: float = 1.0, - ent_coef: float = 0.0, - vf_coef: float = 0.5, - max_grad_norm: float = 0.5, - rms_prop_eps: float = 1e-5, - use_rms_prop: bool = True, - use_sde: bool = False, - sde_sample_freq: int = -1, - normalize_advantage: bool = False, - tensorboard_log=None, - create_eval_env=False, - policy_kwargs=None, - verbose: int = 0, - seed=None, - device="auto", - _init_setup_model: bool = True, - **kwargs): + name = "A2C" + + def __init__( + self, + env, + policy, + learning_rate=7e-4, + n_steps: int = 200, + gamma: float = 0.99, + gae_lambda: float = 1.0, + ent_coef: float = 0.0, + vf_coef: float = 0.5, + max_grad_norm: float = 0.5, + rms_prop_eps: float = 1e-5, + use_rms_prop: bool = True, + use_sde: bool = False, + sde_sample_freq: int = -1, + normalize_advantage: bool = False, + tensorboard_log=None, + create_eval_env=False, + policy_kwargs=None, + verbose: int = 0, + seed=None, + device="auto", + _init_setup_model: bool = True, + **kwargs + ): # init rlberry base class AgentWithSimplePolicy.__init__(self, env, **kwargs) # rlberry accepts tuples (env_constructor, env_kwargs) as env @@ -65,7 +67,8 @@ def __init__(self, verbose, seed, device, - _init_setup_model) + _init_setup_model, + ) def fit(self, budget, **kwargs): self.wrapped.learn(total_timesteps=budget, **kwargs) @@ -79,10 +82,12 @@ def policy(self, observation): # @classmethod def sample_parameters(cls, trial): - learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1) + learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1) ent_coef = trial.suggest_loguniform("ent_coef", 0.00000001, 0.1) vf_coef = trial.suggest_uniform("vf_coef", 0, 1) - normalize_advantage = trial.suggest_categorical("normalize_advantage", [False, True]) + normalize_advantage = trial.suggest_categorical( + "normalize_advantage", [False, True] + ) return dict( learning_rate=learning_rate, ent_coef=ent_coef, @@ -91,12 +96,12 @@ def sample_parameters(cls, trial): ) -if __name__ == '__main__': +if __name__ == "__main__": # # Training one agent # env_ctor = gym_make - env_kwargs = dict(id='CartPole-v1') + env_kwargs = dict(id="CartPole-v1") # env = env_ctor(**env_kwargs) # agent = A2CAgent(env, 'MlpPolicy', verbose=1) # agent.fit(budget=1000) @@ -109,36 +114,39 @@ def sample_parameters(cls, trial): stats = AgentManager( A2CAgent, (env_ctor, env_kwargs), - agent_name='A2C baseline', - init_kwargs=dict(policy='MlpPolicy', verbose=1), + agent_name="A2C baseline", + init_kwargs=dict(policy="MlpPolicy", verbose=1), fit_kwargs=dict(log_interval=1000), fit_budget=2500, eval_kwargs=dict(eval_horizon=400), n_fit=4, - parallelization='process', - output_dir='dev/stable_baselines', - seed=123) + parallelization="process", + output_dir="dev/stable_baselines", + seed=123, + ) stats_alternative = AgentManager( A2CAgent, (env_ctor, env_kwargs), - agent_name='A2C optimized', - init_kwargs=dict(policy='MlpPolicy', verbose=1), + agent_name="A2C optimized", + init_kwargs=dict(policy="MlpPolicy", verbose=1), fit_kwargs=dict(log_interval=1000), fit_budget=2500, eval_kwargs=dict(eval_horizon=400), n_fit=4, - parallelization='process', - output_dir='dev/stable_baselines', - seed=456) + parallelization="process", + output_dir="dev/stable_baselines", + seed=456, + ) # Optimize hyperparams (600 seconds) stats_alternative.optimize_hyperparams( timeout=600, n_optuna_workers=2, n_fit=2, - optuna_parallelization='process', - fit_fraction=1.0) + optuna_parallelization="process", + fit_fraction=1.0, + ) # Fit everything in parallel multimanagers = MultipleManagers() diff --git a/examples/demo_examples/demo_from_stable_baselines_atari.py b/examples/demo_examples/demo_from_stable_baselines_atari.py index a4095bbd1..946ff0351 100644 --- a/examples/demo_examples/demo_from_stable_baselines_atari.py +++ b/examples/demo_examples/demo_from_stable_baselines_atari.py @@ -13,31 +13,33 @@ class A2CAgent(AgentWithSimplePolicy): - name = 'A2C' - - def __init__(self, - env, - policy, - learning_rate=7e-4, - n_steps: int = 5, - gamma: float = 0.99, - gae_lambda: float = 1.0, - ent_coef: float = 0.0, - vf_coef: float = 0.5, - max_grad_norm: float = 0.5, - rms_prop_eps: float = 1e-5, - use_rms_prop: bool = True, - use_sde: bool = False, - sde_sample_freq: int = -1, - normalize_advantage: bool = False, - tensorboard_log=None, - create_eval_env=False, - policy_kwargs=None, - verbose: int = 0, - seed=None, - device="auto", - _init_setup_model: bool = True, - **kwargs): + name = "A2C" + + def __init__( + self, + env, + policy, + learning_rate=7e-4, + n_steps: int = 5, + gamma: float = 0.99, + gae_lambda: float = 1.0, + ent_coef: float = 0.0, + vf_coef: float = 0.5, + max_grad_norm: float = 0.5, + rms_prop_eps: float = 1e-5, + use_rms_prop: bool = True, + use_sde: bool = False, + sde_sample_freq: int = -1, + normalize_advantage: bool = False, + tensorboard_log=None, + create_eval_env=False, + policy_kwargs=None, + verbose: int = 0, + seed=None, + device="auto", + _init_setup_model: bool = True, + **kwargs + ): # init rlberry base class AgentWithSimplePolicy.__init__(self, env, **kwargs) # rlberry accepts tuples (env_constructor, env_kwargs) as env @@ -69,7 +71,8 @@ def __init__(self, verbose, seed, device, - _init_setup_model) + _init_setup_model, + ) def fit(self, budget): self.wrapped.learn(total_timesteps=budget) @@ -83,7 +86,7 @@ def policy(self, observation): # def save(self, filename): self.wrapped.save(filename) - return Path(filename).with_suffix('.zip') + return Path(filename).with_suffix(".zip") @classmethod def load(cls, filename, **kwargs): @@ -96,16 +99,16 @@ def load(cls, filename, **kwargs): # @classmethod def sample_parameters(cls, trial): - learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1) + learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1) - return {'learning_rate': learning_rate} + return {"learning_rate": learning_rate} # # Train and eval env constructors # def env_constructor(n_envs=4): - env = make_atari_env('MontezumaRevenge-v0', n_envs=n_envs) + env = make_atari_env("MontezumaRevenge-v0", n_envs=n_envs) env = VecFrameStack(env, n_stack=4) return env @@ -114,7 +117,7 @@ def eval_env_constructor(n_envs=1): """ Evaluation should be in a scalar environment. """ - env = make_atari_env('MontezumaRevenge-v0', n_envs=n_envs) + env = make_atari_env("MontezumaRevenge-v0", n_envs=n_envs) env = VecFrameStack(env, n_stack=4) env = ScalarizeEnvWrapper(env) return env @@ -125,7 +128,7 @@ def eval_env_constructor(n_envs=1): # -if __name__ == '__main__': +if __name__ == "__main__": # # Training several agents and comparing different hyperparams # @@ -135,13 +138,14 @@ def eval_env_constructor(n_envs=1): train_env=(env_constructor, None), eval_env=(eval_env_constructor, None), eval_kwargs=dict(eval_horizon=200), - agent_name='A2C baseline', + agent_name="A2C baseline", fit_budget=5000, - init_kwargs=dict(policy='CnnPolicy', verbose=10), + init_kwargs=dict(policy="CnnPolicy", verbose=10), n_fit=4, - parallelization='process', - output_dir='dev/stable_baselines_atari', - seed=123) + parallelization="process", + output_dir="dev/stable_baselines_atari", + seed=123, + ) stats.fit() stats.optimize_hyperparams(timeout=60, n_fit=2) diff --git a/examples/demo_examples/demo_gym_wrapper.py b/examples/demo_examples/demo_gym_wrapper.py index 8726e47e9..8dd4143ba 100644 --- a/examples/demo_examples/demo_gym_wrapper.py +++ b/examples/demo_examples/demo_gym_wrapper.py @@ -7,14 +7,13 @@ from rlberry.agents import RSUCBVIAgent from rlberry.wrappers import RescaleRewardWrapper -env = gym_make('Acrobot-v1') +env = gym_make("Acrobot-v1") env.reward_range = (-1.0, 0.0) # missing in gym implementation # rescake rewards to [0, 1] env = RescaleRewardWrapper(env, (0.0, 1.0)) -agent = RSUCBVIAgent(env, gamma=0.99, horizon=200, - bonus_scale_factor=0.1, min_dist=0.2) +agent = RSUCBVIAgent(env, gamma=0.99, horizon=200, bonus_scale_factor=0.1, min_dist=0.2) agent.fit(budget=10) state = env.reset() diff --git a/examples/demo_examples/demo_hyperparam_optim.py b/examples/demo_examples/demo_hyperparam_optim.py index 6f2c3d681..f00b2939c 100644 --- a/examples/demo_examples/demo_hyperparam_optim.py +++ b/examples/demo_examples/demo_hyperparam_optim.py @@ -7,7 +7,7 @@ from rlberry.agents.torch import REINFORCEAgent from rlberry.manager import AgentManager -if __name__ == '__main__': +if __name__ == "__main__": # -------------------------------- # Define train and evaluation envs # -------------------------------- @@ -22,9 +22,7 @@ BONUS_SCALE_FACTOR = 0.1 MIN_DIST = 0.1 - params = {"gamma": GAMMA, - "horizon": HORIZON, - "learning_rate": 0.0003} + params = {"gamma": GAMMA, "horizon": HORIZON, "learning_rate": 0.0003} eval_kwargs = dict(eval_horizon=HORIZON, n_simulations=20) @@ -32,17 +30,22 @@ # Run AgentManager and save results # -------------------------------- manager = AgentManager( - REINFORCEAgent, train_env, fit_budget=N_EPISODES, + REINFORCEAgent, + train_env, + fit_budget=N_EPISODES, init_kwargs=params, eval_kwargs=eval_kwargs, - n_fit=4) + n_fit=4, + ) # hyperparam optim with multiple threads manager.optimize_hyperparams( - n_trials=5, timeout=None, + n_trials=5, + timeout=None, n_fit=2, - sampler_method='optuna_default', - optuna_parallelization='thread') + sampler_method="optuna_default", + optuna_parallelization="thread", + ) initial_n_trials = len(manager.optuna_study.trials) @@ -55,11 +58,13 @@ # continue previous optimization, now with 120s of timeout and multiprocessing manager.optimize_hyperparams( - n_trials=512, timeout=120, + n_trials=512, + timeout=120, n_fit=8, continue_previous=True, - optuna_parallelization='process', - n_optuna_workers=4) + optuna_parallelization="process", + n_optuna_workers=4, + ) print("number of initial trials = ", initial_n_trials) print("number of trials after continuing= ", len(manager.optuna_study.trials)) diff --git a/examples/demo_examples/demo_jax_dqn.py b/examples/demo_examples/demo_jax_dqn.py index 7afc33598..76a001650 100644 --- a/examples/demo_examples/demo_jax_dqn.py +++ b/examples/demo_examples/demo_jax_dqn.py @@ -8,13 +8,13 @@ from rlberry.envs import gym_make from rlberry.manager import AgentManager, MultipleManagers, plot_writer_data -if __name__ == '__main__': +if __name__ == "__main__": # global params fit_budget = 10000 n_fit = 2 # env and algorithm params - env = (gym_make, dict(id='CartPole-v0')) + env = (gym_make, dict(id="CartPole-v0")) params = dict( chunk_size=8, batch_size=64, @@ -25,17 +25,15 @@ learning_rate=0.0015, net_constructor=nets.MLPQNetwork, net_kwargs=dict( - num_actions=env[0](**env[1]).action_space.n, - hidden_sizes=(64, 64) - ) + num_actions=env[0](**env[1]).action_space.n, hidden_sizes=(64, 64) + ), ) params_alternative = params.copy() params_alternative.update( dict( net_kwargs=dict( - num_actions=env[0](**env[1]).action_space.n, - hidden_sizes=(16, 16) + num_actions=env[0](**env[1]).action_space.n, hidden_sizes=(16, 16) ) ) ) @@ -47,8 +45,8 @@ eval_env=env, init_kwargs=params, n_fit=n_fit, - parallelization='process', - agent_name='dqn', + parallelization="process", + agent_name="dqn", ) stats_alternative = AgentManager( @@ -58,8 +56,8 @@ eval_env=env, init_kwargs=params_alternative, n_fit=n_fit, - parallelization='process', - agent_name='dqn_smaller_net' + parallelization="process", + agent_name="dqn_smaller_net", ) # fit everything in parallel @@ -68,10 +66,10 @@ multimanagers.append(stats_alternative) multimanagers.run() - plot_writer_data(multimanagers.managers, tag='episode_rewards', show=False) - plot_writer_data(multimanagers.managers, tag='dw_time_elapsed', show=False) - plot_writer_data(multimanagers.managers, tag='eval_rewards', show=False) - plot_writer_data(multimanagers.managers, tag='q_loss') + plot_writer_data(multimanagers.managers, tag="episode_rewards", show=False) + plot_writer_data(multimanagers.managers, tag="dw_time_elapsed", show=False) + plot_writer_data(multimanagers.managers, tag="eval_rewards", show=False) + plot_writer_data(multimanagers.managers, tag="q_loss") stats.save() stats.clear_output_dir() diff --git a/examples/demo_examples/demo_lsvi_ucb.py b/examples/demo_examples/demo_lsvi_ucb.py index 43289df0c..5d988c659 100644 --- a/examples/demo_examples/demo_lsvi_ucb.py +++ b/examples/demo_examples/demo_lsvi_ucb.py @@ -16,7 +16,9 @@ class GridWorldFeatureMap(FeatureMap): - def __init__(self, n_states, n_actions, n_rows, n_cols, index2coord, dim=15, sigma=0.25): + def __init__( + self, n_states, n_actions, n_rows, n_cols, index2coord, dim=15, sigma=0.25 + ): self.index2coord = index2coord self.n_states = n_states self.n_actions = n_actions @@ -35,7 +37,7 @@ def __init__(self, n_states, n_actions, n_rows, n_cols, index2coord, dim=15, sig x_jj = row_jj / n_rows y_jj = col_jj / n_cols dist = np.sqrt((x_jj - x_ii) ** 2.0 + (y_jj - y_ii) ** 2.0) - sim_matrix[ii, jj] = np.exp(-(dist / sigma) ** 2.0) + sim_matrix[ii, jj] = np.exp(-((dist / sigma) ** 2.0)) # factorize similarity matrix to obtain features uu, ss, vh = np.linalg.svd(sim_matrix, hermitian=True) @@ -54,16 +56,17 @@ def feature_map_fn(env): env.action_space.n, env.nrows, env.ncols, - env.index2coord) + env.index2coord, + ) -if __name__ == '__main__': +if __name__ == "__main__": # Parameters n_episodes = 750 horizon = 10 gamma = 0.99 eval_kwargs = dict(eval_horizon=10) - parallelization = 'process' + parallelization = "process" # Define environment (constructor, kwargs) env = (GridWorld, dict(nrows=5, ncols=5, walls=(), success_probability=0.95)) @@ -72,7 +75,7 @@ def feature_map_fn(env): feature_map_fn=feature_map_fn, horizon=horizon, bonus_scale_factor=0.01, - gamma=gamma + gamma=gamma, ) params_ucbvi = dict( @@ -80,20 +83,17 @@ def feature_map_fn(env): gamma=gamma, real_time_dp=False, stage_dependent=False, - bonus_scale_factor=0.01 + bonus_scale_factor=0.01, ) params_greedy = dict( feature_map_fn=feature_map_fn, horizon=horizon, bonus_scale_factor=0.0, - gamma=gamma + gamma=gamma, ) - params_oracle = dict( - horizon=horizon, - gamma=gamma - ) + params_oracle = dict(horizon=horizon, gamma=gamma) stats = AgentManager( LSVIUCBAgent, @@ -102,7 +102,8 @@ def feature_map_fn(env): fit_budget=n_episodes, eval_kwargs=eval_kwargs, n_fit=4, - parallelization=parallelization) + parallelization=parallelization, + ) # UCBVI baseline stats_ucbvi = AgentManager( @@ -112,7 +113,8 @@ def feature_map_fn(env): fit_budget=n_episodes, eval_kwargs=eval_kwargs, n_fit=4, - parallelization=parallelization) + parallelization=parallelization, + ) # Random exploration baseline stats_random = AgentManager( @@ -122,8 +124,9 @@ def feature_map_fn(env): fit_budget=n_episodes, eval_kwargs=eval_kwargs, n_fit=4, - agent_name='LSVI (random exploration)', - parallelization=parallelization) + agent_name="LSVI (random exploration)", + parallelization=parallelization, + ) # Oracle (optimal policy) oracle_stats = AgentManager( @@ -132,7 +135,8 @@ def feature_map_fn(env): init_kwargs=params_oracle, fit_budget=n_episodes, eval_kwargs=eval_kwargs, - n_fit=1) + n_fit=1, + ) # fit stats.fit() @@ -143,12 +147,12 @@ def feature_map_fn(env): # visualize results plot_writer_data( [stats, stats_ucbvi, stats_random], - tag='episode_rewards', + tag="episode_rewards", preprocess_func=np.cumsum, - title='Cumulative Rewards', - show=False) + title="Cumulative Rewards", + show=False, + ) plot_writer_data( - [stats, stats_ucbvi, stats_random], - tag='dw_time_elapsed', - show=False) + [stats, stats_ucbvi, stats_random], tag="dw_time_elapsed", show=False + ) evaluate_agents([stats, stats_ucbvi, stats_random, oracle_stats], n_simulations=20) diff --git a/examples/demo_examples/demo_network/run_client.py b/examples/demo_examples/demo_network/run_client.py index 5aa5f391e..88bb6f3e6 100644 --- a/examples/demo_examples/demo_network/run_client.py +++ b/examples/demo_examples/demo_network/run_client.py @@ -15,30 +15,29 @@ Message.create( command=interface.Command.AGENT_MANAGER_CREATE_INSTANCE, params=dict( - agent_class=ResourceRequest(name='ValueIterationAgent'), - train_env=ResourceRequest(name='GridWorld', kwargs=dict(nrows=35)), + agent_class=ResourceRequest(name="ValueIterationAgent"), + train_env=ResourceRequest(name="GridWorld", kwargs=dict(nrows=35)), fit_budget=100, init_kwargs=dict(gamma=0.95), eval_kwargs=dict(eval_horizon=100, n_simulations=20), n_fit=2, - seed=10 + seed=10, ), data=None, ), Message.create( - command=interface.Command.LIST_RESOURCES, - params=dict(), - data=dict() + command=interface.Command.LIST_RESOURCES, params=dict(), data=dict() ), print_response=True, ) import numpy as np + client.send( Message.create( command=interface.Command.NONE, params=dict(), - data=dict(big_list=list(1.0 * np.arange(2**8))) + data=dict(big_list=list(1.0 * np.arange(2 ** 8))), ), print_response=True, ) diff --git a/examples/demo_examples/demo_network/run_remote_manager.py b/examples/demo_examples/demo_network/run_remote_manager.py index 427712b60..c4125b176 100644 --- a/examples/demo_examples/demo_network/run_remote_manager.py +++ b/examples/demo_examples/demo_network/run_remote_manager.py @@ -15,7 +15,7 @@ from rlberry.manager.evaluation import evaluate_agents, plot_writer_data -if __name__ == '__main__': +if __name__ == "__main__": port = int(input("Select server port: ")) client = BerryClient(port=port) @@ -23,39 +23,41 @@ local_manager = AgentManager( agent_class=REINFORCEAgent, - train_env=(gym_make, dict(id='CartPole-v0')), + train_env=(gym_make, dict(id="CartPole-v0")), fit_budget=FIT_BUDGET, init_kwargs=dict(gamma=0.99), eval_kwargs=dict(eval_horizon=200, n_simulations=20), n_fit=2, seed=10, - agent_name='REINFORCE(local)', - parallelization='process' + agent_name="REINFORCE(local)", + parallelization="process", ) remote_manager = RemoteAgentManager( client, - agent_class=ResourceRequest(name='REINFORCEAgent'), - train_env=ResourceRequest(name='gym_make', kwargs=dict(id='CartPole-v0')), + agent_class=ResourceRequest(name="REINFORCEAgent"), + train_env=ResourceRequest(name="gym_make", kwargs=dict(id="CartPole-v0")), fit_budget=FIT_BUDGET, init_kwargs=dict(gamma=0.99), eval_kwargs=dict(eval_horizon=200, n_simulations=20), n_fit=3, seed=10, - agent_name='REINFORCE(remote)', - parallelization='process', + agent_name="REINFORCE(remote)", + parallelization="process", enable_tensorboard=True, ) remote_manager.set_writer( idx=0, - writer_fn=ResourceRequest(name='DefaultWriter'), - writer_kwargs=dict(name='debug_reinforce_writer') + writer_fn=ResourceRequest(name="DefaultWriter"), + writer_kwargs=dict(name="debug_reinforce_writer"), ) # Optimize hyperparams of remote agent - best_params = remote_manager.optimize_hyperparams(timeout=60, optuna_parallelization='process') - print(f'best params = {best_params}') + best_params = remote_manager.optimize_hyperparams( + timeout=60, optuna_parallelization="process" + ) + print(f"best params = {best_params}") # Test save/load fname1 = remote_manager.save() @@ -72,7 +74,7 @@ remote_manager.fit(budget=100) # plot - plot_writer_data(mmanagers.managers, tag='episode_rewards', show=False) + plot_writer_data(mmanagers.managers, tag="episode_rewards", show=False) evaluate_agents(mmanagers.managers, n_simulations=10, show=True) # Test some methods diff --git a/examples/demo_examples/demo_network/run_server.py b/examples/demo_examples/demo_network/run_server.py index 368d4f127..2426eb5b0 100644 --- a/examples/demo_examples/demo_network/run_server.py +++ b/examples/demo_examples/demo_network/run_server.py @@ -10,33 +10,20 @@ from rlberry.envs import GridWorld, gym_make from rlberry.utils.writers import DefaultWriter -if __name__ == '__main__': +if __name__ == "__main__": port = int(input("Select server port: ")) resources = dict( - GridWorld=ResourceItem( - obj=GridWorld, - description='GridWorld constructor' - ), - gym_make=ResourceItem( - obj=gym_make, - description='gym_make' - ), - REINFORCEAgent=ResourceItem( - obj=REINFORCEAgent, - description='REINFORCEAgent' - ), - A2CAgent=ResourceItem( - obj=A2CAgent, - description='A2CAgent' - ), + GridWorld=ResourceItem(obj=GridWorld, description="GridWorld constructor"), + gym_make=ResourceItem(obj=gym_make, description="gym_make"), + REINFORCEAgent=ResourceItem(obj=REINFORCEAgent, description="REINFORCEAgent"), + A2CAgent=ResourceItem(obj=A2CAgent, description="A2CAgent"), ValueIterationAgent=ResourceItem( obj=ValueIterationAgent, - description='ValueIterationAgent constructor' + ValueIterationAgent.__doc__ + description="ValueIterationAgent constructor" + ValueIterationAgent.__doc__, ), DefaultWriter=ResourceItem( - obj=DefaultWriter, - description='rlberry default writer' - ) + obj=DefaultWriter, description="rlberry default writer" + ), ) server = BerryServer(resources=resources, port=port, client_socket_timeout=120.0) server.start() diff --git a/examples/demo_examples/demo_ppo_benchmark.py b/examples/demo_examples/demo_ppo_benchmark.py index 1765c9cb6..d16d1f750 100644 --- a/examples/demo_examples/demo_ppo_benchmark.py +++ b/examples/demo_examples/demo_ppo_benchmark.py @@ -30,12 +30,10 @@ params_oracle = { "n_samples": 20, # samples per state-action "gamma": GAMMA, - "horizon": HORIZON + "horizon": HORIZON, } -params_ppo = {"gamma": GAMMA, - "horizon": HORIZON, - "learning_rate": 0.0003} +params_ppo = {"gamma": GAMMA, "horizon": HORIZON, "learning_rate": 0.0003} eval_kwargs = dict(eval_horizon=HORIZON, n_simulations=20) @@ -43,15 +41,23 @@ # Run AgentManager # ----------------------------- oracle_stats = AgentManager( - MBQVIAgent, d_train_env, fit_budget=0, + MBQVIAgent, + d_train_env, + fit_budget=0, init_kwargs=params_oracle, eval_kwargs=eval_kwargs, - n_fit=4, agent_name="Oracle") + n_fit=4, + agent_name="Oracle", +) ppo_stats = AgentManager( - PPOAgent, train_env, fit_budget=N_EPISODES, + PPOAgent, + train_env, + fit_budget=N_EPISODES, init_kwargs=params_ppo, eval_kwargs=eval_kwargs, - n_fit=4, agent_name="PPO") + n_fit=4, + agent_name="PPO", +) agent_manager_list = [oracle_stats, ppo_stats] @@ -59,9 +65,13 @@ manager.fit() # learning curves -plot_writer_data(agent_manager_list, tag='episode_rewards', - preprocess_func=np.cumsum, - title='Cumulative Rewards', show=False) +plot_writer_data( + agent_manager_list, + tag="episode_rewards", + preprocess_func=np.cumsum, + title="Cumulative Rewards", + show=False, +) # compare final policies output = evaluate_agents(agent_manager_list) diff --git a/examples/demo_examples/demo_ppo_bonus.py b/examples/demo_examples/demo_ppo_bonus.py index 264e71d33..3e16304fb 100644 --- a/examples/demo_examples/demo_ppo_bonus.py +++ b/examples/demo_examples/demo_ppo_bonus.py @@ -16,9 +16,7 @@ def uncertainty_estimator_fn(obs_space, act_space): - counter = DiscreteCounter(obs_space, - act_space, - n_bins_obs=20) + counter = DiscreteCounter(obs_space, act_space, n_bins_obs=20) return counter @@ -32,26 +30,27 @@ def uncertainty_estimator_fn(obs_space, act_space): MIN_DIST = 0.1 params_ppo = { - 'gamma': GAMMA, - 'horizon': HORIZON, - 'batch_size': 16, - 'entr_coef': 8e-7, - 'k_epochs': 10, - 'eps_clip': 0.2, - 'learning_rate': 0.03 + "gamma": GAMMA, + "horizon": HORIZON, + "batch_size": 16, + "entr_coef": 8e-7, + "k_epochs": 10, + "eps_clip": 0.2, + "learning_rate": 0.03, } params_ppo_bonus = { - 'gamma': GAMMA, - 'horizon': HORIZON, - 'batch_size': 16, - 'entr_coef': 8e-7, - 'k_epochs': 10, - 'eps_clip': 0.2, - 'learning_rate': 0.03, - 'use_bonus': True, - 'uncertainty_estimator_kwargs': { - 'uncertainty_estimator_fn': uncertainty_estimator_fn} + "gamma": GAMMA, + "horizon": HORIZON, + "batch_size": 16, + "entr_coef": 8e-7, + "k_epochs": 10, + "eps_clip": 0.2, + "learning_rate": 0.03, + "use_bonus": True, + "uncertainty_estimator_kwargs": { + "uncertainty_estimator_fn": uncertainty_estimator_fn + }, } eval_kwargs = dict(eval_horizon=HORIZON, n_simulations=20) @@ -60,13 +59,23 @@ def uncertainty_estimator_fn(obs_space, act_space): # Run AgentManager # ----------------------------- ppo_stats = AgentManager( - PPOAgent, env, fit_budget=N_EPISODES, - init_kwargs=params_ppo, eval_kwargs=eval_kwargs, - n_fit=4, agent_name='PPO') + PPOAgent, + env, + fit_budget=N_EPISODES, + init_kwargs=params_ppo, + eval_kwargs=eval_kwargs, + n_fit=4, + agent_name="PPO", +) ppo_bonus_stats = AgentManager( - PPOAgent, env, fit_budget=N_EPISODES, - init_kwargs=params_ppo_bonus, eval_kwargs=eval_kwargs, - n_fit=4, agent_name='PPO-Bonus') + PPOAgent, + env, + fit_budget=N_EPISODES, + init_kwargs=params_ppo_bonus, + eval_kwargs=eval_kwargs, + n_fit=4, + agent_name="PPO-Bonus", +) agent_manager_list = [ppo_bonus_stats, ppo_stats] @@ -74,9 +83,13 @@ def uncertainty_estimator_fn(obs_space, act_space): manager.fit() # learning curves -plot_writer_data(agent_manager_list, tag='episode_rewards', - preprocess_func=np.cumsum, - title='Cumulative Rewards', show=False) +plot_writer_data( + agent_manager_list, + tag="episode_rewards", + preprocess_func=np.cumsum, + title="Cumulative Rewards", + show=False, +) # compare final policies output = evaluate_agents(agent_manager_list) diff --git a/examples/demo_examples/demo_ppo_partial_fit.py b/examples/demo_examples/demo_ppo_partial_fit.py index 412a578cb..0ad22e10a 100644 --- a/examples/demo_examples/demo_ppo_partial_fit.py +++ b/examples/demo_examples/demo_ppo_partial_fit.py @@ -13,24 +13,37 @@ horizon = 100 ppo_params = {} -ppo_params['horizon'] = 100 -ppo_params['gamma'] = 0.99 -ppo_params['learning_rate'] = 0.001 -ppo_params['eps_clip'] = 0.2 -ppo_params['k_epochs'] = 4 +ppo_params["horizon"] = 100 +ppo_params["gamma"] = 0.99 +ppo_params["learning_rate"] = 0.001 +ppo_params["eps_clip"] = 0.2 +ppo_params["k_epochs"] = 4 eval_kwargs = dict(eval_horizon=horizon, n_simulations=20) ppo_stats = AgentManager( - PPOAgent, env, fit_budget=n_episodes, eval_kwargs=eval_kwargs, - init_kwargs=ppo_params, n_fit=2) + PPOAgent, + env, + fit_budget=n_episodes, + eval_kwargs=eval_kwargs, + init_kwargs=ppo_params, + n_fit=2, +) ppo_stats.fit(n_episodes // 2) -plot_writer_data(ppo_stats, tag='episode_rewards', - preprocess_func=np.cumsum, - title='Cumulative Rewards', show=False) +plot_writer_data( + ppo_stats, + tag="episode_rewards", + preprocess_func=np.cumsum, + title="Cumulative Rewards", + show=False, +) evaluate_agents([ppo_stats], show=False) ppo_stats.fit(n_episodes // 4) -plot_writer_data(ppo_stats, tag='episode_rewards', - preprocess_func=np.cumsum, - title='Cumulative Rewards', show=False) +plot_writer_data( + ppo_stats, + tag="episode_rewards", + preprocess_func=np.cumsum, + title="Cumulative Rewards", + show=False, +) evaluate_agents([ppo_stats], show=True) diff --git a/examples/demo_examples/demo_rnd.py b/examples/demo_examples/demo_rnd.py index 0b2760f8c..ef6b3c58d 100644 --- a/examples/demo_examples/demo_rnd.py +++ b/examples/demo_examples/demo_rnd.py @@ -15,7 +15,8 @@ env.action_space, learning_rate=0.1, update_period=100, - embedding_dim=2) + embedding_dim=2, +) # Test state = env.reset() @@ -28,5 +29,4 @@ if ii % 500 == 0: state = env.reset() bonus = rnd.measure(state, action) - print("it = {}, bonus = {}, loss = {}" - .format(ii, bonus, rnd.loss.item())) + print("it = {}, bonus = {}, loss = {}".format(ii, bonus, rnd.loss.item())) diff --git a/examples/demo_examples/demo_seeding.py b/examples/demo_examples/demo_seeding.py index eb2d891e0..0f85a2010 100644 --- a/examples/demo_examples/demo_seeding.py +++ b/examples/demo_examples/demo_seeding.py @@ -20,7 +20,7 @@ from rlberry.envs import gym_make from rlberry.agents import RSUCBVIAgent -env = gym_make('MountainCar-v0') +env = gym_make("MountainCar-v0") env.reseed(seeder) agent = RSUCBVIAgent(env) diff --git a/examples/demo_examples/demo_ucbvi_and_opqtl.py b/examples/demo_examples/demo_ucbvi_and_opqtl.py index 8e928a7fa..2e73413a5 100644 --- a/examples/demo_examples/demo_ucbvi_and_opqtl.py +++ b/examples/demo_examples/demo_ucbvi_and_opqtl.py @@ -18,18 +18,18 @@ params = {} -params['ucbvi'] = { - 'horizon': HORIZON, - 'stage_dependent': True, - 'gamma': GAMMA, - 'real_time_dp': True, - 'bonus_scale_factor': 1.0, +params["ucbvi"] = { + "horizon": HORIZON, + "stage_dependent": True, + "gamma": GAMMA, + "real_time_dp": True, + "bonus_scale_factor": 1.0, } -params['optql'] = { - 'horizon': HORIZON, - 'gamma': GAMMA, - 'bonus_scale_factor': 1.0, +params["optql"] = { + "horizon": HORIZON, + "gamma": GAMMA, + "bonus_scale_factor": 1.0, } eval_kwargs = dict(eval_horizon=HORIZON, n_simulations=20) @@ -37,15 +37,30 @@ multimanagers = MultipleManagers() multimanagers.append( - AgentManager(UCBVIAgent, env, fit_budget=N_EP, init_kwargs=params['ucbvi'], eval_kwargs=eval_kwargs) + AgentManager( + UCBVIAgent, + env, + fit_budget=N_EP, + init_kwargs=params["ucbvi"], + eval_kwargs=eval_kwargs, + ) ) multimanagers.append( - AgentManager(OptQLAgent, env, fit_budget=N_EP, init_kwargs=params['optql'], eval_kwargs=eval_kwargs) + AgentManager( + OptQLAgent, + env, + fit_budget=N_EP, + init_kwargs=params["optql"], + eval_kwargs=eval_kwargs, + ) ) multimanagers.run() -plot_writer_data(multimanagers.managers, tag='episode_rewards', - preprocess_func=np.cumsum, - title='Cumulative Rewards') +plot_writer_data( + multimanagers.managers, + tag="episode_rewards", + preprocess_func=np.cumsum, + title="Cumulative Rewards", +) diff --git a/examples/demo_examples/demo_vis2d.py b/examples/demo_examples/demo_vis2d.py index 0bcb03993..d90ad2f21 100644 --- a/examples/demo_examples/demo_vis2d.py +++ b/examples/demo_examples/demo_vis2d.py @@ -13,15 +13,23 @@ if CHOICE == 0: env = NRoom(nrooms=5, array_observation=False, reward_free=True) - env = Vis2dWrapper(env, n_bins_obs=20, memory_size=100, state_preprocess_fn=get_nroom_state_coord) + env = Vis2dWrapper( + env, n_bins_obs=20, memory_size=100, state_preprocess_fn=get_nroom_state_coord + ) agent = ValueIterationAgent(env.unwrapped, gamma=0.99, horizon=200, copy_env=False) else: env = MountainCar() env = Vis2dWrapper(env, n_bins_obs=20, memory_size=200) - agent = RSUCBVIAgent(env, gamma=0.99, horizon=200, - bonus_scale_factor=0.1, copy_env=False, min_dist=0.1) + agent = RSUCBVIAgent( + env, + gamma=0.99, + horizon=200, + bonus_scale_factor=0.1, + copy_env=False, + min_dist=0.1, + ) agent.fit(budget=100) @@ -41,5 +49,7 @@ ylim = None # env.render() -env.plot_trajectories(n_skip=5, dot_scale_factor=15, xlim=xlim, ylim=ylim, dot_size_means='total_visits') +env.plot_trajectories( + n_skip=5, dot_scale_factor=15, xlim=xlim, ylim=ylim, dot_size_means="total_visits" +) env.plot_trajectory_actions(xlim=xlim, ylim=ylim) diff --git a/examples/plot_agent_manager.py b/examples/plot_agent_manager.py index 49e07ccb5..76f324d27 100644 --- a/examples/plot_agent_manager.py +++ b/examples/plot_agent_manager.py @@ -12,25 +12,33 @@ # -> The reward function can be accessed by: env.R[state, action] # -> And the transitions: env.P[state, action, next_state] env_ctor = GridWorld -env_kwargs =dict(nrows=3, ncols=10, - reward_at = {(1,1):0.1, (2, 9):1.0}, - walls=((1,4),(2,4), (1,5)), - success_probability=0.9) +env_kwargs = dict( + nrows=3, + ncols=10, + reward_at={(1, 1): 0.1, (2, 9): 1.0}, + walls=((1, 4), (2, 4), (1, 5)), + success_probability=0.9, +) env = env_ctor(**env_kwargs) - import numpy as np from rlberry.agents import AgentWithSimplePolicy + class ValueIterationAgent(AgentWithSimplePolicy): - name = 'ValueIterationAgent' - def __init__(self, env, gamma=0.99, epsilon=1e-5, **kwargs): # it's important to put **kwargs to ensure compatibility with the base class + name = "ValueIterationAgent" + + def __init__( + self, env, gamma=0.99, epsilon=1e-5, **kwargs + ): # it's important to put **kwargs to ensure compatibility with the base class """ gamma: discount factor episilon: precision of value iteration """ - AgentWithSimplePolicy.__init__(self, env, **kwargs) # self.env is initialized in the base class + AgentWithSimplePolicy.__init__( + self, env, **kwargs + ) # self.env is initialized in the base class self.gamma = gamma self.epsilon = epsilon @@ -48,10 +56,10 @@ def fit(self, budget=None, **kwargs): TQ = np.zeros((S, A)) for ss in range(S): for aa in range(A): - TQ[ss, aa] = env.R[ss, aa] + self.gamma*env.P[ss, aa, :].dot(V) + TQ[ss, aa] = env.R[ss, aa] + self.gamma * env.P[ss, aa, :].dot(V) V = TQ.max(axis=1) - if np.abs(TQ-Q).max() < self.epsilon: + if np.abs(TQ - Q).max() < self.epsilon: break Q = TQ self.Q = Q @@ -59,18 +67,19 @@ def fit(self, budget=None, **kwargs): def policy(self, observation): return self.Q[observation, :].argmax() - @classmethod def sample_parameters(cls, trial): - """ - Sample hyperparameters for hyperparam optimization using Optuna (https://optuna.org/) - """ - gamma = trial.suggest_categorical('gamma', [0.1, 0.25, 0.5, 0.75, 0.99]) - return {'gamma':gamma} + """ + Sample hyperparameters for hyperparam optimization using Optuna (https://optuna.org/) + """ + gamma = trial.suggest_categorical("gamma", [0.1, 0.25, 0.5, 0.75, 0.99]) + return {"gamma": gamma} + # Create random agent as a baseline class RandomAgent(AgentWithSimplePolicy): - name = 'RandomAgent' + name = "RandomAgent" + def __init__(self, env, **kwargs): AgentWithSimplePolicy.__init__(self, env, **kwargs) @@ -80,10 +89,11 @@ def fit(self, budget=None, **kwargs): def policy(self, observation): return self.env.action_space.sample() + from rlberry.manager import AgentManager, evaluate_agents # Define parameters -vi_params = {'gamma':0.1, 'epsilon':1e-3} +vi_params = {"gamma": 0.1, "epsilon": 1e-3} # Create AgentManager to fit 4 agents using 1 job vi_stats = AgentManager( @@ -92,7 +102,8 @@ def policy(self, observation): fit_budget=0, eval_kwargs=dict(eval_horizon=20), init_kwargs=vi_params, - n_fit=4) + n_fit=4, +) vi_stats.fit() # Create AgentManager for baseline @@ -101,7 +112,8 @@ def policy(self, observation): (env_ctor, env_kwargs), fit_budget=0, eval_kwargs=dict(eval_horizon=20), - n_fit=1) + n_fit=1, +) baseline_stats.fit() # Compare policies using 10 Monte Carlo simulations diff --git a/examples/plot_kernels.py b/examples/plot_kernels.py index a13b824d3..84b2b2cfc 100644 --- a/examples/plot_kernels.py +++ b/examples/plot_kernels.py @@ -19,13 +19,13 @@ "triweight", "tricube", "cosine", - "exp-4" + "exp-4", ] z = np.linspace(-2, 2, 100) -fig, axes = plt.subplots(1, len(kernel_types),figsize=(15,5)) +fig, axes = plt.subplots(1, len(kernel_types), figsize=(15, 5)) for ii, k_type in enumerate(kernel_types): kernel_vals = kernel_func(z, k_type) axes[ii].plot(z, kernel_vals) diff --git a/rlberry/__init__.py b/rlberry/__init__.py index 8ce358f45..769508b35 100644 --- a/rlberry/__init__.py +++ b/rlberry/__init__.py @@ -1,4 +1,4 @@ -__path__ = __import__('pkgutil').extend_path(__path__, __name__) +__path__ = __import__("pkgutil").extend_path(__path__, __name__) # Initialize logging level from rlberry.utils.logging import configure_logging diff --git a/rlberry/agents/adaptiveql/adaptiveql.py b/rlberry/agents/adaptiveql/adaptiveql.py index 83b97f824..6bb87b8f7 100644 --- a/rlberry/agents/adaptiveql/adaptiveql.py +++ b/rlberry/agents/adaptiveql/adaptiveql.py @@ -43,15 +43,17 @@ class AdaptiveQLAgent(AgentWithSimplePolicy): Uses the metric induced by the l-infinity norm. """ - name = 'AdaptiveQLearning' - - def __init__(self, - env, - gamma=1.0, - horizon=50, - bonus_scale_factor=1.0, - bonus_type="simplified_bernstein", - **kwargs): + name = "AdaptiveQLearning" + + def __init__( + self, + env, + gamma=1.0, + horizon=50, + bonus_scale_factor=1.0, + bonus_type="simplified_bernstein", + **kwargs + ): AgentWithSimplePolicy.__init__(self, env, **kwargs) assert isinstance(self.env.observation_space, spaces.Box) @@ -65,8 +67,10 @@ def __init__(self, # maximum value r_range = self.env.reward_range[1] - self.env.reward_range[0] if r_range == np.inf or r_range == 0.0: - logger.warning("{}: Reward range is zero or infinity. ".format(self.name) - + "Setting it to 1.") + logger.warning( + "{}: Reward range is zero or infinity. ".format(self.name) + + "Setting it to 1." + ) r_range = 1.0 self.v_max = np.zeros(self.horizon) @@ -77,9 +81,9 @@ def __init__(self, self.reset() def reset(self): - self.Qtree = MDPTreePartition(self.env.observation_space, - self.env.action_space, - self.horizon) + self.Qtree = MDPTreePartition( + self.env.observation_space, self.env.action_space, self.horizon + ) # info self.episode = 0 @@ -106,7 +110,7 @@ def _update(self, node, state, action, next_state, reward, hh): if hh < self.horizon - 1: value_next_state = min( self.v_max[hh + 1], - self.Qtree.get_argmax_and_node(next_state, hh + 1)[1].qvalue + self.Qtree.get_argmax_and_node(next_state, hh + 1)[1].qvalue, ) # learning rate @@ -125,7 +129,8 @@ def _compute_bonus(self, n, hh): return bonus else: raise ValueError( - "Error: bonus type {} not implemented".format(self.bonus_type)) + "Error: bonus type {} not implemented".format(self.bonus_type) + ) def _run_episode(self): # interact for H steps diff --git a/rlberry/agents/adaptiveql/tree.py b/rlberry/agents/adaptiveql/tree.py index 7a1c2ec57..fd22c3ca5 100644 --- a/rlberry/agents/adaptiveql/tree.py +++ b/rlberry/agents/adaptiveql/tree.py @@ -122,18 +122,22 @@ def traverse(self, x, update=False): # return value at leaf return node - def plot(self, - fignum="tree plot", - colormap_name='cool', - max_value=10, - node=None, - root=True, ): + def plot( + self, + fignum="tree plot", + colormap_name="cool", + max_value=10, + node=None, + root=True, + ): """ Visualize the function (2d domain only). Shows the hierarchical partition. """ if root: - assert self.dim == 2, "TreePartition plot only available for 2-dimensional spaces." + assert ( + self.dim == 2 + ), "TreePartition plot only available for 2-dimensional spaces." node = self.root plt.figure(fignum) @@ -144,13 +148,20 @@ def plot(self, colormap_fn = plt.get_cmap(colormap_name) color = colormap_fn(node.qvalue / max_value) - rectangle = plt.Rectangle((x0, y0), x1 - x0, y1 - y0, ec="black", color=color) + rectangle = plt.Rectangle( + (x0, y0), x1 - x0, y1 - y0, ec="black", color=color + ) plt.gca().add_patch(rectangle) - plt.axis('scaled') + plt.axis("scaled") else: for cc in node.children: - self.plot(max_value=max_value, colormap_name=colormap_name, node=cc, root=False) + self.plot( + max_value=max_value, + colormap_name=colormap_name, + node=cc, + root=False, + ) class MDPTreePartition: @@ -167,8 +178,9 @@ def __init__(self, observation_space, action_space, horizon): for hh in range(horizon): self.trees.append({}) for aa in range(self.n_actions): - self.trees[hh][aa] = TreePartition(observation_space, - initial_value=horizon - hh) + self.trees[hh][aa] = TreePartition( + observation_space, initial_value=horizon - hh + ) self.dmax = self.trees[0][0].dmax diff --git a/rlberry/agents/agent.py b/rlberry/agents/agent.py index e5355debd..c8614e764 100644 --- a/rlberry/agents/agent.py +++ b/rlberry/agents/agent.py @@ -18,7 +18,7 @@ class Agent(ABC): - """ Basic interface for agents. + """Basic interface for agents. Parameters ---------- @@ -59,18 +59,19 @@ class Agent(ABC): name = "" - def __init__(self, - env: types.Env, - eval_env: Optional[types.Env] = None, - copy_env: bool = True, - seeder: Optional[types.Seed] = None, - output_dir: Optional[str] = None, - _execution_metadata: Optional[metadata_utils.ExecutionMetadata] = None, - _default_writer_kwargs: Optional[dict] = None, - **kwargs): + def __init__( + self, + env: types.Env, + eval_env: Optional[types.Env] = None, + copy_env: bool = True, + seeder: Optional[types.Seed] = None, + output_dir: Optional[str] = None, + _execution_metadata: Optional[metadata_utils.ExecutionMetadata] = None, + _default_writer_kwargs: Optional[dict] = None, + **kwargs, + ): # Check if wrong parameters have been sent to an agent. - assert kwargs == {}, \ - 'Unknown parameters sent to agent:' + str(kwargs.keys()) + assert kwargs == {}, "Unknown parameters sent to agent:" + str(kwargs.keys()) self.seeder = Seeder(seeder) self.env = process_env(env, self.seeder, copy_env=copy_env) @@ -80,14 +81,17 @@ def __init__(self, self.eval_env = process_env(eval_env, self.seeder, copy_env=True) # metadata - self._execution_metadata = _execution_metadata or metadata_utils.ExecutionMetadata() + self._execution_metadata = ( + _execution_metadata or metadata_utils.ExecutionMetadata() + ) self._unique_id = metadata_utils.get_unique_id(self) if self.name: - self._unique_id = self.name + '_' + self._unique_id + self._unique_id = self.name + "_" + self._unique_id # create writer _default_writer_kwargs = _default_writer_kwargs or dict( - name=self.name, execution_metadata=self._execution_metadata) + name=self.name, execution_metadata=self._execution_metadata + ) self._writer = DefaultWriter(**_default_writer_kwargs) # output directory for the agent instance @@ -174,7 +178,7 @@ def sample_parameters(cls, trial): @property def rng(self): - """ Random number generator. """ + """Random number generator.""" return self.seeder.rng def reseed(self, seed_seq=None): @@ -230,7 +234,7 @@ def save(self, filename): if not dill.pickles(self.writer): self.set_writer(None) # save - filename = Path(filename).with_suffix('.pickle') + filename = Path(filename).with_suffix(".pickle") filename.parent.mkdir(parents=True, exist_ok=True) try: with filename.open("wb") as ff: @@ -256,14 +260,14 @@ def load(cls, filename, **kwargs): **kwargs: dict Arguments to required by the __init__ method of the Agent subclass. """ - filename = Path(filename).with_suffix('.pickle') + filename = Path(filename).with_suffix(".pickle") obj = cls(**kwargs) try: - with filename.open('rb') as ff: + with filename.open("rb") as ff: tmp_dict = pickle.load(ff) except Exception: - with filename.open('rb') as ff: + with filename.open("rb") as ff: tmp_dict = dill.load(ff) obj.__dict__.clear() @@ -283,11 +287,7 @@ def policy(self, observation): """Returns an action, given an observation.""" pass - def eval(self, - eval_horizon=10 ** 5, - n_simulations=10, - gamma=1.0, - **kwargs): + def eval(self, eval_horizon=10 ** 5, n_simulations=10, gamma=1.0, **kwargs): """ Monte-Carlo policy evaluation [1]_ of an agent to estimate the value at the initial state. @@ -307,7 +307,7 @@ def eval(self, References ---------- .. [1] http://incompleteideas.net/book/first/ebook/node50.html - """ + """ del kwargs # unused episode_rewards = np.zeros(n_simulations) for sim in range(n_simulations): diff --git a/rlberry/agents/dynprog/value_iteration.py b/rlberry/agents/dynprog/value_iteration.py index c718f7774..0748f16d5 100644 --- a/rlberry/agents/dynprog/value_iteration.py +++ b/rlberry/agents/dynprog/value_iteration.py @@ -31,8 +31,9 @@ def __init__(self, env, gamma=0.95, horizon=None, epsilon=1e-6, **kwargs): AgentWithSimplePolicy.__init__(self, env, **kwargs) # initialize base class - assert isinstance(self.env, FiniteMDP), \ - "Value iteration requires a FiniteMDP model." + assert isinstance( + self.env, FiniteMDP + ), "Value iteration requires a FiniteMDP model." # self.gamma = gamma @@ -50,15 +51,16 @@ def fit(self, budget=None, **kwargs): del kwargs info = {} if self.horizon is None: - assert self.gamma < 1.0, \ - "The discounted setting requires gamma < 1.0" - self.Q, self.V, n_it = value_iteration(self.env.R, self.env.P, - self.gamma, self.epsilon) + assert self.gamma < 1.0, "The discounted setting requires gamma < 1.0" + self.Q, self.V, n_it = value_iteration( + self.env.R, self.env.P, self.gamma, self.epsilon + ) info["n_iterations"] = n_it info["precision"] = self.epsilon else: - self.Q, self.V = backward_induction(self.env.R, self.env.P, - self.horizon, self.gamma) + self.Q, self.V = backward_induction( + self.env.R, self.env.P, self.horizon, self.gamma + ) info["n_iterations"] = self.horizon info["precision"] = 0.0 return info diff --git a/rlberry/agents/jax/dqn/dqn.py b/rlberry/agents/jax/dqn/dqn.py index 7f889f260..54be541f4 100644 --- a/rlberry/agents/jax/dqn/dqn.py +++ b/rlberry/agents/jax/dqn/dqn.py @@ -108,28 +108,29 @@ class DQNAgent(AgentWithSimplePolicy): max_gradient_norm : float, default: 100.0 Maximum gradient norm. """ + name = "JaxDqnAgent" def __init__( - self, - env: types.Env, - gamma: float = 0.99, - batch_size: int = 64, - chunk_size: int = 8, - online_update_interval: int = 1, - target_update_interval: int = 512, - learning_rate: float = 0.001, - epsilon_init: float = 1.0, - epsilon_end: float = 0.05, - epsilon_steps: int = 5000, - max_replay_size: int = 100000, - eval_interval: Optional[int] = None, - max_episode_length: Optional[int] = None, - lambda_: Optional[float] = None, - net_constructor: Optional[Callable[..., hk.Module]] = None, - net_kwargs: Optional[Mapping[str, Any]] = None, - max_gradient_norm: float = 100.0, - **kwargs + self, + env: types.Env, + gamma: float = 0.99, + batch_size: int = 64, + chunk_size: int = 8, + online_update_interval: int = 1, + target_update_interval: int = 512, + learning_rate: float = 0.001, + epsilon_init: float = 1.0, + epsilon_end: float = 0.05, + epsilon_steps: int = 5000, + max_replay_size: int = 100000, + eval_interval: Optional[int] = None, + max_episode_length: Optional[int] = None, + lambda_: Optional[float] = None, + net_constructor: Optional[Callable[..., hk.Module]] = None, + net_kwargs: Optional[Mapping[str, Any]] = None, + max_gradient_norm: float = 100.0, + **kwargs ): AgentWithSimplePolicy.__init__(self, env, **kwargs) env = self.env @@ -137,9 +138,9 @@ def __init__( # checks if not isinstance(self.env.observation_space, spaces.Box): - raise ValueError('DQN only implemented for Box observation spaces.') + raise ValueError("DQN only implemented for Box observation spaces.") if not isinstance(self.env.action_space, spaces.Discrete): - raise ValueError('DQN only implemented for Discrete action spaces.') + raise ValueError("DQN only implemented for Discrete action spaces.") # params self._gamma = gamma @@ -164,7 +165,10 @@ def __init__( try: obs_shape, obs_dtype = sample_obs.shape, sample_obs.dtype except AttributeError: # in case sample_obs has no .shape attribute - obs_shape, obs_dtype = env.observation_space.shape, env.observation_space.dtype + obs_shape, obs_dtype = ( + env.observation_space.shape, + env.observation_space.dtype, + ) action_shape, action_dtype = env.action_space.shape, env.action_space.dtype self._replay_buffer = ReplayBuffer( @@ -172,23 +176,20 @@ def __init__( self._chunk_size, self._max_replay_size, ) - self._replay_buffer.setup_entry('actions', action_shape, action_dtype) - self._replay_buffer.setup_entry('observations', obs_shape, obs_dtype) - self._replay_buffer.setup_entry('next_observations', obs_shape, obs_dtype) - self._replay_buffer.setup_entry('rewards', (), np.float32) - self._replay_buffer.setup_entry('discounts', (), np.float32) + self._replay_buffer.setup_entry("actions", action_shape, action_dtype) + self._replay_buffer.setup_entry("observations", obs_shape, obs_dtype) + self._replay_buffer.setup_entry("next_observations", obs_shape, obs_dtype) + self._replay_buffer.setup_entry("rewards", (), np.float32) + self._replay_buffer.setup_entry("discounts", (), np.float32) self._replay_buffer.build() # initialize network and params net_constructor = net_constructor or nets.MLPQNetwork net_kwargs = net_kwargs or dict( - num_actions=self.env.action_space.n, - hidden_sizes=(64, 64) + num_actions=self.env.action_space.n, hidden_sizes=(64, 64) ) net_ctor = functools.partial(net_constructor, **net_kwargs) - self._q_net = hk.without_apply_rng( - hk.transform(lambda x: net_ctor()(x)) - ) + self._q_net = hk.without_apply_rng(hk.transform(lambda x: net_ctor()(x))) self._dummy_obs = jnp.ones(self.env.observation_space.shape) @@ -197,13 +198,13 @@ def __init__( self._all_params = AllParams( online=self._q_net.init(subkey1, self._dummy_obs), - target=self._q_net.init(subkey2, self._dummy_obs) + target=self._q_net.init(subkey2, self._dummy_obs), ) # initialize optimizer and states self._optimizer = optax.chain( optax.clip_by_global_norm(self._max_gradient_norm), - optax.adam(learning_rate) + optax.adam(learning_rate), ) self._all_states = AllStates( optimizer=self._optimizer.init(self._all_params.online), @@ -236,11 +237,7 @@ def policy(self, observation): action = actor_out.actions.item() return action - def fit( - self, - budget: int, - **kwargs - ): + def fit(self, budget: int, **kwargs): """ Train DQN agent. @@ -273,11 +270,16 @@ def fit( # store data episode_rewards += reward buffer_writer.append( - {'actions': action, - 'observations': observation, - 'rewards': np.array(reward, dtype=np.float32), - 'discounts': np.array(self._gamma * (1.0 - done), dtype=np.float32), - 'next_observations': next_obs}) + { + "actions": action, + "observations": observation, + "rewards": np.array(reward, dtype=np.float32), + "discounts": np.array( + self._gamma * (1.0 - done), dtype=np.float32 + ), + "next_observations": next_obs, + } + ) # counters and next obs timesteps_counter += 1 @@ -291,44 +293,49 @@ def fit( if sample: batch = sample.data self._all_params, self._all_states, info = self.learner_step( - self._all_params, - self._all_states, - batch + self._all_params, self._all_states, batch ) if self.writer: - self.writer.add_scalar('q_loss', info['loss'].item(), total_timesteps) self.writer.add_scalar( - 'learner_steps', + "q_loss", info["loss"].item(), total_timesteps + ) + self.writer.add_scalar( + "learner_steps", self._all_states.learner_steps.item(), - total_timesteps) + total_timesteps, + ) # eval - if self._eval_interval is not None and total_timesteps % self._eval_interval == 0: + if ( + self._eval_interval is not None + and total_timesteps % self._eval_interval == 0 + ): eval_rewards = self.eval( eval_horizon=self._max_episode_length, n_simimulations=2, - gamma=1.0) + gamma=1.0, + ) self.writer.add_scalar( - 'eval_rewards', - eval_rewards, - total_timesteps + "eval_rewards", eval_rewards, total_timesteps ) # check if episode ended if done: if self.writer: - self.writer.add_scalar('episode_rewards', episode_rewards, total_timesteps) + self.writer.add_scalar( + "episode_rewards", episode_rewards, total_timesteps + ) buffer_writer.end_episode() episode_rewards = 0.0 episode_timesteps = 0 observation = self.env.reset() def _loss(self, all_params, batch): - obs_tm1 = batch['observations'] - a_tm1 = batch['actions'] - r_t = batch['rewards'] - discount_t = batch['discounts'] - obs_t = batch['next_observations'] + obs_tm1 = batch["observations"] + a_tm1 = batch["actions"] + r_t = batch["rewards"] + discount_t = batch["discounts"] + obs_t = batch["next_observations"] if self._lambda is None: # remove time dim (batch has shape [batch, chunk_size, ...]) @@ -348,13 +355,13 @@ def _loss(self, all_params, batch): else: batched_loss = jax.vmap(rlax.q_lambda) batch_lambda = self._lambda * jnp.ones(r_t.shape) - td_error = batched_loss(q_tm1, a_tm1, r_t, discount_t, q_t_val, batch_lambda) + td_error = batched_loss( + q_tm1, a_tm1, r_t, discount_t, q_t_val, batch_lambda + ) loss = jnp.mean(rlax.l2_loss(td_error)) - info = dict( - loss=loss - ) + info = dict(loss=loss) return loss, info def _actor_step(self, all_params, all_states, observation, rng_key, evaluation): @@ -365,13 +372,12 @@ def _actor_step(self, all_params, all_states, observation, rng_key, evaluation): eval_action = rlax.greedy().sample(rng_key, q_val) action = jax.lax.select(evaluation, eval_action, train_action) return ( - ActorOutput( - actions=action, - q_values=q_val), + ActorOutput(actions=action, q_values=q_val), AllStates( optimizer=all_states.optimizer, learner_steps=all_states.learner_steps, - actor_steps=all_states.actor_steps + 1), + actor_steps=all_states.actor_steps + 1, + ), ) def _learner_step(self, all_params, all_states, batch): @@ -379,30 +385,28 @@ def _learner_step(self, all_params, all_states, batch): all_params.online, all_params.target, all_states.learner_steps, - self._target_update_interval) - grad, info = jax.grad(self._loss, has_aux=True)( - all_params, - batch) + self._target_update_interval, + ) + grad, info = jax.grad(self._loss, has_aux=True)(all_params, batch) updates, optimizer_state = self._optimizer.update( - grad.online, - all_states.optimizer) + grad.online, all_states.optimizer + ) online_params = optax.apply_updates(all_params.online, updates) return ( - AllParams( - online=online_params, - target=target_params), + AllParams(online=online_params, target=target_params), AllStates( optimizer=optimizer_state, learner_steps=all_states.learner_steps + 1, - actor_steps=all_states.actor_steps), - info + actor_steps=all_states.actor_steps, + ), + info, ) # # Custom save/load methods. # def save(self, filename): - filename = Path(filename).with_suffix('.pickle') + filename = Path(filename).with_suffix(".pickle") filename.parent.mkdir(parents=True, exist_ok=True) writer = None @@ -422,14 +426,14 @@ def save(self, filename): @classmethod def load(cls, filename, **kwargs): - filename = Path(filename).with_suffix('.pickle') + filename = Path(filename).with_suffix(".pickle") agent = cls(**kwargs) - with filename.open('rb') as ff: + with filename.open("rb") as ff: agent_data = dill.load(ff) - agent.key = agent_data['rng_key'] - agent._all_params = agent_data['params'] - agent._all_states = agent_data['states'] - writer = agent_data['writer'] + agent.key = agent_data["rng_key"] + agent._all_params = agent_data["params"] + agent._all_states = agent_data["states"] + writer = agent_data["writer"] if writer: agent._writer = writer return agent @@ -439,13 +443,7 @@ def load(cls, filename, **kwargs): # @classmethod def sample_parameters(cls, trial): - learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-1) - gamma = trial.suggest_uniform('gamma', 0.95, 0.99) - lambda_ = trial.suggest_categorical( - 'lambda_', - [0.1, 0.5, 0.9, None]) - return dict( - learning_rate=learning_rate, - gamma=gamma, - lambda_=lambda_ - ) + learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-1) + gamma = trial.suggest_uniform("gamma", 0.95, 0.99) + lambda_ = trial.suggest_categorical("lambda_", [0.1, 0.5, 0.9, None]) + return dict(learning_rate=learning_rate, gamma=gamma, lambda_=lambda_) diff --git a/rlberry/agents/jax/nets/common.py b/rlberry/agents/jax/nets/common.py index 321a754d2..593000cdf 100644 --- a/rlberry/agents/jax/nets/common.py +++ b/rlberry/agents/jax/nets/common.py @@ -20,10 +20,7 @@ class MLPQNetwork(hk.Module): """ def __init__( - self, - num_actions: int, - hidden_sizes: Tuple[int, ...], - name: str = 'MLPQNetwork' + self, num_actions: int, hidden_sizes: Tuple[int, ...], name: str = "MLPQNetwork" ): super().__init__(name=name) self._mlp = hk.nets.MLP(output_sizes=hidden_sizes + (num_actions,)) diff --git a/rlberry/agents/jax/tests/old_test_tqn.py b/rlberry/agents/jax/tests/old_test_tqn.py index c48d4048b..8cbfc7ab8 100644 --- a/rlberry/agents/jax/tests/old_test_tqn.py +++ b/rlberry/agents/jax/tests/old_test_tqn.py @@ -18,12 +18,9 @@ def test_jax_dqn(lambda_): if not _IMPORT_SUCCESSFUL: return - env = (gym_make, dict(id='CartPole-v0')) + env = (gym_make, dict(id="CartPole-v0")) params = dict( - chunk_size=4, - batch_size=128, - target_update_interval=5, - lambda_=lambda_ + chunk_size=4, batch_size=128, target_update_interval=5, lambda_=lambda_ ) stats = AgentManager( @@ -33,7 +30,7 @@ def test_jax_dqn(lambda_): eval_env=env, init_kwargs=params, n_fit=1, - parallelization='thread', + parallelization="thread", ) stats.fit() stats.clear_output_dir() diff --git a/rlberry/agents/jax/utils/replay_buffer.py b/rlberry/agents/jax/utils/replay_buffer.py index 15aeb6a87..f49cf3584 100644 --- a/rlberry/agents/jax/utils/replay_buffer.py +++ b/rlberry/agents/jax/utils/replay_buffer.py @@ -14,13 +14,13 @@ import reverb except ImportError as ex: logger.error( - f'[replay_buffer] Could not import reverb: \n {ex} \n' - + ' >>> If you have issues with libpython3.7m.so.1.0, try running: \n' - + ' >>> $ export LD_LIBRARY_PATH=$CONDA_PREFIX/lib \n' - + ' >>> in a conda environment, ' - + ' >>> or see https://github.com/deepmind/acme/issues/47 \n' - + ' >>> See also https://stackoverflow.com/a/46833531 for how to set \n' - + ' >>> LD_LIBRARY_PATH automatically when activating a conda environment.' + f"[replay_buffer] Could not import reverb: \n {ex} \n" + + " >>> If you have issues with libpython3.7m.so.1.0, try running: \n" + + " >>> $ export LD_LIBRARY_PATH=$CONDA_PREFIX/lib \n" + + " >>> in a conda environment, " + + " >>> or see https://github.com/deepmind/acme/issues/47 \n" + + " >>> See also https://stackoverflow.com/a/46833531 for how to set \n" + + " >>> LD_LIBRARY_PATH automatically when activating a conda environment." ) exit(1) @@ -55,13 +55,13 @@ def append(self, *args, **kwargs): for key in self.writer.history: if key not in self.entries: raise RuntimeError( - 'Cannot add to replay buffer an item that' - f' was not setup with setup_entry() method of ReplayBuffer: {key}') - trajectory[key] = self.writer.history[key][-self.chunk_size:] + "Cannot add to replay buffer an item that" + f" was not setup with setup_entry() method of ReplayBuffer: {key}" + ) + trajectory[key] = self.writer.history[key][-self.chunk_size :] self.writer.create_item( - table='replay_buffer', - priority=1.0, - trajectory=trajectory) + table="replay_buffer", priority=1.0, trajectory=trajectory + ) self.total_items += 1 @@ -77,13 +77,13 @@ class ReplayBuffer: """ def __init__( - self, - batch_size: int, - chunk_size: int, - max_replay_size: int, + self, + batch_size: int, + chunk_size: int, + max_replay_size: int, ): if chunk_size < 1: - raise ValueError('chunk_size needs to be >= 1') + raise ValueError("chunk_size needs to be >= 1") self._batch_size = batch_size self._chunk_size = chunk_size @@ -101,12 +101,14 @@ def dataset(self): return self._batched_dataset def get_writer(self): - self._chunk_writer = ChunkWriter(self._reverb_client, self._chunk_size, list(self._signature.keys())) + self._chunk_writer = ChunkWriter( + self._reverb_client, self._chunk_size, list(self._signature.keys()) + ) return self._chunk_writer def sample(self): if self._chunk_writer is None: - raise RuntimeError('Calling sample() without previous call to get_writer()') + raise RuntimeError("Calling sample() without previous call to get_writer()") if self._chunk_writer.total_items < self._batch_size: return None return next(self.dataset) @@ -125,7 +127,7 @@ def setup_entry(self, name, shape, dtype): Type of the data. Can be nested. """ if name in self._signature: - raise ValueError(f'Entry {name} already added to the replay buffer.') + raise ValueError(f"Entry {name} already added to the replay buffer.") self._signature[name] = tf.TensorSpec( shape=[self._chunk_size, *shape], @@ -137,7 +139,7 @@ def build(self): self._reverb_server = reverb.Server( tables=[ reverb.Table( - name='replay_buffer', + name="replay_buffer", sampler=reverb.selectors.Uniform(), remover=reverb.selectors.Fifo(), max_size=self._max_replay_size, @@ -145,12 +147,15 @@ def build(self): signature=self._signature, ), ], - port=None + port=None, ) - self._reverb_client = reverb.Client(f'localhost:{self._reverb_server.port}') + self._reverb_client = reverb.Client(f"localhost:{self._reverb_server.port}") self._reverb_dataset = reverb.TrajectoryDataset.from_table_signature( - server_address=f'localhost:{self._reverb_server.port}', - table='replay_buffer', - max_in_flight_samples_per_worker=2 * self._batch_size) - self._batched_dataset = self._reverb_dataset.batch(self._batch_size, drop_remainder=True).as_numpy_iterator() + server_address=f"localhost:{self._reverb_server.port}", + table="replay_buffer", + max_in_flight_samples_per_worker=2 * self._batch_size, + ) + self._batched_dataset = self._reverb_dataset.batch( + self._batch_size, drop_remainder=True + ).as_numpy_iterator() # logger.info(self._reverb_client.server_info()) diff --git a/rlberry/agents/kernel_based/common.py b/rlberry/agents/kernel_based/common.py index d052908d2..33757f66a 100644 --- a/rlberry/agents/kernel_based/common.py +++ b/rlberry/agents/kernel_based/common.py @@ -4,28 +4,30 @@ @numba_jit -def map_to_representative(state, - lp_metric, - representative_states, - n_representatives, - min_dist, - scaling, - accept_new_repr): - """Map state to representative state. """ +def map_to_representative( + state, + lp_metric, + representative_states, + n_representatives, + min_dist, + scaling, + accept_new_repr, +): + """Map state to representative state.""" dist_to_closest = np.inf argmin = -1 for ii in range(n_representatives): - dist = metric_lp(state, representative_states[ii, :], - lp_metric, - scaling) + dist = metric_lp(state, representative_states[ii, :], lp_metric, scaling) if dist < dist_to_closest: dist_to_closest = dist argmin = ii max_representatives = representative_states.shape[0] - if (dist_to_closest > min_dist) \ - and (n_representatives < max_representatives) \ - and accept_new_repr: + if ( + (dist_to_closest > min_dist) + and (n_representatives < max_representatives) + and accept_new_repr + ): new_index = n_representatives representative_states[new_index, :] = state return new_index diff --git a/rlberry/agents/kernel_based/rs_kernel_ucbvi.py b/rlberry/agents/kernel_based/rs_kernel_ucbvi.py index e376fdbab..1f573b362 100644 --- a/rlberry/agents/kernel_based/rs_kernel_ucbvi.py +++ b/rlberry/agents/kernel_based/rs_kernel_ucbvi.py @@ -15,11 +15,26 @@ @numba_jit -def update_model(repr_state, action, repr_next_state, reward, - n_representatives, repr_states, - lp_metric, scaling, bandwidth, - bonus_scale_factor, beta, v_max, bonus_type, - kernel_type, N_sa, B_sa, P_hat, R_hat): +def update_model( + repr_state, + action, + repr_next_state, + reward, + n_representatives, + repr_states, + lp_metric, + scaling, + bandwidth, + bonus_scale_factor, + beta, + v_max, + bonus_type, + kernel_type, + N_sa, + B_sa, + P_hat, + R_hat, +): """ Model update function, lots of arguments so we can use JIT :) """ @@ -29,10 +44,9 @@ def update_model(repr_state, action, repr_next_state, reward, for u_repr_state in range(n_representatives): # compute weight - dist = metric_lp(repr_states[repr_state, :], - repr_states[u_repr_state, :], - lp_metric, - scaling) + dist = metric_lp( + repr_states[repr_state, :], repr_states[u_repr_state, :], lp_metric, scaling + ) weight = kernel_func(dist / bandwidth, kernel_type=kernel_type) # aux variables @@ -43,19 +57,22 @@ def update_model(repr_state, action, repr_next_state, reward, N_sa[u_repr_state, action] += weight # update transitions - P_hat[u_repr_state, action, :n_representatives] = \ - dirac_next_s * weight / current_N_sa + \ - (prev_N_sa / current_N_sa) * \ - P_hat[u_repr_state, action, :n_representatives] + P_hat[u_repr_state, action, :n_representatives] = ( + dirac_next_s * weight / current_N_sa + + (prev_N_sa / current_N_sa) + * P_hat[u_repr_state, action, :n_representatives] + ) # update rewards - R_hat[u_repr_state, action] = weight * reward / current_N_sa + \ - (prev_N_sa / current_N_sa) * R_hat[u_repr_state, action] + R_hat[u_repr_state, action] = ( + weight * reward / current_N_sa + + (prev_N_sa / current_N_sa) * R_hat[u_repr_state, action] + ) # update bonus - B_sa[u_repr_state, action] = compute_bonus(N_sa[u_repr_state, action], - beta, bonus_scale_factor, - v_max, bonus_type) + B_sa[u_repr_state, action] = compute_bonus( + N_sa[u_repr_state, action], beta, bonus_scale_factor, v_max, bonus_type + ) @numba_jit @@ -142,19 +159,22 @@ class RSKernelUCBVIAgent(AgentWithSimplePolicy): name = "RSKernelUCBVI" - def __init__(self, env, - gamma=0.99, - horizon=None, - lp_metric=2, - kernel_type="epanechnikov", - scaling=None, - bandwidth=0.05, - min_dist=0.1, - max_repr=1000, - bonus_scale_factor=1.0, - beta=0.01, - bonus_type="simplified_bernstein", - **kwargs): + def __init__( + self, + env, + gamma=0.99, + horizon=None, + lp_metric=2, + kernel_type="epanechnikov", + scaling=None, + bandwidth=0.05, + min_dist=0.1, + max_repr=1000, + bonus_scale_factor=1.0, + beta=0.01, + bonus_type="simplified_bernstein", + **kwargs + ): # init base class AgentWithSimplePolicy.__init__(self, env, **kwargs) @@ -175,8 +195,7 @@ def __init__(self, env, # other checks assert gamma >= 0 and gamma <= 1.0 if self.horizon is None: - assert gamma < 1.0, \ - "If no horizon is given, gamma must be smaller than 1." + assert gamma < 1.0, "If no horizon is given, gamma must be smaller than 1." self.horizon = int(np.ceil(1.0 / (1.0 - gamma))) # state dimension @@ -185,10 +204,12 @@ def __init__(self, env, # compute scaling, if it is None if scaling is None: # if high and low are bounded - if (self.env.observation_space.high == np.inf).sum() == 0 \ - and (self.env.observation_space.low == -np.inf).sum() == 0: - scaling = self.env.observation_space.high \ - - self.env.observation_space.low + if (self.env.observation_space.high == np.inf).sum() == 0 and ( + self.env.observation_space.low == -np.inf + ).sum() == 0: + scaling = ( + self.env.observation_space.high - self.env.observation_space.low + ) # if high or low are unbounded else: scaling = np.ones(self.state_dim) @@ -200,19 +221,28 @@ def __init__(self, env, # maximum value r_range = self.env.reward_range[1] - self.env.reward_range[0] if r_range == np.inf or r_range == 0.0: - logger.warning("{}: Reward range is zero or infinity. ".format(self.name) - + "Setting it to 1.") + logger.warning( + "{}: Reward range is zero or infinity. ".format(self.name) + + "Setting it to 1." + ) r_range = 1.0 if self.gamma == 1.0: self.v_max = r_range * horizon else: - self.v_max = r_range * (1.0 - np.power(self.gamma, self.horizon)) / (1.0 - self.gamma) + self.v_max = ( + r_range + * (1.0 - np.power(self.gamma, self.horizon)) + / (1.0 - self.gamma) + ) # number of representative states and number of actions if max_repr is None: - max_repr = int(np.ceil((1.0 * np.sqrt(self.state_dim) - / self.min_dist) ** self.state_dim)) + max_repr = int( + np.ceil( + (1.0 * np.sqrt(self.state_dim) / self.min_dist) ** self.state_dim + ) + ) self.max_repr = max_repr # current number of representative states @@ -261,18 +291,23 @@ def fit(self, budget: int, **kwargs): self._run_episode() # compute Q function for the recommended policy - self.Q_policy, _ = backward_induction(self.R_hat[:self.M, :], - self.P_hat[:self.M, :, :self.M], - self.horizon, self.gamma) + self.Q_policy, _ = backward_induction( + self.R_hat[: self.M, :], + self.P_hat[: self.M, :, : self.M], + self.horizon, + self.gamma, + ) def _map_to_repr(self, state, accept_new_repr=True): - repr_state = map_to_representative(state, - self.lp_metric, - self.representative_states, - self.M, - self.min_dist, - self.scaling, - accept_new_repr) + repr_state = map_to_representative( + state, + self.lp_metric, + self.representative_states, + self.M, + self.min_dist, + self.scaling, + accept_new_repr, + ) # check if new representative state if repr_state == self.M: self.M += 1 @@ -282,21 +317,26 @@ def _update(self, state, action, next_state, reward): repr_state = self._map_to_repr(state) repr_next_state = self._map_to_repr(next_state) - update_model(repr_state, action, repr_next_state, reward, - self.M, - self.representative_states, - self.lp_metric, - self.scaling, - self.bandwidth, - self.bonus_scale_factor, - self.beta, - self.v_max, - self.bonus_type, - self.kernel_type, - self.N_sa, - self.B_sa, - self.P_hat, - self.R_hat) + update_model( + repr_state, + action, + repr_next_state, + reward, + self.M, + self.representative_states, + self.lp_metric, + self.scaling, + self.bandwidth, + self.bonus_scale_factor, + self.beta, + self.v_max, + self.bonus_type, + self.kernel_type, + self.N_sa, + self.B_sa, + self.P_hat, + self.R_hat, + ) def _get_action(self, state, hh=0): assert self.Q is not None @@ -319,10 +359,14 @@ def _run_episode(self): # run backward induction backward_induction_in_place( - self.Q[:, :self.M, :], self.V[:, :self.M], - self.R_hat[:self.M, :] + self.B_sa[:self.M, :], - self.P_hat[:self.M, :, :self.M], - self.horizon, self.gamma, self.v_max) + self.Q[:, : self.M, :], + self.V[:, : self.M], + self.R_hat[: self.M, :] + self.B_sa[: self.M, :], + self.P_hat[: self.M, :, : self.M], + self.horizon, + self.gamma, + self.v_max, + ) self.episode += 1 # diff --git a/rlberry/agents/kernel_based/rs_ucbvi.py b/rlberry/agents/kernel_based/rs_ucbvi.py index a9908c0e1..ec7bfbb96 100644 --- a/rlberry/agents/kernel_based/rs_ucbvi.py +++ b/rlberry/agents/kernel_based/rs_ucbvi.py @@ -84,17 +84,20 @@ class RSUCBVIAgent(AgentWithSimplePolicy): name = "RSUCBVI" - def __init__(self, env, - gamma=0.99, - horizon=100, - lp_metric=2, - scaling=None, - min_dist=0.1, - max_repr=1000, - bonus_scale_factor=1.0, - bonus_type="simplified_bernstein", - reward_free=False, - **kwargs): + def __init__( + self, + env, + gamma=0.99, + horizon=100, + lp_metric=2, + scaling=None, + min_dist=0.1, + max_repr=1000, + bonus_scale_factor=1.0, + bonus_type="simplified_bernstein", + reward_free=False, + **kwargs + ): # init base class AgentWithSimplePolicy.__init__(self, env, **kwargs) @@ -113,8 +116,7 @@ def __init__(self, env, # other checks assert gamma >= 0 and gamma <= 1.0 if self.horizon is None: - assert gamma < 1.0, \ - "If no horizon is given, gamma must be smaller than 1." + assert gamma < 1.0, "If no horizon is given, gamma must be smaller than 1." self.horizon = int(np.ceil(1.0 / (1.0 - gamma))) # state dimension @@ -123,10 +125,12 @@ def __init__(self, env, # compute scaling, if it is None if scaling is None: # if high and low are bounded - if (self.env.observation_space.high == np.inf).sum() == 0 \ - and (self.env.observation_space.low == -np.inf).sum() == 0: - scaling = self.env.observation_space.high \ - - self.env.observation_space.low + if (self.env.observation_space.high == np.inf).sum() == 0 and ( + self.env.observation_space.low == -np.inf + ).sum() == 0: + scaling = ( + self.env.observation_space.high - self.env.observation_space.low + ) # if high or low are unbounded else: scaling = np.ones(self.state_dim) @@ -138,20 +142,28 @@ def __init__(self, env, # maximum value r_range = self.env.reward_range[1] - self.env.reward_range[0] if r_range == np.inf or r_range == 0.0: - logger.warning("{}: Reward range is zero or infinity. ".format(self.name) - + "Setting it to 1.") + logger.warning( + "{}: Reward range is zero or infinity. ".format(self.name) + + "Setting it to 1." + ) r_range = 1.0 if self.gamma == 1.0: self.v_max = r_range * horizon else: - self.v_max = r_range * (1.0 - np.power(self.gamma, self.horizon)) \ + self.v_max = ( + r_range + * (1.0 - np.power(self.gamma, self.horizon)) / (1.0 - self.gamma) + ) # number of representative states and number of actions if max_repr is None: - max_repr = int(np.ceil((1.0 * np.sqrt(self.state_dim) / - self.min_dist) ** self.state_dim)) + max_repr = int( + np.ceil( + (1.0 * np.sqrt(self.state_dim) / self.min_dist) ** self.state_dim + ) + ) self.max_repr = max_repr # current number of representative states @@ -205,18 +217,23 @@ def fit(self, budget: int, **kwargs): count += 1 # compute Q function for the recommended policy - self.Q_policy, _ = backward_induction(self.R_hat[:self.M, :], - self.P_hat[:self.M, :, :self.M], - self.horizon, self.gamma) + self.Q_policy, _ = backward_induction( + self.R_hat[: self.M, :], + self.P_hat[: self.M, :, : self.M], + self.horizon, + self.gamma, + ) def _map_to_repr(self, state, accept_new_repr=True): - repr_state = map_to_representative(state, - self.lp_metric, - self.representative_states, - self.M, - self.min_dist, - self.scaling, - accept_new_repr) + repr_state = map_to_representative( + state, + self.lp_metric, + self.representative_states, + self.M, + self.min_dist, + self.scaling, + accept_new_repr, + ) # check if new representative state if repr_state == self.M: self.M += 1 @@ -230,12 +247,15 @@ def _update(self, state, action, next_state, reward): self.N_sas[repr_state, action, repr_next_state] += 1 self.S_sa[repr_state, action] += reward - self.R_hat[repr_state, action] = self.S_sa[repr_state, action] \ - / self.N_sa[repr_state, action] - self.P_hat[repr_state, action, :] = self.N_sas[repr_state, action, :] \ - / self.N_sa[repr_state, action] - self.B_sa[repr_state, action] = \ - self._compute_bonus(self.N_sa[repr_state, action]) + self.R_hat[repr_state, action] = ( + self.S_sa[repr_state, action] / self.N_sa[repr_state, action] + ) + self.P_hat[repr_state, action, :] = ( + self.N_sas[repr_state, action, :] / self.N_sa[repr_state, action] + ) + self.B_sa[repr_state, action] = self._compute_bonus( + self.N_sa[repr_state, action] + ) def _compute_bonus(self, n): # reward-free @@ -250,7 +270,8 @@ def _compute_bonus(self, n): return bonus else: raise NotImplementedError( - "Error: bonus type {} not implemented".format(self.bonus_type)) + "Error: bonus type {} not implemented".format(self.bonus_type) + ) def _get_action(self, state, hh=0): assert self.Q is not None @@ -277,10 +298,14 @@ def _run_episode(self): # run backward induction backward_induction_in_place( - self.Q[:, :self.M, :], self.V[:, :self.M], - self.R_hat[:self.M, :] + self.B_sa[:self.M, :], - self.P_hat[:self.M, :, :self.M], - self.horizon, self.gamma, self.v_max) + self.Q[:, : self.M, :], + self.V[:, : self.M], + self.R_hat[: self.M, :] + self.B_sa[: self.M, :], + self.P_hat[: self.M, :, : self.M], + self.horizon, + self.gamma, + self.v_max, + ) self.episode += 1 # diff --git a/rlberry/agents/linear/lsvi_ucb.py b/rlberry/agents/linear/lsvi_ucb.py index 438b39e3c..36706116f 100644 --- a/rlberry/agents/linear/lsvi_ucb.py +++ b/rlberry/agents/linear/lsvi_ucb.py @@ -9,16 +9,18 @@ @numba_jit def run_lsvi_jit( - dim, horizon, - bonus_factor, - lambda_mat_inv, - reward_hist, - gamma, - feat_hist, - n_actions, - feat_ns_all_actions, - v_max, - total_time_steps): + dim, + horizon, + bonus_factor, + lambda_mat_inv, + reward_hist, + gamma, + feat_hist, + n_actions, + feat_ns_all_actions, + v_max, + total_time_steps, +): """ Jit version of Least-Squares Value Iteration. @@ -61,10 +63,10 @@ def run_lsvi_jit( for aa in range(n_actions): # feat_ns_aa = feat_ns_all_actions[tt, aa, :] - inverse_counts = \ - feat_ns_aa.dot(lambda_mat_inv.T.dot(feat_ns_aa)) - bonus = bonus_factor * np.sqrt(inverse_counts) \ - + v_max * inverse_counts * (bonus_factor > 0.0) + inverse_counts = feat_ns_aa.dot(lambda_mat_inv.T.dot(feat_ns_aa)) + bonus = bonus_factor * np.sqrt( + inverse_counts + ) + v_max * inverse_counts * (bonus_factor > 0.0) # q_ns[aa] = feat_ns_aa.dot(q_w[hh + 1, :]) + bonus q_ns[aa] = min(q_ns[aa], v_max) @@ -119,17 +121,19 @@ class LSVIUCBAgent(AgentWithSimplePolicy): function approximation. In Conference on Learning Theory (pp. 2137-2143). """ - name = 'LSVI-UCB' - - def __init__(self, - env, - horizon, - feature_map_fn, - feature_map_kwargs=None, - gamma=0.99, - bonus_scale_factor=1.0, - reg_factor=0.1, - **kwargs): + name = "LSVI-UCB" + + def __init__( + self, + env, + horizon, + feature_map_fn, + feature_map_kwargs=None, + gamma=0.99, + bonus_scale_factor=1.0, + reg_factor=0.1, + **kwargs + ): AgentWithSimplePolicy.__init__(self, env, **kwargs) self.n_episodes = None @@ -142,23 +146,29 @@ def __init__(self, # if self.bonus_scale_factor == 0.0: - self.name = 'LSVI-Random-Expl' + self.name = "LSVI-Random-Expl" # maximum value r_range = self.env.reward_range[1] - self.env.reward_range[0] if r_range == np.inf: - logger.warning("{}: Reward range is infinity. ".format(self.name) - + "Clipping it to 1.") + logger.warning( + "{}: Reward range is infinity. ".format(self.name) + "Clipping it to 1." + ) r_range = 1.0 if self.gamma == 1.0: self.v_max = r_range * horizon else: - self.v_max = r_range * (1.0 - np.power(self.gamma, self.horizon)) / (1.0 - self.gamma) + self.v_max = ( + r_range + * (1.0 - np.power(self.gamma, self.horizon)) + / (1.0 - self.gamma) + ) # - assert isinstance(self.env.action_space, Discrete), \ - "LSVI-UCB requires discrete actions." + assert isinstance( + self.env.action_space, Discrete + ), "LSVI-UCB requires discrete actions." # assert len(self.feature_map.shape) == 1 @@ -196,9 +206,9 @@ def reset(self): self._rewards = np.zeros(self.n_episodes) # self.feat_hist = np.zeros((self.n_episodes * self.horizon, self.dim)) - self.feat_ns_all_actions = np.zeros((self.n_episodes * self.horizon, - self.env.action_space.n, - self.dim)) + self.feat_ns_all_actions = np.zeros( + (self.n_episodes * self.horizon, self.env.action_space.n, self.dim) + ) # self.w_policy = None @@ -210,7 +220,8 @@ def fit(self, budget, **kwargs): if self.n_episodes is not None: logger.warning( "[LSVI-UCB]: Calling fit() more than once will reset the algorithm" - + " (to realocate memory according to the number of episodes).") + + " (to realocate memory according to the number of episodes)." + ) self.n_episodes = budget self.reset() @@ -252,8 +263,7 @@ def run_episode(self): # self.lambda_mat += np.outer(feat, feat) # update inverse - self.lambda_mat_inv -= \ - (inv @ outer_prod @ inv) / (1 + feat @ inv.T @ feat) + self.lambda_mat_inv -= (inv @ outer_prod @ inv) / (1 + feat @ inv.T @ feat) # update history self.reward_hist[self.total_time_steps] = reward @@ -265,8 +275,9 @@ def run_episode(self): tt = self.total_time_steps self.feat_hist[tt, :] = self.feature_map.map(state, action) for aa in range(self.env.action_space.n): - self.feat_ns_all_actions[tt, aa, :] = \ - self.feature_map.map(next_state, aa) + self.feat_ns_all_actions[tt, aa, :] = self.feature_map.map( + next_state, aa + ) # increments self.total_time_steps += 1 @@ -293,8 +304,9 @@ def _compute_q(self, q_w, state, action, bonus_factor): """q_w is the vector representation of the Q function.""" feat = self.feature_map.map(state, action) inverse_counts = feat @ (self.lambda_mat_inv.T @ feat) - bonus = bonus_factor * np.sqrt(inverse_counts) \ - + self.v_max * inverse_counts * (bonus_factor > 0.0) + bonus = bonus_factor * np.sqrt(inverse_counts) + self.v_max * inverse_counts * ( + bonus_factor > 0.0 + ) q = feat.dot(q_w) + bonus return q @@ -305,23 +317,26 @@ def _compute_q_vec(self, q_w, state, bonus_factor): # q_vec[aa] = self._compute_q(q_w, state, aa, bonus_factor) feat = self.feature_map.map(state, aa) inverse_counts = feat @ (self.lambda_mat_inv.T @ feat) - bonus = bonus_factor * np.sqrt(inverse_counts) \ - + self.v_max * inverse_counts * (bonus_factor > 0.0) + bonus = bonus_factor * np.sqrt( + inverse_counts + ) + self.v_max * inverse_counts * (bonus_factor > 0.0) q_vec[aa] = feat.dot(q_w) + bonus # q_vec[aa] = min(q_vec[aa], self.v_max) # !!!!!!!!! return q_vec def _run_lsvi(self, bonus_factor): # run value iteration - q_w = run_lsvi_jit(self.dim, - self.horizon, - bonus_factor, - self.lambda_mat_inv, - self.reward_hist, - self.gamma, - self.feat_hist, - self.env.action_space.n, - self.feat_ns_all_actions, - self.v_max, - self.total_time_steps) + q_w = run_lsvi_jit( + self.dim, + self.horizon, + bonus_factor, + self.lambda_mat_inv, + self.reward_hist, + self.gamma, + self.feat_hist, + self.env.action_space.n, + self.feat_ns_all_actions, + self.v_max, + self.total_time_steps, + ) return q_w diff --git a/rlberry/agents/mbqvi/mbqvi.py b/rlberry/agents/mbqvi/mbqvi.py index e32d8322c..4b1e3c92a 100644 --- a/rlberry/agents/mbqvi/mbqvi.py +++ b/rlberry/agents/mbqvi/mbqvi.py @@ -41,21 +41,19 @@ class MBQVIAgent(AgentWithSimplePolicy): name = "MBQVI" - def __init__(self, env, - n_samples=10, - gamma=0.99, - horizon=None, - epsilon=1e-6, - **kwargs): + def __init__( + self, env, n_samples=10, gamma=0.99, horizon=None, epsilon=1e-6, **kwargs + ): AgentWithSimplePolicy.__init__(self, env, **kwargs) # initialize base class - assert self.env.is_generative(), \ - "MBQVI requires a generative model." - assert isinstance(self.env.observation_space, Discrete), \ - "MBQVI requires a finite state space." - assert isinstance(self.env.action_space, Discrete), \ - "MBQVI requires a finite action space." + assert self.env.is_generative(), "MBQVI requires a generative model." + assert isinstance( + self.env.observation_space, Discrete + ), "MBQVI requires a finite state space." + assert isinstance( + self.env.action_space, Discrete + ), "MBQVI requires a finite action space." # self.n_samples = n_samples @@ -102,15 +100,14 @@ def fit(self, budget=None, **kwargs): count += 1 if count % 10000 == 0: completed = 100 * count / total_samples - logger.debug("[{}] ... {}/{} ({:0.0f}%)".format( - self.name, - count, - total_samples, - completed)) + logger.debug( + "[{}] ... {}/{} ({:0.0f}%)".format( + self.name, count, total_samples, completed + ) + ) # build model and run VI - logger.debug( - f"{self.name} building model and running backward induction...") + logger.debug(f"{self.name} building model and running backward induction...") N_sa = np.maximum(self.N_sa, 1) self.R_hat = self.S_sa / N_sa @@ -122,15 +119,16 @@ def fit(self, budget=None, **kwargs): info["n_samples"] = self.n_samples info["total_samples"] = total_samples if self.horizon is None: - assert self.gamma < 1.0, \ - "The discounted setting requires gamma < 1.0" - self.Q, self.V, n_it = value_iteration(self.R_hat, self.P_hat, - self.gamma, self.epsilon) + assert self.gamma < 1.0, "The discounted setting requires gamma < 1.0" + self.Q, self.V, n_it = value_iteration( + self.R_hat, self.P_hat, self.gamma, self.epsilon + ) info["n_iterations"] = n_it info["precision"] = self.epsilon else: - self.Q, self.V = backward_induction(self.R_hat, self.P_hat, - self.horizon, self.gamma) + self.Q, self.V = backward_induction( + self.R_hat, self.P_hat, self.horizon, self.gamma + ) info["n_iterations"] = self.horizon info["precision"] = 0.0 return info diff --git a/rlberry/agents/optql/optql.py b/rlberry/agents/optql/optql.py index f0fc66750..d4e4d91df 100644 --- a/rlberry/agents/optql/optql.py +++ b/rlberry/agents/optql/optql.py @@ -36,16 +36,19 @@ class OptQLAgent(AgentWithSimplePolicy): Is Q-Learning Provably Efficient? https://arxiv.org/abs/1807.03765 """ + name = "OptQL" - def __init__(self, - env, - gamma=1.0, - horizon=100, - bonus_scale_factor=1.0, - bonus_type="simplified_bernstein", - add_bonus_after_update=False, - **kwargs): + def __init__( + self, + env, + gamma=1.0, + horizon=100, + bonus_scale_factor=1.0, + bonus_type="simplified_bernstein", + add_bonus_after_update=False, + **kwargs + ): # init base class AgentWithSimplePolicy.__init__(self, env, **kwargs) @@ -62,8 +65,10 @@ def __init__(self, # maximum value r_range = self.env.reward_range[1] - self.env.reward_range[0] if r_range == np.inf or r_range == 0.0: - logger.warning("{}: Reward range is zero or infinity. ".format(self.name) - + "Setting it to 1.") + logger.warning( + "{}: Reward range is zero or infinity. ".format(self.name) + + "Setting it to 1." + ) r_range = 1.0 self.v_max = np.zeros(self.horizon) @@ -88,9 +93,9 @@ def reset(self, **kwargs): self.Q = np.ones((H, S, A)) self.Q_bar = np.ones((H, S, A)) for hh in range(self.horizon): - self.V[hh, :] *= (self.horizon - hh) - self.Q[hh, :, :] *= (self.horizon - hh) - self.Q_bar[hh, :, :] *= (self.horizon - hh) + self.V[hh, :] *= self.horizon - hh + self.Q[hh, :, :] *= self.horizon - hh + self.Q_bar[hh, :, :] *= self.horizon - hh if self.add_bonus_after_update: self.Q *= 0.0 @@ -99,16 +104,17 @@ def reset(self, **kwargs): self.episode = 0 # useful object to compute total number of visited states & entropy of visited states - self.counter = DiscreteCounter(self.env.observation_space, - self.env.action_space) + self.counter = DiscreteCounter( + self.env.observation_space, self.env.action_space + ) def policy(self, observation): - """ Recommended policy. """ + """Recommended policy.""" state = observation return self.Q_bar[0, state, :].argmax() def _get_action(self, state, hh=0): - """ Sampling policy. """ + """Sampling policy.""" return self.Q_bar[hh, state, :].argmax() def _compute_bonus(self, n, hh): @@ -118,7 +124,8 @@ def _compute_bonus(self, n, hh): return bonus else: raise ValueError( - "Error: bonus type {} not implemented".format(self.bonus_type)) + "Error: bonus type {} not implemented".format(self.bonus_type) + ) def _update(self, state, action, next_state, reward, hh): self.N_sa[hh, state, action] += 1 @@ -131,14 +138,20 @@ def _update(self, state, action, next_state, reward, hh): # bonus in the update if not self.add_bonus_after_update: target = reward + bonus + self.gamma * self.V[hh + 1, next_state] - self.Q[hh, state, action] = (1 - alpha) * self.Q[hh, state, action] + alpha * target + self.Q[hh, state, action] = (1 - alpha) * self.Q[ + hh, state, action + ] + alpha * target self.V[hh, state] = min(self.v_max[hh], self.Q[hh, state, :].max()) self.Q_bar[hh, state, action] = self.Q[hh, state, action] # bonus outside the update else: target = reward + self.gamma * self.V[hh + 1, next_state] # bonus not here - self.Q[hh, state, action] = (1 - alpha) * self.Q[hh, state, action] + alpha * target - self.Q_bar[hh, state, action] = self.Q[hh, state, action] + bonus # bonus here + self.Q[hh, state, action] = (1 - alpha) * self.Q[ + hh, state, action + ] + alpha * target + self.Q_bar[hh, state, action] = ( + self.Q[hh, state, action] + bonus + ) # bonus here self.V[hh, state] = min(self.v_max[hh], self.Q_bar[hh, state, :].max()) def _run_episode(self): @@ -164,7 +177,9 @@ def _run_episode(self): # writer if self.writer is not None: self.writer.add_scalar("episode_rewards", episode_rewards, self.episode) - self.writer.add_scalar("n_visited_states", self.counter.get_n_visited_states(), self.episode) + self.writer.add_scalar( + "n_visited_states", self.counter.get_n_visited_states(), self.episode + ) # return sum of rewards collected in the episode return episode_rewards diff --git a/rlberry/agents/tests/test_dynprog.py b/rlberry/agents/tests/test_dynprog.py index f8f4bc6b4..feb3e9f4e 100644 --- a/rlberry/agents/tests/test_dynprog.py +++ b/rlberry/agents/tests/test_dynprog.py @@ -22,24 +22,26 @@ def get_random_mdp(S, A): return R, P -@pytest.mark.parametrize("gamma, S, A", - [ - (0.001, 2, 1), - (0.25, 2, 1), - (0.5, 2, 1), - (0.75, 2, 1), - (0.999, 2, 1), - (0.001, 4, 2), - (0.25, 4, 2), - (0.5, 4, 2), - (0.75, 4, 2), - (0.999, 4, 2), - (0.001, 20, 4), - (0.25, 20, 4), - (0.5, 20, 4), - (0.75, 20, 4), - (0.999, 20, 4) - ]) +@pytest.mark.parametrize( + "gamma, S, A", + [ + (0.001, 2, 1), + (0.25, 2, 1), + (0.5, 2, 1), + (0.75, 2, 1), + (0.999, 2, 1), + (0.001, 4, 2), + (0.25, 4, 2), + (0.5, 4, 2), + (0.75, 4, 2), + (0.999, 4, 2), + (0.001, 20, 4), + (0.25, 20, 4), + (0.5, 20, 4), + (0.75, 20, 4), + (0.999, 20, 4), + ], +) def test_bellman_operator_monotonicity_and_contraction(gamma, S, A): rng = seeding.Seeder(123).rng vmax = 1.0 / (1.0 - gamma) @@ -67,14 +69,10 @@ def test_bellman_operator_monotonicity_and_contraction(gamma, S, A): assert np.greater(TQ2, TQ3).sum() == 0 -@pytest.mark.parametrize("gamma, S, A", - [ - (0.01, 10, 4), - (0.25, 10, 4), - (0.5, 10, 4), - (0.75, 10, 4), - (0.99, 10, 4) - ]) +@pytest.mark.parametrize( + "gamma, S, A", + [(0.01, 10, 4), (0.25, 10, 4), (0.5, 10, 4), (0.75, 10, 4), (0.99, 10, 4)], +) def test_value_iteration(gamma, S, A): for epsilon in np.logspace(-1, -6, num=5): for sim in range(5): @@ -88,11 +86,7 @@ def test_value_iteration(gamma, S, A): assert np.abs(TQ - Q).max() <= epsilon -@pytest.mark.parametrize("horizon, S, A", - [ - (10, 5, 4), - (20, 10, 4) - ]) +@pytest.mark.parametrize("horizon, S, A", [(10, 5, 4), (20, 10, 4)]) def test_backward_induction(horizon, S, A): for sim in range(5): # generate random MDP @@ -116,11 +110,7 @@ def test_backward_induction(horizon, S, A): assert np.array_equal(V, V2) -@pytest.mark.parametrize("horizon, S, A", - [ - (10, 5, 4), - (20, 10, 4) - ]) +@pytest.mark.parametrize("horizon, S, A", [(10, 5, 4), (20, 10, 4)]) def test_backward_induction_sd(horizon, S, A): """ Test stage-dependent MDPs @@ -146,11 +136,7 @@ def test_backward_induction_sd(horizon, S, A): assert np.array_equal(V, Vstat) -@pytest.mark.parametrize("horizon, gamma, S, A", - [ - (None, 0.5, 10, 4), - (10, 1.0, 10, 4) - ]) +@pytest.mark.parametrize("horizon, gamma, S, A", [(None, 0.5, 10, 4), (10, 1.0, 10, 4)]) def test_value_iteration_agent(horizon, gamma, S, A): for sim in range(5): # generate random MDP diff --git a/rlberry/agents/tests/test_kernel_based.py b/rlberry/agents/tests/test_kernel_based.py index 4edefaa42..65abac706 100644 --- a/rlberry/agents/tests/test_kernel_based.py +++ b/rlberry/agents/tests/test_kernel_based.py @@ -5,17 +5,20 @@ from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env -@pytest.mark.parametrize("kernel_type", [ - "uniform", - "triangular", - "gaussian", - "epanechnikov", - "quartic", - "triweight", - "tricube", - "cosine", - "exp-2" -]) +@pytest.mark.parametrize( + "kernel_type", + [ + "uniform", + "triangular", + "gaussian", + "epanechnikov", + "quartic", + "triweight", + "tricube", + "cosine", + "exp-2", + ], +) def test_rs_kernel_ucbvi(kernel_type): for horizon in [None, 30]: env = get_benchmark_env(level=1) @@ -27,7 +30,8 @@ def test_rs_kernel_ucbvi(kernel_type): min_dist=0.2, bandwidth=0.05, beta=1.0, - kernel_type=kernel_type) + kernel_type=kernel_type, + ) agent.fit(budget=5) agent.policy(env.observation_space.sample()) @@ -39,21 +43,16 @@ def test_str_to_int(): def test_rs_ucbvi(): env = get_benchmark_env(level=1) - agent = RSUCBVIAgent(env, - gamma=0.99, - horizon=30, - bonus_scale_factor=0.1) + agent = RSUCBVIAgent(env, gamma=0.99, horizon=30, bonus_scale_factor=0.1) agent.fit(budget=5) agent.policy(env.observation_space.sample()) def test_rs_ucbvi_reward_free(): env = get_benchmark_env(level=1) - agent = RSUCBVIAgent(env, - gamma=0.99, - horizon=30, - bonus_scale_factor=0.1, - reward_free=True) + agent = RSUCBVIAgent( + env, gamma=0.99, horizon=30, bonus_scale_factor=0.1, reward_free=True + ) agent.fit(budget=5) agent.policy(env.observation_space.sample()) assert agent.R_hat.sum() == 0.0 diff --git a/rlberry/agents/tests/test_lsvi_ucb.py b/rlberry/agents/tests/test_lsvi_ucb.py index 9dc3abeaa..31480e1d3 100644 --- a/rlberry/agents/tests/test_lsvi_ucb.py +++ b/rlberry/agents/tests/test_lsvi_ucb.py @@ -37,10 +37,9 @@ def feature_map_fn(_env): return FeatMapClass(_env.observation_space.n, _env.action_space.n) reg_factor = 0.1 - agent = LSVIUCBAgent(env, - feature_map_fn=feature_map_fn, - horizon=10, - reg_factor=reg_factor) + agent = LSVIUCBAgent( + env, feature_map_fn=feature_map_fn, horizon=10, reg_factor=reg_factor + ) agent.reseed(123) agent.fit(budget=50) assert np.allclose(np.linalg.inv(agent.lambda_mat), agent.lambda_mat_inv) @@ -57,14 +56,17 @@ def feature_map_fn(_env): for state, action in zip(agent.state_hist, agent.action_hist): N_sa[state, action] += 1.0 - assert np.allclose(agent.lambda_mat_inv.diagonal(), - 1.0 / (N_sa.flatten() + reg_factor)) + assert np.allclose( + agent.lambda_mat_inv.diagonal(), 1.0 / (N_sa.flatten() + reg_factor) + ) for ss in range(S): for aa in range(A): feat = agent.feature_map.map(ss, aa) - assert np.allclose(feat @ (agent.lambda_mat_inv.T @ feat), - 1.0 / (N_sa[ss, aa] + reg_factor)) + assert np.allclose( + feat @ (agent.lambda_mat_inv.T @ feat), + 1.0 / (N_sa[ss, aa] + reg_factor), + ) def test_lsvi_without_bonus(): @@ -88,8 +90,7 @@ def lsvi_debug_gather_data(agent): # agent.lambda_mat += np.outer(feat, feat) # update inverse - agent.lambda_mat_inv -= \ - (inv @ outer_prod @ inv) / (1 + feat @ inv.T @ feat) + agent.lambda_mat_inv -= (inv @ outer_prod @ inv) / (1 + feat @ inv.T @ feat) # update history agent.reward_hist[count] = reward @@ -101,8 +102,9 @@ def lsvi_debug_gather_data(agent): tt = agent.total_time_steps agent.feat_hist[tt, :] = agent.feature_map.map(state, action) for aa in range(agent.env.action_space.n): - agent.feat_ns_all_actions[tt, aa, :] = \ - agent.feature_map.map(next_state, aa) + agent.feat_ns_all_actions[tt, aa, :] = agent.feature_map.map( + next_state, aa + ) # increments agent.total_time_steps += 1 @@ -114,11 +116,9 @@ def lsvi_debug_gather_data(agent): def feature_map_fn(_env): return OneHotFeatureMap(_env.observation_space.n, _env.action_space.n) - agent = LSVIUCBAgent(env, - feature_map_fn=feature_map_fn, - horizon=20, - gamma=0.99, - reg_factor=1e-5) + agent = LSVIUCBAgent( + env, feature_map_fn=feature_map_fn, horizon=20, gamma=0.99, reg_factor=1e-5 + ) agent.reseed(123) agent.n_episodes = 100 agent.reset() @@ -150,12 +150,14 @@ def test_lsvi_random_exploration(): def feature_map_fn(_env): return OneHotFeatureMap(_env.observation_space.n, _env.action_space.n) - agent = LSVIUCBAgent(env, - feature_map_fn=feature_map_fn, - horizon=20, - gamma=0.99, - reg_factor=1e-5, - bonus_scale_factor=0.0) + agent = LSVIUCBAgent( + env, + feature_map_fn=feature_map_fn, + horizon=20, + gamma=0.99, + reg_factor=1e-5, + bonus_scale_factor=0.0, + ) agent.reseed(123) agent.fit(budget=250) @@ -184,11 +186,14 @@ def test_lsvi_optimism(): def feature_map_fn(_env): return OneHotFeatureMap(_env.observation_space.n, _env.action_space.n) - agent = LSVIUCBAgent(env, gamma=0.99, - feature_map_fn=feature_map_fn, - horizon=3, - bonus_scale_factor=3, - reg_factor=0.000001) + agent = LSVIUCBAgent( + env, + gamma=0.99, + feature_map_fn=feature_map_fn, + horizon=3, + bonus_scale_factor=3, + reg_factor=0.000001, + ) agent.fit(budget=250) # near optimal Q @@ -202,9 +207,8 @@ def feature_map_fn(_env): Q_optimistic = np.zeros((S, A)) for ss in range(S): Q_optimistic[ss, :] = agent._compute_q_vec( - agent.w_vec[0, :], - ss, - agent.bonus_scale_factor) + agent.w_vec[0, :], ss, agent.bonus_scale_factor + ) print(Q) print(Q_optimistic) diff --git a/rlberry/agents/tests/test_optql.py b/rlberry/agents/tests/test_optql.py index 381c30b4a..35adf21d6 100644 --- a/rlberry/agents/tests/test_optql.py +++ b/rlberry/agents/tests/test_optql.py @@ -4,9 +4,6 @@ def test_optql(): env = GridWorld(walls=(), nrows=5, ncols=5) - agent = OptQLAgent(env, - horizon=11, - gamma=0.99, - bonus_scale_factor=0.1) + agent = OptQLAgent(env, horizon=11, gamma=0.99, bonus_scale_factor=0.1) agent.fit(budget=50) agent.policy(env.observation_space.sample()) diff --git a/rlberry/agents/tests/test_ucbvi.py b/rlberry/agents/tests/test_ucbvi.py index 75cd4f0f6..641fe0c02 100644 --- a/rlberry/agents/tests/test_ucbvi.py +++ b/rlberry/agents/tests/test_ucbvi.py @@ -3,24 +3,28 @@ from rlberry.envs.finite import GridWorld -@pytest.mark.parametrize("gamma, stage_dependent, real_time_dp", - [ - (1.0, True, True), - (1.0, True, False), - (1.0, False, True), - (1.0, False, False), - (0.9, True, True), - (0.9, True, False), - (0.9, False, True), - (0.9, False, False), - ]) +@pytest.mark.parametrize( + "gamma, stage_dependent, real_time_dp", + [ + (1.0, True, True), + (1.0, True, False), + (1.0, False, True), + (1.0, False, False), + (0.9, True, True), + (0.9, True, False), + (0.9, False, True), + (0.9, False, False), + ], +) def test_ucbvi(gamma, stage_dependent, real_time_dp): env = GridWorld(walls=(), nrows=5, ncols=5) - agent = UCBVIAgent(env, - horizon=11, - stage_dependent=stage_dependent, - gamma=gamma, - real_time_dp=real_time_dp, - bonus_scale_factor=0.1) + agent = UCBVIAgent( + env, + horizon=11, + stage_dependent=stage_dependent, + gamma=gamma, + real_time_dp=real_time_dp, + bonus_scale_factor=0.1, + ) agent.fit(budget=50) agent.policy(env.observation_space.sample()) diff --git a/rlberry/agents/torch/a2c/a2c.py b/rlberry/agents/torch/a2c/a2c.py index 6fe9b475e..75df7875a 100644 --- a/rlberry/agents/torch/a2c/a2c.py +++ b/rlberry/agents/torch/a2c/a2c.py @@ -62,29 +62,33 @@ class A2CAgent(AgentWithSimplePolicy): name = "A2C" - def __init__(self, env, - batch_size=8, - horizon=256, - gamma=0.99, - entr_coef=0.01, - learning_rate=0.01, - optimizer_type='ADAM', - k_epochs=5, - policy_net_fn=None, - value_net_fn=None, - policy_net_kwargs=None, - value_net_kwargs=None, - use_bonus=False, - uncertainty_estimator_kwargs=None, - device="cuda:best", - **kwargs): + def __init__( + self, + env, + batch_size=8, + horizon=256, + gamma=0.99, + entr_coef=0.01, + learning_rate=0.01, + optimizer_type="ADAM", + k_epochs=5, + policy_net_fn=None, + value_net_fn=None, + policy_net_kwargs=None, + value_net_kwargs=None, + use_bonus=False, + uncertainty_estimator_kwargs=None, + device="cuda:best", + **kwargs + ): AgentWithSimplePolicy.__init__(self, env, **kwargs) self.use_bonus = use_bonus if self.use_bonus: self.env = UncertaintyEstimatorWrapper( - self.env, **uncertainty_estimator_kwargs) + self.env, **uncertainty_estimator_kwargs + ) self.batch_size = batch_size self.horizon = horizon @@ -104,8 +108,7 @@ def __init__(self, env, self.policy_net_fn = policy_net_fn or default_policy_net_fn self.value_net_fn = value_net_fn or default_value_net_fn - self.optimizer_kwargs = {'optimizer_type': optimizer_type, - 'lr': learning_rate} + self.optimizer_kwargs = {"optimizer_type": optimizer_type, "lr": learning_rate} # check environment assert isinstance(self.env.observation_space, spaces.Box) @@ -117,24 +120,24 @@ def __init__(self, env, self.reset() def reset(self, **kwargs): - self.cat_policy = self.policy_net_fn( - self.env, - **self.policy_net_kwargs).to(self.device) + self.cat_policy = self.policy_net_fn(self.env, **self.policy_net_kwargs).to( + self.device + ) self.policy_optimizer = optimizer_factory( - self.cat_policy.parameters(), - **self.optimizer_kwargs) + self.cat_policy.parameters(), **self.optimizer_kwargs + ) - self.value_net = self.value_net_fn( - self.env, - **self.value_net_kwargs).to(self.device) + self.value_net = self.value_net_fn(self.env, **self.value_net_kwargs).to( + self.device + ) self.value_optimizer = optimizer_factory( - self.value_net.parameters(), - **self.optimizer_kwargs) + self.value_net.parameters(), **self.optimizer_kwargs + ) - self.cat_policy_old = self.policy_net_fn( - self.env, - **self.policy_net_kwargs).to(self.device) + self.cat_policy_old = self.policy_net_fn(self.env, **self.policy_net_kwargs).to( + self.device + ) self.cat_policy_old.load_state_dict(self.cat_policy.state_dict()) self.MseLoss = nn.MSELoss() @@ -183,8 +186,8 @@ def _run_episode(self): # check whether to use bonus bonus = 0.0 if self.use_bonus: - if info is not None and 'exploration_bonus' in info: - bonus = info['exploration_bonus'] + if info is not None and "exploration_bonus" in info: + bonus = info["exploration_bonus"] # save in batch self.memory.rewards.append(reward + bonus) # add bonus here @@ -214,8 +217,9 @@ def _update(self): # monte carlo estimate of rewards rewards = [] discounted_reward = 0 - for reward, is_terminal in zip(reversed(self.memory.rewards), - reversed(self.memory.is_terminals)): + for reward, is_terminal in zip( + reversed(self.memory.rewards), reversed(self.memory.is_terminals) + ): if is_terminal: discounted_reward = 0 discounted_reward = reward + (self.gamma * discounted_reward) @@ -239,13 +243,14 @@ def _update(self): # normalize the advantages advantages = rewards - state_values.detach() - advantages = (advantages - advantages.mean()) \ - / (advantages.std() + 1e-8) + advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) # find pg loss - pg_loss = - logprobs * advantages - loss = pg_loss \ - + 0.5 * self.MseLoss(state_values, rewards) \ + pg_loss = -logprobs * advantages + loss = ( + pg_loss + + 0.5 * self.MseLoss(state_values, rewards) - self.entr_coef * dist_entropy + ) # take gradient step self.policy_optimizer.zero_grad() @@ -264,21 +269,18 @@ def _update(self): # @classmethod def sample_parameters(cls, trial): - batch_size = trial.suggest_categorical('batch_size', - [1, 4, 8, 16, 32]) - gamma = trial.suggest_categorical('gamma', - [0.9, 0.95, 0.99]) - learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1) + batch_size = trial.suggest_categorical("batch_size", [1, 4, 8, 16, 32]) + gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.99]) + learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1) - entr_coef = trial.suggest_loguniform('entr_coef', 1e-8, 0.1) + entr_coef = trial.suggest_loguniform("entr_coef", 1e-8, 0.1) - k_epochs = trial.suggest_categorical('k_epochs', - [1, 5, 10, 20]) + k_epochs = trial.suggest_categorical("k_epochs", [1, 5, 10, 20]) return { - 'batch_size': batch_size, - 'gamma': gamma, - 'learning_rate': learning_rate, - 'entr_coef': entr_coef, - 'k_epochs': k_epochs, + "batch_size": batch_size, + "gamma": gamma, + "learning_rate": learning_rate, + "entr_coef": entr_coef, + "k_epochs": k_epochs, } diff --git a/rlberry/agents/torch/avec/avec_ppo.py b/rlberry/agents/torch/avec/avec_ppo.py index a3176f34a..40e082280 100644 --- a/rlberry/agents/torch/avec/avec_ppo.py +++ b/rlberry/agents/torch/avec/avec_ppo.py @@ -93,32 +93,36 @@ class AVECPPOAgent(AgentWithSimplePolicy): name = "AVECPPO" - def __init__(self, env, - batch_size=8, - horizon=256, - gamma=0.99, - entr_coef=0.01, - vf_coef=0., - avec_coef=1., - learning_rate=0.0003, - optimizer_type='ADAM', - eps_clip=0.2, - k_epochs=10, - policy_net_fn=None, - value_net_fn=None, - policy_net_kwargs=None, - value_net_kwargs=None, - use_bonus=False, - uncertainty_estimator_kwargs=None, - device="cuda:best", - **kwargs): + def __init__( + self, + env, + batch_size=8, + horizon=256, + gamma=0.99, + entr_coef=0.01, + vf_coef=0.0, + avec_coef=1.0, + learning_rate=0.0003, + optimizer_type="ADAM", + eps_clip=0.2, + k_epochs=10, + policy_net_fn=None, + value_net_fn=None, + policy_net_kwargs=None, + value_net_kwargs=None, + use_bonus=False, + uncertainty_estimator_kwargs=None, + device="cuda:best", + **kwargs + ): AgentWithSimplePolicy.__init__(self, env, **kwargs) self.use_bonus = use_bonus if self.use_bonus: self.env = UncertaintyEstimatorWrapper( - self.env, **uncertainty_estimator_kwargs) + self.env, **uncertainty_estimator_kwargs + ) self.learning_rate = learning_rate self.gamma = gamma @@ -141,8 +145,7 @@ def __init__(self, env, self.policy_net_fn = policy_net_fn or default_policy_net_fn self.value_net_fn = value_net_fn or default_value_net_fn - self.optimizer_kwargs = {'optimizer_type': optimizer_type, - 'lr': learning_rate} + self.optimizer_kwargs = {"optimizer_type": optimizer_type, "lr": learning_rate} # check environment assert isinstance(self.env.observation_space, spaces.Box) @@ -154,26 +157,23 @@ def __init__(self, env, self.reset() def reset(self, **kwargs): - self.cat_policy = self.policy_net_fn( - self.env, - **self.policy_net_kwargs - ).to(self.device) + self.cat_policy = self.policy_net_fn(self.env, **self.policy_net_kwargs).to( + self.device + ) self.policy_optimizer = optimizer_factory( - self.cat_policy.parameters(), - **self.optimizer_kwargs) + self.cat_policy.parameters(), **self.optimizer_kwargs + ) - self.value_net = self.value_net_fn( - self.env, - **self.value_net_kwargs - ).to(self.device) + self.value_net = self.value_net_fn(self.env, **self.value_net_kwargs).to( + self.device + ) self.value_optimizer = optimizer_factory( - self.value_net.parameters(), - **self.optimizer_kwargs) + self.value_net.parameters(), **self.optimizer_kwargs + ) - self.cat_policy_old = self.policy_net_fn( - self.env, - **self.policy_net_kwargs - ).to(self.device) + self.cat_policy_old = self.policy_net_fn(self.env, **self.policy_net_kwargs).to( + self.device + ) self.cat_policy_old.load_state_dict(self.cat_policy.state_dict()) self.MseLoss = nn.MSELoss() @@ -223,8 +223,8 @@ def _run_episode(self): # check whether to use bonus bonus = 0.0 if self.use_bonus: - if info is not None and 'exploration_bonus' in info: - bonus = info['exploration_bonus'] + if info is not None and "exploration_bonus" in info: + bonus = info["exploration_bonus"] # save in batch self.memory.rewards.append(reward + bonus) # add bonus here @@ -255,8 +255,9 @@ def _update(self): # monte carlo estimate of rewards rewards = [] discounted_reward = 0 - for reward, is_terminal in zip(reversed(self.memory.rewards), - reversed(self.memory.is_terminals)): + for reward, is_terminal in zip( + reversed(self.memory.rewards), reversed(self.memory.is_terminals) + ): if is_terminal: discounted_reward = 0 discounted_reward = reward + (self.gamma * discounted_reward) @@ -284,16 +285,18 @@ def _update(self): # normalize the advantages advantages = rewards - state_values.detach() - advantages = (advantages - advantages.mean()) / \ - (advantages.std() + 1e-8) + advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) # find surrogate loss surr1 = ratios * advantages - surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 - + self.eps_clip) * advantages - loss = -torch.min(surr1, surr2) \ - + self.avec_coef * self._avec_loss(state_values, rewards) \ - + self.vf_coef * self.MseLoss(state_values, rewards) \ + surr2 = ( + torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages + ) + loss = ( + -torch.min(surr1, surr2) + + self.avec_coef * self._avec_loss(state_values, rewards) + + self.vf_coef * self.MseLoss(state_values, rewards) - self.entr_coef * dist_entropy + ) # take gradient step self.policy_optimizer.zero_grad() @@ -328,25 +331,21 @@ def _avec_loss(self, y_pred, y_true): # @classmethod def sample_parameters(cls, trial): - batch_size = trial.suggest_categorical('batch_size', - [1, 4, 8, 16, 32]) - gamma = trial.suggest_categorical('gamma', - [0.9, 0.95, 0.99]) - learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1) + batch_size = trial.suggest_categorical("batch_size", [1, 4, 8, 16, 32]) + gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.99]) + learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1) - entr_coef = trial.suggest_loguniform('entr_coef', 1e-8, 0.1) + entr_coef = trial.suggest_loguniform("entr_coef", 1e-8, 0.1) - eps_clip = trial.suggest_categorical('eps_clip', - [0.1, 0.2, 0.3]) + eps_clip = trial.suggest_categorical("eps_clip", [0.1, 0.2, 0.3]) - k_epochs = trial.suggest_categorical('k_epochs', - [1, 5, 10, 20]) + k_epochs = trial.suggest_categorical("k_epochs", [1, 5, 10, 20]) return { - 'batch_size': batch_size, - 'gamma': gamma, - 'learning_rate': learning_rate, - 'entr_coef': entr_coef, - 'eps_clip': eps_clip, - 'k_epochs': k_epochs, + "batch_size": batch_size, + "gamma": gamma, + "learning_rate": learning_rate, + "entr_coef": entr_coef, + "eps_clip": eps_clip, + "k_epochs": k_epochs, } diff --git a/rlberry/agents/torch/dqn/dqn.py b/rlberry/agents/torch/dqn/dqn.py index df8ada635..978fc9fd9 100644 --- a/rlberry/agents/torch/dqn/dqn.py +++ b/rlberry/agents/torch/dqn/dqn.py @@ -4,13 +4,24 @@ import numpy as np from rlberry.agents import AgentWithSimplePolicy -from rlberry.agents.utils.memories import Transition, PrioritizedReplayMemory, TransitionReplayMemory +from rlberry.agents.utils.memories import ( + Transition, + PrioritizedReplayMemory, + TransitionReplayMemory, +) from rlberry.exploration_tools.discrete_counter import DiscreteCounter -from rlberry.exploration_tools.online_discretization_counter import OnlineDiscretizationCounter +from rlberry.exploration_tools.online_discretization_counter import ( + OnlineDiscretizationCounter, +) from rlberry.wrappers.uncertainty_estimator_wrapper import UncertaintyEstimatorWrapper from rlberry.agents.torch.dqn.exploration import exploration_factory -from rlberry.agents.torch.utils.training import loss_function_factory, model_factory, size_model_config, \ - trainable_parameters, optimizer_factory +from rlberry.agents.torch.utils.training import ( + loss_function_factory, + model_factory, + size_model_config, + trainable_parameters, + optimizer_factory, +) from rlberry.seeding import Seeder from rlberry.utils.factory import load from rlberry.utils.torch import choose_device @@ -75,47 +86,47 @@ class DQNAgent(AgentWithSimplePolicy): prioritized_replay: bool Use prioritized replay. """ - name = 'DQN' - - def __init__(self, - env, - horizon=256, - gamma=0.99, - loss_function="l2", - batch_size=100, - device="cuda:best", - target_update=1, - learning_rate=0.001, - epsilon_init=1.0, - epsilon_final=0.1, - epsilon_decay=5000, - optimizer_type='ADAM', - qvalue_net_fn=None, - qvalue_net_kwargs=None, - double=True, - memory_capacity=10000, - use_bonus=False, - uncertainty_estimator_kwargs=None, - prioritized_replay=True, - update_frequency=1, - **kwargs): + + name = "DQN" + + def __init__( + self, + env, + horizon=256, + gamma=0.99, + loss_function="l2", + batch_size=100, + device="cuda:best", + target_update=1, + learning_rate=0.001, + epsilon_init=1.0, + epsilon_final=0.1, + epsilon_decay=5000, + optimizer_type="ADAM", + qvalue_net_fn=None, + qvalue_net_kwargs=None, + double=True, + memory_capacity=10000, + use_bonus=False, + uncertainty_estimator_kwargs=None, + prioritized_replay=True, + update_frequency=1, + **kwargs, + ): # Wrap arguments and initialize base class - memory_kwargs = { - 'capacity': memory_capacity, - 'n_steps': 1, - 'gamma': gamma - } + memory_kwargs = {"capacity": memory_capacity, "n_steps": 1, "gamma": gamma} exploration_kwargs = { - 'method': "EpsilonGreedy", - 'temperature': epsilon_init, - 'final_temperature': epsilon_final, - 'tau': epsilon_decay, + "method": "EpsilonGreedy", + "temperature": epsilon_init, + "final_temperature": epsilon_final, + "tau": epsilon_decay, } AgentWithSimplePolicy.__init__(self, env, **kwargs) self.use_bonus = use_bonus if self.use_bonus: self.env = UncertaintyEstimatorWrapper( - self.env, **uncertainty_estimator_kwargs) + self.env, **uncertainty_estimator_kwargs + ) self.horizon = horizon self.exploration_kwargs = exploration_kwargs or {} self.memory_kwargs = memory_kwargs or {} @@ -123,40 +134,49 @@ def __init__(self, self.target_update = target_update self.double = double - assert isinstance(env.action_space, spaces.Discrete), \ - "Only compatible with Discrete action spaces." + assert isinstance( + env.action_space, spaces.Discrete + ), "Only compatible with Discrete action spaces." self.prioritized_replay = prioritized_replay - memory_class = PrioritizedReplayMemory if prioritized_replay else TransitionReplayMemory + memory_class = ( + PrioritizedReplayMemory if prioritized_replay else TransitionReplayMemory + ) self.memory = memory_class(**self.memory_kwargs) - self.exploration_policy = \ - exploration_factory(self.env.action_space, - **self.exploration_kwargs) + self.exploration_policy = exploration_factory( + self.env.action_space, **self.exploration_kwargs + ) self.training = True self.steps = 0 self.episode = 0 - self.optimizer_kwargs = {'optimizer_type': optimizer_type, - 'lr': learning_rate} + self.optimizer_kwargs = {"optimizer_type": optimizer_type, "lr": learning_rate} self.device = choose_device(device) self.loss_function = loss_function self.gamma = gamma qvalue_net_kwargs = qvalue_net_kwargs or {} - qvalue_net_fn = load(qvalue_net_fn) if isinstance(qvalue_net_fn, str) else \ - qvalue_net_fn or default_qvalue_net_fn + qvalue_net_fn = ( + load(qvalue_net_fn) + if isinstance(qvalue_net_fn, str) + else qvalue_net_fn or default_qvalue_net_fn + ) self.value_net = qvalue_net_fn(self.env, **qvalue_net_kwargs) self.target_net = qvalue_net_fn(self.env, **qvalue_net_kwargs) self.target_net.load_state_dict(self.value_net.state_dict()) self.target_net.eval() - logger.info("Number of trainable parameters: {}" - .format(trainable_parameters(self.value_net))) + logger.info( + "Number of trainable parameters: {}".format( + trainable_parameters(self.value_net) + ) + ) self.value_net.to(self.device) self.target_net.to(self.device) self.loss_function = loss_function_factory(self.loss_function) - self.optimizer = optimizer_factory(self.value_net.parameters(), - **self.optimizer_kwargs) + self.optimizer = optimizer_factory( + self.value_net.parameters(), **self.optimizer_kwargs + ) self.update_frequency = update_frequency self.steps = 0 @@ -167,21 +187,36 @@ def fit(self, budget: int, **kwargs): state = self.env.reset() values = self.get_state_action_values(state) for i, value in enumerate(values): - self.writer.add_scalar(f"agent/action_value_{i}", value, self.episode) + self.writer.add_scalar( + f"agent/action_value_{i}", value, self.episode + ) total_reward, total_bonus, total_success, length = self._run_episode() if self.episode % 20 == 0: - logger.info(f"Episode {self.episode + 1}/{budget}, total reward {total_reward}") + logger.info( + f"Episode {self.episode + 1}/{budget}, total reward {total_reward}" + ) if self.writer: self.writer.add_scalar("episode_rewards", total_reward, self.episode) - self.writer.add_scalar("episode/total_reward", total_reward, self.episode) + self.writer.add_scalar( + "episode/total_reward", total_reward, self.episode + ) self.writer.add_scalar("episode/total_bonus", total_bonus, self.episode) - self.writer.add_scalar("episode/total_success", total_success, self.episode) + self.writer.add_scalar( + "episode/total_success", total_success, self.episode + ) self.writer.add_scalar("episode/length", length, self.episode) - if self.use_bonus and \ - (isinstance(self.env.uncertainty_estimator, OnlineDiscretizationCounter) or - isinstance(self.env.uncertainty_estimator, DiscreteCounter)): - n_visited_states = (self.env.uncertainty_estimator.N_sa.sum(axis=1) > 0).sum() - self.writer.add_scalar("debug/n_visited_states", n_visited_states, self.episode) + if self.use_bonus and ( + isinstance( + self.env.uncertainty_estimator, OnlineDiscretizationCounter + ) + or isinstance(self.env.uncertainty_estimator, DiscreteCounter) + ): + n_visited_states = ( + self.env.uncertainty_estimator.N_sa.sum(axis=1) > 0 + ).sum() + self.writer.add_scalar( + "debug/n_visited_states", n_visited_states, self.episode + ) def _run_episode(self): total_reward = total_bonus = total_success = time = 0 @@ -194,8 +229,8 @@ def _run_episode(self): # bonus used only for logging, here bonus = 0.0 if self.use_bonus: - if info is not None and 'exploration_bonus' in info: - bonus = info['exploration_bonus'] + if info is not None and "exploration_bonus" in info: + bonus = info["exploration_bonus"] self.record(state, action, reward, next_state, done, info) state = next_state @@ -295,52 +330,60 @@ def compute_bellman_residual(self, batch): The residuals over the batch, and the computed target. """ # Concatenate the batch elements - state = torch.cat(tuple(torch.tensor([batch.state], - dtype=torch.float))).to(self.device) - action = torch.tensor(batch.action, - dtype=torch.long).to(self.device) - reward = torch.tensor(batch.reward, - dtype=torch.float).to(self.device) + state = torch.cat(tuple(torch.tensor([batch.state], dtype=torch.float))).to( + self.device + ) + action = torch.tensor(batch.action, dtype=torch.long).to(self.device) + reward = torch.tensor(batch.reward, dtype=torch.float).to(self.device) if self.use_bonus: - bonus = self.env.bonus_batch(state, action).to(self.device) * self.exploration_policy.epsilon + bonus = ( + self.env.bonus_batch(state, action).to(self.device) + * self.exploration_policy.epsilon + ) if self.writer: - self.writer.add_scalar("debug/minibatch_mean_bonus", bonus.mean().item(), self.episode) - self.writer.add_scalar("debug/minibatch_mean_reward", reward.mean().item(), self.episode) + self.writer.add_scalar( + "debug/minibatch_mean_bonus", bonus.mean().item(), self.episode + ) + self.writer.add_scalar( + "debug/minibatch_mean_reward", reward.mean().item(), self.episode + ) reward += bonus - next_state = torch.cat(tuple(torch.tensor([batch.next_state], - dtype=torch.float))).to(self.device) - terminal = torch.tensor(batch.terminal, - dtype=torch.bool).to(self.device) + next_state = torch.cat( + tuple(torch.tensor([batch.next_state], dtype=torch.float)) + ).to(self.device) + terminal = torch.tensor(batch.terminal, dtype=torch.bool).to(self.device) batch = Transition(state, action, reward, next_state, terminal, batch.info) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken state_action_values = self.value_net(batch.state) - state_action_values = \ - state_action_values.gather(1, batch.action.unsqueeze(1)).squeeze(1) + state_action_values = state_action_values.gather( + 1, batch.action.unsqueeze(1) + ).squeeze(1) with torch.no_grad(): # Compute V(s_{t+1}) for all next states. - next_state_values = \ - torch.zeros(batch.reward.shape).to(self.device) + next_state_values = torch.zeros(batch.reward.shape).to(self.device) if self.double: # Double Q-learning: pick best actions from policy network _, best_actions = self.value_net(batch.next_state).max(1) # Double Q-learning: estimate action values # from target network - best_values = self.target_net( - batch.next_state - ).gather(1, best_actions.unsqueeze(1)) \ + best_values = ( + self.target_net(batch.next_state) + .gather(1, best_actions.unsqueeze(1)) .squeeze(1) + ) else: best_values, _ = self.target_net(batch.next_state).max(1) - next_state_values[~batch.terminal] \ - = best_values[~batch.terminal] + next_state_values[~batch.terminal] = best_values[~batch.terminal] # Compute the expected Q values target_state_action_value = batch.reward + self.gamma * next_state_values # Compute residuals - residuals = self.loss_function(state_action_values, target_state_action_value, reduction='none') + residuals = self.loss_function( + state_action_values, target_state_action_value, reduction="none" + ) return residuals, target_state_action_value def get_batch_state_values(self, states): @@ -359,9 +402,9 @@ def get_batch_state_values(self, states): * [a1*; ...; aN*] the array of corresponding optimal action indexes for each state """ - values, actions = self.value_net(torch.tensor(states, - dtype=torch.float) - .to(self.device)).max(1) + values, actions = self.value_net( + torch.tensor(states, dtype=torch.float).to(self.device) + ).max(1) return values.data.cpu().numpy(), actions.data.cpu().numpy() def get_batch_state_action_values(self, states): @@ -378,9 +421,11 @@ def get_batch_state_action_values(self, states): values:[[Q11, ..., Q1n]; ...] the array of all action values for each state """ - return self.value_net(torch.tensor(states, - dtype=torch.float) - .to(self.device)).data.cpu().numpy() + return ( + self.value_net(torch.tensor(states, dtype=torch.float).to(self.device)) + .data.cpu() + .numpy() + ) def get_state_value(self, state): """ @@ -442,22 +487,24 @@ def set_time(self, time): def eval_mode(self): self.training = False - self.exploration_kwargs['method'] = "Greedy" - self.exploration_policy = \ - exploration_factory(self.env.action_space, - **self.exploration_kwargs) + self.exploration_kwargs["method"] = "Greedy" + self.exploration_policy = exploration_factory( + self.env.action_space, **self.exploration_kwargs + ) def save(self, filename, **kwargs): - state = {'state_dict': self.value_net.state_dict(), - 'optimizer': self.optimizer.state_dict()} + state = { + "state_dict": self.value_net.state_dict(), + "optimizer": self.optimizer.state_dict(), + } torch.save(state, filename) return filename def load(self, filename, **kwargs): checkpoint = torch.load(filename, map_location=self.device) - self.value_net.load_state_dict(checkpoint['state_dict']) - self.target_net.load_state_dict(checkpoint['state_dict']) - self.optimizer.load_state_dict(checkpoint['optimizer']) + self.value_net.load_state_dict(checkpoint["state_dict"]) + self.target_net.load_state_dict(checkpoint["state_dict"]) + self.optimizer.load_state_dict(checkpoint["optimizer"]) return filename def initialize_model(self): @@ -470,39 +517,39 @@ def set_writer(self, writer): except AttributeError: pass if self.writer: - obs_shape = self.env.observation_space.shape \ - if isinstance(self.env.observation_space, spaces.Box) else \ - self.env.observation_space.spaces[0].shape - model_input = torch.zeros((1, *obs_shape), dtype=torch.float, - device=self.device) + obs_shape = ( + self.env.observation_space.shape + if isinstance(self.env.observation_space, spaces.Box) + else self.env.observation_space.spaces[0].shape + ) + model_input = torch.zeros( + (1, *obs_shape), dtype=torch.float, device=self.device + ) self.writer.add_graph(self.value_net, input_to_model=(model_input,)) - self.writer.add_scalar("agent/trainable_parameters", - trainable_parameters(self.value_net), 0) + self.writer.add_scalar( + "agent/trainable_parameters", trainable_parameters(self.value_net), 0 + ) # # For hyperparameter optimization # @classmethod def sample_parameters(cls, trial): - batch_size = trial.suggest_categorical('batch_size', - [32, 64, 128, 256, 512]) - gamma = trial.suggest_categorical('gamma', - [0.95, 0.99]) - learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1) + batch_size = trial.suggest_categorical("batch_size", [32, 64, 128, 256, 512]) + gamma = trial.suggest_categorical("gamma", [0.95, 0.99]) + learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1) - target_update = trial.suggest_categorical('target_update', - [1, 250, 500, 1000]) + target_update = trial.suggest_categorical("target_update", [1, 250, 500, 1000]) - epsilon_final = trial.suggest_loguniform('epsilon_final', 1e-2, 1e-1) + epsilon_final = trial.suggest_loguniform("epsilon_final", 1e-2, 1e-1) - epsilon_decay = trial.suggest_categorical('target_update', - [1000, 5000, 10000]) + epsilon_decay = trial.suggest_categorical("target_update", [1000, 5000, 10000]) return { - 'batch_size': batch_size, - 'gamma': gamma, - 'learning_rate': learning_rate, - 'target_update': target_update, - 'epsilon_final': epsilon_final, - 'epsilon_decay': epsilon_decay, + "batch_size": batch_size, + "gamma": gamma, + "learning_rate": learning_rate, + "target_update": target_update, + "epsilon_final": epsilon_final, + "epsilon_decay": epsilon_decay, } diff --git a/rlberry/agents/torch/dqn/exploration.py b/rlberry/agents/torch/dqn/exploration.py index 4a3b24150..70d5ac70e 100644 --- a/rlberry/agents/torch/dqn/exploration.py +++ b/rlberry/agents/torch/dqn/exploration.py @@ -25,8 +25,8 @@ def sample(self): """ distribution = self.get_distribution() return self.np_random.choice( - list(distribution.keys()), 1, - p=np.array(list(distribution.values())))[0] + list(distribution.keys()), 1, p=np.array(list(distribution.values())) + )[0] def seed(self, seeder=None): """ @@ -58,12 +58,9 @@ class EpsilonGreedy(DiscreteDistribution): probability 1-epsilon. """ - def __init__(self, - action_space, - temperature=1.0, - final_temperature=0.1, - tau=5000, - **kwargs): + def __init__( + self, action_space, temperature=1.0, final_temperature=0.1, tau=5000, **kwargs + ): super().__init__(**kwargs) self.action_space = action_space self.temperature = temperature @@ -81,8 +78,10 @@ def __init__(self, self.seed() def get_distribution(self): - distribution = {action: self.epsilon / self.action_space.n - for action in range(self.action_space.n)} + distribution = { + action: self.epsilon / self.action_space.n + for action in range(self.action_space.n) + } distribution[self.optimal_action] += 1 - self.epsilon return distribution @@ -98,13 +97,11 @@ def update(self, values): Whether to update epsilon schedule """ self.optimal_action = np.argmax(values) - self.epsilon = self.final_temperature \ - + (self.temperature - self.final_temperature) * \ - np.exp(- self.time / self.tau) + self.epsilon = self.final_temperature + ( + self.temperature - self.final_temperature + ) * np.exp(-self.time / self.tau) if self.writer: - self.writer.add_scalar('exploration/epsilon', - self.epsilon, - self.time) + self.writer.add_scalar("exploration/epsilon", self.epsilon, self.time) def step_time(self): self.time += 1 @@ -133,8 +130,10 @@ def __init__(self, action_space, **kwargs): def get_distribution(self): optimal_action = np.argmax(self.values) - return {action: 1 if action == optimal_action - else 0 for action in range(self.action_space.n)} + return { + action: 1 if action == optimal_action else 0 + for action in range(self.action_space.n) + } def update(self, values): self.values = values @@ -155,9 +154,9 @@ def exploration_factory(action_space, method="EpsilonGreedy", **kwargs): ------- A new exploration policy. """ - if method == 'Greedy': + if method == "Greedy": return Greedy(action_space, **kwargs) - elif method == 'EpsilonGreedy': + elif method == "EpsilonGreedy": return EpsilonGreedy(action_space, **kwargs) else: raise ValueError("Unknown exploration method") diff --git a/rlberry/agents/torch/ppo/ppo.py b/rlberry/agents/torch/ppo/ppo.py index 5b32c5b97..2292059d9 100644 --- a/rlberry/agents/torch/ppo/ppo.py +++ b/rlberry/agents/torch/ppo/ppo.py @@ -71,34 +71,39 @@ class PPOAgent(AgentWithSimplePolicy): name = "PPO" - def __init__(self, env, - batch_size=64, - update_frequency=8, - horizon=256, - gamma=0.99, - entr_coef=0.01, - vf_coef=0.5, - learning_rate=0.01, - optimizer_type='ADAM', - eps_clip=0.2, - k_epochs=5, - use_gae=True, - gae_lambda=0.95, - policy_net_fn=None, - value_net_fn=None, - policy_net_kwargs=None, - value_net_kwargs=None, - device="cuda:best", - use_bonus=False, - uncertainty_estimator_kwargs=None, - **kwargs): # TODO: sort arguments + def __init__( + self, + env, + batch_size=64, + update_frequency=8, + horizon=256, + gamma=0.99, + entr_coef=0.01, + vf_coef=0.5, + learning_rate=0.01, + optimizer_type="ADAM", + eps_clip=0.2, + k_epochs=5, + use_gae=True, + gae_lambda=0.95, + policy_net_fn=None, + value_net_fn=None, + policy_net_kwargs=None, + value_net_kwargs=None, + device="cuda:best", + use_bonus=False, + uncertainty_estimator_kwargs=None, + **kwargs + ): # TODO: sort arguments AgentWithSimplePolicy.__init__(self, env, **kwargs) # bonus self.use_bonus = use_bonus if self.use_bonus: - self.env = UncertaintyEstimatorWrapper(self.env, **uncertainty_estimator_kwargs) + self.env = UncertaintyEstimatorWrapper( + self.env, **uncertainty_estimator_kwargs + ) # algorithm parameters self.gamma = gamma @@ -137,8 +142,7 @@ def __init__(self, env, self.device = choose_device(device) - self.optimizer_kwargs = {'optimizer_type': optimizer_type, - 'lr': learning_rate} + self.optimizer_kwargs = {"optimizer_type": optimizer_type, "lr": learning_rate} # check environment assert isinstance(self.env.observation_space, spaces.Box) @@ -156,13 +160,23 @@ def from_config(cls, **kwargs): return cls(**kwargs) def reset(self, **kwargs): - self.cat_policy = self.policy_net_fn(self.env, **self.policy_net_kwargs).to(self.device) - self.policy_optimizer = optimizer_factory(self.cat_policy.parameters(), **self.optimizer_kwargs) - - self.value_net = self.value_net_fn(self.env, **self.value_net_kwargs).to(self.device) - self.value_optimizer = optimizer_factory(self.value_net.parameters(), **self.optimizer_kwargs) - - self.cat_policy_old = self.policy_net_fn(self.env, **self.policy_net_kwargs).to(self.device) + self.cat_policy = self.policy_net_fn(self.env, **self.policy_net_kwargs).to( + self.device + ) + self.policy_optimizer = optimizer_factory( + self.cat_policy.parameters(), **self.optimizer_kwargs + ) + + self.value_net = self.value_net_fn(self.env, **self.value_net_kwargs).to( + self.device + ) + self.value_optimizer = optimizer_factory( + self.value_net.parameters(), **self.optimizer_kwargs + ) + + self.cat_policy_old = self.policy_net_fn(self.env, **self.policy_net_kwargs).to( + self.device + ) self.cat_policy_old.load_state_dict(self.cat_policy.state_dict()) self.MseLoss = nn.MSELoss() # TODO: turn into argument @@ -215,8 +229,8 @@ def _run_episode(self): # check whether to use bonus bonus = 0.0 if self.use_bonus: - if info is not None and 'exploration_bonus' in info: - bonus = info['exploration_bonus'] + if info is not None and "exploration_bonus" in info: + bonus = info["exploration_bonus"] # save transition states.append(state) @@ -238,7 +252,9 @@ def _run_episode(self): state_values = torch.squeeze(state_values).tolist() # TODO: add the option to normalize before computing returns/advantages? - returns, advantages = self._compute_returns_avantages(rewards, is_terminals, state_values) + returns, advantages = self._compute_returns_avantages( + rewards, is_terminals, state_values + ) # save in batch self.memory.states.extend(states) @@ -258,7 +274,9 @@ def _run_episode(self): self.writer.add_scalar("episode_rewards", episode_rewards, self.episode) # update - if self.episode % self.update_frequency == 0: # TODO: maybe change to update in function of n_steps instead + if ( + self.episode % self.update_frequency == 0 + ): # TODO: maybe change to update in function of n_steps instead self._update() self.memory.clear_memory() del self.returns[:] # TODO: add to memory (cf reset) @@ -292,7 +310,9 @@ def _update(self): for k in range(n_batches): # sample batch - batch_idx = np.arange(k * self.batch_size, min((k + 1) * self.batch_size, n_samples)) + batch_idx = np.arange( + k * self.batch_size, min((k + 1) * self.batch_size, n_samples) + ) old_states = shuffled_states[batch_idx] old_actions = shuffled_actions[batch_idx] old_logprobs = shuffled_logprobs[batch_idx] @@ -313,14 +333,21 @@ def _update(self): # rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5) # normalize the advantages - old_advantages = old_advantages.view(-1, ) + old_advantages = old_advantages.view( + -1, + ) if self.normalize_advantages: - old_advantages = (old_advantages - old_advantages.mean()) / (old_advantages.std() + 1e-10) + old_advantages = (old_advantages - old_advantages.mean()) / ( + old_advantages.std() + 1e-10 + ) # compute surrogate loss surr1 = ratios * old_advantages - surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * old_advantages + surr2 = ( + torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) + * old_advantages + ) surr_loss = torch.min(surr1, surr2) # compute value function loss @@ -330,7 +357,7 @@ def _update(self): loss_entropy = self.entr_coef * dist_entropy # compute total loss - loss = - surr_loss + loss_vf - loss_entropy + loss = -surr_loss + loss_vf - loss_entropy # take gradient step self.policy_optimizer.zero_grad() @@ -343,8 +370,16 @@ def _update(self): # log if self.writer: - self.writer.add_scalar("fit/surrogate_loss", surr_loss.mean().cpu().detach().numpy(), self.episode) - self.writer.add_scalar("fit/entropy_loss", dist_entropy.mean().cpu().detach().numpy(), self.episode) + self.writer.add_scalar( + "fit/surrogate_loss", + surr_loss.mean().cpu().detach().numpy(), + self.episode, + ) + self.writer.add_scalar( + "fit/entropy_loss", + dist_entropy.mean().cpu().detach().numpy(), + self.episode, + ) # copy new weights into old policy self.cat_policy_old.load_state_dict(self.cat_policy.state_dict()) @@ -357,22 +392,39 @@ def _compute_returns_avantages(self, rewards, is_terminals, state_values): if not self.use_gae: for t in reversed(range(self.horizon)): if t == self.horizon - 1: - returns[t] = rewards[t] + self.gamma * (1 - is_terminals[t]) * state_values[-1] + returns[t] = ( + rewards[t] + + self.gamma * (1 - is_terminals[t]) * state_values[-1] + ) else: - returns[t] = rewards[t] + self.gamma * (1 - is_terminals[t]) * returns[t + 1] + returns[t] = ( + rewards[t] + self.gamma * (1 - is_terminals[t]) * returns[t + 1] + ) advantages[t] = returns[t] - state_values[t] else: last_adv = 0 for t in reversed(range(self.horizon)): if t == self.horizon - 1: - returns[t] = rewards[t] + self.gamma * (1 - is_terminals[t]) * state_values[-1] + returns[t] = ( + rewards[t] + + self.gamma * (1 - is_terminals[t]) * state_values[-1] + ) td_error = returns[t] - state_values[t] else: - returns[t] = rewards[t] + self.gamma * (1 - is_terminals[t]) * returns[t + 1] - td_error = rewards[t] + self.gamma * (1 - is_terminals[t]) * state_values[t + 1] - state_values[t] - - last_adv = self.gae_lambda * self.gamma * (1 - is_terminals[t]) * last_adv + td_error + returns[t] = ( + rewards[t] + self.gamma * (1 - is_terminals[t]) * returns[t + 1] + ) + td_error = ( + rewards[t] + + self.gamma * (1 - is_terminals[t]) * state_values[t + 1] + - state_values[t] + ) + + last_adv = ( + self.gae_lambda * self.gamma * (1 - is_terminals[t]) * last_adv + + td_error + ) advantages[t] = last_adv return returns, advantages @@ -382,25 +434,21 @@ def _compute_returns_avantages(self, rewards, is_terminals, state_values): # @classmethod def sample_parameters(cls, trial): - batch_size = trial.suggest_categorical('batch_size', - [1, 4, 8, 16, 32]) - gamma = trial.suggest_categorical('gamma', - [0.9, 0.95, 0.99]) - learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1) + batch_size = trial.suggest_categorical("batch_size", [1, 4, 8, 16, 32]) + gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.99]) + learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1) - entr_coef = trial.suggest_loguniform('entr_coef', 1e-8, 0.1) + entr_coef = trial.suggest_loguniform("entr_coef", 1e-8, 0.1) - eps_clip = trial.suggest_categorical('eps_clip', - [0.1, 0.2, 0.3]) + eps_clip = trial.suggest_categorical("eps_clip", [0.1, 0.2, 0.3]) - k_epochs = trial.suggest_categorical('k_epochs', - [1, 5, 10, 20]) + k_epochs = trial.suggest_categorical("k_epochs", [1, 5, 10, 20]) return { - 'batch_size': batch_size, - 'gamma': gamma, - 'learning_rate': learning_rate, - 'entr_coef': entr_coef, - 'eps_clip': eps_clip, - 'k_epochs': k_epochs, + "batch_size": batch_size, + "gamma": gamma, + "learning_rate": learning_rate, + "entr_coef": entr_coef, + "eps_clip": eps_clip, + "k_epochs": k_epochs, } diff --git a/rlberry/agents/torch/reinforce/reinforce.py b/rlberry/agents/torch/reinforce/reinforce.py index 80255877c..460d6720b 100644 --- a/rlberry/agents/torch/reinforce/reinforce.py +++ b/rlberry/agents/torch/reinforce/reinforce.py @@ -54,19 +54,22 @@ class REINFORCEAgent(AgentWithSimplePolicy): name = "REINFORCE" - def __init__(self, env, - batch_size=8, - horizon=256, - gamma=0.99, - entr_coef=0.01, - learning_rate=0.0001, - normalize=True, - optimizer_type='ADAM', - policy_net_fn=None, - policy_net_kwargs=None, - use_bonus_if_available=False, - device="cuda:best", - **kwargs): + def __init__( + self, + env, + batch_size=8, + horizon=256, + gamma=0.99, + entr_coef=0.01, + learning_rate=0.0001, + normalize=True, + optimizer_type="ADAM", + policy_net_fn=None, + policy_net_kwargs=None, + use_bonus_if_available=False, + device="cuda:best", + **kwargs + ): AgentWithSimplePolicy.__init__(self, env, **kwargs) self.batch_size = batch_size @@ -86,8 +89,7 @@ def __init__(self, env, # self.policy_net_fn = policy_net_fn or default_policy_net_fn - self.optimizer_kwargs = {'optimizer_type': optimizer_type, - 'lr': learning_rate} + self.optimizer_kwargs = {"optimizer_type": optimizer_type, "lr": learning_rate} # check environment assert isinstance(self.env.observation_space, spaces.Box) @@ -99,14 +101,13 @@ def __init__(self, env, self.reset() def reset(self, **kwargs): - self.policy_net = self.policy_net_fn( - self.env, - **self.policy_net_kwargs - ).to(self.device) + self.policy_net = self.policy_net_fn(self.env, **self.policy_net_kwargs).to( + self.device + ) self.policy_optimizer = optimizer_factory( - self.policy_net.parameters(), - **self.optimizer_kwargs) + self.policy_net.parameters(), **self.optimizer_kwargs + ) self.memory = Memory() @@ -140,8 +141,8 @@ def _run_episode(self): # check whether to use bonus bonus = 0.0 if self.use_bonus_if_available: - if info is not None and 'exploration_bonus' in info: - bonus = info['exploration_bonus'] + if info is not None and "exploration_bonus" in info: + bonus = info["exploration_bonus"] # save in batch self.memory.states.append(state) @@ -177,8 +178,9 @@ def _update(self): # monte carlo estimate of rewards rewards = [] discounted_reward = 0 - for reward, is_terminal in zip(reversed(self.memory.rewards), - reversed(self.memory.is_terminals)): + for reward, is_terminal in zip( + reversed(self.memory.rewards), reversed(self.memory.is_terminals) + ): if is_terminal: discounted_reward = 0 discounted_reward = reward + (self.gamma * discounted_reward) @@ -211,17 +213,15 @@ def _update(self): # @classmethod def sample_parameters(cls, trial): - batch_size = trial.suggest_categorical('batch_size', - [1, 4, 8, 16, 32]) - gamma = trial.suggest_categorical('gamma', - [0.9, 0.95, 0.99]) - learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1) + batch_size = trial.suggest_categorical("batch_size", [1, 4, 8, 16, 32]) + gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.99]) + learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1) - entr_coef = trial.suggest_loguniform('entr_coef', 1e-8, 0.1) + entr_coef = trial.suggest_loguniform("entr_coef", 1e-8, 0.1) return { - 'batch_size': batch_size, - 'gamma': gamma, - 'learning_rate': learning_rate, - 'entr_coef': entr_coef, + "batch_size": batch_size, + "gamma": gamma, + "learning_rate": learning_rate, + "entr_coef": entr_coef, } diff --git a/rlberry/agents/torch/tests/test_actor_critic_algos.py b/rlberry/agents/torch/tests/test_actor_critic_algos.py index 989cf63dd..c76b83728 100644 --- a/rlberry/agents/torch/tests/test_actor_critic_algos.py +++ b/rlberry/agents/torch/tests/test_actor_critic_algos.py @@ -11,21 +11,20 @@ def test_a2c_agent(): horizon = 30 def uncertainty_estimator_fn(observation_space, action_space): - counter = DiscreteCounter(observation_space, - action_space, - n_bins_obs=20) + counter = DiscreteCounter(observation_space, action_space, n_bins_obs=20) return counter - agent = A2CAgent(env, - horizon=horizon, - gamma=0.99, - learning_rate=0.001, - k_epochs=4, - use_bonus=True, - uncertainty_estimator_kwargs=dict( - uncertainty_estimator_fn=uncertainty_estimator_fn, - bonus_scale_factor=1.0 - )) + agent = A2CAgent( + env, + horizon=horizon, + gamma=0.99, + learning_rate=0.001, + k_epochs=4, + use_bonus=True, + uncertainty_estimator_kwargs=dict( + uncertainty_estimator_fn=uncertainty_estimator_fn, bonus_scale_factor=1.0 + ), + ) agent.fit(budget=n_episodes) agent.policy(env.observation_space.sample()) @@ -35,12 +34,14 @@ def test_a2c_agent_partial_fit(): n_episodes = 10 horizon = 30 - agent = A2CAgent(env, - horizon=horizon, - gamma=0.99, - learning_rate=0.001, - k_epochs=4, - use_bonus=False) + agent = A2CAgent( + env, + horizon=horizon, + gamma=0.99, + learning_rate=0.001, + k_epochs=4, + use_bonus=False, + ) agent.fit(budget=n_episodes // 2) agent.policy(env.observation_space.sample()) @@ -56,22 +57,21 @@ def test_ppo_agent(): horizon = 30 def uncertainty_estimator_fn(observation_space, action_space): - counter = DiscreteCounter(observation_space, - action_space, - n_bins_obs=20) + counter = DiscreteCounter(observation_space, action_space, n_bins_obs=20) return counter - agent = PPOAgent(env, - horizon=horizon, - gamma=0.99, - learning_rate=0.001, - eps_clip=0.2, - k_epochs=4, - use_bonus=True, - uncertainty_estimator_kwargs=dict( - uncertainty_estimator_fn=uncertainty_estimator_fn, - bonus_scale_factor=1 - )) + agent = PPOAgent( + env, + horizon=horizon, + gamma=0.99, + learning_rate=0.001, + eps_clip=0.2, + k_epochs=4, + use_bonus=True, + uncertainty_estimator_kwargs=dict( + uncertainty_estimator_fn=uncertainty_estimator_fn, bonus_scale_factor=1 + ), + ) agent.fit(budget=n_episodes) agent.policy(env.observation_space.sample()) @@ -81,13 +81,15 @@ def test_ppo_agent_partial_fit(): n_episodes = 10 horizon = 30 - agent = PPOAgent(env, - horizon=horizon, - gamma=0.99, - learning_rate=0.001, - eps_clip=0.2, - k_epochs=4, - use_bonus=False) + agent = PPOAgent( + env, + horizon=horizon, + gamma=0.99, + learning_rate=0.001, + eps_clip=0.2, + k_epochs=4, + use_bonus=False, + ) agent.fit(budget=n_episodes // 2) agent.policy(env.observation_space.sample()) @@ -104,23 +106,22 @@ def test_avec_ppo_agent(): # def uncertainty_estimator_fn(observation_space, action_space): - counter = DiscreteCounter(observation_space, - action_space, - n_bins_obs=20) + counter = DiscreteCounter(observation_space, action_space, n_bins_obs=20) return counter - agent = AVECPPOAgent(env, - horizon=horizon, - gamma=0.99, - learning_rate=0.001, - eps_clip=0.2, - k_epochs=4, - batch_size=1, - use_bonus=True, - uncertainty_estimator_kwargs=dict( - uncertainty_estimator_fn=uncertainty_estimator_fn, - bonus_scale_factor=1.0) - ) + agent = AVECPPOAgent( + env, + horizon=horizon, + gamma=0.99, + learning_rate=0.001, + eps_clip=0.2, + k_epochs=4, + batch_size=1, + use_bonus=True, + uncertainty_estimator_kwargs=dict( + uncertainty_estimator_fn=uncertainty_estimator_fn, bonus_scale_factor=1.0 + ), + ) agent.fit(budget=n_episodes // 2) agent.policy(env.observation_space.sample()) @@ -130,14 +131,16 @@ def test_avec_ppo_agent_partial_fit(): n_episodes = 10 horizon = 30 - agent = AVECPPOAgent(env, - horizon=horizon, - gamma=0.99, - learning_rate=0.001, - eps_clip=0.2, - k_epochs=4, - batch_size=1, - use_bonus=False) + agent = AVECPPOAgent( + env, + horizon=horizon, + gamma=0.99, + learning_rate=0.001, + eps_clip=0.2, + k_epochs=4, + batch_size=1, + use_bonus=False, + ) agent.fit(budget=n_episodes // 2) agent.policy(env.observation_space.sample()) diff --git a/rlberry/agents/torch/tests/test_dqn.py b/rlberry/agents/torch/tests/test_dqn.py index 293934951..07760e438 100644 --- a/rlberry/agents/torch/tests/test_dqn.py +++ b/rlberry/agents/torch/tests/test_dqn.py @@ -1,6 +1,8 @@ from rlberry.envs import gym_make from rlberry.agents.torch.dqn import DQNAgent -from rlberry.exploration_tools.online_discretization_counter import OnlineDiscretizationCounter +from rlberry.exploration_tools.online_discretization_counter import ( + OnlineDiscretizationCounter, +) from rlberry.exploration_tools.torch.rnd import RandomNetworkDistillation from rlberry.seeding import Seeder import numpy as np @@ -11,27 +13,28 @@ def test_dqn_agent(): def uncertainty_estimator_fn(observation_space, action_space): counter = OnlineDiscretizationCounter( - observation_space, - action_space, - min_dist=0.25) + observation_space, action_space, min_dist=0.25 + ) return counter - agent = DQNAgent(env, - use_bonus=True, - uncertainty_estimator_kwargs=dict( - uncertainty_estimator_fn=uncertainty_estimator_fn, - bonus_scale_factor=1.0 - )) + agent = DQNAgent( + env, + use_bonus=True, + uncertainty_estimator_kwargs=dict( + uncertainty_estimator_fn=uncertainty_estimator_fn, bonus_scale_factor=1.0 + ), + ) agent.fit(budget=5) agent.policy(env.observation_space.sample()) # test seeding of exploration policy - agent2 = DQNAgent(env, - use_bonus=True, - uncertainty_estimator_kwargs=dict( - uncertainty_estimator_fn=uncertainty_estimator_fn, - bonus_scale_factor=1.0 - )) + agent2 = DQNAgent( + env, + use_bonus=True, + uncertainty_estimator_kwargs=dict( + uncertainty_estimator_fn=uncertainty_estimator_fn, bonus_scale_factor=1.0 + ), + ) agent.reseed(Seeder(123)) agent2.reseed(Seeder(123)) @@ -47,11 +50,12 @@ def uncertainty_estimator_fn(observation_space, action_space): counter = RandomNetworkDistillation(observation_space, action_space) return counter - agent = DQNAgent(env, - use_bonus=True, - uncertainty_estimator_kwargs=dict( - uncertainty_estimator_fn=uncertainty_estimator_fn, - bonus_scale_factor=1.0 - )) + agent = DQNAgent( + env, + use_bonus=True, + uncertainty_estimator_kwargs=dict( + uncertainty_estimator_fn=uncertainty_estimator_fn, bonus_scale_factor=1.0 + ), + ) agent.fit(budget=5) agent.policy(env.observation_space.sample()) diff --git a/rlberry/agents/torch/tests/test_reinforce.py b/rlberry/agents/torch/tests/test_reinforce.py index 56640ad6a..5df650288 100644 --- a/rlberry/agents/torch/tests/test_reinforce.py +++ b/rlberry/agents/torch/tests/test_reinforce.py @@ -1,8 +1,7 @@ from rlberry.agents.torch import REINFORCEAgent from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env from rlberry.exploration_tools.discrete_counter import DiscreteCounter -from rlberry.wrappers.uncertainty_estimator_wrapper import \ - UncertaintyEstimatorWrapper +from rlberry.wrappers.uncertainty_estimator_wrapper import UncertaintyEstimatorWrapper def test_reinforce_agent(): @@ -12,19 +11,20 @@ def test_reinforce_agent(): # def uncertainty_estimator_fn(observation_space, action_space): - counter = DiscreteCounter(observation_space, action_space, - n_bins_obs=20) + counter = DiscreteCounter(observation_space, action_space, n_bins_obs=20) return counter - env = UncertaintyEstimatorWrapper(_env, - uncertainty_estimator_fn, - bonus_scale_factor=1.0) + env = UncertaintyEstimatorWrapper( + _env, uncertainty_estimator_fn, bonus_scale_factor=1.0 + ) # - agent = REINFORCEAgent(env, - horizon=horizon, - gamma=0.99, - learning_rate=0.001, - use_bonus_if_available=True) + agent = REINFORCEAgent( + env, + horizon=horizon, + gamma=0.99, + learning_rate=0.001, + use_bonus_if_available=True, + ) agent.fit(budget=n_episodes) agent.policy(env.observation_space.sample()) @@ -34,11 +34,13 @@ def test_reinforce_agent_partial_fit(): n_episodes = 10 horizon = 30 - agent = REINFORCEAgent(env, - horizon=horizon, - gamma=0.99, - learning_rate=0.001, - use_bonus_if_available=False) + agent = REINFORCEAgent( + env, + horizon=horizon, + gamma=0.99, + learning_rate=0.001, + use_bonus_if_available=False, + ) agent.fit(budget=n_episodes // 2) agent.policy(env.observation_space.sample()) assert agent.episode == 5 diff --git a/rlberry/agents/torch/tests/test_torch_models.py b/rlberry/agents/torch/tests/test_torch_models.py index 5a0330d12..8ba3f0639 100644 --- a/rlberry/agents/torch/tests/test_torch_models.py +++ b/rlberry/agents/torch/tests/test_torch_models.py @@ -10,42 +10,34 @@ def test_mlp(): - model = MultiLayerPerceptron(in_size=5, - layer_sizes=[10, 10, 10], - out_size=10, - reshape=False) + model = MultiLayerPerceptron( + in_size=5, layer_sizes=[10, 10, 10], out_size=10, reshape=False + ) x = torch.rand(1, 5) y = model.forward(x) assert y.shape[1] == 10 def test_mlp_policy(): - model = MultiLayerPerceptron(in_size=5, - layer_sizes=[10, 10, 10], - out_size=10, - reshape=False, - is_policy=True) + model = MultiLayerPerceptron( + in_size=5, layer_sizes=[10, 10, 10], out_size=10, reshape=False, is_policy=True + ) x = torch.rand(1, 5) scores = model.action_scores(x) assert scores.shape[1] == 10 def test_cnn(): - model = ConvolutionalNetwork(in_channels=10, - in_height=20, - in_width=30, - out_size=15) + model = ConvolutionalNetwork(in_channels=10, in_height=20, in_width=30, out_size=15) x = torch.rand(1, 10, 20, 30) y = model.forward(x) assert y.shape[1] == 15 def test_cnn_policy(): - model = ConvolutionalNetwork(in_channels=10, - in_height=20, - in_width=30, - out_size=15, - is_policy=True) + model = ConvolutionalNetwork( + in_channels=10, in_height=20, in_width=30, out_size=15, is_policy=True + ) x = torch.rand(1, 10, 20, 30) scores = model.action_scores(x) assert scores.shape[1] == 15 @@ -58,4 +50,3 @@ def test_ego_attention(): def test_self_attention(): _ = SelfAttention() - diff --git a/rlberry/agents/torch/tests/test_torch_training.py b/rlberry/agents/torch/tests/test_torch_training.py index 4fe78526e..33233de04 100644 --- a/rlberry/agents/torch/tests/test_torch_training.py +++ b/rlberry/agents/torch/tests/test_torch_training.py @@ -6,12 +6,31 @@ # loss_function_factory assert isinstance(loss_function_factory("l2").__name__, type(F.mse_loss.__name__)) assert isinstance(loss_function_factory("l1").__name__, type(F.l1_loss.__name__)) -assert isinstance(loss_function_factory("smooth_l1").__name__, type(F.smooth_l1_loss.__name__)) -assert isinstance(loss_function_factory("bce").__name__, type(F.binary_cross_entropy.__name__)) +assert isinstance( + loss_function_factory("smooth_l1").__name__, type(F.smooth_l1_loss.__name__) +) +assert isinstance( + loss_function_factory("bce").__name__, type(F.binary_cross_entropy.__name__) +) # optimizer_factory env = get_benchmark_env(level=1) -assert optimizer_factory(default_policy_net_fn(env).parameters(), "ADAM").defaults["lr"] == 0.001 -assert optimizer_factory(default_policy_net_fn(env).parameters(), "ADAM").defaults["betas"] == (0.9, 0.999) -assert optimizer_factory(default_policy_net_fn(env).parameters(), "RMS_PROP").defaults["lr"] == 0.01 -assert optimizer_factory(default_policy_net_fn(env).parameters(), "RMS_PROP").defaults["alpha"] == 0.99 +assert ( + optimizer_factory(default_policy_net_fn(env).parameters(), "ADAM").defaults["lr"] + == 0.001 +) +assert optimizer_factory(default_policy_net_fn(env).parameters(), "ADAM").defaults[ + "betas" +] == (0.9, 0.999) +assert ( + optimizer_factory(default_policy_net_fn(env).parameters(), "RMS_PROP").defaults[ + "lr" + ] + == 0.01 +) +assert ( + optimizer_factory(default_policy_net_fn(env).parameters(), "RMS_PROP").defaults[ + "alpha" + ] + == 0.99 +) diff --git a/rlberry/agents/torch/utils/attention_models.py b/rlberry/agents/torch/utils/attention_models.py index 8145441e5..47d984487 100644 --- a/rlberry/agents/torch/utils/attention_models.py +++ b/rlberry/agents/torch/utils/attention_models.py @@ -10,138 +10,122 @@ class EgoAttention(BaseModule): - def __init__(self, - feature_size=64, - heads=4, - dropout_factor=0): + def __init__(self, feature_size=64, heads=4, dropout_factor=0): super().__init__() self.feature_size = feature_size self.heads = heads self.dropout_factor = dropout_factor self.features_per_head = int(self.feature_size / self.heads) - self.value_all = nn.Linear(self.feature_size, - self.feature_size, - bias=False) - self.key_all = nn.Linear(self.feature_size, - self.feature_size, - bias=False) - self.query_ego = nn.Linear(self.feature_size, - self.feature_size, - bias=False) - self.attention_combine = nn.Linear(self.feature_size, - self.feature_size, - bias=False) + self.value_all = nn.Linear(self.feature_size, self.feature_size, bias=False) + self.key_all = nn.Linear(self.feature_size, self.feature_size, bias=False) + self.query_ego = nn.Linear(self.feature_size, self.feature_size, bias=False) + self.attention_combine = nn.Linear( + self.feature_size, self.feature_size, bias=False + ) @classmethod def default_config(cls): - return { - } + return {} def forward(self, ego, others, mask=None): batch_size = others.shape[0] n_entities = others.shape[1] + 1 - input_all = torch.cat((ego.view(batch_size, 1, - self.feature_size), others), dim=1) + input_all = torch.cat( + (ego.view(batch_size, 1, self.feature_size), others), dim=1 + ) # Dimensions: Batch, entity, head, feature_per_head - key_all = self.key_all(input_all).view(batch_size, - n_entities, - self.heads, - self.features_per_head) - value_all = self.value_all(input_all).view(batch_size, - n_entities, - self.heads, - self.features_per_head) - query_ego = self.query_ego(ego).view(batch_size, 1, - self.heads, - self.features_per_head) + key_all = self.key_all(input_all).view( + batch_size, n_entities, self.heads, self.features_per_head + ) + value_all = self.value_all(input_all).view( + batch_size, n_entities, self.heads, self.features_per_head + ) + query_ego = self.query_ego(ego).view( + batch_size, 1, self.heads, self.features_per_head + ) # Dimensions: Batch, head, entity, feature_per_head key_all = key_all.permute(0, 2, 1, 3) value_all = value_all.permute(0, 2, 1, 3) query_ego = query_ego.permute(0, 2, 1, 3) if mask is not None: - mask = mask.view((batch_size, 1, 1, - n_entities)).repeat((1, self.heads, 1, 1)) - value, attention_matrix = attention(query_ego, - key_all, - value_all, - mask, - nn.Dropout(self.dropout_factor)) - result = (self.attention_combine( - value.reshape((batch_size, - self.feature_size))) + ego.squeeze(1)) / 2 + mask = mask.view((batch_size, 1, 1, n_entities)).repeat( + (1, self.heads, 1, 1) + ) + value, attention_matrix = attention( + query_ego, key_all, value_all, mask, nn.Dropout(self.dropout_factor) + ) + result = ( + self.attention_combine(value.reshape((batch_size, self.feature_size))) + + ego.squeeze(1) + ) / 2 return result, attention_matrix class SelfAttention(BaseModule): - def __init__(self, - feature_size=64, - heads=4, - dropout_factor=0, - **kwargs): + def __init__(self, feature_size=64, heads=4, dropout_factor=0, **kwargs): super().__init__(**kwargs) self.feature_size = feature_size self.heads = heads self.dropout_factor = dropout_factor self.features_per_head = int(self.feature_size / self.heads) - self.value_all = nn.Linear(self.feature_size, - self.feature_size, - bias=False) - self.key_all = nn.Linear(self.feature_size, - self.feature_size, - bias=False) - self.query_all = nn.Linear(self.feature_size, - self.feature_size, - bias=False) - self.attention_combine = nn.Linear(self.feature_size, - self.feature_size, - bias=False) + self.value_all = nn.Linear(self.feature_size, self.feature_size, bias=False) + self.key_all = nn.Linear(self.feature_size, self.feature_size, bias=False) + self.query_all = nn.Linear(self.feature_size, self.feature_size, bias=False) + self.attention_combine = nn.Linear( + self.feature_size, self.feature_size, bias=False + ) def forward(self, ego, others, mask=None): batch_size = others.shape[0] n_entities = others.shape[1] + 1 - input_all = torch.cat((ego.view(batch_size, 1, - self.feature_size), - others), dim=1) + input_all = torch.cat( + (ego.view(batch_size, 1, self.feature_size), others), dim=1 + ) # Dimensions: Batch, entity, head, feature_per_head - key_all = self.key_all(input_all).view(batch_size, n_entities, - self.heads, - self.features_per_head) - value_all = self.value_all(input_all).view(batch_size, n_entities, - self.heads, - self.features_per_head) - query_all = self.query_all(input_all).view(batch_size, - n_entities, - self.heads, - self.features_per_head) + key_all = self.key_all(input_all).view( + batch_size, n_entities, self.heads, self.features_per_head + ) + value_all = self.value_all(input_all).view( + batch_size, n_entities, self.heads, self.features_per_head + ) + query_all = self.query_all(input_all).view( + batch_size, n_entities, self.heads, self.features_per_head + ) # Dimensions: Batch, head, entity, feature_per_head key_all = key_all.permute(0, 2, 1, 3) value_all = value_all.permute(0, 2, 1, 3) query_all = query_all.permute(0, 2, 1, 3) if mask is not None: - mask = mask.view((batch_size, 1, 1, - n_entities)).repeat((1, self.heads, 1, 1)) - value, attention_matrix = attention(query_all, key_all, value_all, - mask, - nn.Dropout(self.dropout_factor)) - result = (self.attention_combine( - value.reshape((batch_size, n_entities, self.feature_size))) - + input_all) / 2 + mask = mask.view((batch_size, 1, 1, n_entities)).repeat( + (1, self.heads, 1, 1) + ) + value, attention_matrix = attention( + query_all, key_all, value_all, mask, nn.Dropout(self.dropout_factor) + ) + result = ( + self.attention_combine( + value.reshape((batch_size, n_entities, self.feature_size)) + ) + + input_all + ) / 2 return result, attention_matrix class EgoAttentionNetwork(BaseModule): - def __init__(self, - in_size=None, - out_size=None, - presence_feature_idx=0, - embedding_layer_kwargs=None, - attention_layer_kwargs=None, - output_layer_kwargs=None, - **kwargs): + def __init__( + self, + in_size=None, + out_size=None, + presence_feature_idx=0, + embedding_layer_kwargs=None, + attention_layer_kwargs=None, + output_layer_kwargs=None, + **kwargs + ): super().__init__(**kwargs) self.out_size = out_size self.presence_feature_idx = presence_feature_idx @@ -171,7 +155,7 @@ def split_input(self, x, mask=None): others = x[:, 1:, :] if mask is None: aux = self.presence_feature_idx - mask = x[:, :, aux:aux + 1] < 0.5 + mask = x[:, :, aux : aux + 1] < 0.5 return ego, others, mask def forward_attention(self, x): diff --git a/rlberry/agents/torch/utils/models.py b/rlberry/agents/torch/utils/models.py index f93dc4de6..729969c81 100644 --- a/rlberry/agents/torch/utils/models.py +++ b/rlberry/agents/torch/utils/models.py @@ -23,35 +23,50 @@ def default_policy_net_fn(env): elif isinstance(env.observation_space, spaces.Tuple): obs_shape = env.observation_space.spaces[0].shape else: - raise ValueError("Incompatible observation space: {}".format(env.observation_space)) + raise ValueError( + "Incompatible observation space: {}".format(env.observation_space) + ) if len(obs_shape) == 3: if obs_shape[0] < obs_shape[1] and obs_shape[0] < obs_shape[1]: # Assume CHW observation space - model_config = {"type": "ConvolutionalNetwork", - "is_policy": True, - "in_channels": int(obs_shape[0]), - "in_height": int(obs_shape[1]), - "in_width": int(obs_shape[2])} + model_config = { + "type": "ConvolutionalNetwork", + "is_policy": True, + "in_channels": int(obs_shape[0]), + "in_height": int(obs_shape[1]), + "in_width": int(obs_shape[2]), + } elif obs_shape[2] < obs_shape[0] and obs_shape[2] < obs_shape[1]: # Assume WHC observation space - model_config = {"type": "ConvolutionalNetwork", - "is_policy": True, - "transpose_obs": True, - "in_channels": int(obs_shape[2]), - "in_height": int(obs_shape[1]), - "in_width": int(obs_shape[0])} + model_config = { + "type": "ConvolutionalNetwork", + "is_policy": True, + "transpose_obs": True, + "in_channels": int(obs_shape[2]), + "in_height": int(obs_shape[1]), + "in_width": int(obs_shape[0]), + } elif len(obs_shape) == 2: - model_config = {"type": "ConvolutionalNetwork", - "is_policy": True, - "in_channels": int(1), - "in_height": int(obs_shape[0]), - "in_width": int(obs_shape[1])} + model_config = { + "type": "ConvolutionalNetwork", + "is_policy": True, + "in_channels": int(1), + "in_height": int(obs_shape[0]), + "in_width": int(obs_shape[1]), + } elif len(obs_shape) == 1: - model_config = {"type": "MultiLayerPerceptron", "in_size": int(obs_shape[0]), - "layer_sizes": [64, 64], "reshape": False, "is_policy": True} + model_config = { + "type": "MultiLayerPerceptron", + "in_size": int(obs_shape[0]), + "layer_sizes": [64, 64], + "reshape": False, + "is_policy": True, + } else: - raise ValueError("Incompatible observation shape: {}".format(env.observation_space.shape)) + raise ValueError( + "Incompatible observation shape: {}".format(env.observation_space.shape) + ) if isinstance(env.action_space, spaces.Discrete): model_config["out_size"] = env.action_space.n @@ -70,21 +85,34 @@ def default_value_net_fn(env): elif isinstance(env.observation_space, spaces.Tuple): obs_shape = env.observation_space.spaces[0].shape else: - raise ValueError("Incompatible observation space: {}".format(env.observation_space)) + raise ValueError( + "Incompatible observation space: {}".format(env.observation_space) + ) # Assume CHW observation space if len(obs_shape) == 3: - model_config = {"type": "ConvolutionalNetwork", "in_channels": int(obs_shape[0]), - "in_height": int(obs_shape[1]), - "in_width": int(obs_shape[2])} + model_config = { + "type": "ConvolutionalNetwork", + "in_channels": int(obs_shape[0]), + "in_height": int(obs_shape[1]), + "in_width": int(obs_shape[2]), + } elif len(obs_shape) == 2: - model_config = {"type": "ConvolutionalNetwork", "in_channels": int(1), - "in_height": int(obs_shape[0]), - "in_width": int(obs_shape[1])} + model_config = { + "type": "ConvolutionalNetwork", + "in_channels": int(1), + "in_height": int(obs_shape[0]), + "in_width": int(obs_shape[1]), + } elif len(obs_shape) == 1: - model_config = {"type": "MultiLayerPerceptron", "in_size": int(obs_shape[0]), - "layer_sizes": [64, 64]} + model_config = { + "type": "MultiLayerPerceptron", + "in_size": int(obs_shape[0]), + "layer_sizes": [64, 64], + } else: - raise ValueError("Incompatible observation shape: {}".format(env.observation_space.shape)) + raise ValueError( + "Incompatible observation shape: {}".format(env.observation_space.shape) + ) model_config["out_size"] = 1 @@ -97,7 +125,7 @@ def __init__(self, obs_size, hidden_size, n_actions): self.net = nn.Sequential( nn.Linear(obs_size, hidden_size), nn.ReLU(), - nn.Linear(hidden_size, n_actions) + nn.Linear(hidden_size, n_actions), ) def forward(self, x): @@ -117,15 +145,15 @@ def __init__(self, activation_type="RELU", reset_type="XAVIER"): self.reset_type = reset_type def _init_weights(self, m): - if hasattr(m, 'weight'): + if hasattr(m, "weight"): if self.reset_type == "XAVIER": torch.nn.init.xavier_uniform_(m.weight.data) elif self.reset_type == "ZEROS": - torch.nn.init.constant_(m.weight.data, 0.) + torch.nn.init.constant_(m.weight.data, 0.0) else: raise ValueError("Unknown reset type") - if hasattr(m, 'bias') and m.bias is not None: - torch.nn.init.constant_(m.bias.data, 0.) + if hasattr(m, "bias") and m.bias is not None: + torch.nn.init.constant_(m.bias.data, 0.0) def reset(self): self.apply(self._init_weights) @@ -134,7 +162,9 @@ def reset(self): class Table(torch.nn.Module): def __init__(self, state_size, action_size): super().__init__() - self.policy = nn.Embedding.from_pretrained(torch.zeros(state_size, action_size), freeze=False) + self.policy = nn.Embedding.from_pretrained( + torch.zeros(state_size, action_size), freeze=False + ) self.softmax = nn.Softmax(dim=-1) def forward(self, x): @@ -146,14 +176,16 @@ def action_scores(self, x): class MultiLayerPerceptron(BaseModule): - def __init__(self, - in_size=None, - layer_sizes=None, - reshape=True, - out_size=None, - activation="RELU", - is_policy=False, - **kwargs): + def __init__( + self, + in_size=None, + layer_sizes=None, + reshape=True, + out_size=None, + activation="RELU", + is_policy=False, + **kwargs + ): super().__init__(**kwargs) self.reshape = reshape self.layer_sizes = layer_sizes or [64, 64] @@ -162,8 +194,7 @@ def __init__(self, self.is_policy = is_policy self.softmax = nn.Softmax(dim=-1) sizes = [in_size] + self.layer_sizes - layers_list = [nn.Linear(sizes[i], sizes[i + 1]) - for i in range(len(sizes) - 1)] + layers_list = [nn.Linear(sizes[i], sizes[i + 1]) for i in range(len(sizes) - 1)] self.layers = nn.ModuleList(layers_list) if out_size: self.predict = nn.Linear(sizes[-1], out_size) @@ -193,12 +224,14 @@ def action_scores(self, x): class DuelingNetwork(BaseModule): - def __init__(self, - in_size=None, - base_module_kwargs=None, - value_kwargs=None, - advantage_kwargs=None, - out_size=None): + def __init__( + self, + in_size=None, + base_module_kwargs=None, + value_kwargs=None, + advantage_kwargs=None, + out_size=None, + ): super().__init__() self.out_size = out_size base_module_kwargs = base_module_kwargs or {} @@ -217,21 +250,24 @@ def forward(self, x): x = self.base_module(x) value = self.value(x).expand(-1, self.out_size) advantage = self.advantage(x) - return value + advantage \ - - advantage.mean(1).unsqueeze(1).expand(-1, self.out_size) + return ( + value + advantage - advantage.mean(1).unsqueeze(1).expand(-1, self.out_size) + ) class ConvolutionalNetwork(nn.Module): - def __init__(self, - activation="RELU", - in_channels=None, - in_height=None, - in_width=None, - head_mlp_kwargs=None, - out_size=None, - is_policy=False, - transpose_obs=False, - **kwargs): + def __init__( + self, + activation="RELU", + in_channels=None, + in_height=None, + in_width=None, + head_mlp_kwargs=None, + out_size=None, + is_policy=False, + transpose_obs=False, + **kwargs + ): super().__init__() self.activation = activation_factory(activation) self.conv1 = nn.Conv2d(in_channels, 16, kernel_size=2, stride=2) diff --git a/rlberry/agents/torch/utils/training.py b/rlberry/agents/torch/utils/training.py index b175c963a..1a5080cfb 100644 --- a/rlberry/agents/torch/utils/training.py +++ b/rlberry/agents/torch/utils/training.py @@ -29,8 +29,13 @@ def optimizer_factory(params, optimizer_type="ADAM", **kwargs): def model_factory(type="MultiLayerPerceptron", **kwargs) -> nn.Module: from rlberry.agents.torch.utils.attention_models import EgoAttentionNetwork - from rlberry.agents.torch.utils.models import MultiLayerPerceptron, DuelingNetwork, ConvolutionalNetwork, \ - Table + from rlberry.agents.torch.utils.models import ( + MultiLayerPerceptron, + DuelingNetwork, + ConvolutionalNetwork, + Table, + ) + if type == "MultiLayerPerceptron": return MultiLayerPerceptron(**kwargs) elif type == "DuelingNetwork": @@ -50,8 +55,7 @@ def model_factory_from_env(env, **kwargs): return model_factory(**kwargs) -def size_model_config(env, - **model_config): +def size_model_config(env, **model_config): """ Update the configuration of a model depending on the environment observation/action spaces. diff --git a/rlberry/agents/ucbvi/ucbvi.py b/rlberry/agents/ucbvi/ucbvi.py index 99476b5f4..424fd7e0c 100644 --- a/rlberry/agents/ucbvi/ucbvi.py +++ b/rlberry/agents/ucbvi/ucbvi.py @@ -3,7 +3,10 @@ import gym.spaces as spaces from rlberry.agents import AgentWithSimplePolicy -from rlberry.agents.ucbvi.utils import update_value_and_get_action, update_value_and_get_action_sd +from rlberry.agents.ucbvi.utils import ( + update_value_and_get_action, + update_value_and_get_action_sd, +) from rlberry.exploration_tools.discrete_counter import DiscreteCounter from rlberry.agents.dynprog.utils import backward_induction_sd from rlberry.agents.dynprog.utils import backward_induction_in_place @@ -56,18 +59,21 @@ class UCBVIAgent(AgentWithSimplePolicy): Advances in Neural Information Processing Systems. 2019. https://papers.nips.cc/paper/2019/file/25caef3a545a1fff2ff4055484f0e758-Paper.pdf """ + name = "UCBVI" - def __init__(self, - env, - gamma=1.0, - horizon=100, - bonus_scale_factor=1.0, - bonus_type="simplified_bernstein", - reward_free=False, - stage_dependent=False, - real_time_dp=False, - **kwargs): + def __init__( + self, + env, + gamma=1.0, + horizon=100, + bonus_scale_factor=1.0, + bonus_type="simplified_bernstein", + reward_free=False, + stage_dependent=False, + real_time_dp=False, + **kwargs + ): # init base class AgentWithSimplePolicy.__init__(self, env, **kwargs) @@ -86,15 +92,16 @@ def __init__(self, # other checks assert gamma >= 0 and gamma <= 1.0 if self.horizon is None: - assert gamma < 1.0, \ - "If no horizon is given, gamma must be smaller than 1." + assert gamma < 1.0, "If no horizon is given, gamma must be smaller than 1." self.horizon = int(np.ceil(1.0 / (1.0 - gamma))) # maximum value r_range = self.env.reward_range[1] - self.env.reward_range[0] if r_range == np.inf or r_range == 0.0: - logger.warning("{}: Reward range is zero or infinity. ".format(self.name) - + "Setting it to 1.") + logger.warning( + "{}: Reward range is zero or infinity. ".format(self.name) + + "Setting it to 1." + ) r_range = 1.0 self.v_max = np.zeros(self.horizon) @@ -146,12 +153,13 @@ def reset(self, **kwargs): self.episode = 0 # useful object to compute total number of visited states & entropy of visited states - self.counter = DiscreteCounter(self.env.observation_space, - self.env.action_space) + self.counter = DiscreteCounter( + self.env.observation_space, self.env.action_space + ) # update name if self.real_time_dp: - self.name = 'UCBVI-RTDP' + self.name = "UCBVI-RTDP" def policy(self, observation): state = observation @@ -159,7 +167,7 @@ def policy(self, observation): return self.Q_policy[0, state, :].argmax() def _get_action(self, state, hh=0): - """ Sampling policy. """ + """Sampling policy.""" if not self.real_time_dp: assert self.Q is not None return self.Q[hh, state, :].argmax() @@ -176,7 +184,8 @@ def _get_action(self, state, hh=0): self.P_hat, self.B_sa, self.gamma, - self.v_max) + self.v_max, + ) def _compute_bonus(self, n, hh): # reward-free @@ -191,7 +200,8 @@ def _compute_bonus(self, n, hh): return bonus else: raise ValueError( - "Error: bonus type {} not implemented".format(self.bonus_type)) + "Error: bonus type {} not implemented".format(self.bonus_type) + ) def _update(self, state, action, next_state, reward, hh): if self.stage_dependent: @@ -201,7 +211,9 @@ def _update(self, state, action, next_state, reward, hh): prev_r = self.R_hat[hh, state, action] prev_p = self.P_hat[hh, state, action, :] - self.R_hat[hh, state, action] = (1.0 - 1.0 / nn) * prev_r + reward * 1.0 / nn + self.R_hat[hh, state, action] = ( + 1.0 - 1.0 / nn + ) * prev_r + reward * 1.0 / nn self.P_hat[hh, state, action, :] = (1.0 - 1.0 / nn) * prev_p self.P_hat[hh, state, action, next_state] += 1.0 / nn @@ -251,7 +263,8 @@ def _run_episode(self): self.R_hat + self.B_sa, self.P_hat, self.gamma, - self.v_max[0]) + self.v_max[0], + ) else: backward_induction_in_place( self.Q, @@ -260,7 +273,8 @@ def _run_episode(self): self.P_hat, self.horizon, self.gamma, - self.v_max[0]) + self.v_max[0], + ) # update info self.episode += 1 @@ -268,7 +282,9 @@ def _run_episode(self): # writer if self.writer is not None: self.writer.add_scalar("episode_rewards", episode_rewards, self.episode) - self.writer.add_scalar("n_visited_states", self.counter.get_n_visited_states(), self.episode) + self.writer.add_scalar( + "n_visited_states", self.counter.get_n_visited_states(), self.episode + ) # return sum of rewards collected in the episode return episode_rewards @@ -289,7 +305,8 @@ def fit(self, budget: int, **kwargs): self.R_hat, self.P_hat, self.gamma, - self.v_max[0]) + self.v_max[0], + ) else: backward_induction_in_place( self.Q_policy, @@ -298,4 +315,5 @@ def fit(self, budget: int, **kwargs): self.P_hat, self.horizon, self.gamma, - self.v_max[0]) + self.v_max[0], + ) diff --git a/rlberry/agents/ucbvi/utils.py b/rlberry/agents/ucbvi/utils.py index dd530e809..255affeed 100644 --- a/rlberry/agents/ucbvi/utils.py +++ b/rlberry/agents/ucbvi/utils.py @@ -2,14 +2,7 @@ @numba_jit -def update_value_and_get_action(state, - hh, - V, - R_hat, - P_hat, - B_sa, - gamma, - v_max): +def update_value_and_get_action(state, hh, V, R_hat, P_hat, B_sa, gamma, v_max): """ state : int hh : int @@ -50,14 +43,7 @@ def update_value_and_get_action(state, @numba_jit -def update_value_and_get_action_sd(state, - hh, - V, - R_hat, - P_hat, - B_sa, - gamma, - v_max): +def update_value_and_get_action_sd(state, hh, V, R_hat, P_hat, B_sa, gamma, v_max): """ state : int hh : int diff --git a/rlberry/agents/utils/memories.py b/rlberry/agents/utils/memories.py index 2787190ec..6ec890402 100644 --- a/rlberry/agents/utils/memories.py +++ b/rlberry/agents/utils/memories.py @@ -2,8 +2,9 @@ import operator from collections import namedtuple -Transition = namedtuple('Transition', - ('state', 'action', 'reward', 'next_state', 'terminal', 'info')) +Transition = namedtuple( + "Transition", ("state", "action", "reward", "next_state", "terminal", "info") +) class ReplayMemory(object): @@ -11,9 +12,7 @@ class ReplayMemory(object): Container that stores and samples transitions. """ - def __init__(self, - capacity=10000, - **kwargs): + def __init__(self, capacity=10000, **kwargs): self.capacity = int(capacity) self.memory = [] self.position = 0 @@ -59,22 +58,20 @@ def _encode_sample(self, idxes): rewards.append(reward) next_states.append(np.array(next_state, copy=False)) dones.append(done) - return Transition(np.array(states), - np.array(actions), - np.array(rewards), - np.array(next_states), - np.array(dones), - {}) + return Transition( + np.array(states), + np.array(actions), + np.array(rewards), + np.array(next_states), + np.array(dones), + {}, + ) class PrioritizedReplayMemory(TransitionReplayMemory): """Code from https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py""" - def __init__(self, - capacity=10000, - alpha=0.5, - beta=0.5, - **kwargs): + def __init__(self, capacity=10000, alpha=0.5, beta=0.5, **kwargs): """Create Prioritized Replay buffer. Parameters ---------- @@ -251,7 +248,9 @@ def __init__(self, capacity, operation, neutral_element): neutral element for the operation above. eg. float('-inf') for max and 0 for sum. """ - assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2." + assert ( + capacity > 0 and capacity & (capacity - 1) == 0 + ), "capacity must be positive and a power of 2." self._capacity = capacity self._value = [neutral_element for _ in range(2 * capacity)] self._operation = operation @@ -268,7 +267,7 @@ def _reduce_helper(self, start, end, node, node_start, node_end): else: return self._operation( self._reduce_helper(start, mid, 2 * node, node_start, mid), - self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end) + self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end), ) def reduce(self, start=0, end=None): @@ -303,8 +302,7 @@ def __setitem__(self, idx, val): idx //= 2 while idx >= 1: self._value[idx] = self._operation( - self._value[2 * idx], - self._value[2 * idx + 1] + self._value[2 * idx], self._value[2 * idx + 1] ) idx //= 2 @@ -316,9 +314,7 @@ def __getitem__(self, idx): class SumSegmentTree(SegmentTree): def __init__(self, capacity): super(SumSegmentTree, self).__init__( - capacity=capacity, - operation=operator.add, - neutral_element=0.0 + capacity=capacity, operation=operator.add, neutral_element=0.0 ) def sum(self, start=0, end=None): @@ -357,9 +353,7 @@ def find_prefixsum_idx(self, prefixsum): class MinSegmentTree(SegmentTree): def __init__(self, capacity): super(MinSegmentTree, self).__init__( - capacity=capacity, - operation=min, - neutral_element=float('inf') + capacity=capacity, operation=min, neutral_element=float("inf") ) def min(self, start=0, end=None): diff --git a/rlberry/colab_utils/display_setup.py b/rlberry/colab_utils/display_setup.py index 583a8fbbd..302e589eb 100644 --- a/rlberry/colab_utils/display_setup.py +++ b/rlberry/colab_utils/display_setup.py @@ -5,26 +5,31 @@ import base64 from pyvirtualdisplay import Display from IPython import display as ipythondisplay + # from IPython.display import clear_output from pathlib import Path -def show_video(filename=None, directory='./videos'): +def show_video(filename=None, directory="./videos"): """ Either show all videos in a directory (if filename is None) or show video corresponding to filename. """ html = [] if filename is not None: - files = Path('./').glob(filename) + files = Path("./").glob(filename) else: files = Path(directory).glob("*.mp4") for mp4 in files: video_b64 = base64.b64encode(mp4.read_bytes()) - html.append(''''''.format(mp4, video_b64.decode('ascii'))) + """.format( + mp4, video_b64.decode("ascii") + ) + ) ipythondisplay.display(ipythondisplay.HTML(data="
".join(html))) diff --git a/rlberry/envs/basewrapper.py b/rlberry/envs/basewrapper.py index f0bb3c6e4..597f36208 100644 --- a/rlberry/envs/basewrapper.py +++ b/rlberry/envs/basewrapper.py @@ -63,7 +63,7 @@ def __getattr__(self, attr): The first condition is to avoid infinite recursion when deep copying. See https://stackoverflow.com/a/47300262 """ - if attr[:2] == '__': + if attr[:2] == "__": raise AttributeError(attr) if attr in self.__dict__: return getattr(self, attr) @@ -94,7 +94,7 @@ def step(self, action): def sample(self, state, action): return self.env.sample(state, action) - def render(self, mode='human', **kwargs): + def render(self, mode="human", **kwargs): return self.env.render(mode=mode, **kwargs) def close(self): @@ -116,8 +116,9 @@ def is_online(self): def is_generative(self): try: - self.env.sample(self.env.observation_space.sample(), - self.env.action_space.sample()) + self.env.sample( + self.env.observation_space.sample(), self.env.action_space.sample() + ) return True except Exception: return False @@ -126,4 +127,4 @@ def __repr__(self): return str(self) def __str__(self): - return '<{}{}>'.format(type(self).__name__, self.env) + return "<{}{}>".format(type(self).__name__, self.env) diff --git a/rlberry/envs/benchmarks/ball_exploration/ball2d.py b/rlberry/envs/benchmarks/ball_exploration/ball2d.py index deb2070c2..bb1b43d7e 100644 --- a/rlberry/envs/benchmarks/ball_exploration/ball2d.py +++ b/rlberry/envs/benchmarks/ball_exploration/ball2d.py @@ -55,11 +55,13 @@ def __init__(self): self.horizon = 30 # self.p = 2 - self.action_list = [np.array([0.0, 0.0]), - 0.05 * np.array([1.0, 0.0]), - -0.05 * np.array([1.0, 0.0]), - 0.05 * np.array([0.0, 1.0]), - -0.05 * np.array([0.0, 1.0])] + self.action_list = [ + np.array([0.0, 0.0]), + 0.05 * np.array([1.0, 0.0]), + -0.05 * np.array([1.0, 0.0]), + 0.05 * np.array([0.0, 1.0]), + -0.05 * np.array([0.0, 1.0]), + ] self.reward_amplitudes = [] self.reward_smoothness = [] @@ -70,17 +72,19 @@ def __init__(self): self.sigma_init = 0.001 self.mu_init = np.array([0.0, 0.0]) - PBall2D.__init__(self, - self.p, - self.action_list, - self.reward_amplitudes, - self.reward_smoothness, - self.reward_centers, - self.A, - self.B, - self.sigma, - self.sigma_init, - self.mu_init) + PBall2D.__init__( + self, + self.p, + self.action_list, + self.reward_amplitudes, + self.reward_smoothness, + self.reward_centers, + self.A, + self.B, + self.sigma, + self.sigma_init, + self.mu_init, + ) self.name = "Ball Exploration Benchmark - Level 0 (Reward-Free)" @@ -88,6 +92,7 @@ def __init__(self): # Level 1 # + class BallLevel1(PBall2D): """ Dense rewards @@ -97,11 +102,13 @@ def __init__(self): self.horizon = 30 # self.p = 2 - self.action_list = [np.array([0.0, 0.0]), - 0.05 * np.array([1.0, 0.0]), - -0.05 * np.array([1.0, 0.0]), - 0.05 * np.array([0.0, 1.0]), - -0.05 * np.array([0.0, 1.0])] + self.action_list = [ + np.array([0.0, 0.0]), + 0.05 * np.array([1.0, 0.0]), + -0.05 * np.array([1.0, 0.0]), + 0.05 * np.array([0.0, 1.0]), + -0.05 * np.array([0.0, 1.0]), + ] self.reward_amplitudes = np.array([1.0]) self.reward_smoothness = np.array([0.5 * np.sqrt(2)]) @@ -112,17 +119,19 @@ def __init__(self): self.sigma_init = 0.001 self.mu_init = np.array([0.0, 0.0]) - PBall2D.__init__(self, - self.p, - self.action_list, - self.reward_amplitudes, - self.reward_smoothness, - self.reward_centers, - self.A, - self.B, - self.sigma, - self.sigma_init, - self.mu_init) + PBall2D.__init__( + self, + self.p, + self.action_list, + self.reward_amplitudes, + self.reward_smoothness, + self.reward_centers, + self.A, + self.B, + self.sigma, + self.sigma_init, + self.mu_init, + ) self.name = "Ball Exploration Benchmark - Level 1" @@ -130,6 +139,7 @@ def __init__(self): # Level 2 # + class BallLevel2(BallLevel1): """ Sparse rewards @@ -174,8 +184,10 @@ def __init__(self): self.reward_amplitudes = np.array([1.0, 0.1]) self.reward_smoothness = np.array([0.2, 0.5 * np.sqrt(2)]) - self.reward_centers = [np.array([-0.5, -0.5]), # far sparse - np.array([0.5, 0.5])] # dense + self.reward_centers = [ + np.array([-0.5, -0.5]), # far sparse + np.array([0.5, 0.5]), + ] # dense self.name = "Ball Exploration Benchmark - Level 4" @@ -183,6 +195,7 @@ def __init__(self): # Level 5 # + class BallLevel5(BallLevel4): """ Far sparse reward (as lvl 2) + dense suboptimal rewards, noisier @@ -193,6 +206,7 @@ def __init__(self): self.sigma = 0.025 self.name = "Ball Exploration Benchmark - Level 5" + # if __name__ == '__main__': # env = get_benchmark_env(1) # env.enable_rendering() diff --git a/rlberry/envs/benchmarks/ball_exploration/pball.py b/rlberry/envs/benchmarks/ball_exploration/pball.py index ff554a481..a196805cf 100644 --- a/rlberry/envs/benchmarks/ball_exploration/pball.py +++ b/rlberry/envs/benchmarks/ball_exploration/pball.py @@ -81,17 +81,19 @@ class PBall(Model): name = "LP-Ball" - def __init__(self, - p, - action_list, - reward_amplitudes, - reward_smoothness, - reward_centers, - A, - B, - sigma, - sigma_init, - mu_init): + def __init__( + self, + p, + action_list, + reward_amplitudes, + reward_smoothness, + reward_centers, + A, + B, + sigma, + sigma_init, + mu_init, + ): """ Parameters ----------- @@ -121,8 +123,10 @@ def __init__(self, assert p >= 1, "PBall requires p>=1" if p not in [2, np.inf]: - logger.warning("For p!=2 or p!=np.inf, PBall \ -does not make true projections onto the lp ball.") + logger.warning( + "For p!=2 or p!=np.inf, PBall \ +does not make true projections onto the lp ball." + ) self.p = p self.d, self.dp = B.shape # d and d' self.m = len(action_list) @@ -146,11 +150,13 @@ def __init__(self, assert len(self.reward_amplitudes) == len(self.reward_smoothness) assert len(self.reward_amplitudes) == len(self.reward_centers) if len(self.reward_amplitudes) > 0: - assert self.reward_amplitudes.max() <= 1.0 and \ - self.reward_amplitudes.min() >= 0.0, \ - "reward amplitudes b_i must be in [0, 1]" - assert self.reward_smoothness.min() > 0.0, \ - "reward smoothness c_i must be > 0" + assert ( + self.reward_amplitudes.max() <= 1.0 + and self.reward_amplitudes.min() >= 0.0 + ), "reward amplitudes b_i must be in [0, 1]" + assert ( + self.reward_smoothness.min() > 0.0 + ), "reward smoothness c_i must be > 0" self.reward_range = (0, 1.0) # @@ -163,8 +169,9 @@ def reset(self, state=None): if state is not None: self.state = state else: - self.state = self.mu_init \ - + self.sigma_init * self.seeder.rng.normal(size=self.d) + self.state = self.mu_init + self.sigma_init * self.seeder.rng.normal( + size=self.d + ) # projection to unit ball self.state = projection_to_pball(self.state, self.p) return self.state.copy() @@ -175,8 +182,11 @@ def sample(self, state, action): # next state action_vec = self.action_list[action] - next_s = self.A.dot(state) + self.B.dot(action_vec) \ - + self.sigma * self.rng.normal(size=self.d) + next_s = ( + self.A.dot(state) + + self.B.dot(action_vec) + + self.sigma * self.rng.normal(size=self.d) + ) next_s = projection_to_pball(next_s, self.p) # done and reward @@ -220,31 +230,42 @@ def get_transitions_lipschitz_constant(self): return np.linalg.norm(self.A, ord=order) # If p!=1, p!=2 or p!=np.inf, return upper bound on the induced norm. - return np.power(self.d, 1.0 / self.p) * np.linalg.norm(self.A, - ord=np.inf) + return np.power(self.d, 1.0 / self.p) * np.linalg.norm(self.A, ord=np.inf) class PBall2D(RenderInterface2D, PBall): - def __init__(self, - p=2, - action_list=[0.05 * np.array([1, 0]), - -0.05 * np.array([1, 0]), - 0.05 * np.array([0, 1]), - -0.05 * np.array([0, 1])], - reward_amplitudes=np.array([1.0]), - reward_smoothness=np.array([0.25]), - reward_centers=[np.array([0.75, 0.0])], - A=np.eye(2), - B=np.eye(2), - sigma=0.01, - sigma_init=0.001, - mu_init=np.array([0.0, 0.0]) - ): + def __init__( + self, + p=2, + action_list=[ + 0.05 * np.array([1, 0]), + -0.05 * np.array([1, 0]), + 0.05 * np.array([0, 1]), + -0.05 * np.array([0, 1]), + ], + reward_amplitudes=np.array([1.0]), + reward_smoothness=np.array([0.25]), + reward_centers=[np.array([0.75, 0.0])], + A=np.eye(2), + B=np.eye(2), + sigma=0.01, + sigma_init=0.001, + mu_init=np.array([0.0, 0.0]), + ): # Initialize PBall - PBall.__init__(self, p, action_list, reward_amplitudes, - reward_smoothness, - reward_centers, - A, B, sigma, sigma_init, mu_init) + PBall.__init__( + self, + p, + action_list, + reward_amplitudes, + reward_smoothness, + reward_centers, + A, + B, + sigma, + sigma_init, + mu_init, + ) # Render interface RenderInterface2D.__init__(self) @@ -285,8 +306,9 @@ def get_background(self): # reward position for ii, ampl in enumerate(self.reward_amplitudes): - contour = self._get_ball_shape(self.reward_centers[ii], - self.reward_smoothness[ii]) + contour = self._get_ball_shape( + self.reward_centers[ii], self.reward_smoothness[ii] + ) ampl = 1.0 - ampl # dark violet = more reward contour.set_color((0.5, 0.0, 0.5 * (1.0 + ampl))) bg.add_shape(contour) @@ -320,15 +342,16 @@ class SimplePBallND(PBall): PBall environment in d dimensions with simple dynamics. """ - def __init__(self, - p=2, - dim=2, - action_amplitude=0.05, - r_smoothness=0.25, - sigma=0.01, - sigma_init=0.001, - mu_init=None - ): + def __init__( + self, + p=2, + dim=2, + action_amplitude=0.05, + r_smoothness=0.25, + sigma=0.01, + sigma_init=0.001, + mu_init=None, + ): # Action list action_list = [] for dd in range(dim): @@ -352,10 +375,20 @@ def __init__(self, mu_init = np.zeros(dim) # Initialize PBall - PBall.__init__(self, p, action_list, reward_amplitudes, - reward_smoothness, - reward_centers, - A, B, sigma, sigma_init, mu_init) + PBall.__init__( + self, + p, + action_list, + reward_amplitudes, + reward_smoothness, + reward_centers, + A, + B, + sigma, + sigma_init, + mu_init, + ) + # if __name__ == '__main__': # env = PBall2D(p=5) diff --git a/rlberry/envs/benchmarks/generalization/twinrooms.py b/rlberry/envs/benchmarks/generalization/twinrooms.py index 693b3f568..6d444b876 100644 --- a/rlberry/envs/benchmarks/generalization/twinrooms.py +++ b/rlberry/envs/benchmarks/generalization/twinrooms.py @@ -33,11 +33,10 @@ class TwinRooms(RenderInterface2D, Model): when array_observation is True. Only the functions env.reset() and env.step() are covered. """ + name = "TwinRooms" - def __init__(self, - noise_room1=0.01, - noise_room2=0.01): + def __init__(self, noise_room1=0.01, noise_room2=0.01): Model.__init__(self) RenderInterface2D.__init__(self) @@ -60,7 +59,7 @@ def __init__(self, # rendering info self.set_clipping_area((0, 2, 0, 1)) self.set_refresh_interval(100) # in milliseconds - self.renderer_type = 'opengl' + self.renderer_type = "opengl" # reset self.reset() @@ -119,8 +118,11 @@ def sample(self, state, action): else: raise ValueError("Invalid action") - next_state = state + displacement \ - + self.room_noises[self.current_room] * self.rng.normal(size=2) + next_state = ( + state + + displacement + + self.room_noises[self.current_room] * self.rng.normal(size=2) + ) # clip to room next_state = self._clip_to_room(next_state) @@ -152,7 +154,10 @@ def get_background(self): bg.add_shape(shape) # rewards - for (x, y) in [self.base_reward_pos, self.base_reward_pos + np.array([1.0, 0.0])]: + for (x, y) in [ + self.base_reward_pos, + self.base_reward_pos + np.array([1.0, 0.0]), + ]: reward = circle_shape((x, y), 0.1, n_points=50) reward.type = "POLYGON" reward.set_color((0.0, 0.5, 0.0)) diff --git a/rlberry/envs/benchmarks/grid_exploration/apple_gold.py b/rlberry/envs/benchmarks/grid_exploration/apple_gold.py index 3517225ab..e444cb71f 100644 --- a/rlberry/envs/benchmarks/grid_exploration/apple_gold.py +++ b/rlberry/envs/benchmarks/grid_exploration/apple_gold.py @@ -34,6 +34,7 @@ class AppleGold(GridWorld): for Hard-Exploration Tasks arXiv preprint arXiv:1907.10247 """ + name = "AppleGold" def __init__(self, reward_free=False, array_observation=False): @@ -70,26 +71,24 @@ def __init__(self, reward_free=False, array_observation=False): if self.reward_free: reward_at = {} else: - reward_at = { - (7, 7): 10.0, - (8, 2): 1.0, - (10, 3): 1.0 - } + reward_at = {(7, 7): 10.0, (8, 2): 1.0, (10, 3): 1.0} for jj in range(7, 16): for ii in range(1, 12): if (ii, jj) not in walls and (ii, jj) != (7, 7): reward_at[(ii, jj)] = -0.05 # Init base class - GridWorld.__init__(self, - nrows=nrows, - ncols=ncols, - start_coord=start_coord, - terminal_states=terminal_states, - success_probability=success_probability, - reward_at=reward_at, - walls=walls, - default_reward=default_reward) + GridWorld.__init__( + self, + nrows=nrows, + ncols=ncols, + start_coord=start_coord, + terminal_states=terminal_states, + success_probability=success_probability, + reward_at=reward_at, + walls=walls, + default_reward=default_reward, + ) # spaces if self.array_observation: diff --git a/rlberry/envs/benchmarks/grid_exploration/four_room.py b/rlberry/envs/benchmarks/grid_exploration/four_room.py index 6f96775d6..b40cf208b 100644 --- a/rlberry/envs/benchmarks/grid_exploration/four_room.py +++ b/rlberry/envs/benchmarks/grid_exploration/four_room.py @@ -30,12 +30,10 @@ class FourRoom(GridWorld): when array_observation is True. Only the functions env.reset() and env.step() are covered. """ + name = "FourRoom" - def __init__(self, - reward_free=False, - difficulty=0, - array_observation=False): + def __init__(self, reward_free=False, difficulty=0, array_observation=False): self.reward_free = reward_free self.difficulty = difficulty self.array_observation = array_observation @@ -77,15 +75,17 @@ def __init__(self, } # Init base class - GridWorld.__init__(self, - nrows=nrows, - ncols=ncols, - start_coord=start_coord, - terminal_states=terminal_states, - success_probability=success_probability, - reward_at=reward_at, - walls=walls, - default_reward=default_reward) + GridWorld.__init__( + self, + nrows=nrows, + ncols=ncols, + start_coord=start_coord, + terminal_states=terminal_states, + success_probability=success_probability, + reward_at=reward_at, + walls=walls, + default_reward=default_reward, + ) # spaces if self.array_observation: diff --git a/rlberry/envs/benchmarks/grid_exploration/nroom.py b/rlberry/envs/benchmarks/grid_exploration/nroom.py index 04ef975c1..f94079123 100644 --- a/rlberry/envs/benchmarks/grid_exploration/nroom.py +++ b/rlberry/envs/benchmarks/grid_exploration/nroom.py @@ -55,20 +55,23 @@ class NRoom(GridWorld): when array_observation is True. Only the functions env.reset() and env.step() are covered. """ + name = "N-Room" - def __init__(self, - nrooms=7, - reward_free=False, - array_observation=False, - room_size=5, - success_probability=0.95, - remove_walls=False, - initial_state_distribution='center', - include_traps=False): + def __init__( + self, + nrooms=7, + reward_free=False, + array_observation=False, + room_size=5, + success_probability=0.95, + remove_walls=False, + initial_state_distribution="center", + include_traps=False, + ): assert nrooms > 0, "nrooms must be > 0" - assert initial_state_distribution in ('center', 'uniform') + assert initial_state_distribution in ("center", "uniform") self.reward_free = reward_free self.array_observation = array_observation @@ -116,12 +119,13 @@ def __init__(self, # existing rooms if count < self.nrooms: # remove top wall - if ((room_c == self.room_ncols - 1) and (room_r % 2 == 0)) \ - or ((room_c == 0) and (room_r % 2 == 1)): + if ((room_c == self.room_ncols - 1) and (room_r % 2 == 0)) or ( + (room_c == 0) and (room_r % 2 == 1) + ): if room_r != self.room_nrows - 1: wall_to_remove = self._convert_room_coord_to_global( - room_r, room_c, - self.room_size, self.room_size // 2) + room_r, room_c, self.room_size, self.room_size // 2 + ) if wall_to_remove in walls: walls.remove(wall_to_remove) # rooms to remove @@ -129,30 +133,37 @@ def __init__(self, for ii in range(-1, self.room_size + 1): for jj in range(-1, self.room_size + 1): wall_to_include = self._convert_room_coord_to_global( - room_r, room_c, - ii, jj) - if wall_to_include[0] >= 0 and wall_to_include[0] < nrows \ - and wall_to_include[1] >= 0 and wall_to_include[1] < ncols \ - and (wall_to_include not in walls): + room_r, room_c, ii, jj + ) + if ( + wall_to_include[0] >= 0 + and wall_to_include[0] < nrows + and wall_to_include[1] >= 0 + and wall_to_include[1] < ncols + and (wall_to_include not in walls) + ): walls.append(wall_to_include) pass # start coord if count == nrooms // 2: start_coord = self._convert_room_coord_to_global( - room_r, room_c, - self.room_size // 2, self.room_size // 2) + room_r, room_c, self.room_size // 2, self.room_size // 2 + ) # terminal state if count == nrooms - 1: terminal_state = self._convert_room_coord_to_global( - room_r, room_c, - self.room_size // 2, self.room_size // 2) + room_r, room_c, self.room_size // 2, self.room_size // 2 + ) # trap if include_traps: self.traps.append( self._convert_room_coord_to_global( - room_r, room_c, - self.room_size // 2 + 1, self.room_size // 2 + 1) + room_r, + room_c, + self.room_size // 2 + 1, + self.room_size // 2 + 1, + ) ) count += 1 @@ -164,7 +175,7 @@ def __init__(self, reward_at = { terminal_state: 1.0, start_coord: 0.01, - (self.room_size // 2, self.room_size // 2): 0.1 + (self.room_size // 2, self.room_size // 2): 0.1, } # Check remove_walls @@ -172,18 +183,20 @@ def __init__(self, walls = () # Init base class - GridWorld.__init__(self, - nrows=nrows, - ncols=ncols, - start_coord=start_coord, - terminal_states=terminal_states, - success_probability=success_probability, - reward_at=reward_at, - walls=walls, - default_reward=0.0) + GridWorld.__init__( + self, + nrows=nrows, + ncols=ncols, + start_coord=start_coord, + terminal_states=terminal_states, + success_probability=success_probability, + reward_at=reward_at, + walls=walls, + default_reward=0.0, + ) # Check initial distribution - if initial_state_distribution == 'uniform': + if initial_state_distribution == "uniform": distr = np.ones(self.observation_space.n) / self.observation_space.n self.set_initial_state_distribution(distr) @@ -192,7 +205,9 @@ def __init__(self, self.discrete_observation_space = self.observation_space self.observation_space = spaces.Box(0.0, 1.0, shape=(2,)) - def _convert_room_coord_to_global(self, room_row, room_col, room_coord_row, room_coord_col): + def _convert_room_coord_to_global( + self, room_row, room_col, room_coord_row, room_coord_col + ): col_offset = (self.room_size + 1) * room_col row_offset = (self.room_size + 1) * room_row diff --git a/rlberry/envs/benchmarks/grid_exploration/six_room.py b/rlberry/envs/benchmarks/grid_exploration/six_room.py index 131c3a3ef..b8b9232a4 100644 --- a/rlberry/envs/benchmarks/grid_exploration/six_room.py +++ b/rlberry/envs/benchmarks/grid_exploration/six_room.py @@ -25,6 +25,7 @@ class SixRoom(GridWorld): when array_observation is True. Only the functions env.reset() and env.step() are covered. """ + name = "SixRoom" def __init__(self, reward_free=False, array_observation=False): @@ -60,15 +61,17 @@ def __init__(self, reward_free=False, array_observation=False): } # Init base class - GridWorld.__init__(self, - nrows=nrows, - ncols=ncols, - start_coord=start_coord, - terminal_states=terminal_states, - success_probability=success_probability, - reward_at=reward_at, - walls=walls, - default_reward=default_reward) + GridWorld.__init__( + self, + nrows=nrows, + ncols=ncols, + start_coord=start_coord, + terminal_states=terminal_states, + success_probability=success_probability, + reward_at=reward_at, + walls=walls, + default_reward=default_reward, + ) # spaces if self.array_observation: diff --git a/rlberry/envs/bullet3/pybullet_envs/__init__.py b/rlberry/envs/bullet3/pybullet_envs/__init__.py index 533cc6a97..796a8424a 100644 --- a/rlberry/envs/bullet3/pybullet_envs/__init__.py +++ b/rlberry/envs/bullet3/pybullet_envs/__init__.py @@ -12,29 +12,29 @@ def register(id, *args, **kvargs): # ------------bullet------------- register( - id='PendulumBulletEnv-v0', - entry_point='rlberry.envs.bullet3.pybullet_envs.gym_pendulum_envs:PendulumBulletEnv', + id="PendulumBulletEnv-v0", + entry_point="rlberry.envs.bullet3.pybullet_envs.gym_pendulum_envs:PendulumBulletEnv", max_episode_steps=1000, reward_threshold=950.0, ) register( - id='PendulumSwingupBulletEnv-v0', - entry_point='rlberry.envs.bullet3.pybullet_envs.gym_pendulum_envs:PendulumSwingupBulletEnv', + id="PendulumSwingupBulletEnv-v0", + entry_point="rlberry.envs.bullet3.pybullet_envs.gym_pendulum_envs:PendulumSwingupBulletEnv", max_episode_steps=1000, reward_threshold=800.0, ) register( - id='DiscretePendulumBulletEnv-v0', - entry_point='rlberry.envs.bullet3.pybullet_envs.gym_pendulum_envs:DiscretePendulumBulletEnv', + id="DiscretePendulumBulletEnv-v0", + entry_point="rlberry.envs.bullet3.pybullet_envs.gym_pendulum_envs:DiscretePendulumBulletEnv", max_episode_steps=1000, reward_threshold=950.0, ) register( - id='DiscretePendulumSwingupBulletEnv-v0', - entry_point='rlberry.envs.bullet3.pybullet_envs.gym_pendulum_envs:DiscretePendulumSwingupBulletEnv', + id="DiscretePendulumSwingupBulletEnv-v0", + entry_point="rlberry.envs.bullet3.pybullet_envs.gym_pendulum_envs:DiscretePendulumSwingupBulletEnv", max_episode_steps=1000, reward_threshold=800.0, ) diff --git a/rlberry/envs/bullet3/pybullet_envs/gym_pendulum_envs.py b/rlberry/envs/bullet3/pybullet_envs/gym_pendulum_envs.py index 0288c40f6..9e9e2d8fa 100644 --- a/rlberry/envs/bullet3/pybullet_envs/gym_pendulum_envs.py +++ b/rlberry/envs/bullet3/pybullet_envs/gym_pendulum_envs.py @@ -1,6 +1,9 @@ from gym import spaces from pybullet_envs.env_bases import MJCFBaseBulletEnv -from pybullet_envs.gym_pendulum_envs import InvertedPendulumBulletEnv, InvertedPendulumSwingupBulletEnv +from pybullet_envs.gym_pendulum_envs import ( + InvertedPendulumBulletEnv, + InvertedPendulumSwingupBulletEnv, +) from pybullet_envs.scene_abstract import SingleRobotEmptyScene from rlberry.envs.bullet3.pybullet_envs.robot_pendula import Pendulum, PendulumSwingup @@ -16,7 +19,9 @@ def __init__(self): self.stateId = -1 def create_single_player_scene(self, bullet_client): - return SingleRobotEmptyScene(bullet_client, gravity=9.81, timestep=0.02, frame_skip=1) + return SingleRobotEmptyScene( + bullet_client, gravity=9.81, timestep=0.02, frame_skip=1 + ) def step(self, a): self.robot.apply_action(a) @@ -28,7 +33,7 @@ def step(self, a): done = False else: reward = 1.0 - done = np.abs(self.robot.theta) > .2 + done = np.abs(self.robot.theta) > 0.2 self.rewards = [float(reward)] self.HUD(state, a, done) return state, sum(self.rewards), done, {} diff --git a/rlberry/envs/bullet3/pybullet_envs/robot_bases.py b/rlberry/envs/bullet3/pybullet_envs/robot_bases.py index e6c3190d4..d2dc50e75 100644 --- a/rlberry/envs/bullet3/pybullet_envs/robot_bases.py +++ b/rlberry/envs/bullet3/pybullet_envs/robot_bases.py @@ -10,69 +10,113 @@ class MJCFBasedRobot2(MJCFBasedRobot): def reset(self, bullet_client): self._p = bullet_client # print("Created bullet_client with id=", self._p._client) - if (self.doneLoading == 0): + if self.doneLoading == 0: self.ordered_joints = [] self.doneLoading = 1 if self.self_collision: - self.objects = self._p.loadMJCF(os.path.join(data.getDataPath(), "mjcf", - self.model_xml), - flags=pybullet.URDF_USE_SELF_COLLISION | - pybullet.URDF_USE_SELF_COLLISION_EXCLUDE_ALL_PARENTS | - pybullet.URDF_GOOGLEY_UNDEFINED_COLORS) - self.parts, self.jdict, self.ordered_joints, self.robot_body = self.addToScene( - self._p, self.objects) + self.objects = self._p.loadMJCF( + os.path.join(data.getDataPath(), "mjcf", self.model_xml), + flags=pybullet.URDF_USE_SELF_COLLISION + | pybullet.URDF_USE_SELF_COLLISION_EXCLUDE_ALL_PARENTS + | pybullet.URDF_GOOGLEY_UNDEFINED_COLORS, + ) + ( + self.parts, + self.jdict, + self.ordered_joints, + self.robot_body, + ) = self.addToScene(self._p, self.objects) else: self.objects = self._p.loadMJCF( - os.path.join(data.getDataPath(), "mjcf", self.model_xml, - flags=pybullet.URDF_GOOGLEY_UNDEFINED_COLORS)) - self.parts, self.jdict, self.ordered_joints, self.robot_body = self.addToScene( - self._p, self.objects) + os.path.join( + data.getDataPath(), + "mjcf", + self.model_xml, + flags=pybullet.URDF_GOOGLEY_UNDEFINED_COLORS, + ) + ) + ( + self.parts, + self.jdict, + self.ordered_joints, + self.robot_body, + ) = self.addToScene(self._p, self.objects) self.robot_specific_reset(self._p) - s = self.calc_state( + s = ( + self.calc_state() ) # optimization: calc_state() can calculate something in self.* for calc_potential() to use return s class URDFBasedRobot2(URDFBasedRobot): - def __init__(self, - model_urdf, - robot_name, - action_dim, - obs_dim, - basePosition=[0, 0, 0], - baseOrientation=[0, 0, 0, 1], - fixed_base=False, - self_collision=False): - super().__init__(model_urdf, robot_name, action_dim, obs_dim, basePosition, baseOrientation, fixed_base, - self_collision) + def __init__( + self, + model_urdf, + robot_name, + action_dim, + obs_dim, + basePosition=[0, 0, 0], + baseOrientation=[0, 0, 0, 1], + fixed_base=False, + self_collision=False, + ): + super().__init__( + model_urdf, + robot_name, + action_dim, + obs_dim, + basePosition, + baseOrientation, + fixed_base, + self_collision, + ) self.doneLoading = 0 def reset(self, bullet_client): self._p = bullet_client - if (self.doneLoading == 0): + if self.doneLoading == 0: self.ordered_joints = [] self.doneLoading = 1 if self.self_collision: - self.parts, self.jdict, self.ordered_joints, self.robot_body = self.addToScene( + ( + self.parts, + self.jdict, + self.ordered_joints, + self.robot_body, + ) = self.addToScene( self._p, - self._p.loadURDF(os.path.join(data.getDataPath(), self.model_urdf), - basePosition=self.basePosition, - baseOrientation=self.baseOrientation, - useFixedBase=self.fixed_base, - flags=pybullet.URDF_USE_SELF_COLLISION | pybullet.URDF_GOOGLEY_UNDEFINED_COLORS)) + self._p.loadURDF( + os.path.join(data.getDataPath(), self.model_urdf), + basePosition=self.basePosition, + baseOrientation=self.baseOrientation, + useFixedBase=self.fixed_base, + flags=pybullet.URDF_USE_SELF_COLLISION + | pybullet.URDF_GOOGLEY_UNDEFINED_COLORS, + ), + ) else: - self.parts, self.jdict, self.ordered_joints, self.robot_body = self.addToScene( + ( + self.parts, + self.jdict, + self.ordered_joints, + self.robot_body, + ) = self.addToScene( self._p, - self._p.loadURDF(os.path.join(data.getDataPath(), self.model_urdf), - basePosition=self.basePosition, - baseOrientation=self.baseOrientation, - useFixedBase=self.fixed_base, flags=pybullet.URDF_GOOGLEY_UNDEFINED_COLORS)) + self._p.loadURDF( + os.path.join(data.getDataPath(), self.model_urdf), + basePosition=self.basePosition, + baseOrientation=self.baseOrientation, + useFixedBase=self.fixed_base, + flags=pybullet.URDF_GOOGLEY_UNDEFINED_COLORS, + ), + ) self.robot_specific_reset(self._p) - s = self.calc_state( + s = ( + self.calc_state() ) # optimization: calc_state() can calculate something in self.* for calc_potential() to use self.potential = self.calc_potential() diff --git a/rlberry/envs/bullet3/pybullet_envs/robot_pendula.py b/rlberry/envs/bullet3/pybullet_envs/robot_pendula.py index b95a86136..871fe45c8 100644 --- a/rlberry/envs/bullet3/pybullet_envs/robot_pendula.py +++ b/rlberry/envs/bullet3/pybullet_envs/robot_pendula.py @@ -1,7 +1,10 @@ import gym import numpy as np -from rlberry.envs.bullet3.pybullet_envs.robot_bases import MJCFBasedRobot2, URDFBasedRobot2 +from rlberry.envs.bullet3.pybullet_envs.robot_bases import ( + MJCFBasedRobot2, + URDFBasedRobot2, +) class Pendulum(URDFBasedRobot2): @@ -9,23 +12,25 @@ class Pendulum(URDFBasedRobot2): def __init__(self): # MJCFBasedRobot2.__init__(self, 'pendulum.xml', 'pole', action_dim=1, obs_dim=2) - URDFBasedRobot2.__init__(self, 'pendulum.urdf', 'pole', action_dim=1, obs_dim=2) + URDFBasedRobot2.__init__(self, "pendulum.urdf", "pole", action_dim=1, obs_dim=2) self.action_space = gym.spaces.Box(shape=(1,), low=-20, high=20) def robot_specific_reset(self, bullet_client): self._p = bullet_client self.pole = self.parts["pole"] self.j1 = self.jdict["hinge"] - u = self.np_random.uniform(low=-.1, high=.1) + u = self.np_random.uniform(low=-0.1, high=0.1) self.j1.reset_current_position(u if not self.swingup else np.pi + u, 0) self.j1.set_motor_torque(0) def apply_action(self, a): - assert (np.isfinite(a).all()) + assert np.isfinite(a).all() if not np.isfinite(a).all(): print("a is inf") a[0] = 0 - self.j1.set_motor_torque(np.clip(a[0], self.action_space.low, self.action_space.high)) + self.j1.set_motor_torque( + np.clip(a[0], self.action_space.low, self.action_space.high) + ) def calc_state(self): self.theta, theta_dot = self.j1.current_position() diff --git a/rlberry/envs/classic_control/acrobot.py b/rlberry/envs/classic_control/acrobot.py index c15069e0a..41d16cbd7 100644 --- a/rlberry/envs/classic_control/acrobot.py +++ b/rlberry/envs/classic_control/acrobot.py @@ -16,8 +16,13 @@ from rlberry.rendering.common_shapes import bar_shape, circle_shape __copyright__ = "Copyright 2013, RLPy http://acl.mit.edu/RLPy" -__credits__ = ["Alborz Geramifard", "Robert H. Klein", "Christoph Dann", - "William Dabney", "Jonathan P. How"] +__credits__ = [ + "Alborz Geramifard", + "Robert H. Klein", + "Christoph Dann", + "William Dabney", + "Jonathan P. How", +] __license__ = "BSD 3-Clause" __author__ = "Christoph Dann " @@ -71,24 +76,25 @@ class Acrobot(RenderInterface2D, Model): than the original version which employs Euler integration, see the AcrobotLegacy class. """ + name = "Acrobot" - dt = .2 + dt = 0.2 - LINK_LENGTH_1 = 1. # [m] - LINK_LENGTH_2 = 1. # [m] - LINK_MASS_1 = 1. #: [kg] mass of link 1 - LINK_MASS_2 = 1. #: [kg] mass of link 2 + LINK_LENGTH_1 = 1.0 # [m] + LINK_LENGTH_2 = 1.0 # [m] + LINK_MASS_1 = 1.0 #: [kg] mass of link 1 + LINK_MASS_2 = 1.0 #: [kg] mass of link 2 LINK_COM_POS_1 = 0.5 #: [m] position of the center of mass of link 1 LINK_COM_POS_2 = 0.5 #: [m] position of the center of mass of link 2 - LINK_MOI = 1. #: moments of inertia for both links + LINK_MOI = 1.0 #: moments of inertia for both links MAX_VEL_1 = 4 * np.pi MAX_VEL_2 = 9 * np.pi - AVAIL_TORQUE = [-1., 0., +1] + AVAIL_TORQUE = [-1.0, 0.0, +1] - torque_noise_max = 0. + torque_noise_max = 0.0 #: use dynamics equations from the nips paper or the book book_or_nips = "book" @@ -123,8 +129,10 @@ def reset(self): return self._get_ob() def step(self, action): - assert self.action_space.contains(action), \ - "%r (%s) invalid" % (action, type(action)) + assert self.action_space.contains(action), "%r (%s) invalid" % ( + action, + type(action), + ) # save state for rendering if self.is_render_enabled(): @@ -135,8 +143,7 @@ def step(self, action): # Add noise to the force action if self.torque_noise_max > 0: - torque += self.rng.uniform(-self.torque_noise_max, - self.torque_noise_max) + torque += self.rng.uniform(-self.torque_noise_max, self.torque_noise_max) # Now, augment the state with our force action so it can be passed to # _dsdt @@ -158,17 +165,18 @@ def step(self, action): ns[3] = bound(ns[3], -self.MAX_VEL_2, self.MAX_VEL_2) self.state = ns terminal = self._terminal() - reward = -1. if not terminal else 0. + reward = -1.0 if not terminal else 0.0 return self._get_ob(), reward, terminal, {} def _get_ob(self): s = self.state - return np.array([np.cos(s[0]), np.sin(s[0]), np.cos(s[1]), - np.sin(s[1]), s[2], s[3]]) + return np.array( + [np.cos(s[0]), np.sin(s[0]), np.cos(s[1]), np.sin(s[1]), s[2], s[3]] + ) def _terminal(self): s = self.state - return bool(-np.cos(s[0]) - np.cos(s[1] + s[0]) > 1.) + return bool(-np.cos(s[0]) - np.cos(s[1] + s[0]) > 1.0) def _dsdt(self, s_augmented, t): m1 = self.LINK_MASS_1 @@ -185,26 +193,35 @@ def _dsdt(self, s_augmented, t): theta2 = s[1] dtheta1 = s[2] dtheta2 = s[3] - d1 = m1 * lc1 ** 2 + m2 * \ - (l1 ** 2 + lc2 ** 2 + 2 * l1 * lc2 * np.cos(theta2)) + I1 + I2 + d1 = ( + m1 * lc1 ** 2 + + m2 * (l1 ** 2 + lc2 ** 2 + 2 * l1 * lc2 * np.cos(theta2)) + + I1 + + I2 + ) d2 = m2 * (lc2 ** 2 + l1 * lc2 * np.cos(theta2)) + I2 - phi2 = m2 * lc2 * g * np.cos(theta1 + theta2 - np.pi / 2.) - phi1 = - m2 * l1 * lc2 * dtheta2 ** 2 * np.sin(theta2) \ - - 2 * m2 * l1 * lc2 * dtheta2 * dtheta1 * np.sin(theta2) \ - + (m1 * lc1 + m2 * l1) * g * np.cos(theta1 - np.pi / 2) + phi2 + phi2 = m2 * lc2 * g * np.cos(theta1 + theta2 - np.pi / 2.0) + phi1 = ( + -m2 * l1 * lc2 * dtheta2 ** 2 * np.sin(theta2) + - 2 * m2 * l1 * lc2 * dtheta2 * dtheta1 * np.sin(theta2) + + (m1 * lc1 + m2 * l1) * g * np.cos(theta1 - np.pi / 2) + + phi2 + ) if self.book_or_nips == "nips": # the following line is consistent with the description in the # paper - ddtheta2 = (a + d2 / d1 * phi1 - phi2) / \ - (m2 * lc2 ** 2 + I2 - d2 ** 2 / d1) + ddtheta2 = (a + d2 / d1 * phi1 - phi2) / (m2 * lc2 ** 2 + I2 - d2 ** 2 / d1) else: # the following line is consistent with the java implementation # and the book - ddtheta2 = (a + d2 / d1 * phi1 - m2 * l1 * lc2 * dtheta1 ** 2 * - np.sin(theta2) - phi2) \ - / (m2 * lc2 ** 2 + I2 - d2 ** 2 / d1) + ddtheta2 = ( + a + + d2 / d1 * phi1 + - m2 * l1 * lc2 * dtheta1 ** 2 * np.sin(theta2) + - phi2 + ) / (m2 * lc2 ** 2 + I2 - d2 ** 2 / d1) ddtheta1 = -(d2 * ddtheta2 + phi1) / d1 - return (dtheta1, dtheta2, ddtheta1, ddtheta2, 0.) + return (dtheta1, dtheta2, ddtheta1, ddtheta2, 0.0) # # Below: code for rendering @@ -219,10 +236,14 @@ def get_scene(self, state): p0 = (0.0, 0.0) - p1 = (self.LINK_LENGTH_1 * np.sin(state[0]), - -self.LINK_LENGTH_1 * np.cos(state[0])) - p2 = (p1[0] + self.LINK_LENGTH_2 * np.sin(state[0] + state[1]), - p1[1] - self.LINK_LENGTH_2 * np.cos(state[0] + state[1])) + p1 = ( + self.LINK_LENGTH_1 * np.sin(state[0]), + -self.LINK_LENGTH_1 * np.cos(state[0]), + ) + p2 = ( + p1[0] + self.LINK_LENGTH_2 * np.sin(state[0] + state[1]), + p1[1] - self.LINK_LENGTH_2 * np.cos(state[0] + state[1]), + ) link1 = bar_shape(p0, p1, 0.1) link1.set_color((255 / 255, 140 / 255, 0 / 255)) diff --git a/rlberry/envs/classic_control/mountain_car.py b/rlberry/envs/classic_control/mountain_car.py index 6692541e1..6f8fa0589 100644 --- a/rlberry/envs/classic_control/mountain_car.py +++ b/rlberry/envs/classic_control/mountain_car.py @@ -59,6 +59,7 @@ class MountainCar(RenderInterface2D, Model): Episode Termination: The car position is more than 0.5 """ + name = "MountainCar" def __init__(self, goal_velocity=0): @@ -91,8 +92,10 @@ def __init__(self, goal_velocity=0): self.reset() def step(self, action): - assert self.action_space.contains(action), \ - "%r (%s) invalid" % (action, type(action)) + assert self.action_space.contains(action), "%r (%s) invalid" % ( + action, + type(action), + ) # save state for rendering if self.is_render_enabled(): @@ -110,23 +113,24 @@ def reset(self): def sample(self, state, action): if not isinstance(state, np.ndarray): state = np.array(state) - assert self.observation_space.contains(state), \ - "Invalid state as argument of reset()." - assert self.action_space.contains(action), \ - "%r (%s) invalid" % (action, type(action)) + assert self.observation_space.contains( + state + ), "Invalid state as argument of reset()." + assert self.action_space.contains(action), "%r (%s) invalid" % ( + action, + type(action), + ) position = state[0] velocity = state[1] - velocity += (action - 1) * self.force \ - + math.cos(3 * position) * (-self.gravity) + velocity += (action - 1) * self.force + math.cos(3 * position) * (-self.gravity) velocity = np.clip(velocity, -self.max_speed, self.max_speed) position += velocity position = np.clip(position, self.min_position, self.max_position) - if (position == self.min_position and velocity < 0): + if position == self.min_position and velocity < 0: velocity = 0 - done = bool(position >= self.goal_position and - velocity >= self.goal_velocity) + done = bool(position >= self.goal_position and velocity >= self.goal_velocity) reward = 0.0 if done: reward = 1.0 @@ -136,7 +140,7 @@ def sample(self, state, action): @staticmethod def _height(xs): - return np.sin(3 * xs) * .45 + .55 + return np.sin(3 * xs) * 0.45 + 0.55 # # Below: code for rendering @@ -154,8 +158,7 @@ def get_background(self): mountain.add_vertex((0.6, -1.0)) n_points = 50 - obs_range = self.observation_space.high[0] \ - - self.observation_space.low[0] + obs_range = self.observation_space.high[0] - self.observation_space.low[0] eps = obs_range / (n_points - 1) for ii in reversed(range(n_points)): x = self.observation_space.low[0] + ii * eps diff --git a/rlberry/envs/classic_control/pendulum.py b/rlberry/envs/classic_control/pendulum.py index e068da35f..26d45f0e3 100644 --- a/rlberry/envs/classic_control/pendulum.py +++ b/rlberry/envs/classic_control/pendulum.py @@ -22,6 +22,7 @@ class Pendulum(RenderInterface2D, Model): the pendulum starts in a random position, and the goal is to swing it up so it stays upright. """ + name = "Pendulum" def __init__(self): @@ -30,23 +31,23 @@ def __init__(self): RenderInterface2D.__init__(self) # environment parameters - self.max_speed = 8. - self.max_torque = 2. + self.max_speed = 8.0 + self.max_torque = 2.0 self.dt = 0.5 - self.gravity = 10. - self.mass = 1. - self.length = 1. + self.gravity = 10.0 + self.mass = 1.0 + self.length = 1.0 # rendering info self.set_clipping_area((-2.2, 2.2, -2.2, 2.2)) self.set_refresh_interval(10) # observation and action spaces - high = np.array([1., 1., self.max_speed]) + high = np.array([1.0, 1.0, self.max_speed]) low = -high - self.action_space = spaces.Box(low=-self.max_torque, - high=self.max_torque, - shape=(1,)) + self.action_space = spaces.Box( + low=-self.max_torque, high=self.max_torque, shape=(1,) + ) self.observation_space = spaces.Box(low=low, high=high) # initialize @@ -60,8 +61,10 @@ def reset(self): return self._get_ob() def step(self, action): - assert self.action_space.contains(action), \ - "%r (%s) invalid" % (action, type(action)) + assert self.action_space.contains(action), "%r (%s) invalid" % ( + action, + type(action), + ) # save state for rendering if self.is_render_enabled(): @@ -75,11 +78,19 @@ def step(self, action): action = np.clip(action, -self.max_torque, self.max_torque)[0] self.last_action = action # for rendering - costs = angle_normalize(theta) ** 2 + .1 * thetadot ** 2 + .001 * (action ** 2) + costs = ( + angle_normalize(theta) ** 2 + 0.1 * thetadot ** 2 + 0.001 * (action ** 2) + ) # compute the next state after action - newthetadot = thetadot + (-3 * gravity / (2 * length) * np.sin(theta + np.pi) + - 3. / (mass * length ** 2) * action) * dt + newthetadot = ( + thetadot + + ( + -3 * gravity / (2 * length) * np.sin(theta + np.pi) + + 3.0 / (mass * length ** 2) * action + ) + * dt + ) newtheta = theta + newthetadot * dt newthetadot = np.clip(newthetadot, -self.max_speed, self.max_speed) @@ -102,8 +113,7 @@ def get_scene(self, state): scene = Scene() p0 = (0.0, 0.0) - p1 = (self.length * np.sin(state[0]), - -self.length * np.cos(state[0])) + p1 = (self.length * np.sin(state[0]), -self.length * np.cos(state[0])) link = bar_shape(p0, p1, 0.1) link.set_color((255 / 255, 105 / 255, 30 / 255)) @@ -118,4 +128,4 @@ def get_scene(self, state): def angle_normalize(x): - return (((x + np.pi) % (2 * np.pi)) - np.pi) + return ((x + np.pi) % (2 * np.pi)) - np.pi diff --git a/rlberry/envs/finite/finite_mdp.py b/rlberry/envs/finite/finite_mdp.py index ff092b7f5..57649a15b 100644 --- a/rlberry/envs/finite/finite_mdp.py +++ b/rlberry/envs/finite/finite_mdp.py @@ -62,8 +62,9 @@ def reset(self): Reset the environment to a default state. """ if isinstance(self.initial_state_distribution, np.ndarray): - self.state = self.rng.choice(self._states, - p=self.initial_state_distribution) + self.state = self.rng.choice( + self._states, p=self.initial_state_distribution + ) else: self.state = self.initial_state_distribution return self.state @@ -159,17 +160,20 @@ def log(self): """ Print the structure of the MDP. """ - indent = ' ' + indent = " " for s in self._states: logger.info(f"State {s} {indent}") for a in self._actions: logger.info(f"{indent} Action {a}") for ss in self._states: if self.P[s, a, ss] > 0.0: - logger.info(f'{2 * indent} transition to {ss} ' - f'with prob {self.P[s, a, ss]: .2f}') + logger.info( + f"{2 * indent} transition to {ss} " + f"with prob {self.P[s, a, ss]: .2f}" + ) logger.info("~~~~~~~~~~~~~~~~~~~~") + # if __name__ == '__main__': # S = 3 # A = 2 diff --git a/rlberry/envs/finite/gridworld.py b/rlberry/envs/finite/gridworld.py index dfc036199..3088b8198 100644 --- a/rlberry/envs/finite/gridworld.py +++ b/rlberry/envs/finite/gridworld.py @@ -37,17 +37,20 @@ class GridWorld(RenderInterface2D, FiniteMDP): reward received at states not in 'reward_at' """ + name = "GridWorld" - def __init__(self, - nrows=5, - ncols=5, - start_coord=(0, 0), - terminal_states=None, - success_probability=0.9, - reward_at=None, - walls=((1, 1), (2, 2)), - default_reward=0.0): + def __init__( + self, + nrows=5, + ncols=5, + start_coord=(0, 0), + terminal_states=None, + success_probability=0.9, + reward_at=None, + walls=((1, 1), (2, 2)), + default_reward=0.0, + ): # Grid dimensions self.nrows = nrows self.ncols = ncols @@ -79,8 +82,8 @@ def __init__(self, self.start_coord = tuple(start_coord) # Actions (string to index & index to string) - self.a_str2idx = {'left': 0, 'right': 1, 'down': 2, 'up': 3} - self.a_idx2str = {0: 'left', 1: 'right', 2: 'down', 3: 'up'} + self.a_str2idx = {"left": 0, "right": 1, "down": 2, "up": 3} + self.a_idx2str = {0: "left", 1: "right", 2: "down", 3: "up"} # -------------------------------------------- # The variables below are defined in _build() @@ -99,8 +102,9 @@ def __init__(self, # Build self._build() init_state_idx = self.coord2index[start_coord] - FiniteMDP.__init__(self, self.R, self.P, - initial_state_distribution=init_state_idx) + FiniteMDP.__init__( + self, self.R, self.P, initial_state_distribution=init_state_idx + ) RenderInterface2D.__init__(self) self.reset() self.reward_range = (self.R.min(), self.R.max()) @@ -108,7 +112,7 @@ def __init__(self, # rendering info self.set_clipping_area((0, self.ncols, 0, self.nrows)) self.set_refresh_interval(100) # in milliseconds - self.renderer_type = 'pygame' + self.renderer_type = "pygame" def is_terminal(self, state): state_coord = self.index2coord[state] @@ -158,8 +162,7 @@ def _build_transition_probabilities(self): for s in range(Ns): s_coord = self.index2coord[s] neighbors = self._get_neighbors(*s_coord) - valid_neighbors = [neighbors[nn][0] for nn in neighbors - if neighbors[nn][1]] + valid_neighbors = [neighbors[nn][0] for nn in neighbors if neighbors[nn][1]] n_valid = len(valid_neighbors) for a in range(Na): # each action corresponds to a direction for nn in neighbors: @@ -167,23 +170,23 @@ def _build_transition_probabilities(self): if next_s_coord in valid_neighbors: next_s = self.coord2index[next_s_coord] if a == nn: # action is successful - self.P[s, a, next_s] = self.success_probability \ - + (1 - self.success_probability) \ - * (n_valid == 1) + self.P[s, a, next_s] = self.success_probability + ( + 1 - self.success_probability + ) * (n_valid == 1) elif neighbors[a][0] not in valid_neighbors: self.P[s, a, s] = 1.0 else: if n_valid > 1: - self.P[s, a, next_s] = \ - (1.0 - self.success_probability) \ - / (n_valid - 1) + self.P[s, a, next_s] = ( + 1.0 - self.success_probability + ) / (n_valid - 1) def _get_neighbors(self, row, col): aux = {} - aux['left'] = (row, col - 1) # left - aux['right'] = (row, col + 1) # right - aux['up'] = (row - 1, col) # up - aux['down'] = (row + 1, col) # down + aux["left"] = (row, col - 1) # left + aux["right"] = (row, col + 1) # right + aux["up"] = (row - 1, col) # up + aux["down"] = (row + 1, col) # down neighbors = {} for direction_str in aux: direction = self.a_str2idx[direction_str] @@ -193,10 +196,10 @@ def _get_neighbors(self, row, col): def get_transition_support(self, state): row, col = self.index2coord[state] - neighbors = [(row, col - 1), (row, col + 1), - (row - 1, col), (row + 1, col)] - return [self.coord2index[coord] for coord in neighbors - if self._is_valid(*coord)] + neighbors = [(row, col - 1), (row, col + 1), (row - 1, col), (row + 1, col)] + return [ + self.coord2index[coord] for coord in neighbors if self._is_valid(*coord) + ] def _is_valid(self, row, col): if (row, col) in self.walls: @@ -208,38 +211,38 @@ def _is_valid(self, row, col): return True def _build_ascii(self): - grid = [[''] * self.ncols for rr in range(self.nrows)] - grid_idx = [[''] * self.ncols for rr in range(self.nrows)] + grid = [[""] * self.ncols for rr in range(self.nrows)] + grid_idx = [[""] * self.ncols for rr in range(self.nrows)] for rr in range(self.nrows): for cc in range(self.ncols): if (rr, cc) in self.walls: - grid[rr][cc] = 'x ' + grid[rr][cc] = "x " else: - grid[rr][cc] = 'o ' + grid[rr][cc] = "o " grid_idx[rr][cc] = str(self.coord2index[(rr, cc)]).zfill(3) for (rr, cc) in self.reward_at: rwd = self.reward_at[(rr, cc)] if rwd > 0: - grid[rr][cc] = '+ ' + grid[rr][cc] = "+ " if rwd < 0: - grid[rr][cc] = '-' + grid[rr][cc] = "-" - grid[self.start_coord[0]][self.start_coord[1]] = 'I ' + grid[self.start_coord[0]][self.start_coord[1]] = "I " # current position of the agent x, y = self.index2coord[self.state] - grid[x][y] = 'A ' + grid[x][y] = "A " # - grid_ascii = '' + grid_ascii = "" for rr in range(self.nrows + 1): if rr < self.nrows: - grid_ascii += str(rr).zfill(2) + 2 * ' ' \ - + ' '.join(grid[rr]) + '\n' + grid_ascii += str(rr).zfill(2) + 2 * " " + " ".join(grid[rr]) + "\n" else: - grid_ascii += 3 * ' ' + ' '.join([str(jj).zfill(2) for jj - in range(self.ncols)]) + grid_ascii += 3 * " " + " ".join( + [str(jj).zfill(2) for jj in range(self.ncols)] + ) self.grid_ascii = grid_ascii self.grid_idx = grid_idx @@ -247,21 +250,22 @@ def _build_ascii(self): def display_values(self, values): assert len(values) == self.Ns - grid_values = [['X'.ljust(9)] * self.ncols for ii in range(self.nrows)] + grid_values = [["X".ljust(9)] * self.ncols for ii in range(self.nrows)] for s_idx in range(self.Ns): v = values[s_idx] row, col = self.index2coord[s_idx] grid_values[row][col] = ("%0.2f" % v).ljust(9) - grid_values_ascii = '' + grid_values_ascii = "" for rr in range(self.nrows + 1): if rr < self.nrows: - grid_values_ascii += str(rr).zfill(2) + 2 * ' ' \ - + ' '.join(grid_values[rr]) + '\n' + grid_values_ascii += ( + str(rr).zfill(2) + 2 * " " + " ".join(grid_values[rr]) + "\n" + ) else: - grid_values_ascii += 4 * ' ' \ - + ' '.join([str(jj).zfill(2).ljust(9) for jj - in range(self.ncols)]) + grid_values_ascii += 4 * " " + " ".join( + [str(jj).zfill(2).ljust(9) for jj in range(self.ncols)] + ) logger.info(grid_values_ascii) def print_transition_at(self, row, col, action): @@ -272,8 +276,10 @@ def print_transition_at(self, row, col, action): a_idx = self.a_str2idx[action] for next_s_idx, prob in enumerate(self.P[s_idx, a_idx]): if prob > 0: - logger.info("to (%d, %d) with prob %f" % - (self.index2coord[next_s_idx] + (prob,))) + logger.info( + "to (%d, %d) with prob %f" + % (self.index2coord[next_s_idx] + (prob,)) + ) def render_ascii(self): logger.info(self._build_ascii()) @@ -330,10 +336,8 @@ def get_layout_array(self, state_data=None, fill_walls_with=np.nan): return layout def get_layout_img( - self, - state_data=None, - colormap_name='cool', - wall_color=(0.0, 0.0, 0.0)): + self, state_data=None, colormap_name="cool", wall_color=(0.0, 0.0, 0.0) + ): """ Returns an image array representing the value of `state_data` on the gridworld layout. @@ -367,7 +371,9 @@ def get_layout_img( if np.isnan(layout[rr, cc]): img[self.nrows - 1 - rr, cc, :] = wall_color else: - img[self.nrows - 1 - rr, cc, :3] = scalar_map.to_rgba(layout[rr, cc])[:3] + img[self.nrows - 1 - rr, cc, :3] = scalar_map.to_rgba( + layout[rr, cc] + )[:3] return img def get_background(self): @@ -423,6 +429,7 @@ def get_scene(self, state): scene.add_shape(agent) return scene + # if __name__ == '__main__': # env = GridWorld(nrows=5, ncols=5, # reward_at={(4, 4): 1, (4, 3): -1}) diff --git a/rlberry/envs/gym_make.py b/rlberry/envs/gym_make.py index 93766e049..a75deab04 100644 --- a/rlberry/envs/gym_make.py +++ b/rlberry/envs/gym_make.py @@ -30,9 +30,11 @@ def gym_make(id, wrap_spaces=False, **kwargs): def atari_make(id, scalarize=True, **kwargs): from stable_baselines3.common.env_util import make_atari_env from stable_baselines3.common.vec_env import VecFrameStack + env = make_atari_env(env_id=id, **kwargs) env = VecFrameStack(env, n_stack=4) if scalarize: from rlberry.wrappers.scalarize import ScalarizeEnvWrapper + env = ScalarizeEnvWrapper(env) return env diff --git a/rlberry/envs/interface/model.py b/rlberry/envs/interface/model.py index d5fd2852a..fb29cc927 100644 --- a/rlberry/envs/interface/model.py +++ b/rlberry/envs/interface/model.py @@ -90,8 +90,10 @@ def sample(self, state, action): raise NotImplementedError("sample() method not implemented.") def is_online(self): - logger.warning("Checking if Model is\ -online calls reset() and step() methods.") + logger.warning( + "Checking if Model is\ +online calls reset() and step() methods." + ) try: self.reset() self.step(self.action_space.sample()) @@ -103,11 +105,12 @@ def is_online(self): raise def is_generative(self): - logger.warning("Checking if Model is \ -generative calls sample() method.") + logger.warning( + "Checking if Model is \ +generative calls sample() method." + ) try: - self.sample(self.observation_space.sample(), - self.action_space.sample()) + self.sample(self.observation_space.sample(), self.action_space.sample()) return True except Exception as ex: if isinstance(ex, NotImplementedError): @@ -121,5 +124,5 @@ def unwrapped(self): @property def rng(self): - """ Random number generator. """ + """Random number generator.""" return self.seeder.rng diff --git a/rlberry/envs/tests/test_env_seeding.py b/rlberry/envs/tests/test_env_seeding.py index 6b8ca2f2c..44477528a 100644 --- a/rlberry/envs/tests/test_env_seeding.py +++ b/rlberry/envs/tests/test_env_seeding.py @@ -21,7 +21,7 @@ Pendulum, FourRoom, SixRoom, - AppleGold + AppleGold, ] @@ -63,7 +63,9 @@ def test_env_seeding(ModelClass): env4.reseed(seeder4) env5 = ModelClass() - env5.reseed(seeder1) # same seeder as env1, but different trajectories. This is expected. + env5.reseed( + seeder1 + ) # same seeder as env1, but different trajectories. This is expected. seeding.safe_reseed(env4, seeder4) diff --git a/rlberry/envs/tests/test_gym_env_seeding.py b/rlberry/envs/tests/test_gym_env_seeding.py index 2db8b8653..b5e1872d5 100644 --- a/rlberry/envs/tests/test_gym_env_seeding.py +++ b/rlberry/envs/tests/test_gym_env_seeding.py @@ -8,9 +8,9 @@ from copy import deepcopy gym_envs = [ - 'Acrobot-v1', - 'CartPole-v1', - 'MountainCar-v0', + "Acrobot-v1", + "CartPole-v1", + "MountainCar-v0", ] diff --git a/rlberry/envs/tests/test_instantiation.py b/rlberry/envs/tests/test_instantiation.py index 5a12fd9b9..632a66a32 100644 --- a/rlberry/envs/tests/test_instantiation.py +++ b/rlberry/envs/tests/test_instantiation.py @@ -23,7 +23,7 @@ FourRoom, SixRoom, AppleGold, - NRoom + NRoom, ] @@ -57,13 +57,14 @@ def test_rendering_calls(ModelClass): def test_gridworld_aux_functions(): - env = GridWorld(nrows=5, ncols=8, walls=((1, 1),), - reward_at={(4, 4): 1, (4, 3): -1}) + env = GridWorld( + nrows=5, ncols=8, walls=((1, 1),), reward_at={(4, 4): 1, (4, 3): -1} + ) env.log() # from FiniteMDP env.render_ascii() # from GridWorld vals = np.arange(env.observation_space.n) env.display_values(vals) - env.print_transition_at(0, 0, 'up') + env.print_transition_at(0, 0, "up") layout = env.get_layout_array(vals, fill_walls_with=np.inf) for rr in range(env.nrows): @@ -89,20 +90,24 @@ def test_pball_env(p): env.get_transitions_lipschitz_constant() -@pytest.mark.parametrize("reward_free, difficulty, array_observation", - [ - (True, 0, False), - (False, 0, False), - (False, 0, True), - (False, 1, False), - (False, 1, True), - (False, 2, False), - (False, 2, True), - ]) +@pytest.mark.parametrize( + "reward_free, difficulty, array_observation", + [ + (True, 0, False), + (False, 0, False), + (False, 0, True), + (False, 1, False), + (False, 1, True), + (False, 2, False), + (False, 2, True), + ], +) def test_four_room(reward_free, difficulty, array_observation): - env = FourRoom(reward_free=reward_free, - difficulty=difficulty, - array_observation=array_observation) + env = FourRoom( + reward_free=reward_free, + difficulty=difficulty, + array_observation=array_observation, + ) initial_state = env.reset() next_state, reward, _, _ = env.step(1) @@ -121,13 +126,15 @@ def test_four_room(reward_free, difficulty, array_observation): assert isinstance(next_state, np.ndarray) -@pytest.mark.parametrize("reward_free, array_observation", - [ - (False, False), - (False, True), - (True, False), - (True, True), - ]) +@pytest.mark.parametrize( + "reward_free, array_observation", + [ + (False, False), + (False, True), + (True, False), + (True, True), + ], +) def test_six_room(reward_free, array_observation): env = SixRoom(reward_free=reward_free, array_observation=array_observation) @@ -145,13 +152,15 @@ def test_six_room(reward_free, array_observation): assert isinstance(next_state, np.ndarray) -@pytest.mark.parametrize("reward_free, array_observation", - [ - (False, False), - (False, True), - (True, False), - (True, True), - ]) +@pytest.mark.parametrize( + "reward_free, array_observation", + [ + (False, False), + (False, True), + (True, False), + (True, True), + ], +) def test_apple_gold(reward_free, array_observation): env = AppleGold(reward_free=reward_free, array_observation=array_observation) @@ -168,23 +177,27 @@ def test_apple_gold(reward_free, array_observation): assert isinstance(next_state, np.ndarray) -@pytest.mark.parametrize("reward_free, array_observation, initial_state_distribution", - [ - (False, False, 'center'), - (False, True, 'center'), - (True, False, 'center'), - (True, True, 'center'), - (True, False, 'uniform'), - ]) +@pytest.mark.parametrize( + "reward_free, array_observation, initial_state_distribution", + [ + (False, False, "center"), + (False, True, "center"), + (True, False, "center"), + (True, True, "center"), + (True, False, "uniform"), + ], +) def test_n_room(reward_free, array_observation, initial_state_distribution): - env = NRoom(reward_free=reward_free, - array_observation=array_observation, - initial_state_distribution=initial_state_distribution) + env = NRoom( + reward_free=reward_free, + array_observation=array_observation, + initial_state_distribution=initial_state_distribution, + ) initial_state = env.reset() next_state, reward, _, _ = env.step(1) - if initial_state_distribution == 'uniform': + if initial_state_distribution == "uniform": assert env.initial_state_distribution[0] == 1.0 / env.observation_space.n assert env.observation_space.contains(initial_state) diff --git a/rlberry/experiment/generator.py b/rlberry/experiment/generator.py index 816016a7e..982b4bf0a 100644 --- a/rlberry/experiment/generator.py +++ b/rlberry/experiment/generator.py @@ -27,14 +27,17 @@ def experiment_generator(): """ args = docopt(__doc__) for (_, agent_manager_kwargs) in parse_experiment_config( - Path(args[""]), - n_fit=int(args["--n_fit"]), - output_base_dir=args["--output_dir"], - parallelization=args["--parallelization"]): + Path(args[""]), + n_fit=int(args["--n_fit"]), + output_base_dir=args["--output_dir"], + parallelization=args["--parallelization"], + ): if args["--enable_tensorboard"]: if check_packages.TENSORBOARD_INSTALLED: agent_manager_kwargs.update(dict(enable_tensorboard=True)) else: - logger.warning('Option --enable_tensorboard is not available: tensorboard is not installed.') + logger.warning( + "Option --enable_tensorboard is not available: tensorboard is not installed." + ) yield AgentManager(**agent_manager_kwargs) diff --git a/rlberry/experiment/load_results.py b/rlberry/experiment/load_results.py index 8bf02a1f1..434642344 100644 --- a/rlberry/experiment/load_results.py +++ b/rlberry/experiment/load_results.py @@ -45,10 +45,10 @@ def load_experiment_results(output_dir, experiment_name): output_data['data_dir'][agent_name] = directory from which the results were loaded """ output_data = {} - output_data['agent_list'] = [] - output_data['manager'] = {} - output_data['dataframes'] = {} - output_data['data_dir'] = {} + output_data["agent_list"] = [] + output_data["manager"] = {} + output_data["dataframes"] = {} + output_data["data_dir"] = {} # preprocess input if not isinstance(output_dir, list): @@ -58,14 +58,16 @@ def load_experiment_results(output_dir, experiment_name): ndirs = len(output_dir) if ndirs > 1: - assert len(experiment_name) == ndirs, "Number of experiment names must match the number of output_dirs " + assert ( + len(experiment_name) == ndirs + ), "Number of experiment names must match the number of output_dirs " else: output_dir = len(experiment_name) * output_dir results_dirs = [] for dd, exper in zip(output_dir, experiment_name): results_dirs.append(Path(dd) / Path(exper).stem) - output_data['experiment_dirs'] = results_dirs + output_data["experiment_dirs"] = results_dirs # Subdirectories with data for each agent subdirs = [] @@ -75,31 +77,33 @@ def load_experiment_results(output_dir, experiment_name): # Create dictionary dict[agent_name] = most recent result dir data_dirs = {} for dd in subdirs: - data_dirs[dd.name] = _get_most_recent_path([f for f in dd.iterdir() if f.is_dir()]) - data_dirs[dd.name] = data_dirs[dd.name] / 'manager_data' + data_dirs[dd.name] = _get_most_recent_path( + [f for f in dd.iterdir() if f.is_dir()] + ) + data_dirs[dd.name] = data_dirs[dd.name] / "manager_data" # Load data from each subdir for agent_name in data_dirs: - output_data['agent_list'].append(agent_name) + output_data["agent_list"].append(agent_name) # store data_dir - output_data['data_dir'][agent_name] = data_dirs[agent_name] + output_data["data_dir"][agent_name] = data_dirs[agent_name] # store AgentManager - output_data['manager'][agent_name] = None - fname = data_dirs[agent_name] / 'manager_obj.pickle' + output_data["manager"][agent_name] = None + fname = data_dirs[agent_name] / "manager_obj.pickle" try: - output_data['manager'][agent_name] = AgentManager.load(fname) + output_data["manager"][agent_name] = AgentManager.load(fname) except Exception: - logger.warning(f'Could not load AgentManager instance for {agent_name}.') + logger.warning(f"Could not load AgentManager instance for {agent_name}.") logger.info("... loaded " + str(fname)) # store data frames dataframes = {} - csv_files = [f for f in data_dirs[agent_name].iterdir() if f.suffix == '.csv'] + csv_files = [f for f in data_dirs[agent_name].iterdir() if f.suffix == ".csv"] for ff in csv_files: dataframes[ff.stem] = pd.read_csv(ff) logger.info("... loaded " + str(ff)) - output_data['dataframes'][agent_name] = dataframes + output_data["dataframes"][agent_name] = dataframes return output_data diff --git a/rlberry/experiment/tests/old_test_experiment_generator.py b/rlberry/experiment/tests/old_test_experiment_generator.py index 0d6e6c8ab..44834c336 100644 --- a/rlberry/experiment/tests/old_test_experiment_generator.py +++ b/rlberry/experiment/tests/old_test_experiment_generator.py @@ -6,8 +6,7 @@ def test_mock_args(monkeypatch): monkeypatch.setattr( - "sys.argv", - ['', 'rlberry/experiment/tests/params_experiment.yaml'] + "sys.argv", ["", "rlberry/experiment/tests/params_experiment.yaml"] ) random_numbers = [] @@ -16,25 +15,25 @@ def test_mock_args(monkeypatch): random_numbers.append(rng.uniform(size=10)) assert agent_manager.agent_class is RSUCBVIAgent - assert agent_manager._base_init_kwargs['horizon'] == 51 + assert agent_manager._base_init_kwargs["horizon"] == 51 assert agent_manager.fit_budget == 10 - assert agent_manager.eval_kwargs['eval_horizon'] == 51 + assert agent_manager.eval_kwargs["eval_horizon"] == 51 - assert agent_manager._base_init_kwargs['lp_metric'] == 2 - assert agent_manager._base_init_kwargs['min_dist'] == 0.0 - assert agent_manager._base_init_kwargs['max_repr'] == 800 - assert agent_manager._base_init_kwargs['bonus_scale_factor'] == 1.0 - assert agent_manager._base_init_kwargs['reward_free'] is True + assert agent_manager._base_init_kwargs["lp_metric"] == 2 + assert agent_manager._base_init_kwargs["min_dist"] == 0.0 + assert agent_manager._base_init_kwargs["max_repr"] == 800 + assert agent_manager._base_init_kwargs["bonus_scale_factor"] == 1.0 + assert agent_manager._base_init_kwargs["reward_free"] is True train_env = agent_manager.train_env[0](**agent_manager.train_env[1]) assert train_env.reward_free is False assert train_env.array_observation is True - if agent_manager.agent_name == 'rsucbvi': - assert agent_manager._base_init_kwargs['gamma'] == 1.0 + if agent_manager.agent_name == "rsucbvi": + assert agent_manager._base_init_kwargs["gamma"] == 1.0 - elif agent_manager.agent_name == 'rsucbvi_alternative': - assert agent_manager._base_init_kwargs['gamma'] == 0.9 + elif agent_manager.agent_name == "rsucbvi_alternative": + assert agent_manager._base_init_kwargs["gamma"] == 0.9 else: raise ValueError() diff --git a/rlberry/experiment/yaml_utils.py b/rlberry/experiment/yaml_utils.py index e9852512e..2471c960f 100644 --- a/rlberry/experiment/yaml_utils.py +++ b/rlberry/experiment/yaml_utils.py @@ -3,7 +3,7 @@ import yaml from rlberry.utils.factory import load -_AGENT_KEYS = ('init_kwargs', 'eval_kwargs', 'fit_kwargs') +_AGENT_KEYS = ("init_kwargs", "eval_kwargs", "fit_kwargs") def read_yaml(path): @@ -97,10 +97,12 @@ def read_env_config(config_path): return load(env_config["constructor"]), env_config["params"] -def parse_experiment_config(path: Path, - n_fit: int = 4, - output_base_dir: str = 'results', - parallelization: str = 'process') -> Generator[Tuple[int, dict], None, None]: +def parse_experiment_config( + path: Path, + n_fit: int = 4, + output_base_dir: str = "results", + parallelization: str = "process", +) -> Generator[Tuple[int, dict], None, None]: """ Read .yaml files. set global seed and convert to AgentManager instances. @@ -165,20 +167,20 @@ def parse_experiment_config(path: Path, last = idx # kwargs - init_kwargs = agent_config['init_kwargs'] - eval_kwargs = agent_config['eval_kwargs'] - fit_kwargs = agent_config['fit_kwargs'] + init_kwargs = agent_config["init_kwargs"] + eval_kwargs = agent_config["eval_kwargs"] + fit_kwargs = agent_config["fit_kwargs"] # check if there are global kwargs - if 'global_init_kwargs' in config: - init_kwargs.update(config['global_init_kwargs']) - if 'global_eval_kwargs' in config: - eval_kwargs.update(config['global_eval_kwargs']) - if 'global_fit_kwargs' in config: - fit_kwargs.update(config['global_fit_kwargs']) + if "global_init_kwargs" in config: + init_kwargs.update(config["global_init_kwargs"]) + if "global_eval_kwargs" in config: + eval_kwargs.update(config["global_eval_kwargs"]) + if "global_fit_kwargs" in config: + fit_kwargs.update(config["global_fit_kwargs"]) # pop fit_budget from fit_kwargs - fit_budget = fit_kwargs.pop('fit_budget') + fit_budget = fit_kwargs.pop("fit_budget") # append run index to dir output_dir = output_dir / str(last + 1) @@ -196,10 +198,11 @@ def parse_experiment_config(path: Path, output_dir=output_dir, parallelization=parallelization, seed=seed, - create_unique_out_dir=False) # output_dir is already made unique above + create_unique_out_dir=False, + ) # output_dir is already made unique above -if __name__ == '__main__': - filename = 'examples/demo_experiment/params_experiment.yaml' +if __name__ == "__main__": + filename = "examples/demo_experiment/params_experiment.yaml" for (seed, agent_manager) in parse_experiment_config(Path(filename)): print(seed) diff --git a/rlberry/exploration_tools/discrete_counter.py b/rlberry/exploration_tools/discrete_counter.py index a14dceae2..549a39955 100644 --- a/rlberry/exploration_tools/discrete_counter.py +++ b/rlberry/exploration_tools/discrete_counter.py @@ -19,13 +19,15 @@ class DiscreteCounter(UncertaintyEstimator): Returns bonuses in 1/n ** rate_power. """ - def __init__(self, - observation_space, - action_space, - n_bins_obs=10, - n_bins_actions=10, - rate_power=0.5, - **kwargs): + def __init__( + self, + observation_space, + action_space, + n_bins_obs=10, + n_bins_actions=10, + rate_power=0.5, + **kwargs + ): UncertaintyEstimator.__init__(self, observation_space, action_space) self.rate_power = rate_power @@ -37,16 +39,14 @@ def __init__(self, self.n_states = observation_space.n else: self.continuous_state = True - self.state_discretizer = Discretizer(self.observation_space, - n_bins_obs) + self.state_discretizer = Discretizer(self.observation_space, n_bins_obs) self.n_states = self.state_discretizer.discrete_space.n if isinstance(action_space, Discrete): self.n_actions = action_space.n else: self.continuous_action = True - self.action_discretizer = Discretizer(self.action_space, - n_bins_actions) + self.action_discretizer = Discretizer(self.action_space, n_bins_actions) self.n_actions = self.action_discretizer.discrete_space.n self.N_sa = np.zeros((self.n_states, self.n_actions)) @@ -61,12 +61,12 @@ def _preprocess(self, state, action): def reset(self): self.N_sa = np.zeros((self.n_states, self.n_actions)) - @preprocess_args(expected_type='numpy') + @preprocess_args(expected_type="numpy") def update(self, state, action, next_state=None, reward=None, **kwargs): state, action = self._preprocess(state, action) self.N_sa[state, action] += 1 - @preprocess_args(expected_type='numpy') + @preprocess_args(expected_type="numpy") def measure(self, state, action, **kwargs): state, action = self._preprocess(state, action) n = np.maximum(1.0, self.N_sa[state, action]) diff --git a/rlberry/exploration_tools/online_discretization_counter.py b/rlberry/exploration_tools/online_discretization_counter.py index e0ba65f3d..98fc11fac 100644 --- a/rlberry/exploration_tools/online_discretization_counter.py +++ b/rlberry/exploration_tools/online_discretization_counter.py @@ -10,29 +10,32 @@ @numba_jit -def map_to_representative(state, - lp_metric, - representative_states, - n_representatives, - min_dist, - scaling, - accept_new_repr): +def map_to_representative( + state, + lp_metric, + representative_states, + n_representatives, + min_dist, + scaling, + accept_new_repr, +): """ Map state to representative state. """ dist_to_closest = np.inf argmin = -1 for ii in range(n_representatives): - dist = metric_lp(state, representative_states[ii, :], - lp_metric, scaling) + dist = metric_lp(state, representative_states[ii, :], lp_metric, scaling) if dist < dist_to_closest: dist_to_closest = dist argmin = ii max_representatives = representative_states.shape[0] - if dist_to_closest > min_dist \ - and n_representatives < max_representatives \ - and accept_new_repr: + if ( + dist_to_closest > min_dist + and n_representatives < max_representatives + and accept_new_repr + ): new_index = n_representatives representative_states[new_index, :] = state return new_index, 0.0 @@ -69,15 +72,17 @@ class OnlineDiscretizationCounter(UncertaintyEstimator): returns bonuses in n^power. """ - def __init__(self, - observation_space, - action_space, - lp_metric=2, - min_dist=0.1, - max_repr=1000, - scaling=None, - rate_power=1, - **kwargs): + def __init__( + self, + observation_space, + action_space, + lp_metric=2, + min_dist=0.1, + max_repr=1000, + scaling=None, + rate_power=1, + **kwargs + ): UncertaintyEstimator.__init__(self, observation_space, action_space) assert isinstance(action_space, Discrete) @@ -94,8 +99,7 @@ def __init__(self, if scaling is None: # if high and low are bounded if self.observation_space.is_bounded(): - scaling = self.observation_space.high \ - - self.observation_space.low + scaling = self.observation_space.high - self.observation_space.low # if high or low are unbounded else: scaling = np.ones(self.state_dim) @@ -118,40 +122,42 @@ def reset(self): self._overflow_warning = False def _get_representative_state(self, state, accept_new_repr=True): - state_idx, dist_to_closest \ - = map_to_representative(state, - self.lp_metric, - self.representative_states, - self.n_representatives, - self.min_dist, - self.scaling, - accept_new_repr) + state_idx, dist_to_closest = map_to_representative( + state, + self.lp_metric, + self.representative_states, + self.n_representatives, + self.min_dist, + self.scaling, + accept_new_repr, + ) # check if new representative state if state_idx == self.n_representatives: self.n_representatives += 1 - if self.n_representatives >= self.max_repr \ - and (not self._overflow_warning): - logger.warning("OnlineDiscretizationCounter reached \ -the maximum number of representative states.") + if self.n_representatives >= self.max_repr and (not self._overflow_warning): + logger.warning( + "OnlineDiscretizationCounter reached \ +the maximum number of representative states." + ) self._overflow_warning = True return state_idx, dist_to_closest - @preprocess_args(expected_type='numpy') + @preprocess_args(expected_type="numpy") def update(self, state, action, next_state=None, reward=None, **kwargs): state_idx, _ = self._get_representative_state(state) self.N_sa[state_idx, action] += 1 - @preprocess_args(expected_type='numpy') + @preprocess_args(expected_type="numpy") def measure(self, state, action, **kwargs): n = np.maximum(1.0, self.count(state, action)) return np.power(1 / n, self.rate_power) def count(self, state, action): state_idx, dist_to_closest = self._get_representative_state( - state, - accept_new_repr=False) + state, accept_new_repr=False + ) # if state is too far from the closest representative, # its count is zero. if dist_to_closest > self.min_dist: diff --git a/rlberry/exploration_tools/tests/test_discrete_counter.py b/rlberry/exploration_tools/tests/test_discrete_counter.py index d80dc56ad..25a8c36d0 100644 --- a/rlberry/exploration_tools/tests/test_discrete_counter.py +++ b/rlberry/exploration_tools/tests/test_discrete_counter.py @@ -4,13 +4,17 @@ from rlberry.envs import MountainCar from rlberry.envs.benchmarks.grid_exploration.nroom import NRoom from rlberry.exploration_tools.discrete_counter import DiscreteCounter -from rlberry.exploration_tools.online_discretization_counter import OnlineDiscretizationCounter +from rlberry.exploration_tools.online_discretization_counter import ( + OnlineDiscretizationCounter, +) @pytest.mark.parametrize("rate_power", [0.5, 1]) def test_discrete_env(rate_power): env = GridWorld() - counter = DiscreteCounter(env.observation_space, env.action_space, rate_power=rate_power) + counter = DiscreteCounter( + env.observation_space, env.action_space, rate_power=rate_power + ) for N in range(10, 20): assert counter.get_n_visited_states() == 0 @@ -37,7 +41,9 @@ def test_discrete_env(rate_power): @pytest.mark.parametrize("rate_power", [0.5, 1]) def test_continuous_state_env(rate_power): env = MountainCar() - counter = DiscreteCounter(env.observation_space, env.action_space, rate_power=rate_power) + counter = DiscreteCounter( + env.observation_space, env.action_space, rate_power=rate_power + ) for N in [10, 20]: for _ in range(50): @@ -60,9 +66,9 @@ def test_continuous_state_env(rate_power): @pytest.mark.parametrize("rate_power", [True, False]) def test_continuous_state_env_2(rate_power): env = MountainCar() - counter = OnlineDiscretizationCounter(env.observation_space, - env.action_space, - rate_power=rate_power) + counter = OnlineDiscretizationCounter( + env.observation_space, env.action_space, rate_power=rate_power + ) for N in [10, 20]: for _ in range(50): @@ -81,10 +87,9 @@ def test_continuous_state_env_2(rate_power): def test_continuous_state_env_3(): env = NRoom(nrooms=3, array_observation=True) - counter = OnlineDiscretizationCounter(env.observation_space, - env.action_space, - rate_power=0.5, - min_dist=0.0) + counter = OnlineDiscretizationCounter( + env.observation_space, env.action_space, rate_power=0.5, min_dist=0.0 + ) for N in range(10, 20): assert counter.get_n_visited_states() == 0 @@ -101,6 +106,8 @@ def test_continuous_state_env_3(): assert np.allclose(counter.measure(continuous_ss, aa), np.sqrt(1.0 / N)) assert counter.get_n_visited_states() == env.discrete_observation_space.n - assert np.allclose(counter.get_entropy(), np.log2(env.discrete_observation_space.n)) + assert np.allclose( + counter.get_entropy(), np.log2(env.discrete_observation_space.n) + ) counter.reset() diff --git a/rlberry/exploration_tools/torch/rnd.py b/rlberry/exploration_tools/torch/rnd.py index 8e137c91c..1acf88de7 100644 --- a/rlberry/exploration_tools/torch/rnd.py +++ b/rlberry/exploration_tools/torch/rnd.py @@ -24,29 +24,34 @@ def get_network(shape, embedding_dim): else: raise ValueError("Unknown image convention") - return ConvolutionalNetwork(in_channels=C, - in_width=W, - in_height=H, - out_size=embedding_dim, - activation="ELU", - transpose_obs=transpose_obs, - is_policy=False) + return ConvolutionalNetwork( + in_channels=C, + in_width=W, + in_height=H, + out_size=embedding_dim, + activation="ELU", + transpose_obs=transpose_obs, + is_policy=False, + ) elif len(shape) == 2: H, W = shape - return ConvolutionalNetwork(in_channels=1, - in_width=W, - in_height=H, - activation="ELU", - out_size=embedding_dim) + return ConvolutionalNetwork( + in_channels=1, + in_width=W, + in_height=H, + activation="ELU", + out_size=embedding_dim, + ) elif len(shape) == 1: - return MultiLayerPerceptron(in_size=shape[0], - activation="RELU", - layer_sizes=[64, 64], - out_size=embedding_dim) + return MultiLayerPerceptron( + in_size=shape[0], + activation="RELU", + layer_sizes=[64, 64], + out_size=embedding_dim, + ) else: - raise ValueError("Incompatible observation shape: {}" - .format(shape)) + raise ValueError("Incompatible observation shape: {}".format(shape)) class RandomNetworkDistillation(UncertaintyEstimator): @@ -58,20 +63,22 @@ class RandomNetworkDistillation(UncertaintyEstimator): In International Conference on Learning Representations. """ - def __init__(self, - observation_space, - action_space, - learning_rate=0.001, - update_period=100, - embedding_dim=10, - net_fn=None, - net_kwargs=None, - device="cuda:best", - rate_power=0.5, - batch_size=10, - memory_size=10000, - with_action=False, - **kwargs): + def __init__( + self, + observation_space, + action_space, + learning_rate=0.001, + update_period=100, + embedding_dim=10, + net_fn=None, + net_kwargs=None, + device="cuda:best", + rate_power=0.5, + batch_size=10, + memory_size=10000, + with_action=False, + **kwargs + ): assert isinstance(observation_space, spaces.Box) UncertaintyEstimator.__init__(self, observation_space, action_space) self.learning_rate = learning_rate @@ -79,8 +86,14 @@ def __init__(self, self.update_period = update_period self.embedding_dim = embedding_dim out_size = embedding_dim * action_space.n if with_action else embedding_dim - self.net_fn = load(net_fn) if isinstance(net_fn, str) else \ - net_fn or partial(get_network, shape=observation_space.shape, embedding_dim=out_size) + self.net_fn = ( + load(net_fn) + if isinstance(net_fn, str) + else net_fn + or partial( + get_network, shape=observation_space.shape, embedding_dim=out_size + ) + ) self.net_kwargs = net_kwargs or {} if "out_size" in self.net_kwargs: self.net_kwargs["out_size"] = out_size @@ -97,7 +110,8 @@ def reset(self, **kwargs): self.rnd_optimizer = torch.optim.Adam( self.predictor_network.parameters(), lr=self.learning_rate, - betas=(0.9, 0.999)) + betas=(0.9, 0.999), + ) self.count = 0 self.loss = torch.tensor(0.0).to(self.device) @@ -111,24 +125,27 @@ def _get_embeddings(self, state, action=None, batch=False, all_actions=False): predicted_embedding = self.predictor_network(state) if self.with_action: - random_embedding = random_embedding.view((state.shape[0], self.action_space.n, -1)) - predicted_embedding = predicted_embedding.view((state.shape[0], self.action_space.n, -1)) + random_embedding = random_embedding.view( + (state.shape[0], self.action_space.n, -1) + ) + predicted_embedding = predicted_embedding.view( + (state.shape[0], self.action_space.n, -1) + ) if not all_actions: action = action.long().to(self.device) if not batch: action = action.unsqueeze(0) - action = action.unsqueeze(1).repeat(1, random_embedding.shape[-1]).unsqueeze(1) + action = ( + action.unsqueeze(1) + .repeat(1, random_embedding.shape[-1]) + .unsqueeze(1) + ) random_embedding = random_embedding.gather(1, action).squeeze(1) predicted_embedding = predicted_embedding.gather(1, action).squeeze(1) return random_embedding, predicted_embedding - @preprocess_args(expected_type='torch') - def update(self, - state, - action=None, - next_state=None, - reward=None, - **kwargs): + @preprocess_args(expected_type="torch") + def update(self, state, action=None, next_state=None, reward=None, **kwargs): batch = [(state, action)] if self.batch_size > 0 and not self.memory.is_empty(): @@ -139,10 +156,11 @@ def update(self, if self.with_action: actions = torch.stack(actions) - random_embedding, predicted_embedding = self._get_embeddings(states, actions, batch=True) + random_embedding, predicted_embedding = self._get_embeddings( + states, actions, batch=True + ) - self.loss += self.loss_fn(random_embedding.detach(), - predicted_embedding) + self.loss += self.loss_fn(random_embedding.detach(), predicted_embedding) self.count += 1 if self.count % self.update_period == 0: @@ -152,19 +170,27 @@ def update(self, self.rnd_optimizer.step() self.loss = torch.tensor(0.0).to(self.device) - @preprocess_args(expected_type='torch') + @preprocess_args(expected_type="torch") def measure(self, state, action=None, **kwargs): - random_embedding, predicted_embedding = self._get_embeddings(state, action, batch=False) - error = torch.norm(predicted_embedding.detach() - random_embedding.detach(), p=2, dim=-1) + random_embedding, predicted_embedding = self._get_embeddings( + state, action, batch=False + ) + error = torch.norm( + predicted_embedding.detach() - random_embedding.detach(), p=2, dim=-1 + ) return error.pow(2 * self.rate_power).item() - @preprocess_args(expected_type='torch') + @preprocess_args(expected_type="torch") def measure_batch(self, states, actions, **kwargs): - random_embedding, predicted_embedding = self._get_embeddings(states, actions, batch=True) - error = torch.norm(predicted_embedding.detach() - random_embedding.detach(), p=2, dim=-1) + random_embedding, predicted_embedding = self._get_embeddings( + states, actions, batch=True + ) + error = torch.norm( + predicted_embedding.detach() - random_embedding.detach(), p=2, dim=-1 + ) return error.pow(2 * self.rate_power) - @preprocess_args(expected_type='torch') + @preprocess_args(expected_type="torch") def measure_batch_all_actions(self, states, **kwargs): """ Measure N(s,a) for all a in A. @@ -178,6 +204,10 @@ def measure_batch_all_actions(self, states, **kwargs): N(s,a): an array of shape B x A """ assert self.with_action - random_embedding, predicted_embedding = self._get_embeddings(states, None, batch=True, all_actions=True) - error = torch.norm(predicted_embedding.detach() - random_embedding.detach(), p=2, dim=-1) + random_embedding, predicted_embedding = self._get_embeddings( + states, None, batch=True, all_actions=True + ) + error = torch.norm( + predicted_embedding.detach() - random_embedding.detach(), p=2, dim=-1 + ) return error.pow(2 * self.rate_power) diff --git a/rlberry/exploration_tools/torch/tests/test_rnd.py b/rlberry/exploration_tools/torch/tests/test_rnd.py index 60d13fa50..60649b096 100644 --- a/rlberry/exploration_tools/torch/tests/test_rnd.py +++ b/rlberry/exploration_tools/torch/tests/test_rnd.py @@ -12,7 +12,8 @@ def test_rnd(): env.action_space, learning_rate=0.1, update_period=100, - embedding_dim=2) + embedding_dim=2, + ) # Test state = env.reset() diff --git a/rlberry/exploration_tools/typing.py b/rlberry/exploration_tools/typing.py index 0b41c70a2..b51aabf64 100644 --- a/rlberry/exploration_tools/typing.py +++ b/rlberry/exploration_tools/typing.py @@ -9,9 +9,9 @@ def _get_type(arg): if _TORCH_INSTALLED and isinstance(arg, torch.Tensor): - return 'torch' + return "torch" elif isinstance(arg, np.ndarray): - return 'numpy' + return "numpy" else: return type(arg) @@ -32,7 +32,7 @@ def process_type(arg, expected_type): if arg is None: return None - if expected_type == 'torch': + if expected_type == "torch": assert _TORCH_INSTALLED, "expected_type is 'torch', but torch is not installed!" if isinstance(arg, torch.Tensor): return arg @@ -42,7 +42,7 @@ def process_type(arg, expected_type): return torch.tensor(arg) else: return arg - elif expected_type == 'numpy': + elif expected_type == "numpy": if isinstance(arg, np.ndarray): return arg elif _TORCH_INSTALLED and isinstance(arg, torch.Tensor): diff --git a/rlberry/exploration_tools/uncertainty_estimator.py b/rlberry/exploration_tools/uncertainty_estimator.py index 923b9f9d3..868b4c90e 100644 --- a/rlberry/exploration_tools/uncertainty_estimator.py +++ b/rlberry/exploration_tools/uncertainty_estimator.py @@ -22,10 +22,13 @@ def measure(self, state, action, **kwargs): def measure_batch(self, states, actions, **kwargs): batch = [self.measure(s, a, **kwargs) for s, a in zip(states, actions)] - if _get_type(batch[0]) == 'torch': + if _get_type(batch[0]) == "torch": import torch + return torch.FloatTensor(batch) return np.array(batch) def measure_batch_all_actions(self, states): - return np.array([[self.measure(s, a) for a in range(self.action_space.n)] for s in states]) + return np.array( + [[self.measure(s, a) for a in range(self.action_space.n)] for s in states] + ) diff --git a/rlberry/manager/agent_manager.py b/rlberry/manager/agent_manager.py index 9c4ec685c..0215501fd 100644 --- a/rlberry/manager/agent_manager.py +++ b/rlberry/manager/agent_manager.py @@ -36,6 +36,7 @@ # Aux # + class AgentHandler: """ Wraps an Agent so that it can be either loaded in memory @@ -58,13 +59,9 @@ class AgentHandler: Arguments required by __init__ method of agent_class. """ - def __init__(self, - id, - filename, - seeder, - agent_class, - agent_instance=None, - agent_kwargs=None) -> None: + def __init__( + self, id, filename, seeder, agent_class, agent_instance=None, agent_kwargs=None + ) -> None: self._id = id self._fname = Path(filename) self._seeder = seeder @@ -92,12 +89,16 @@ def is_loaded(self): def load(self) -> bool: try: - self._agent_instance = self._agent_class.load(self._fname, **self._agent_kwargs) + self._agent_instance = self._agent_class.load( + self._fname, **self._agent_kwargs + ) safe_reseed(self._agent_instance.env, self._seeder) return True except Exception as ex: self._agent_instance = None - logger.error(f'Failed call to AgentHandler.load() for {self._agent_class}: {ex}') + logger.error( + f"Failed call to AgentHandler.load() for {self._agent_class}: {ex}" + ) return False def dump(self): @@ -107,7 +108,9 @@ def dump(self): # saved_filename might have appended the correct extension, for instance, # so self._fname must be updated. if not saved_filename: - logger.warning(f'Instance of {self._agent_class} cannot be saved and will be kept in memory.') + logger.warning( + f"Instance of {self._agent_class} cannot be saved and will be kept in memory." + ) return self._fname = Path(saved_filename) del self._agent_instance @@ -117,16 +120,18 @@ def __getattr__(self, attr): """ Allows AgentHandler to behave like the handled Agent. """ - if attr[:2] == '__': + if attr[:2] == "__": raise AttributeError(attr) if attr in self.__dict__: return getattr(self, attr) - assert not self.is_empty(), 'Calling AgentHandler with no agent instance stored.' + assert ( + not self.is_empty() + ), "Calling AgentHandler with no agent instance stored." if not self.is_loaded(): loaded = self.load() if not loaded: - raise RuntimeError(f'Could not load Agent from {self._fname}.') + raise RuntimeError(f"Could not load Agent from {self._fname}.") return getattr(self._agent_instance, attr) @@ -195,24 +200,26 @@ class AgentManager: init_kwargs_per_instance will be used. """ - def __init__(self, - agent_class, - train_env, - fit_budget=None, - eval_env=None, - init_kwargs=None, - fit_kwargs=None, - eval_kwargs=None, - agent_name=None, - n_fit=4, - output_dir=None, - parallelization='thread', - worker_logging_level='INFO', - seed=None, - enable_tensorboard=False, - create_unique_out_dir=True, - default_writer_kwargs=None, - init_kwargs_per_instance=None): + def __init__( + self, + agent_class, + train_env, + fit_budget=None, + eval_env=None, + init_kwargs=None, + fit_kwargs=None, + eval_kwargs=None, + agent_name=None, + n_fit=4, + output_dir=None, + parallelization="thread", + worker_logging_level="INFO", + seed=None, + enable_tensorboard=False, + create_unique_out_dir=True, + default_writer_kwargs=None, + init_kwargs_per_instance=None, + ): # agent_class should only be None when the constructor is called # by the class method AgentManager.load(), since the agent class # will be loaded. @@ -229,10 +236,12 @@ def __init__(self, # Check train_env and eval_env assert isinstance( - train_env, Tuple), "[AgentManager]train_env must be Tuple (constructor, kwargs)" + train_env, Tuple + ), "[AgentManager]train_env must be Tuple (constructor, kwargs)" if eval_env is not None: assert isinstance( - eval_env, Tuple), "[AgentManager]train_env must be Tuple (constructor, kwargs)" + eval_env, Tuple + ), "[AgentManager]train_env must be Tuple (constructor, kwargs)" # create oject identifier self.unique_id = metadata_utils.get_unique_id(self) @@ -265,25 +274,27 @@ def __init__(self, self.fit_budget = fit_budget else: try: - self.fit_budget = self.fit_kwargs.pop('fit_budget') + self.fit_budget = self.fit_kwargs.pop("fit_budget") except KeyError: - raise ValueError('[AgentManager] fit_budget missing in __init__().') + raise ValueError("[AgentManager] fit_budget missing in __init__().") # extra params per instance if init_kwargs_per_instance is not None: assert len(init_kwargs_per_instance) == n_fit init_kwargs_per_instance = deepcopy(init_kwargs_per_instance) - self.init_kwargs_per_instance = init_kwargs_per_instance or [dict() for _ in range(n_fit)] + self.init_kwargs_per_instance = init_kwargs_per_instance or [ + dict() for _ in range(n_fit) + ] # output dir if output_dir is None: output_dir = metadata_utils.RLBERRY_TEMP_DATA_DIR - self.output_dir = Path(output_dir) / 'manager_data' + self.output_dir = Path(output_dir) / "manager_data" if create_unique_out_dir: - self.output_dir = self.output_dir / (self.agent_name + '_' + self.unique_id) + self.output_dir = self.output_dir / (self.agent_name + "_" + self.unique_id) # Create list of writers for each agent that will be trained # 'default' will keep Agent's use of DefaultWriter. - self.writers = [('default', None) for _ in range(n_fit)] + self.writers = [("default", None) for _ in range(n_fit)] # Parameters to setup Agent's DefaultWriter self.agent_default_writer_kwargs = [ @@ -291,22 +302,24 @@ def __init__(self, name=self.agent_name, log_interval=3, tensorboard_kwargs=None, - execution_metadata=metadata_utils.ExecutionMetadata(obj_worker_id=idx) + execution_metadata=metadata_utils.ExecutionMetadata(obj_worker_id=idx), ) for idx in range(n_fit) ] self.tensorboard_dir = None if enable_tensorboard: - self.tensorboard_dir = self.output_dir / 'tensorboard' + self.tensorboard_dir = self.output_dir / "tensorboard" for idx, params in enumerate(self.agent_default_writer_kwargs): - params['tensorboard_kwargs'] = dict( + params["tensorboard_kwargs"] = dict( log_dir=self.tensorboard_dir / str(idx) ) # Update DefaultWriter according to user's settings. default_writer_kwargs = default_writer_kwargs or {} if default_writer_kwargs: - logger.warning('(Re)defining the following DefaultWriter' - f' parameters in AgentManager: {list(default_writer_kwargs.keys())}') + logger.warning( + "(Re)defining the following DefaultWriter" + f" parameters in AgentManager: {list(default_writer_kwargs.keys())}" + ) for ii in range(n_fit): self.agent_default_writer_kwargs[ii].update(default_writer_kwargs) @@ -324,13 +337,15 @@ def __init__(self, def _init_optuna_storage_url(self): self.output_dir.mkdir(parents=True, exist_ok=True) - self.db_filename = self.output_dir / 'optuna_data.db' + self.db_filename = self.output_dir / "optuna_data.db" if create_database(self.db_filename): self.optuna_storage_url = f"sqlite:///{self.db_filename}" else: self.db_filename = None self.optuna_storage_url = "sqlite:///:memory:" - logger.warning(f'Unable to create databate {self.db_filename}. Using sqlite:///:memory:') + logger.warning( + f"Unable to create databate {self.db_filename}. Using sqlite:///:memory:" + ) def _set_init_kwargs(self): init_seeders = self.seeder.spawn(self.n_fit, squeeze=False) @@ -344,7 +359,9 @@ def _set_init_kwargs(self): copy_env=False, seeder=init_seeders[ii], output_dir=Path(self.output_dir) / f"output_{ii}", - _execution_metadata=self.agent_default_writer_kwargs[ii]['execution_metadata'], + _execution_metadata=self.agent_default_writer_kwargs[ii][ + "execution_metadata" + ], _default_writer_kwargs=self.agent_default_writer_kwargs[ii], ) ) @@ -357,7 +374,7 @@ def _reset_agent_handlers(self): self.agent_handlers = [ AgentHandler( id=ii, - filename=self.output_dir / Path(f'agent_handlers/idx_{ii}'), + filename=self.output_dir / Path(f"agent_handlers/idx_{ii}"), seeder=handlers_seeders[ii], agent_class=self.agent_class, agent_instance=None, @@ -379,7 +396,9 @@ def get_writer_data(self): def get_agent_instances(self): if self.agent_handlers: - return [agent_handler.get_instance() for agent_handler in self.agent_handlers] + return [ + agent_handler.get_instance() for agent_handler in self.agent_handlers + ] return [] def eval_agents(self, n_simulations: Optional[int] = None) -> list: @@ -403,11 +422,13 @@ def eval_agents(self, n_simulations: Optional[int] = None) -> list: agent_idx = self.eval_seeder.rng.choice(len(self.agent_handlers)) agent = self.agent_handlers[agent_idx] if agent.is_empty(): - logger.error('Calling eval() in an AgentManager instance contaning an empty AgentHandler.' - ' Returning [].') + logger.error( + "Calling eval() in an AgentManager instance contaning an empty AgentHandler." + " Returning []." + ) return [] values.append(agent.eval(**self.eval_kwargs)) - logger.info(f'[eval]... simulation {ii + 1}/{n_simulations}') + logger.info(f"[eval]... simulation {ii + 1}/{n_simulations}") return values def clear_output_dir(self): @@ -415,7 +436,7 @@ def clear_output_dir(self): try: shutil.rmtree(self.output_dir) except FileNotFoundError: - logger.warning(f'No directory {self.output_dir} found to be deleted.') + logger.warning(f"No directory {self.output_dir} found to be deleted.") def clear_handlers(self): """Delete files from output_dir/agent_handlers that are managed by this class.""" @@ -443,8 +464,9 @@ def set_writer(self, idx, writer_fn, writer_kwargs=None): AgentManager fits `n_fit` agents, the writer of each one of them needs to be set separetely. """ - assert idx >= 0 and idx < self.n_fit, \ - "Invalid index sent to AgentManager.set_writer()" + assert ( + idx >= 0 and idx < self.n_fit + ), "Invalid index sent to AgentManager.set_writer()" writer_kwargs = writer_kwargs or {} self.writers[idx] = (writer_fn, writer_kwargs) @@ -464,29 +486,36 @@ def fit(self, budget=None, **kwargs): for handler in self.agent_handlers: handler.dump() - if self.parallelization == 'thread': + if self.parallelization == "thread": executor_class = concurrent.futures.ThreadPoolExecutor lock = threading.Lock() - elif self.parallelization == 'process': + elif self.parallelization == "process": executor_class = functools.partial( concurrent.futures.ProcessPoolExecutor, - mp_context=multiprocessing.get_context('spawn')) + mp_context=multiprocessing.get_context("spawn"), + ) lock = multiprocessing.Manager().Lock() else: - raise ValueError(f'Invalid backend for parallelization: {self.parallelization}') - - args = [( - lock, - handler, - self.agent_class, - budget, - init_kwargs, - deepcopy(self.fit_kwargs), - writer, - self.worker_logging_level, - seeder) - for init_kwargs, handler, seeder, writer - in zip(self.init_kwargs, self.agent_handlers, seeders, self.writers)] + raise ValueError( + f"Invalid backend for parallelization: {self.parallelization}" + ) + + args = [ + ( + lock, + handler, + self.agent_class, + budget, + init_kwargs, + deepcopy(self.fit_kwargs), + writer, + self.worker_logging_level, + seeder, + ) + for init_kwargs, handler, seeder, writer in zip( + self.init_kwargs, self.agent_handlers, seeders, self.writers + ) + ] if len(args) == 1: workers_output = [_fit_worker(args[0])] @@ -499,9 +528,7 @@ def fit(self, budget=None, **kwargs): workers_output = [] for future in concurrent.futures.as_completed(futures): - workers_output.append( - future.result() - ) + workers_output.append(future.result()) executor.shutdown() workers_output.sort(key=lambda x: x.id) @@ -536,7 +563,7 @@ def save(self): output_dir.mkdir(parents=True, exist_ok=True) # save optimized hyperparameters if self.best_hyperparams is not None: - fname = Path(output_dir) / 'best_hyperparams.json' + fname = Path(output_dir) / "best_hyperparams.json" _safe_serialize_json(self.best_hyperparams, fname) # save default_writer_data that can be aggregated in a pandas DataFrame if self.default_writer_data is not None: @@ -549,7 +576,7 @@ def save(self): try: output = pd.DataFrame(all_writer_data) # save - fname = Path(output_dir) / 'data.csv' + fname = Path(output_dir) / "data.csv" output.to_csv(fname, index=None) except Exception: logger.warning("Could not save default_writer_data.") @@ -563,7 +590,7 @@ def save(self): handler.dump() # save - filename = Path('manager_obj').with_suffix('.pickle') + filename = Path("manager_obj").with_suffix(".pickle") filename = output_dir / filename filename.parent.mkdir(parents=True, exist_ok=True) try: @@ -574,7 +601,9 @@ def save(self): try: with filename.open("wb") as ff: dill.dump(self.__dict__, ff) - logger.info("Saved AgentManager({}) using dill.".format(self.agent_name)) + logger.info( + "Saved AgentManager({}) using dill.".format(self.agent_name) + ) except Exception as ex: logger.warning("[AgentManager] Instance cannot be pickled: " + str(ex)) @@ -582,15 +611,15 @@ def save(self): @classmethod def load(cls, filename): - filename = Path(filename).with_suffix('.pickle') + filename = Path(filename).with_suffix(".pickle") obj = cls(None, None, None) try: - with filename.open('rb') as ff: + with filename.open("rb") as ff: tmp_dict = pickle.load(ff) logger.info("Loaded AgentManager using pickle.") except Exception: - with filename.open('rb') as ff: + with filename.open("rb") as ff: tmp_dict = dill.load(ff) logger.info("Loaded AgentManager using dill.") @@ -598,18 +627,20 @@ def load(cls, filename): obj.__dict__.update(tmp_dict) return obj - def optimize_hyperparams(self, - n_trials=256, - timeout=60, - n_fit=2, - n_optuna_workers=2, - optuna_parallelization='thread', - sampler_method='optuna_default', - pruner_method='halving', - continue_previous=False, - fit_fraction=1.0, - sampler_kwargs=None, - disable_evaluation_writers=True): + def optimize_hyperparams( + self, + n_trials=256, + timeout=60, + n_fit=2, + n_optuna_workers=2, + optuna_parallelization="thread", + sampler_method="optuna_default", + pruner_method="halving", + continue_previous=False, + fit_fraction=1.0, + sampler_kwargs=None, + disable_evaluation_writers=True, + ): """ Run hyperparameter optimization and updates init_kwargs with the best hyperparameters found. @@ -670,7 +701,7 @@ def optimize_hyperparams(self, # # setup # - TEMP_DIR = self.output_dir / 'optim' + TEMP_DIR = self.output_dir / "optim" global _OPTUNA_INSTALLED if not _OPTUNA_INSTALLED: logging.error("Optuna not installed.") @@ -689,42 +720,43 @@ def optimize_hyperparams(self, if sampler_kwargs is None: sampler_kwargs = {} # get sampler - if sampler_method == 'random': + if sampler_method == "random": sampler = optuna.samplers.RandomSampler() - elif sampler_method == 'grid': - assert sampler_kwargs is not None, \ - "To use GridSampler, " + \ - "a search_space dictionary must be provided." + elif sampler_method == "grid": + assert sampler_kwargs is not None, ( + "To use GridSampler, " + + "a search_space dictionary must be provided." + ) sampler = optuna.samplers.GridSampler(**sampler_kwargs) - elif sampler_method == 'cmaes': + elif sampler_method == "cmaes": sampler = optuna.samplers.CmaEsSampler(**sampler_kwargs) - elif sampler_method == 'optuna_default': + elif sampler_method == "optuna_default": sampler = optuna.samplers.TPESampler(**sampler_kwargs) else: raise NotImplementedError( - "Sampler method %s is not implemented." % sampler_method) + "Sampler method %s is not implemented." % sampler_method + ) # get pruner - if pruner_method == 'halving': + if pruner_method == "halving": pruner = optuna.pruners.SuccessiveHalvingPruner( - min_resource=1, - reduction_factor=4, - min_early_stopping_rate=0) - elif pruner_method == 'none': + min_resource=1, reduction_factor=4, min_early_stopping_rate=0 + ) + elif pruner_method == "none": pruner = None else: raise NotImplementedError( - "Pruner method %s is not implemented." % pruner_method) + "Pruner method %s is not implemented." % pruner_method + ) # storage self._init_optuna_storage_url() storage = optuna.storages.RDBStorage(self.optuna_storage_url) # optuna study - study = optuna.create_study(sampler=sampler, - pruner=pruner, - storage=storage, - direction='maximize') + study = optuna.create_study( + sampler=sampler, pruner=pruner, storage=storage, direction="maximize" + ) self.optuna_study = study # save, to that optimization can be resumed later @@ -744,11 +776,11 @@ def optimize_hyperparams(self, n_fit=n_fit, temp_dir=TEMP_DIR, # TEMP_DIR disable_evaluation_writers=disable_evaluation_writers, - fit_fraction=fit_fraction + fit_fraction=fit_fraction, ) try: - if optuna_parallelization == 'thread': + if optuna_parallelization == "thread": with concurrent.futures.ThreadPoolExecutor() as executor: for _ in range(n_optuna_workers): executor.submit( @@ -756,21 +788,26 @@ def optimize_hyperparams(self, objective, n_trials=n_trials, timeout=timeout, - gc_after_trial=True) + gc_after_trial=True, + ) executor.shutdown() - elif optuna_parallelization == 'process': + elif optuna_parallelization == "process": with concurrent.futures.ProcessPoolExecutor( - mp_context=multiprocessing.get_context('spawn')) as executor: + mp_context=multiprocessing.get_context("spawn") + ) as executor: for _ in range(n_optuna_workers): executor.submit( study.optimize, objective, n_trials=n_trials // n_optuna_workers, timeout=timeout, - gc_after_trial=True) + gc_after_trial=True, + ) executor.shutdown() else: - raise ValueError(f'Invalid value for optuna_parallelization: {optuna_parallelization}.') + raise ValueError( + f"Invalid value for optuna_parallelization: {optuna_parallelization}." + ) except KeyboardInterrupt: logger.warning("Evaluation stopped.") @@ -779,21 +816,21 @@ def optimize_hyperparams(self, try: shutil.rmtree(TEMP_DIR) except FileNotFoundError as ex: - logger.warning(f'Could not delete {TEMP_DIR}: {ex}') + logger.warning(f"Could not delete {TEMP_DIR}: {ex}") # continue try: best_trial = study.best_trial except ValueError as ex: - logger.error(f'Hyperparam optimization failed due to the error: {ex}') + logger.error(f"Hyperparam optimization failed due to the error: {ex}") return dict() - logger.info(f'Number of finished trials: {len(study.trials)}') - logger.info('Best trial:') - logger.info(f'Value: {best_trial.value}') - logger.info('Params:') + logger.info(f"Number of finished trials: {len(study.trials)}") + logger.info("Best trial:") + logger.info(f"Value: {best_trial.value}") + logger.info("Params:") for key, value in best_trial.params.items(): - logger.info(f' {key}: {value}') + logger.info(f" {key}: {value}") # store best parameters self.best_hyperparams = best_trial.params @@ -817,8 +854,17 @@ def _fit_worker(args): """ Create and fit an agent instance """ - (lock, agent_handler, agent_class, fit_budget, init_kwargs, - fit_kwargs, writer, worker_logging_level, seeder) = args + ( + lock, + agent_handler, + agent_class, + fit_budget, + init_kwargs, + fit_kwargs, + writer, + worker_logging_level, + seeder, + ) = args # reseed external libraries set_external_seed(seeder) @@ -833,13 +879,15 @@ def _fit_worker(args): # create agent agent = agent_class(**init_kwargs) # seed agent - agent.reseed(seeder) # TODO: check if extra reseeding here is necessary + agent.reseed(seeder) # TODO: check if extra reseeding here is necessary agent_handler.set_instance(agent) # set writer if writer[0] is None: agent_handler.set_writer(None) - elif writer[0] != 'default': # 'default' corresponds to DefaultWriter created by Agent.__init__() + elif ( + writer[0] != "default" + ): # 'default' corresponds to DefaultWriter created by Agent.__init__() writer_fn = writer[0] writer_kwargs = writer[1] agent_handler.set_writer(writer_fn(**writer_kwargs)) @@ -868,22 +916,22 @@ def _safe_serialize_json(obj, filename): def default(obj): return f"<>" - with open(filename, 'w') as fp: + with open(filename, "w") as fp: json.dump(obj, fp, sort_keys=True, indent=4, default=default) def _optuna_objective( - trial, - base_init_kwargs, # self._base_init_kwargs - agent_class, # self.agent_class - train_env, # self.train_env - eval_env, - fit_budget, # self.fit_budget - eval_kwargs, # self.eval_kwargs - n_fit, - temp_dir, # TEMP_DIR - disable_evaluation_writers, - fit_fraction + trial, + base_init_kwargs, # self._base_init_kwargs + agent_class, # self.agent_class + train_env, # self.train_env + eval_env, + fit_budget, # self.fit_budget + eval_kwargs, # self.eval_kwargs + n_fit, + temp_dir, # TEMP_DIR + disable_evaluation_writers, + fit_fraction, ): kwargs = deepcopy(base_init_kwargs) @@ -902,13 +950,14 @@ def _optuna_objective( eval_env=eval_env, init_kwargs=kwargs, # kwargs are being optimized eval_kwargs=deepcopy(eval_kwargs), - agent_name='optim', + agent_name="optim", n_fit=n_fit, - worker_logging_level='INFO', - parallelization='thread', + worker_logging_level="INFO", + parallelization="thread", output_dir=temp_dir, enable_tensorboard=False, - create_unique_out_dir=True) + create_unique_out_dir=True, + ) if disable_evaluation_writers: for ii in range(params_stats.n_fit): diff --git a/rlberry/manager/evaluation.py b/rlberry/manager/evaluation.py index dd3a64b47..8d56a9ae0 100644 --- a/rlberry/manager/evaluation.py +++ b/rlberry/manager/evaluation.py @@ -6,12 +6,14 @@ logger = logging.getLogger(__name__) -def evaluate_agents(agent_manager_list, - n_simulations=5, - fignum=None, - show=True, - plot=True, - sns_kwargs=None): +def evaluate_agents( + agent_manager_list, + n_simulations=5, + fignum=None, + show=True, + plot=True, + sns_kwargs=None, +): """ Evaluate and compare each of the agents in agent_manager_list. @@ -41,13 +43,15 @@ def evaluate_agents(agent_manager_list, eval_outputs = [] for agent_manager in agent_manager_list: - logger.info(f'Evaluating {agent_manager.agent_name}...') + logger.info(f"Evaluating {agent_manager.agent_name}...") outputs = agent_manager.eval_agents(n_simulations) if len(outputs) > 0: eval_outputs.append(outputs) if len(eval_outputs) == 0: - logger.error('[evaluate_agents]: No evaluation data. Make sure AgentManager.fit() has been called.') + logger.error( + "[evaluate_agents]: No evaluation data. Make sure AgentManager.fit() has been called." + ) return # @@ -85,14 +89,16 @@ def evaluate_agents(agent_manager_list, return output -def plot_writer_data(agent_manager, - tag, - xtag=None, - fignum=None, - show=True, - preprocess_func=None, - title=None, - sns_kwargs=None): +def plot_writer_data( + agent_manager, + tag, + xtag=None, + fignum=None, + show=True, + preprocess_func=None, + title=None, + sns_kwargs=None, +): """ Given a list of AgentManager, plot data (corresponding to info) obtained in each episode. The dictionary returned by agents' .fit() method must contain a key equal to `info`. @@ -120,11 +126,11 @@ def plot_writer_data(agent_manager, ------- Pandas DataFrame with processed data used by seaborn's lineplot. """ - sns_kwargs = sns_kwargs or {'ci': 'sd'} + sns_kwargs = sns_kwargs or {"ci": "sd"} title = title or tag if preprocess_func is not None: - ylabel = 'value' + ylabel = "value" else: ylabel = tag preprocess_func = preprocess_func or (lambda x: x) @@ -143,34 +149,36 @@ def plot_writer_data(agent_manager, if writer_data is not None: for idx in writer_data: df = writer_data[idx] - processed_df = pd.DataFrame(df[df['tag'] == tag]) - processed_df['value'] = preprocess_func(processed_df['value'].values) + processed_df = pd.DataFrame(df[df["tag"] == tag]) + processed_df["value"] = preprocess_func(processed_df["value"].values) # update name according to AgentManager name - processed_df['name'] = agent_name + processed_df["name"] = agent_name # add column with xtag, if given if xtag is not None: - df_xtag = pd.DataFrame(df[df['tag'] == xtag]) - processed_df[xtag] = df_xtag['value'].values + df_xtag = pd.DataFrame(df[df["tag"] == xtag]) + processed_df[xtag] = df_xtag["value"].values data_list.append(processed_df) if len(data_list) == 0: - logger.error('[plot_writer_data]: No data to be plotted.') + logger.error("[plot_writer_data]: No data to be plotted.") return all_writer_data = pd.concat(data_list, ignore_index=True) - data = all_writer_data[all_writer_data['tag'] == tag] + data = all_writer_data[all_writer_data["tag"] == tag] if xtag is None: - xtag = 'global_step' + xtag = "global_step" if data[xtag].notnull().sum() > 0: xx = xtag - if data['global_step'].isna().sum() > 0: - logger.warning(f'Plotting {tag} vs {xtag}, but {xtag} might be missing for some agents.') + if data["global_step"].isna().sum() > 0: + logger.warning( + f"Plotting {tag} vs {xtag}, but {xtag} might be missing for some agents." + ) else: xx = data.index plt.figure(fignum) - lineplot_kwargs = dict(x=xx, y='value', hue='name', style='name', data=data) + lineplot_kwargs = dict(x=xx, y="value", hue="name", style="name", data=data) lineplot_kwargs.update(sns_kwargs) sns.lineplot(**lineplot_kwargs) plt.title(title) diff --git a/rlberry/manager/multiple_managers.py b/rlberry/manager/multiple_managers.py index 9ca22429d..498b3f42f 100644 --- a/rlberry/manager/multiple_managers.py +++ b/rlberry/manager/multiple_managers.py @@ -40,15 +40,11 @@ def run(self, save=False): with concurrent.futures.ThreadPoolExecutor() as executor: futures = [] for inst in self.instances: - futures.append( - executor.submit(fit_stats, inst, save=save) - ) + futures.append(executor.submit(fit_stats, inst, save=save)) fitted_instances = [] for future in concurrent.futures.as_completed(futures): - fitted_instances.append( - future.result() - ) + fitted_instances.append(future.result()) self.instances = fitted_instances diff --git a/rlberry/manager/remote_agent_manager.py b/rlberry/manager/remote_agent_manager.py index 5d296bf34..397252c80 100644 --- a/rlberry/manager/remote_agent_manager.py +++ b/rlberry/manager/remote_agent_manager.py @@ -26,6 +26,7 @@ class RemoteAgentManager: Parameters for AgentManager instance. Some parameters (as agent_class, train_env, eval_env) can be defined using a ResourceRequest. """ + def __init__( self, client: BerryClient, @@ -46,13 +47,11 @@ def __init__( if msg.command == interface.Command.RAISE_EXCEPTION: raise Exception(msg.message) - self._remote_agent_manager_filename = pathlib.Path( - msg.info['filename'] - ) + self._remote_agent_manager_filename = pathlib.Path(msg.info["filename"]) # get useful attributes - self.agent_name = msg.info['agent_name'] - self.output_dir = pathlib.Path(msg.info['output_dir']) # to save locally + self.agent_name = msg.info["agent_name"] + self.output_dir = pathlib.Path(msg.info["output_dir"]) # to save locally def set_client(self, client: BerryClient): self._client = client @@ -75,21 +74,25 @@ def get_writer_data(self): ) if msg.command == interface.Command.RAISE_EXCEPTION: raise Exception(msg.message) - raw_data = msg.data['writer_data'] + raw_data = msg.data["writer_data"] writer_data = dict() for idx in raw_data: csv_content = raw_data[idx] - writer_data[idx] = pd.read_csv(io.StringIO(csv_content), sep=',') + writer_data[idx] = pd.read_csv(io.StringIO(csv_content), sep=",") # check if tensorboard data was received # If so, read file and unzip it. - tensorboard_bin_data = msg.data['tensorboard_bin_data'] + tensorboard_bin_data = msg.data["tensorboard_bin_data"] if tensorboard_bin_data is not None: - tensorboard_bin_data = base64.b64decode(tensorboard_bin_data.encode('ascii')) - zip_file = open(self.output_dir / 'tensorboard_data.zip', "wb") + tensorboard_bin_data = base64.b64decode( + tensorboard_bin_data.encode("ascii") + ) + zip_file = open(self.output_dir / "tensorboard_data.zip", "wb") zip_file.write(tensorboard_bin_data) zip_file.close() - with zipfile.ZipFile(self.output_dir / 'tensorboard_data.zip', 'r') as zip_ref: + with zipfile.ZipFile( + self.output_dir / "tensorboard_data.zip", "r" + ) as zip_ref: zip_ref.extractall(self.output_dir) return writer_data @@ -98,9 +101,8 @@ def fit(self, budget=None, **kwargs): interface.Message.create( command=interface.Command.AGENT_MANAGER_FIT, params=dict( - filename=self.remote_file, - budget=budget, - extra_params=kwargs), + filename=self.remote_file, budget=budget, extra_params=kwargs + ), data=None, ) ) @@ -111,15 +113,13 @@ def eval_agents(self, n_simulations: Optional[int] = None): msg = self._client.send( interface.Message.create( command=interface.Command.AGENT_MANAGER_EVAL, - params=dict( - filename=self.remote_file, - n_simulations=n_simulations), + params=dict(filename=self.remote_file, n_simulations=n_simulations), data=None, ) ) if msg.command == interface.Command.RAISE_EXCEPTION: raise Exception(msg.message) - out = msg.data['output'] + out = msg.data["output"] return out def clear_output_dir(self): @@ -146,11 +146,7 @@ def clear_handlers(self): def set_writer(self, idx, writer_fn, writer_kwargs=None): """Note: Use ResourceRequest for writer_fn.""" - params = dict( - idx=idx, - writer_fn=writer_fn, - writer_kwargs=writer_kwargs - ) + params = dict(idx=idx, writer_fn=writer_fn, writer_kwargs=writer_kwargs) msg = self._client.send( interface.Message.create( command=interface.Command.AGENT_MANAGER_SET_WRITER, @@ -189,34 +185,40 @@ def save(self): output_dir.mkdir(parents=True, exist_ok=True) # save - filename = pathlib.Path('remote_manager_obj').with_suffix('.pickle') + filename = pathlib.Path("remote_manager_obj").with_suffix(".pickle") filename = output_dir / filename filename.parent.mkdir(parents=True, exist_ok=True) try: with filename.open("wb") as ff: pickle.dump(self.__dict__, ff) - logger.info("Saved RemoteAgentManager({}) using pickle.".format(self.agent_name)) + logger.info( + "Saved RemoteAgentManager({}) using pickle.".format(self.agent_name) + ) except Exception: try: with filename.open("wb") as ff: dill.dump(self.__dict__, ff) - logger.info("Saved RemoteAgentManager({}) using dill.".format(self.agent_name)) + logger.info( + "Saved RemoteAgentManager({}) using dill.".format(self.agent_name) + ) except Exception as ex: - logger.warning("[RemoteAgentManager] Instance cannot be pickled: " + str(ex)) + logger.warning( + "[RemoteAgentManager] Instance cannot be pickled: " + str(ex) + ) return filename @classmethod def load(cls, filename): - filename = pathlib.Path(filename).with_suffix('.pickle') + filename = pathlib.Path(filename).with_suffix(".pickle") obj = cls(None) try: - with filename.open('rb') as ff: + with filename.open("rb") as ff: tmp_dict = pickle.load(ff) logger.info("Loaded RemoteAgentManager using pickle.") except Exception: - with filename.open('rb') as ff: + with filename.open("rb") as ff: tmp_dict = dill.load(ff) logger.info("Loaded RemoteAgentManager using dill.") diff --git a/rlberry/manager/tests/test_agent_manager.py b/rlberry/manager/tests/test_agent_manager.py index 22500a92c..4de7f56b8 100644 --- a/rlberry/manager/tests/test_agent_manager.py +++ b/rlberry/manager/tests/test_agent_manager.py @@ -5,11 +5,7 @@ class DummyAgent(AgentWithSimplePolicy): - def __init__(self, - env, - hyperparameter1=0, - hyperparameter2=0, - **kwargs): + def __init__(self, env, hyperparameter1=0, hyperparameter2=0, **kwargs): AgentWithSimplePolicy.__init__(self, env, **kwargs) self.name = "DummyAgent" self.fitted = False @@ -24,8 +20,8 @@ def fit(self, budget, **kwargs): self.total_budget += budget for ii in range(budget): if self.writer is not None: - self.writer.add_scalar('a', 5) - self.writer.add_scalar('b', 6, ii) + self.writer.add_scalar("a", 5) + self.writer.add_scalar("b", 6, ii) return None def policy(self, observation): @@ -33,12 +29,9 @@ def policy(self, observation): @classmethod def sample_parameters(cls, trial): - hyperparameter1 \ - = trial.suggest_categorical('hyperparameter1', [1, 2, 3]) - hyperparameter2 \ - = trial.suggest_uniform('hyperparameter2', -10, 10) - return {'hyperparameter1': hyperparameter1, - 'hyperparameter2': hyperparameter2} + hyperparameter1 = trial.suggest_categorical("hyperparameter1", [1, 2, 3]) + hyperparameter2 = trial.suggest_uniform("hyperparameter2", -10, 10) + return {"hyperparameter1": hyperparameter1, "hyperparameter2": hyperparameter2} def test_agent_manager_1(): @@ -57,11 +50,24 @@ def test_agent_manager_1(): # Run AgentManager params_per_instance = [dict(hyperparameter2=ii) for ii in range(4)] stats_agent1 = AgentManager( - DummyAgent, train_env, fit_budget=5, eval_kwargs=eval_kwargs, - init_kwargs=params, n_fit=4, seed=123, init_kwargs_per_instance=params_per_instance) + DummyAgent, + train_env, + fit_budget=5, + eval_kwargs=eval_kwargs, + init_kwargs=params, + n_fit=4, + seed=123, + init_kwargs_per_instance=params_per_instance, + ) stats_agent2 = AgentManager( - DummyAgent, train_env, fit_budget=5, eval_kwargs=eval_kwargs, - init_kwargs=params, n_fit=4, seed=123) + DummyAgent, + train_env, + fit_budget=5, + eval_kwargs=eval_kwargs, + init_kwargs=params, + n_fit=4, + seed=123, + ) agent_manager_list = [stats_agent1, stats_agent2] for st in agent_manager_list: st.fit() @@ -75,7 +81,7 @@ def test_agent_manager_1(): assert instance.hyperparameter2 == 100 # learning curves - plot_writer_data(agent_manager_list, tag='episode_rewards', show=False) + plot_writer_data(agent_manager_list, tag="episode_rewards", show=False) # compare final policies evaluate_agents(agent_manager_list, show=False) @@ -110,15 +116,25 @@ def test_agent_manager_2(): # Run AgentManager stats_agent1 = AgentManager( - DummyAgent, train_env, eval_env=eval_env, - fit_budget=5, eval_kwargs=eval_kwargs, - init_kwargs=params, n_fit=4, - seed=123) + DummyAgent, + train_env, + eval_env=eval_env, + fit_budget=5, + eval_kwargs=eval_kwargs, + init_kwargs=params, + n_fit=4, + seed=123, + ) stats_agent2 = AgentManager( - DummyAgent, train_env, eval_env=eval_env, - fit_budget=5, eval_kwargs=eval_kwargs, - init_kwargs=params, n_fit=4, - seed=123) + DummyAgent, + train_env, + eval_env=eval_env, + fit_budget=5, + eval_kwargs=eval_kwargs, + init_kwargs=params, + n_fit=4, + seed=123, + ) agent_manager_list = [stats_agent1, stats_agent2] for st in agent_manager_list: st.fit() @@ -128,7 +144,7 @@ def test_agent_manager_2(): evaluate_agents(agent_manager_list, show=False) # learning curves - plot_writer_data(agent_manager_list, tag='episode_rewards', show=False) + plot_writer_data(agent_manager_list, tag="episode_rewards", show=False) # check if fitted for agent_manager in agent_manager_list: @@ -154,7 +170,10 @@ def test_agent_manager_2(): def test_agent_manager_partial_fit_and_tuple_env(): # Define train and evaluation envs - train_env = (GridWorld, None) # tuple (constructor, kwargs) must also work in AgentManager + train_env = ( + GridWorld, + None, + ) # tuple (constructor, kwargs) must also work in AgentManager # Parameters params = {} @@ -162,15 +181,23 @@ def test_agent_manager_partial_fit_and_tuple_env(): # Run AgentManager stats = AgentManager( - DummyAgent, train_env, - init_kwargs=params, n_fit=4, - fit_budget=5, eval_kwargs=eval_kwargs, - seed=123) + DummyAgent, + train_env, + init_kwargs=params, + n_fit=4, + fit_budget=5, + eval_kwargs=eval_kwargs, + seed=123, + ) stats2 = AgentManager( - DummyAgent, train_env, - init_kwargs=params, n_fit=4, - fit_budget=5, eval_kwargs=eval_kwargs, - seed=123) + DummyAgent, + train_env, + init_kwargs=params, + n_fit=4, + fit_budget=5, + eval_kwargs=eval_kwargs, + seed=123, + ) # Run partial fit stats.fit(10) @@ -182,7 +209,9 @@ def test_agent_manager_partial_fit_and_tuple_env(): stats2.fit() # learning curves - plot_writer_data([stats], tag='episode_rewards', show=False, preprocess_func=np.cumsum) + plot_writer_data( + [stats], tag="episode_rewards", show=False, preprocess_func=np.cumsum + ) # compare final policies evaluate_agents([stats], show=False) diff --git a/rlberry/manager/tests/test_agent_manager_seeding.py b/rlberry/manager/tests/test_agent_manager_seeding.py index aea00ace0..8f7cc1a08 100644 --- a/rlberry/manager/tests/test_agent_manager_seeding.py +++ b/rlberry/manager/tests/test_agent_manager_seeding.py @@ -8,30 +8,24 @@ import pytest -@pytest.mark.parametrize("env, agent_class", - [ - ((MountainCar, {}), RSUCBVIAgent), - ((gym_make, {'id': 'MountainCar-v0'}), RSUCBVIAgent), - ((gym.make, {'id': 'MountainCar-v0'}), RSUCBVIAgent), - ((MountainCar, {}), A2CAgent), - ((gym_make, {'id': 'MountainCar-v0'}), A2CAgent), - ((gym.make, {'id': 'MountainCar-v0'}), A2CAgent) - ]) +@pytest.mark.parametrize( + "env, agent_class", + [ + ((MountainCar, {}), RSUCBVIAgent), + ((gym_make, {"id": "MountainCar-v0"}), RSUCBVIAgent), + ((gym.make, {"id": "MountainCar-v0"}), RSUCBVIAgent), + ((MountainCar, {}), A2CAgent), + ((gym_make, {"id": "MountainCar-v0"}), A2CAgent), + ((gym.make, {"id": "MountainCar-v0"}), A2CAgent), + ], +) def test_agent_manager_and_multiple_managers_seeding(env, agent_class): agent_manager = AgentManager( - agent_class, - env, - fit_budget=2, - init_kwargs={'horizon': 10}, - n_fit=6, - seed=3456) + agent_class, env, fit_budget=2, init_kwargs={"horizon": 10}, n_fit=6, seed=3456 + ) agent_manager_test = AgentManager( - agent_class, - env, - fit_budget=2, - init_kwargs={'horizon': 10}, - n_fit=6, - seed=3456) + agent_class, env, fit_budget=2, init_kwargs={"horizon": 10}, n_fit=6, seed=3456 + ) multimanagers = MultipleManagers() multimanagers.append(agent_manager) diff --git a/rlberry/manager/tests/test_hyperparam_optim.py b/rlberry/manager/tests/test_hyperparam_optim.py index 1fa00d514..47d3134d5 100644 --- a/rlberry/manager/tests/test_hyperparam_optim.py +++ b/rlberry/manager/tests/test_hyperparam_optim.py @@ -6,11 +6,7 @@ class DummyAgent(AgentWithSimplePolicy): - def __init__(self, - env, - hyperparameter1=0, - hyperparameter2=0, - **kwargs): + def __init__(self, env, hyperparameter1=0, hyperparameter2=0, **kwargs): AgentWithSimplePolicy.__init__(self, env, **kwargs) self.name = "DummyAgent" self.fitted = False @@ -29,12 +25,9 @@ def policy(self, observation): @classmethod def sample_parameters(cls, trial): - hyperparameter1 \ - = trial.suggest_categorical('hyperparameter1', [1, 2, 3]) - hyperparameter2 \ - = trial.suggest_uniform('hyperparameter2', -10, 10) - return {'hyperparameter1': hyperparameter1, - 'hyperparameter2': hyperparameter2} + hyperparameter1 = trial.suggest_categorical("hyperparameter1", [1, 2, 3]) + hyperparameter2 = trial.suggest_uniform("hyperparameter2", -10, 10) + return {"hyperparameter1": hyperparameter1, "hyperparameter2": hyperparameter2} def test_hyperparam_optim_tpe(): @@ -42,12 +35,14 @@ def test_hyperparam_optim_tpe(): train_env = (GridWorld, {}) # Run AgentManager - stats_agent = AgentManager(DummyAgent, - train_env, - fit_budget=1, - init_kwargs={}, - eval_kwargs={'eval_horizon': 5}, - n_fit=4) + stats_agent = AgentManager( + DummyAgent, + train_env, + fit_budget=1, + init_kwargs={}, + eval_kwargs={"eval_horizon": 5}, + n_fit=4, + ) # test hyperparameter optimization with TPE sampler # using hyperopt default values @@ -61,12 +56,14 @@ def test_hyperparam_optim_random(): train_env = (GridWorld, {}) # Run AgentManager - stats_agent = AgentManager(DummyAgent, - train_env, - init_kwargs={}, - fit_budget=1, - eval_kwargs={'eval_horizon': 5}, - n_fit=4) + stats_agent = AgentManager( + DummyAgent, + train_env, + init_kwargs={}, + fit_budget=1, + eval_kwargs={"eval_horizon": 5}, + n_fit=4, + ) # test hyperparameter optimization with random sampler stats_agent.optimize_hyperparams(sampler_method="random", n_trials=5) @@ -78,20 +75,21 @@ def test_hyperparam_optim_grid(): train_env = (GridWorld, {}) # Run AgentManager - stats_agent = AgentManager(DummyAgent, - train_env, - init_kwargs={}, - fit_budget=1, - eval_kwargs={'eval_horizon': 5}, - n_fit=4) + stats_agent = AgentManager( + DummyAgent, + train_env, + init_kwargs={}, + fit_budget=1, + eval_kwargs={"eval_horizon": 5}, + n_fit=4, + ) # test hyperparameter optimization with grid sampler - search_space = {"hyperparameter1": [1, 2, 3], - "hyperparameter2": [-5, 0, 5]} + search_space = {"hyperparameter1": [1, 2, 3], "hyperparameter2": [-5, 0, 5]} sampler_kwargs = {"search_space": search_space} - stats_agent.optimize_hyperparams(n_trials=3 * 3, - sampler_method="grid", - sampler_kwargs=sampler_kwargs) + stats_agent.optimize_hyperparams( + n_trials=3 * 3, sampler_method="grid", sampler_kwargs=sampler_kwargs + ) stats_agent.clear_output_dir() @@ -100,12 +98,14 @@ def test_hyperparam_optim_cmaes(): train_env = (GridWorld, {}) # Run AgentManager - stats_agent = AgentManager(DummyAgent, - train_env, - init_kwargs={}, - fit_budget=1, - eval_kwargs={'eval_horizon': 5}, - n_fit=4) + stats_agent = AgentManager( + DummyAgent, + train_env, + init_kwargs={}, + fit_budget=1, + eval_kwargs={"eval_horizon": 5}, + n_fit=4, + ) # test hyperparameter optimization with CMA-ES sampler stats_agent.optimize_hyperparams(sampler_method="cmaes", n_trials=5) @@ -119,27 +119,35 @@ def sample_parameters(cls, trial): """ Sample hyperparameters for hyperparam optimization using Optuna (https://optuna.org/) """ - gamma = trial.suggest_categorical('gamma', [0.1, 0.99]) - return {'gamma': gamma} - - env = (GridWorld, dict( - nrows=3, ncols=10, - reward_at={(1, 1): 0.1, (2, 9): 1.0}, - walls=((1, 4), (2, 4), (1, 5)), - success_probability=0.9)) - - vi_params = {'gamma': 0.1, 'epsilon': 1e-3} - - vi_stats = AgentManager(ValueIterationAgentToOptimize, - env, - fit_budget=0, - eval_kwargs=dict(eval_horizon=20), - init_kwargs=vi_params, - n_fit=4, - seed=123) - - vi_stats.optimize_hyperparams(n_trials=5, n_fit=1, - sampler_method='random', pruner_method='none') + gamma = trial.suggest_categorical("gamma", [0.1, 0.99]) + return {"gamma": gamma} + + env = ( + GridWorld, + dict( + nrows=3, + ncols=10, + reward_at={(1, 1): 0.1, (2, 9): 1.0}, + walls=((1, 4), (2, 4), (1, 5)), + success_probability=0.9, + ), + ) + + vi_params = {"gamma": 0.1, "epsilon": 1e-3} + + vi_stats = AgentManager( + ValueIterationAgentToOptimize, + env, + fit_budget=0, + eval_kwargs=dict(eval_horizon=20), + init_kwargs=vi_params, + n_fit=4, + seed=123, + ) + + vi_stats.optimize_hyperparams( + n_trials=5, n_fit=1, sampler_method="random", pruner_method="none" + ) assert vi_stats.optuna_study vi_stats.clear_output_dir() diff --git a/rlberry/manager/utils.py b/rlberry/manager/utils.py index a3c4fc409..347ed0ed2 100644 --- a/rlberry/manager/utils.py +++ b/rlberry/manager/utils.py @@ -6,7 +6,7 @@ def create_database(db_file): connection = None try: connection = sqlite3.connect(db_file) - print(f'Connected to {db_file} (sqlite3 version = {sqlite3.version})') + print(f"Connected to {db_file} (sqlite3 version = {sqlite3.version})") except sqlite3.Error as err: print(err) diff --git a/rlberry/metadata_utils.py b/rlberry/metadata_utils.py index 7a4302451..a443b6420 100644 --- a/rlberry/metadata_utils.py +++ b/rlberry/metadata_utils.py @@ -5,10 +5,10 @@ # Default output directory used by the library. -RLBERRY_DEFAULT_DATA_DIR = 'rlberry_data/' +RLBERRY_DEFAULT_DATA_DIR = "rlberry_data/" # Temporary directory used by the library -RLBERRY_TEMP_DATA_DIR = 'rlberry_data/temp/' +RLBERRY_TEMP_DATA_DIR = "rlberry_data/temp/" def get_unique_id(obj): @@ -19,7 +19,7 @@ def get_unique_id(obj): # uuid4() is an universal id, but there might be issues if called simultaneously in different processes. # This function combines id(), uuid4(), and a timestamp in a single ID, and hashes it. timestamp = datetime.timestamp(datetime.now()) - timestamp = str(timestamp).replace('.', '') + timestamp = str(timestamp).replace(".", "") str_id = timestamp + str(id(obj)) + uuid.uuid4().hex str_id = hashlib.md5(str_id.encode()).hexdigest() return str_id @@ -38,5 +38,6 @@ class ExecutionMetadata(NamedTuple): obj_info : dict, default: None Extra info about the object. """ + obj_worker_id: int = -1 obj_info: Optional[dict] = None diff --git a/rlberry/network/client.py b/rlberry/network/client.py index ea8cc3036..51e5ffba3 100644 --- a/rlberry/network/client.py +++ b/rlberry/network/client.py @@ -6,7 +6,7 @@ from rlberry.network.utils import serialize_message -class BerryClient(): +class BerryClient: """ rlberry client @@ -17,9 +17,10 @@ class BerryClient(): port : int Integer from 1-65535 """ + def __init__( self, - host='127.0.0.1', + host="127.0.0.1", port: int = 65432, ) -> None: assert port >= 1 and port <= 65535 @@ -27,9 +28,9 @@ def __init__( self._port = port def send( - self, - *messages: interface.Message, - print_response: bool = False, + self, + *messages: interface.Message, + print_response: bool = False, ) -> Union[List[interface.Message], interface.Message]: returned_messages = [] pp = pprint.PrettyPrinter(indent=4) diff --git a/rlberry/network/interface.py b/rlberry/network/interface.py index 7ee4b5cc8..929a3f366 100644 --- a/rlberry/network/interface.py +++ b/rlberry/network/interface.py @@ -2,22 +2,22 @@ from typing import Any, Dict, Mapping, NamedTuple, Optional -REQUEST_PREFIX = 'ResourceRequest_' +REQUEST_PREFIX = "ResourceRequest_" class Command: - NONE = 'NONE' - RAISE_EXCEPTION = 'RAISE_EXCEPTION' - ECHO = 'ECHO' - LIST_RESOURCES = 'LIST_RESOURCES' - AGENT_MANAGER_CREATE_INSTANCE = 'AGENT_MANAGER_CREATE_INSTANCE' - AGENT_MANAGER_FIT = 'AGENT_MANAGER_FIT' - AGENT_MANAGER_EVAL = 'AGENT_MANAGER_EVAL' - AGENT_MANAGER_CLEAR_OUTPUT_DIR = 'AGENT_MANAGER_CLEAR_OUTPUT_DIR' - AGENT_MANAGER_CLEAR_HANDLERS = 'AGENT_MANAGER_CLEAR_HANDLERS' - AGENT_MANAGER_SET_WRITER = 'AGENT_MANAGER_SET_WRITER' - AGENT_MANAGER_OPTIMIZE_HYPERPARAMS = 'AGENT_MANAGER_OPTIMIZE_HYPERPARAMS' - AGENT_MANAGER_GET_WRITER_DATA = 'AGENT_MANAGER_GET_WRITER_DATA' + NONE = "NONE" + RAISE_EXCEPTION = "RAISE_EXCEPTION" + ECHO = "ECHO" + LIST_RESOURCES = "LIST_RESOURCES" + AGENT_MANAGER_CREATE_INSTANCE = "AGENT_MANAGER_CREATE_INSTANCE" + AGENT_MANAGER_FIT = "AGENT_MANAGER_FIT" + AGENT_MANAGER_EVAL = "AGENT_MANAGER_EVAL" + AGENT_MANAGER_CLEAR_OUTPUT_DIR = "AGENT_MANAGER_CLEAR_OUTPUT_DIR" + AGENT_MANAGER_CLEAR_HANDLERS = "AGENT_MANAGER_CLEAR_HANDLERS" + AGENT_MANAGER_SET_WRITER = "AGENT_MANAGER_SET_WRITER" + AGENT_MANAGER_OPTIMIZE_HYPERPARAMS = "AGENT_MANAGER_OPTIMIZE_HYPERPARAMS" + AGENT_MANAGER_GET_WRITER_DATA = "AGENT_MANAGER_GET_WRITER_DATA" class BerryServerInfo: @@ -26,7 +26,7 @@ class BerryServerInfo: class Message(NamedTuple): - message: Optional[str] = '' + message: Optional[str] = "" command: Optional[Command] = None params: Optional[Mapping[str, Any]] = None data: Optional[Mapping[str, Any]] = None @@ -37,13 +37,14 @@ def to_dict(self): @classmethod def create( - cls, - message: Optional[str] = '', - command: Optional[Command] = None, - params: Optional[Mapping[str, Any]] = None, - data: Optional[Mapping[str, Any]] = None, - info: Optional[Mapping[str, Any]] = None): - command = command or '' + cls, + message: Optional[str] = "", + command: Optional[Command] = None, + params: Optional[Mapping[str, Any]] = None, + data: Optional[Mapping[str, Any]] = None, + info: Optional[Mapping[str, Any]] = None, + ): + command = command or "" params = params or dict() data = data or dict() info = info or dict() @@ -81,8 +82,8 @@ def send_data(socket, data): """ adapted from: https://stackoverflow.com/a/63532988 """ - print(f'[rlberry.network] sending {len(data)} bytes...') - socket.sendall(struct.pack('>I', len(data)) + data) + print(f"[rlberry.network] sending {len(data)} bytes...") + socket.sendall(struct.pack(">I", len(data)) + data) def receive_data(socket): @@ -92,11 +93,11 @@ def receive_data(socket): data_size_packed = socket.recv(4) if not data_size_packed: return data_size_packed - data_size = struct.unpack('>I', data_size_packed)[0] + data_size = struct.unpack(">I", data_size_packed)[0] received_data = b"" remaining_size = min(next_power_of_two(data_size), 4096) while remaining_size > 0: received_data += socket.recv(remaining_size) remaining_size = data_size - len(received_data) - print(f'[rlberry.network] ... received {len(received_data)}/{data_size} bytes.') + print(f"[rlberry.network] ... received {len(received_data)}/{data_size} bytes.") return received_data diff --git a/rlberry/network/server.py b/rlberry/network/server.py index 5c737fc1b..ab0926416 100644 --- a/rlberry/network/server.py +++ b/rlberry/network/server.py @@ -5,7 +5,11 @@ import json import rlberry.network.server_utils as server_utils from rlberry.network import interface -from rlberry.network.utils import apply_fn_to_tree, map_request_to_obj, serialize_message +from rlberry.network.utils import ( + apply_fn_to_tree, + map_request_to_obj, + serialize_message, +) from rlberry.envs import gym_make from typing import Optional @@ -18,14 +22,16 @@ def __init__(self, client_socket, client_address, resources, timeout): self._socket = client_socket self._address = client_address self._resources = resources - self._logger = logging.getLogger('ClientHandler') + self._logger = logging.getLogger("ClientHandler") self._timeout = timeout def _process_message(self, message: interface.Message): """Replace resource requests in 'message' by available resources.""" message = message.to_dict() message = apply_fn_to_tree( - lambda key, val: map_request_to_obj(key, val, self._resources), message, apply_to_nodes=True + lambda key, val: map_request_to_obj(key, val, self._resources), + message, + apply_to_nodes=True, ) return interface.Message.from_dict(message) @@ -39,8 +45,8 @@ def _execute_message(self, message: interface.Message): interface.send_data(self._socket, serialize_message(response)) except Exception as ex: response = interface.Message.create( - command=interface.Command.RAISE_EXCEPTION, - message=str(ex)) + command=interface.Command.RAISE_EXCEPTION, message=str(ex) + ) interface.send_data(self._socket, serialize_message(response)) return 1 return 0 @@ -49,7 +55,9 @@ def run(self): with self._socket: try: while True: - print(f'\n Handling client @ {self._address}') + print( + f"\n Handling client @ {self._address}" + ) self._socket.settimeout(self._timeout) message_bytes = interface.receive_data(self._socket) if not message_bytes: @@ -57,17 +65,17 @@ def run(self): # process bytes message = interface.Message.from_dict(json.loads(message_bytes)) message = self._process_message(message) - print(f' Received message: \n{message}') + print(f" Received message: \n{message}") # execute message commands and send back a response self._execute_message(message) except Exception as ex: - print(f' [ERROR]: {ex}') + print(f" [ERROR]: {ex}") self._logger.exception(ex) finally: - print(f' Finished client @ {self._address}') + print(f" Finished client @ {self._address}") -class BerryServer(): +class BerryServer: """ rlberry server @@ -87,9 +95,10 @@ class BerryServer(): Number of received client sockets after which to terminate the server. If None, does not terminate. """ + def __init__( self, - host='127.0.0.1', + host="127.0.0.1", port: int = 65432, backlog: int = 5, resources: Optional[interface.Resources] = None, @@ -109,42 +118,55 @@ def __init__( # Define basic resources if resources is None: self._resources = dict( - gym_make=interface.ResourceItem( - obj=gym_make, - description='gym_make'), + gym_make=interface.ResourceItem(obj=gym_make, description="gym_make"), ) else: for _, val in resources.items(): - if set(val.keys()) != set(['obj', 'description']): + if set(val.keys()) != set(["obj", "description"]): raise ValueError( "resources items must be a dictionary with keys ['obj', 'description']." - f" Received: {list(val.keys())}") + f" Received: {list(val.keys())}" + ) def start(self): - print(f'\n\nStarting BerryServer @ (host, port) = ({self._host}, {self._port}).\n\n') + print( + f"\n\nStarting BerryServer @ (host, port) = ({self._host}, {self._port}).\n\n" + ) with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.bind((self._host, self._port)) s.listen(self._backlog) - with concurrent.futures.ProcessPoolExecutor(mp_context=multiprocessing.get_context('spawn')) as executor: + with concurrent.futures.ProcessPoolExecutor( + mp_context=multiprocessing.get_context("spawn") + ) as executor: futures = [] while True: - print(f' BerryServer({self._host}, {self._port}): waiting for connection...') - client_socket, client_address = s.accept() # wait for connection + print( + f" BerryServer({self._host}, {self._port}): waiting for connection..." + ) + client_socket, client_address = s.accept() # wait for connection self._client_socket_counter += 1 client_handler = ClientHandler( client_socket, client_address, self._resources, - self._client_socket_timeout) - print(f' BerryServer({self._host}, {self._port}): ' - f'new client @ {client_address}') + self._client_socket_timeout, + ) + print( + f" BerryServer({self._host}, {self._port}): " + f"new client @ {client_address}" + ) futures.append(executor.submit(client_handler.run)) - if self._terminate_after and self._client_socket_counter >= self._terminate_after: - print(' Terminating server (main process): ' - 'reached max number of client sockets.') + if ( + self._terminate_after + and self._client_socket_counter >= self._terminate_after + ): + print( + " Terminating server (main process): " + "reached max number of client sockets." + ) break -if __name__ == '__main__': +if __name__ == "__main__": server = BerryServer() server.start() diff --git a/rlberry/network/server_utils.py b/rlberry/network/server_utils.py index e5beed83c..4a63ee0fd 100644 --- a/rlberry/network/server_utils.py +++ b/rlberry/network/server_utils.py @@ -7,38 +7,40 @@ def execute_message( - message: interface.Message, - resources: interface.Resources) -> interface.Message: + message: interface.Message, resources: interface.Resources +) -> interface.Message: response = interface.Message.create(command=interface.Command.ECHO) # LIST_RESOURCES if message.command == interface.Command.LIST_RESOURCES: info = {} for rr in resources: - info[rr] = resources[rr]['description'] + info[rr] = resources[rr]["description"] response = interface.Message.create(info=info) # AGENT_MANAGER_CREATE_INSTANCE elif message.command == interface.Command.AGENT_MANAGER_CREATE_INSTANCE: params = message.params base_dir = pathlib.Path(metadata_utils.RLBERRY_DEFAULT_DATA_DIR) - if 'output_dir' in params: - params['output_dir'] = base_dir / 'server_data' / params['output_dir'] + if "output_dir" in params: + params["output_dir"] = base_dir / "server_data" / params["output_dir"] else: - params['output_dir'] = base_dir / 'server_data/' + params["output_dir"] = base_dir / "server_data/" agent_manager = AgentManager(**params) filename = str(agent_manager.save()) response = interface.Message.create( info=dict( filename=filename, agent_name=agent_manager.agent_name, - output_dir=str(agent_manager.output_dir).replace('server_data/', 'client_data/') + output_dir=str(agent_manager.output_dir).replace( + "server_data/", "client_data/" + ), ) ) del agent_manager # AGENT_MANAGER_FIT elif message.command == interface.Command.AGENT_MANAGER_FIT: - filename = message.params['filename'] - budget = message.params['budget'] - extra_params = message.params['extra_params'] + filename = message.params["filename"] + budget = message.params["budget"] + extra_params = message.params["extra_params"] agent_manager = AgentManager.load(filename) agent_manager.fit(budget, **extra_params) agent_manager.save() @@ -46,45 +48,49 @@ def execute_message( del agent_manager # AGENT_MANAGER_EVAL elif message.command == interface.Command.AGENT_MANAGER_EVAL: - filename = message.params['filename'] + filename = message.params["filename"] agent_manager = AgentManager.load(filename) - eval_output = agent_manager.eval_agents(message.params['n_simulations']) + eval_output = agent_manager.eval_agents(message.params["n_simulations"]) response = interface.Message.create(data=dict(output=eval_output)) del agent_manager # AGENT_MANAGER_CLEAR_OUTPUT_DIR elif message.command == interface.Command.AGENT_MANAGER_CLEAR_OUTPUT_DIR: - filename = message.params['filename'] + filename = message.params["filename"] agent_manager = AgentManager.load(filename) agent_manager.clear_output_dir() - response = interface.Message.create(message=f'Cleared output dir: {agent_manager.output_dir}') + response = interface.Message.create( + message=f"Cleared output dir: {agent_manager.output_dir}" + ) del agent_manager # AGENT_MANAGER_CLEAR_HANDLERS elif message.command == interface.Command.AGENT_MANAGER_CLEAR_HANDLERS: - filename = message.params['filename'] + filename = message.params["filename"] agent_manager = AgentManager.load(filename) agent_manager.clear_handlers() agent_manager.save() - response = interface.Message.create(message=f'Cleared handlers: {filename}') + response = interface.Message.create(message=f"Cleared handlers: {filename}") del agent_manager # AGENT_MANAGER_SET_WRITER elif message.command == interface.Command.AGENT_MANAGER_SET_WRITER: - filename = message.params['filename'] + filename = message.params["filename"] agent_manager = AgentManager.load(filename) - agent_manager.set_writer(**message.params['kwargs']) + agent_manager.set_writer(**message.params["kwargs"]) agent_manager.save() del agent_manager # AGENT_MANAGER_OPTIMIZE_HYPERPARAMS elif message.command == interface.Command.AGENT_MANAGER_OPTIMIZE_HYPERPARAMS: - filename = message.params['filename'] + filename = message.params["filename"] agent_manager = AgentManager.load(filename) - best_params_dict = agent_manager.optimize_hyperparams(**message.params['kwargs']) + best_params_dict = agent_manager.optimize_hyperparams( + **message.params["kwargs"] + ) agent_manager.save() del agent_manager response = interface.Message.create(data=best_params_dict) # AGENT_MANAGER_GET_WRITER_DATA elif message.command == interface.Command.AGENT_MANAGER_GET_WRITER_DATA: # writer scalar data - filename = message.params['filename'] + filename = message.params["filename"] agent_manager = AgentManager.load(filename) writer_data = agent_manager.get_writer_data() writer_data = writer_data or dict() @@ -95,14 +101,17 @@ def execute_message( if agent_manager.tensorboard_dir is not None: tensorboard_zip_file = rlberry.utils.io.zipdir( agent_manager.tensorboard_dir, - agent_manager.output_dir / 'tensorboard_data.zip') + agent_manager.output_dir / "tensorboard_data.zip", + ) if tensorboard_zip_file is not None: tensorboard_bin_data = open(tensorboard_zip_file, "rb").read() - tensorboard_bin_data = base64.b64encode(tensorboard_bin_data).decode('ascii') + tensorboard_bin_data = base64.b64encode(tensorboard_bin_data).decode( + "ascii" + ) response = interface.Message.create( data=dict( - writer_data=writer_data, - tensorboard_bin_data=tensorboard_bin_data) + writer_data=writer_data, tensorboard_bin_data=tensorboard_bin_data + ) ) del agent_manager # end diff --git a/rlberry/network/utils.py b/rlberry/network/utils.py index a2feb7fb1..67e2ae1f7 100644 --- a/rlberry/network/utils.py +++ b/rlberry/network/utils.py @@ -4,18 +4,21 @@ from typing import Any, Callable, Mapping, Optional, Tuple, Union -Tree = Union[Any, Tuple, Mapping[Any, 'Tree']] +Tree = Union[Any, Tuple, Mapping[Any, "Tree"]] def apply_fn_to_tree( - fn: Callable[[Any, Any], Tuple[Any, Any]], - tree: Tree, - is_leaf: Optional[Callable[[Any], Any]] = None, - apply_to_nodes: Optional[bool] = False): + fn: Callable[[Any, Any], Tuple[Any, Any]], + tree: Tree, + is_leaf: Optional[Callable[[Any], Any]] = None, + apply_to_nodes: Optional[bool] = False, +): """ new_key, new_val = fn(key, my_dict[key]) """ - is_leaf = is_leaf or (lambda x: not isinstance(x, Mapping) and not isinstance(x, Tuple)) + is_leaf = is_leaf or ( + lambda x: not isinstance(x, Mapping) and not isinstance(x, Tuple) + ) if is_leaf(tree): return deepcopy(tree) if isinstance(tree, Mapping): @@ -27,12 +30,16 @@ def apply_fn_to_tree( new_key, new_val = fn(key, tree[key]) new_tree.pop(key) new_tree[new_key] = new_val - return {key: apply_fn_to_tree( - fn, val, is_leaf, apply_to_nodes) for (key, val) in new_tree.items()} + return { + key: apply_fn_to_tree(fn, val, is_leaf, apply_to_nodes) + for (key, val) in new_tree.items() + } elif isinstance(tree, Tuple): - return tuple([apply_fn_to_tree(fn, val, is_leaf, apply_to_nodes) for val in tree]) + return tuple( + [apply_fn_to_tree(fn, val, is_leaf, apply_to_nodes) for val in tree] + ) else: - raise RuntimeError('Tree is not a Mapping or Tuple.') + raise RuntimeError("Tree is not a Mapping or Tuple.") def _map_resource_request_to_dict(key, val): @@ -46,29 +53,31 @@ def _map_resource_request_to_dict(key, val): def map_request_to_obj(key, val, resources: interface.Resources): if key.startswith(interface.REQUEST_PREFIX): - new_key = key[len(interface.REQUEST_PREFIX):] - resource_name = val['name'] + new_key = key[len(interface.REQUEST_PREFIX) :] + resource_name = val["name"] try: - resource_kwargs = val['kwargs'] + resource_kwargs = val["kwargs"] except KeyError: resource_kwargs = None if resource_name in resources: if resource_kwargs: - new_val = (resources[resource_name]['obj'], resource_kwargs) + new_val = (resources[resource_name]["obj"], resource_kwargs) else: - new_val = resources[resource_name]['obj'] + new_val = resources[resource_name]["obj"] return new_key, new_val else: - raise RuntimeError(f'Unavailable requested resource: {resource_name}') + raise RuntimeError(f"Unavailable requested resource: {resource_name}") else: return key, val def serialize_message(message: interface.Message) -> bytes: message = message.to_dict() - message = apply_fn_to_tree(_map_resource_request_to_dict, message, apply_to_nodes=True) + message = apply_fn_to_tree( + _map_resource_request_to_dict, message, apply_to_nodes=True + ) def default(obj): return f"<>" - return str.encode(json.dumps(message, default=default)) \ No newline at end of file + return str.encode(json.dumps(message, default=default)) diff --git a/rlberry/rendering/opengl_render2d.py b/rlberry/rendering/opengl_render2d.py index 269427cd2..562b35aea 100644 --- a/rlberry/rendering/opengl_render2d.py +++ b/rlberry/rendering/opengl_render2d.py @@ -8,10 +8,10 @@ from rlberry.rendering import Scene logger = logging.getLogger(__name__) -environ['PYGAME_HIDE_SUPPORT_PROMPT'] = '1' +environ["PYGAME_HIDE_SUPPORT_PROMPT"] = "1" _IMPORT_SUCESSFUL = True -_IMPORT_ERROR_MSG = '' +_IMPORT_ERROR_MSG = "" try: import pygame as pg from pygame.locals import DOUBLEBUF, OPENGL @@ -93,16 +93,24 @@ def initGL(self): """ glMatrixMode(GL_PROJECTION) glLoadIdentity() - gluOrtho2D(self.clipping_area[0], self.clipping_area[1], - self.clipping_area[2], self.clipping_area[3]) + gluOrtho2D( + self.clipping_area[0], + self.clipping_area[1], + self.clipping_area[2], + self.clipping_area[3], + ) def display(self): """ Callback function, handler for window re-paint """ # Set background color (clear background) - glClearColor(self.background_color[0], self.background_color[1], - self.background_color[2], 1.0) + glClearColor( + self.background_color[0], + self.background_color[1], + self.background_color[2], + 1.0, + ) glClear(GL_COLOR_BUFFER_BIT) # Display background @@ -193,7 +201,9 @@ def run_graphics(self, loop=True): def get_gl_image_str(self): # see https://gist.github.com/Jerdak/7364746 glReadBuffer(GL_FRONT) - pixels = glReadPixels(0, 0, self.window_width, self.window_height, GL_RGB, GL_UNSIGNED_BYTE) + pixels = glReadPixels( + 0, 0, self.window_width, self.window_height, GL_RGB, GL_UNSIGNED_BYTE + ) return pixels def get_video_data(self): @@ -223,9 +233,9 @@ def get_video_data(self): # See https://stackoverflow.com/a/42754578/5691288 # string_image = self.get_gl_image_str() - temp_surf = pg.image.fromstring(string_image, - (self.window_width, - self.window_height), 'RGB') + temp_surf = pg.image.fromstring( + string_image, (self.window_width, self.window_height), "RGB" + ) tmp_arr = pg.surfarray.array3d(temp_surf) imgdata = np.moveaxis(tmp_arr, 0, 1) imgdata = np.flipud(imgdata) diff --git a/rlberry/rendering/pygame_render2d.py b/rlberry/rendering/pygame_render2d.py index 4b1bf8889..a2a43a842 100644 --- a/rlberry/rendering/pygame_render2d.py +++ b/rlberry/rendering/pygame_render2d.py @@ -9,10 +9,10 @@ logger = logging.getLogger(__name__) -environ['PYGAME_HIDE_SUPPORT_PROMPT'] = '1' +environ["PYGAME_HIDE_SUPPORT_PROMPT"] = "1" _IMPORT_SUCESSFUL = True -_IMPORT_ERROR_MSG = '' +_IMPORT_ERROR_MSG = "" try: import pygame as pg @@ -97,7 +97,7 @@ def draw_geometric2d(self, shape): """ Draw a 2D shape, of type GeometricPrimitive """ - if shape.type in ['POLYGON']: + if shape.type in ["POLYGON"]: area = self.clipping_area width_range = area[1] - area[0] height_range = area[3] - area[2] @@ -113,15 +113,13 @@ def draw_geometric2d(self, shape): pg_vertex = (xx, yy) vertices.append(pg_vertex) - color = (255 * shape.color[0], - 255 * shape.color[1], - 255 * shape.color[2]) + color = (255 * shape.color[0], 255 * shape.color[1], 255 * shape.color[2]) pg.draw.polygon(self.screen, color, vertices) else: raise NotImplementedError( - "Shape type %s not implemented in pygame renderer." - % shape.type) + "Shape type %s not implemented in pygame renderer." % shape.type + ) def run_graphics(self, loop=True): """ @@ -180,10 +178,10 @@ def get_video_data(self): # # See https://stackoverflow.com/a/42754578/5691288 # - string_image = pg.image.tostring(self.screen, 'RGB') - temp_surf = pg.image.fromstring(string_image, - (self.window_width, - self.window_height), 'RGB') + string_image = pg.image.tostring(self.screen, "RGB") + temp_surf = pg.image.fromstring( + string_image, (self.window_width, self.window_height), "RGB" + ) tmp_arr = pg.surfarray.array3d(temp_surf) imgdata = np.moveaxis(tmp_arr, 0, 1) video_data.append(imgdata) diff --git a/rlberry/rendering/render_interface.py b/rlberry/rendering/render_interface.py index 563df3b8f..13c8d415c 100644 --- a/rlberry/rendering/render_interface.py +++ b/rlberry/rendering/render_interface.py @@ -34,6 +34,7 @@ def save_video(self, filename, **kwargs): Save video file. """ pass + def get_video(self, **kwargs): """ Get video data. @@ -62,12 +63,12 @@ def __init__(self): self._clipping_area = (-1.0, 1.0, -1.0, 1.0) # (left,right,bottom,top) # rendering type, either 'pygame' or 'opengl' - self.renderer_type = 'opengl' + self.renderer_type = "opengl" def get_renderer(self): - if self.renderer_type == 'opengl': + if self.renderer_type == "opengl": return OpenGLRender2D() - elif self.renderer_type == 'pygame': + elif self.renderer_type == "pygame": return PyGameRender2D() else: raise NotImplementedError("Unknown renderer type.") @@ -155,8 +156,6 @@ def get_video(self, framerate=25, **kwargs): return renderer.get_video_data() - - def save_video(self, filename, framerate=25, **kwargs): video_data = self.get_video(framerate=framerate, **kwargs) video_write(filename, video_data, framerate=framerate) diff --git a/rlberry/rendering/tests/test_rendering_interface.py b/rlberry/rendering/tests/test_rendering_interface.py index 678f93fc9..e17c2761f 100644 --- a/rlberry/rendering/tests/test_rendering_interface.py +++ b/rlberry/rendering/tests/test_rendering_interface.py @@ -31,7 +31,7 @@ SimplePBallND, FourRoom, SixRoom, - AppleGold + AppleGold, ] @@ -62,10 +62,10 @@ def test_render2d_interface(ModelClass): next_s, _, _, _ = env.step(action) state = next_s env.render(loop=False) - env.save_video('test_video.mp4') + env.save_video("test_video.mp4") env.clear_render_buffer() try: - os.remove('test_video.mp4') + os.remove("test_video.mp4") except Exception: pass @@ -85,9 +85,9 @@ def test_render2d_interface_wrapped(ModelClass): next_s, _, _, _ = env.step(action) state = next_s env.render(loop=False) - env.save_video('test_video.mp4') + env.save_video("test_video.mp4") env.clear_render_buffer() try: - os.remove('test_video.mp4') + os.remove("test_video.mp4") except Exception: pass diff --git a/rlberry/rendering/utils.py b/rlberry/rendering/utils.py index bd23e4f53..f02ce89bf 100644 --- a/rlberry/rendering/utils.py +++ b/rlberry/rendering/utils.py @@ -10,7 +10,7 @@ logger = logging.getLogger(__name__) -def video_write(fn, images, framerate=60, vcodec='libx264'): +def video_write(fn, images, framerate=60, vcodec="libx264"): """ Save list of images to a video file. @@ -38,29 +38,34 @@ def video_write(fn, images, framerate=60, vcodec='libx264'): if not _FFMPEG_INSTALLED: logger.error( "video_write(): Unable to save video, ffmpeg-python \ - package required (https://github.com/kkroening/ffmpeg-python)") + package required (https://github.com/kkroening/ffmpeg-python)" + ) return if not isinstance(images, np.ndarray): images = np.asarray(images) _, height, width, channels = images.shape process = ( - ffmpeg - .input('pipe:', format='rawvideo', pix_fmt='rgb24', - s='{}x{}'.format(width, height), r=framerate) - .output(fn, pix_fmt='yuv420p', vcodec=vcodec) - .overwrite_output() - .run_async(pipe_stdin=True) + ffmpeg.input( + "pipe:", + format="rawvideo", + pix_fmt="rgb24", + s="{}x{}".format(width, height), + r=framerate, + ) + .output(fn, pix_fmt="yuv420p", vcodec=vcodec) + .overwrite_output() + .run_async(pipe_stdin=True) ) for frame in images: - process.stdin.write( - frame - .astype(np.uint8) - .tobytes() - ) + process.stdin.write(frame.astype(np.uint8).tobytes()) process.stdin.close() process.wait() except Exception as ex: - logger.warning("Not possible to save \ -video, due to exception: {}".format(str(ex))) + logger.warning( + "Not possible to save \ +video, due to exception: {}".format( + str(ex) + ) + ) diff --git a/rlberry/seeding/tests/test_seeding.py b/rlberry/seeding/tests/test_seeding.py index 279a10fc3..8e87bf529 100644 --- a/rlberry/seeding/tests/test_seeding.py +++ b/rlberry/seeding/tests/test_seeding.py @@ -13,8 +13,12 @@ def test_seeder_basic(): assert (data1 != data2).sum() > 5 assert (data2 != data3).sum() == 0 - assert seeder2.spawn(1).generate_state(1)[0] == seeder3.spawn(1).generate_state(1)[0] - assert seeder1.spawn(1).generate_state(1)[0] != seeder3.spawn(1).generate_state(1)[0] + assert ( + seeder2.spawn(1).generate_state(1)[0] == seeder3.spawn(1).generate_state(1)[0] + ) + assert ( + seeder1.spawn(1).generate_state(1)[0] != seeder3.spawn(1).generate_state(1)[0] + ) def test_seeder_initialized_from_seeder(): diff --git a/rlberry/seeding/tests/test_threads.py b/rlberry/seeding/tests/test_threads.py index 83f211e9a..010655c0e 100644 --- a/rlberry/seeding/tests/test_threads.py +++ b/rlberry/seeding/tests/test_threads.py @@ -23,7 +23,5 @@ def test_multithread_seeding(): results = [] for future in concurrent.futures.as_completed(futures): - results.append( - future.result() - ) + results.append(future.result()) assert results[0] != results[1], f"error in simulation {(ii, jj)}" diff --git a/rlberry/seeding/tests/test_threads_torch.py b/rlberry/seeding/tests/test_threads_torch.py index 08dfa1a3c..585e7713a 100644 --- a/rlberry/seeding/tests/test_threads_torch.py +++ b/rlberry/seeding/tests/test_threads_torch.py @@ -30,7 +30,5 @@ def test_torch_multithread_seeding(): results = [] for future in concurrent.futures.as_completed(futures): - results.append( - future.result() - ) + results.append(future.result()) assert results[0] != results[1], f"error in simulation {(ii, jj)}" diff --git a/rlberry/spaces/box.py b/rlberry/spaces/box.py index 2ff60f81e..2409eaf97 100644 --- a/rlberry/spaces/box.py +++ b/rlberry/spaces/box.py @@ -64,8 +64,7 @@ def sample(self): * (-oo, b] : shifted negative exponential distribution * (-oo, oo) : normal distribution """ - high = self.high if self.dtype.kind == 'f' \ - else self.high.astype('int64') + 1 + high = self.high if self.dtype.kind == "f" else self.high.astype("int64") + 1 sample = np.empty(self.shape) # Masking arrays which classify the coordinates according to interval @@ -76,19 +75,22 @@ def sample(self): bounded = self.bounded_below & self.bounded_above # Vectorized sampling by interval type - sample[unbounded] = self.rng.normal( - size=unbounded[unbounded].shape) - - sample[low_bounded] = self.rng.exponential( - size=low_bounded[low_bounded].shape) + self.low[low_bounded] - - sample[upp_bounded] = -self.rng.exponential( - size=upp_bounded[upp_bounded].shape) + self.high[upp_bounded] - - sample[bounded] = self.rng.uniform(low=self.low[bounded], - high=high[bounded], - size=bounded[bounded].shape) - if self.dtype.kind == 'i': + sample[unbounded] = self.rng.normal(size=unbounded[unbounded].shape) + + sample[low_bounded] = ( + self.rng.exponential(size=low_bounded[low_bounded].shape) + + self.low[low_bounded] + ) + + sample[upp_bounded] = ( + -self.rng.exponential(size=upp_bounded[upp_bounded].shape) + + self.high[upp_bounded] + ) + + sample[bounded] = self.rng.uniform( + low=self.low[bounded], high=high[bounded], size=bounded[bounded].shape + ) + if self.dtype.kind == "i": sample = np.floor(sample) return sample.astype(self.dtype) diff --git a/rlberry/spaces/from_gym.py b/rlberry/spaces/from_gym.py index 939a3740c..6bd87ce06 100644 --- a/rlberry/spaces/from_gym.py +++ b/rlberry/spaces/from_gym.py @@ -3,31 +3,36 @@ def convert_space_from_gym(space): - if isinstance(space, gym.spaces.Box) and (not isinstance(space, rlberry.spaces.Box)): + if isinstance(space, gym.spaces.Box) and ( + not isinstance(space, rlberry.spaces.Box) + ): return rlberry.spaces.Box( - space.low, - space.high, - shape=space.shape, - dtype=space.dtype + space.low, space.high, shape=space.shape, dtype=space.dtype ) - if isinstance(space, gym.spaces.Discrete) and (not isinstance(space, rlberry.spaces.Discrete)): - return rlberry.spaces.Discrete( - n=space.n - ) - if isinstance(space, gym.spaces.MultiBinary) and (not isinstance(space, rlberry.spaces.MultiBinary)): - return rlberry.spaces.MultiBinary( - n=space.n - ) - if isinstance(space, gym.spaces.MultiDiscrete) and (not isinstance(space, rlberry.spaces.MultiDiscrete)): + if isinstance(space, gym.spaces.Discrete) and ( + not isinstance(space, rlberry.spaces.Discrete) + ): + return rlberry.spaces.Discrete(n=space.n) + if isinstance(space, gym.spaces.MultiBinary) and ( + not isinstance(space, rlberry.spaces.MultiBinary) + ): + return rlberry.spaces.MultiBinary(n=space.n) + if isinstance(space, gym.spaces.MultiDiscrete) and ( + not isinstance(space, rlberry.spaces.MultiDiscrete) + ): return rlberry.spaces.MultiDiscrete( nvec=space.nvec, dtype=space.dtype, ) - if isinstance(space, gym.spaces.Tuple) and (not isinstance(space, rlberry.spaces.Tuple)): + if isinstance(space, gym.spaces.Tuple) and ( + not isinstance(space, rlberry.spaces.Tuple) + ): return rlberry.spaces.Tuple( spaces=[convert_space_from_gym(sp) for sp in space.spaces] ) - if isinstance(space, gym.spaces.Dict) and (not isinstance(space, rlberry.spaces.Dict)): + if isinstance(space, gym.spaces.Dict) and ( + not isinstance(space, rlberry.spaces.Dict) + ): converted_spaces = dict() for key in space.spaces: converted_spaces[key] = convert_space_from_gym(space.spaces[key]) diff --git a/rlberry/spaces/multi_binary.py b/rlberry/spaces/multi_binary.py index d3703526d..84bb27b4f 100644 --- a/rlberry/spaces/multi_binary.py +++ b/rlberry/spaces/multi_binary.py @@ -44,5 +44,4 @@ def reseed(self, seed_seq=None): self.seeder.reseed(seed_seq) def sample(self): - return self.rng.integers(low=0, high=2, - size=self.n, dtype=self.dtype) + return self.rng.integers(low=0, high=2, size=self.n, dtype=self.dtype) diff --git a/rlberry/spaces/tests/test_from_gym.py b/rlberry/spaces/tests/test_from_gym.py index 779c55597..945553fe8 100644 --- a/rlberry/spaces/tests/test_from_gym.py +++ b/rlberry/spaces/tests/test_from_gym.py @@ -18,24 +18,26 @@ def test_discrete_space(n): assert sp.contains(sp.sample()) -@pytest.mark.parametrize("low, high, dim", - [ - (1.0, 10.0, 1), - (1.0, 10.0, 2), - (1.0, 10.0, 4), - (-10.0, 1.0, 1), - (-10.0, 1.0, 2), - (-10.0, 1.0, 4), - (-np.inf, 1.0, 1), - (-np.inf, 1.0, 2), - (-np.inf, 1.0, 4), - (1.0, np.inf, 1), - (1.0, np.inf, 2), - (1.0, np.inf, 4), - (-np.inf, np.inf, 1), - (-np.inf, np.inf, 2), - (-np.inf, np.inf, 4), - ]) +@pytest.mark.parametrize( + "low, high, dim", + [ + (1.0, 10.0, 1), + (1.0, 10.0, 2), + (1.0, 10.0, 4), + (-10.0, 1.0, 1), + (-10.0, 1.0, 2), + (-10.0, 1.0, 4), + (-np.inf, 1.0, 1), + (-np.inf, 1.0, 2), + (-np.inf, 1.0, 4), + (1.0, np.inf, 1), + (1.0, np.inf, 2), + (1.0, np.inf, 4), + (-np.inf, np.inf, 1), + (-np.inf, np.inf, 2), + (-np.inf, np.inf, 4), + ], +) def test_box_space_case_1(low, high, dim): shape = (dim, 1) gym_sp = gym.spaces.Box(low, high, shape=shape) @@ -43,7 +45,7 @@ def test_box_space_case_1(low, high, dim): assert isinstance(sp, rlberry.spaces.Box) sp.reseed(123) for _ in range(2 ** dim): - assert (sp.contains(sp.sample())) + assert sp.contains(sp.sample()) @pytest.mark.parametrize( @@ -53,8 +55,9 @@ def test_box_space_case_1(low, high, dim): (np.array([-10.0, -10.0, -10.0]), np.array([10.0, 10.0, 10.0])), (np.array([-10.0, -10.0, -10.0]), np.array([10.0, 10.0, np.inf])), (np.array([-np.inf, -10.0, -10.0]), np.array([10.0, 10.0, np.inf])), - (np.array([-np.inf, -10.0, -10.0]), np.array([np.inf, 10.0, np.inf])) - ]) + (np.array([-np.inf, -10.0, -10.0]), np.array([np.inf, 10.0, np.inf])), + ], +) def test_box_space_case_2(low, high): gym_sp = gym.spaces.Box(low, high, dtype=np.float64) sp = convert_space_from_gym(gym_sp) @@ -65,7 +68,7 @@ def test_box_space_case_2(low, high): else: assert sp.is_bounded() for ii in range(2 ** sp.shape[0]): - assert (sp.contains(sp.sample())) + assert sp.contains(sp.sample()) def test_tuple(): @@ -101,26 +104,36 @@ def test_multibinary(): def test_dict(): - nested_observation_space = gym.spaces.Dict({ - 'sensors': gym.spaces.Dict({ - 'position': gym.spaces.Box(low=-100, high=100, shape=(3,)), - 'velocity': gym.spaces.Box(low=-1, high=1, shape=(3,)), - 'front_cam': gym.spaces.Tuple(( - gym.spaces.Box(low=0, high=1, shape=(10, 10, 3)), - gym.spaces.Box(low=0, high=1, shape=(10, 10, 3)) - )), - 'rear_cam': gym.spaces.Box(low=0, high=1, shape=(10, 10, 3)), - }), - 'ext_controller': gym.spaces.MultiDiscrete((5, 2, 2)), - 'inner_state': gym.spaces.Dict({ - 'charge': gym.spaces.Discrete(100), - 'system_checks': gym.spaces.MultiBinary(10), - 'job_status': gym.spaces.Dict({ - 'task': gym.spaces.Discrete(5), - 'progress': gym.spaces.Box(low=0, high=100, shape=()), - }) - }) - }) + nested_observation_space = gym.spaces.Dict( + { + "sensors": gym.spaces.Dict( + { + "position": gym.spaces.Box(low=-100, high=100, shape=(3,)), + "velocity": gym.spaces.Box(low=-1, high=1, shape=(3,)), + "front_cam": gym.spaces.Tuple( + ( + gym.spaces.Box(low=0, high=1, shape=(10, 10, 3)), + gym.spaces.Box(low=0, high=1, shape=(10, 10, 3)), + ) + ), + "rear_cam": gym.spaces.Box(low=0, high=1, shape=(10, 10, 3)), + } + ), + "ext_controller": gym.spaces.MultiDiscrete((5, 2, 2)), + "inner_state": gym.spaces.Dict( + { + "charge": gym.spaces.Discrete(100), + "system_checks": gym.spaces.MultiBinary(10), + "job_status": gym.spaces.Dict( + { + "task": gym.spaces.Discrete(5), + "progress": gym.spaces.Box(low=0, high=100, shape=()), + } + ), + } + ), + } + ) gym_sp = nested_observation_space sp = convert_space_from_gym(gym_sp) assert isinstance(sp, rlberry.spaces.Dict) diff --git a/rlberry/spaces/tests/test_spaces.py b/rlberry/spaces/tests/test_spaces.py index 53837c6f7..ddb735814 100644 --- a/rlberry/spaces/tests/test_spaces.py +++ b/rlberry/spaces/tests/test_spaces.py @@ -19,29 +19,31 @@ def test_discrete_space(n): assert sp.contains(sp.sample()) -@pytest.mark.parametrize("low, high, dim", - [ - (1.0, 10.0, 1), - (1.0, 10.0, 2), - (1.0, 10.0, 4), - (-10.0, 1.0, 1), - (-10.0, 1.0, 2), - (-10.0, 1.0, 4), - (-np.inf, 1.0, 1), - (-np.inf, 1.0, 2), - (-np.inf, 1.0, 4), - (1.0, np.inf, 1), - (1.0, np.inf, 2), - (1.0, np.inf, 4), - (-np.inf, np.inf, 1), - (-np.inf, np.inf, 2), - (-np.inf, np.inf, 4), - ]) +@pytest.mark.parametrize( + "low, high, dim", + [ + (1.0, 10.0, 1), + (1.0, 10.0, 2), + (1.0, 10.0, 4), + (-10.0, 1.0, 1), + (-10.0, 1.0, 2), + (-10.0, 1.0, 4), + (-np.inf, 1.0, 1), + (-np.inf, 1.0, 2), + (-np.inf, 1.0, 4), + (1.0, np.inf, 1), + (1.0, np.inf, 2), + (1.0, np.inf, 4), + (-np.inf, np.inf, 1), + (-np.inf, np.inf, 2), + (-np.inf, np.inf, 4), + ], +) def test_box_space_case_1(low, high, dim): shape = (dim, 1) sp = Box(low, high, shape=shape) for ii in range(2 ** dim): - assert (sp.contains(sp.sample())) + assert sp.contains(sp.sample()) @pytest.mark.parametrize( @@ -51,8 +53,9 @@ def test_box_space_case_1(low, high, dim): (np.array([-10.0, -10.0, -10.0]), np.array([10.0, 10.0, 10.0])), (np.array([-10.0, -10.0, -10.0]), np.array([10.0, 10.0, np.inf])), (np.array([-np.inf, -10.0, -10.0]), np.array([10.0, 10.0, np.inf])), - (np.array([-np.inf, -10.0, -10.0]), np.array([np.inf, 10.0, np.inf])) - ]) + (np.array([-np.inf, -10.0, -10.0]), np.array([np.inf, 10.0, np.inf])), + ], +) def test_box_space_case_2(low, high): sp = Box(low, high) if (-np.inf in low) or (np.inf in high): @@ -60,7 +63,7 @@ def test_box_space_case_2(low, high): else: assert sp.is_bounded() for ii in range(2 ** sp.shape[0]): - assert (sp.contains(sp.sample())) + assert sp.contains(sp.sample()) def test_tuple(): @@ -88,26 +91,36 @@ def test_multibinary(): def test_dict(): - nested_observation_space = Dict({ - 'sensors': Dict({ - 'position': Box(low=-100, high=100, shape=(3,)), - 'velocity': Box(low=-1, high=1, shape=(3,)), - 'front_cam': Tuple(( - Box(low=0, high=1, shape=(10, 10, 3)), - Box(low=0, high=1, shape=(10, 10, 3)) - )), - 'rear_cam': Box(low=0, high=1, shape=(10, 10, 3)), - }), - 'ext_controller': MultiDiscrete((5, 2, 2)), - 'inner_state': Dict({ - 'charge': Discrete(100), - 'system_checks': MultiBinary(10), - 'job_status': Dict({ - 'task': Discrete(5), - 'progress': Box(low=0, high=100, shape=()), - }) - }) - }) + nested_observation_space = Dict( + { + "sensors": Dict( + { + "position": Box(low=-100, high=100, shape=(3,)), + "velocity": Box(low=-1, high=1, shape=(3,)), + "front_cam": Tuple( + ( + Box(low=0, high=1, shape=(10, 10, 3)), + Box(low=0, high=1, shape=(10, 10, 3)), + ) + ), + "rear_cam": Box(low=0, high=1, shape=(10, 10, 3)), + } + ), + "ext_controller": MultiDiscrete((5, 2, 2)), + "inner_state": Dict( + { + "charge": Discrete(100), + "system_checks": MultiBinary(10), + "job_status": Dict( + { + "task": Discrete(5), + "progress": Box(low=0, high=100, shape=()), + } + ), + } + ), + } + ) sp = nested_observation_space for _ in range(10): assert sp.contains(sp.sample()) diff --git a/rlberry/utils/binsearch.py b/rlberry/utils/binsearch.py index e23ed327a..e0a8d2a4c 100644 --- a/rlberry/utils/binsearch.py +++ b/rlberry/utils/binsearch.py @@ -22,10 +22,10 @@ def binary_search_nd(x_vec, bins): aux = 1 assert dim == len(x_vec), "dimension mismatch in binary_search_nd()" for dd in range(dim): - index_dd = np.searchsorted(bins[dd], x_vec[dd], side='right') - 1 + index_dd = np.searchsorted(bins[dd], x_vec[dd], side="right") - 1 assert index_dd != -1, "error in binary_search_nd()" flat_index += aux * index_dd - aux *= (len(bins[dd]) - 1) + aux *= len(bins[dd]) - 1 return flat_index @@ -39,8 +39,7 @@ def unravel_index_uniform_bin(flat_index, dim, n_per_dim): if __name__ == "__main__": - bins = [(0, 1, 2, 3, 4), - (0, 1, 2, 3, 4)] + bins = [(0, 1, 2, 3, 4), (0, 1, 2, 3, 4)] x = [3.9, 3.5] index = binary_search_nd(x, bins) print(index) diff --git a/rlberry/utils/io.py b/rlberry/utils/io.py index 94f7092ce..cb269f29a 100644 --- a/rlberry/utils/io.py +++ b/rlberry/utils/io.py @@ -1,4 +1,3 @@ - import os import zipfile import pathlib @@ -22,12 +21,13 @@ def zipdir(dir_path, ouput_fname): dir_path = pathlib.Path(dir_path) if not dir_path.exists(): return None - ouput_fname = pathlib.Path(ouput_fname).with_suffix('.zip') - zipf = zipfile.ZipFile(ouput_fname, 'w', zipfile.ZIP_DEFLATED) + ouput_fname = pathlib.Path(ouput_fname).with_suffix(".zip") + zipf = zipfile.ZipFile(ouput_fname, "w", zipfile.ZIP_DEFLATED) for root, _, files in os.walk(dir_path): for file in files: - zipf.write(os.path.join(root, file), - os.path.relpath(os.path.join(root, file), - os.path.join(dir_path, '..'))) + zipf.write( + os.path.join(root, file), + os.path.relpath(os.path.join(root, file), os.path.join(dir_path, "..")), + ) zipf.close() return ouput_fname diff --git a/rlberry/utils/jit_setup.py b/rlberry/utils/jit_setup.py index 20a566ac8..c5f2f55a1 100644 --- a/rlberry/utils/jit_setup.py +++ b/rlberry/utils/jit_setup.py @@ -10,6 +10,7 @@ numba_jit = jit(nopython=True) else: + def numba_jit(func, **options): """This decorator does not modify the decorated function.""" return func diff --git a/rlberry/utils/logging.py b/rlberry/utils/logging.py index 05748c213..db9d0898d 100644 --- a/rlberry/utils/logging.py +++ b/rlberry/utils/logging.py @@ -3,10 +3,12 @@ import gym -def configure_logging(level: str = "INFO", - file_path: Path = None, - file_level: str = "DEBUG", - default_msg: str = "") -> None: +def configure_logging( + level: str = "INFO", + file_path: Path = None, + file_level: str = "DEBUG", + default_msg: str = "", +) -> None: """ Set the logging configuration @@ -28,29 +30,19 @@ def configure_logging(level: str = "INFO", "version": 1, "disable_existing_loggers": False, "formatters": { - "standard": { - "format": default_msg + "[%(levelname)s] %(message)s " - }, + "standard": {"format": default_msg + "[%(levelname)s] %(message)s "}, "detailed": { "format": default_msg + "[%(name)s:%(levelname)s] %(message)s " - } + }, }, "handlers": { "default": { "level": level, "formatter": "standard", - "class": "logging.StreamHandler" + "class": "logging.StreamHandler", } }, - "loggers": { - "": { - "handlers": [ - "default" - ], - "level": "DEBUG", - "propagate": True - } - } + "loggers": {"": {"handlers": ["default"], "level": "DEBUG", "propagate": True}}, } if file_path: config["handlers"][file_path.name] = { @@ -58,10 +50,10 @@ def configure_logging(level: str = "INFO", "filename": file_path, "level": file_level, "formatter": "detailed", - "mode": 'w' + "mode": "w", } config["loggers"][""]["handlers"].append(file_path.name) logging.config.dictConfig(config) gym.logger.set_level(logging.getLevelName(level)) - numba_logger = logging.getLogger('numba') + numba_logger = logging.getLogger("numba") numba_logger.setLevel(logging.WARNING) diff --git a/rlberry/utils/math.py b/rlberry/utils/math.py index a6ce0e886..5fcb09841 100644 --- a/rlberry/utils/math.py +++ b/rlberry/utils/math.py @@ -1,11 +1,7 @@ import numpy as np from typing import Union, Tuple -Interval = Union[ - np.ndarray, - Tuple[float, float], - Tuple[np.ndarray, np.ndarray] -] +Interval = Union[np.ndarray, Tuple[float, float], Tuple[np.ndarray, np.ndarray]] def lmap(v: np.ndarray, x: Interval, y: Interval) -> np.ndarray: diff --git a/rlberry/utils/space_discretizer.py b/rlberry/utils/space_discretizer.py index bd8ab3df1..217a7a525 100644 --- a/rlberry/utils/space_discretizer.py +++ b/rlberry/utils/space_discretizer.py @@ -6,8 +6,9 @@ class Discretizer: def __init__(self, space, n_bins): - assert isinstance(space, Box), \ - "Discretization is only implemented for Box spaces." + assert isinstance( + space, Box + ), "Discretization is only implemented for Box spaces." assert space.is_bounded() self.space = space self.n_bins = n_bins diff --git a/rlberry/utils/tests/test_binsearch.py b/rlberry/utils/tests/test_binsearch.py index 00172d8fa..fd94adde4 100644 --- a/rlberry/utils/tests/test_binsearch.py +++ b/rlberry/utils/tests/test_binsearch.py @@ -23,13 +23,9 @@ def test_binary_search_nd(): assert binary_search_nd(vec3, bins) == 1 + 3 * 1 + 3 * 3 * 0 -@pytest.mark.parametrize("i, j, k, N", - [ - (0, 0, 0, 5), - (0, 1, 2, 5), - (4, 3, 2, 5), - (4, 4, 4, 5) - ]) +@pytest.mark.parametrize( + "i, j, k, N", [(0, 0, 0, 5), (0, 1, 2, 5), (4, 3, 2, 5), (4, 4, 4, 5)] +) def test_unravel_index_uniform_bin(i, j, k, N): # index = i + N * j + N * N * k dim = 3 diff --git a/rlberry/utils/tests/test_metrics.py b/rlberry/utils/tests/test_metrics.py index d18eac384..487dcbb61 100644 --- a/rlberry/utils/tests/test_metrics.py +++ b/rlberry/utils/tests/test_metrics.py @@ -11,7 +11,7 @@ def test_metrics(dim): scaling_2 = 0.5 * np.ones(dim) for p in range(1, 10): - assert np.abs(metric_lp(x, y, p, scaling_1) - - np.power(dim, 1.0 / p)) < 1e-15 - assert np.abs(metric_lp(x, y, p, scaling_2) - - 2 * np.power(dim, 1.0 / p)) < 1e-15 + assert np.abs(metric_lp(x, y, p, scaling_1) - np.power(dim, 1.0 / p)) < 1e-15 + assert ( + np.abs(metric_lp(x, y, p, scaling_2) - 2 * np.power(dim, 1.0 / p)) < 1e-15 + ) diff --git a/rlberry/utils/torch.py b/rlberry/utils/torch.py index 5370fa901..f72d78d5d 100644 --- a/rlberry/utils/torch.py +++ b/rlberry/utils/torch.py @@ -10,25 +10,28 @@ def get_gpu_memory_map(): - result = check_output(['nvidia-smi', - '--query-gpu=memory.used', - '--format=csv,nounits,noheader']) + result = check_output( + ["nvidia-smi", "--query-gpu=memory.used", "--format=csv,nounits,noheader"] + ) return [int(x) for x in result.split()] def least_used_device(): - """ Get the GPU device with most available memory. """ + """Get the GPU device with most available memory.""" if not torch.cuda.is_available(): raise RuntimeError("cuda unavailable") - if shutil.which('nvidia-smi') is None: - raise RuntimeError("nvidia-smi unavailable: \ -cannot select device with most least memory used.") + if shutil.which("nvidia-smi") is None: + raise RuntimeError( + "nvidia-smi unavailable: \ +cannot select device with most least memory used." + ) memory_map = get_gpu_memory_map() device_id = np.argmin(memory_map) - logger.info(f"Choosing GPU device: {device_id}, " - f"memory used: {memory_map[device_id]}") + logger.info( + f"Choosing GPU device: {device_id}, " f"memory used: {memory_map[device_id]}" + ) return torch.device("cuda:{}".format(device_id)) @@ -37,7 +40,9 @@ def choose_device(preferred_device, default_device="cpu"): try: preferred_device = least_used_device() except RuntimeError: - logger.info(f"Could not find least used device (nvidia-smi might be missing), use cuda:0 instead") + logger.info( + f"Could not find least used device (nvidia-smi might be missing), use cuda:0 instead" + ) if torch.cuda.is_available(): return choose_device("cuda:0") else: @@ -45,8 +50,10 @@ def choose_device(preferred_device, default_device="cpu"): try: torch.zeros((1,), device=preferred_device) # Test availability except (RuntimeError, AssertionError) as e: - logger.info(f"Preferred device {preferred_device} unavailable ({e})." - f"Switching to default {default_device}") + logger.info( + f"Preferred device {preferred_device} unavailable ({e})." + f"Switching to default {default_device}" + ) return default_device return preferred_device @@ -55,9 +62,12 @@ def get_memory(pid=None): if not pid: pid = os.getpid() command = "nvidia-smi" - result = run(command, stdout=PIPE, stderr=PIPE, - universal_newlines=True, shell=True).stdout - m = re.findall("\| *[0-9] *" - + str(pid) - + " *C *.*python.*? +([0-9]+).*\|", result, re.MULTILINE) + result = run( + command, stdout=PIPE, stderr=PIPE, universal_newlines=True, shell=True + ).stdout + m = re.findall( + "\| *[0-9] *" + str(pid) + " *C *.*python.*? +([0-9]+).*\|", + result, + re.MULTILINE, + ) return [int(mem) for mem in m] diff --git a/rlberry/utils/writers.py b/rlberry/utils/writers.py index 7e714117b..56a31e6de 100644 --- a/rlberry/utils/writers.py +++ b/rlberry/utils/writers.py @@ -38,11 +38,13 @@ class DefaultWriter: """ def __init__( - self, name: str, - log_interval: int = 3, - tensorboard_kwargs: Optional[dict] = None, - execution_metadata: Optional[metadata_utils.ExecutionMetadata] = None, - maxlen: Optional[int] = None): + self, + name: str, + log_interval: int = 3, + tensorboard_kwargs: Optional[dict] = None, + execution_metadata: Optional[metadata_utils.ExecutionMetadata] = None, + maxlen: Optional[int] = None, + ): self._name = name self._log_interval = log_interval self._execution_metadata = execution_metadata @@ -53,8 +55,12 @@ def __init__( self.reset() # initialize tensorboard - if (tensorboard_kwargs is not None) and (not check_packages.TENSORBOARD_INSTALLED): - logger.warning('[DefaultWriter]: received tensorboard_kwargs, but tensorboard is not installed.') + if (tensorboard_kwargs is not None) and ( + not check_packages.TENSORBOARD_INSTALLED + ): + logger.warning( + "[DefaultWriter]: received tensorboard_kwargs, but tensorboard is not installed." + ) self._tensorboard_kwargs = tensorboard_kwargs self._tensorboard_logdir = None self._summary_writer = None @@ -74,13 +80,19 @@ def summary_writer(self): @property def data(self): - df = pd.DataFrame(columns=('name', 'tag', 'value', 'global_step')) + df = pd.DataFrame(columns=("name", "tag", "value", "global_step")) for tag in self._data: df = df.append(pd.DataFrame(self._data[tag]), ignore_index=True) return df def add_scalar( - self, tag: str, scalar_value: float, global_step: Optional[int] = None, walltime=None, new_style=False): + self, + tag: str, + scalar_value: float, + global_step: Optional[int] = None, + walltime=None, + new_style=False, + ): """ Behaves as SummaryWriter.add_scalar(). @@ -102,34 +114,46 @@ def add_scalar( style (simple_value field). New style could lead to faster data loading. """ if self._summary_writer: - self._summary_writer.add_scalar(tag, scalar_value, global_step, walltime, new_style) + self._summary_writer.add_scalar( + tag, scalar_value, global_step, walltime, new_style + ) self._add_scalar(tag, scalar_value, global_step) - def _add_scalar(self, tag: str, scalar_value: float, global_step: Optional[int] = None): + def _add_scalar( + self, tag: str, scalar_value: float, global_step: Optional[int] = None + ): """ Store scalar value in self._data. """ # Update data structures if tag not in self._data: self._data[tag] = dict() - self._data[tag]['name'] = deque(maxlen=self._maxlen) - self._data[tag]['tag'] = deque(maxlen=self._maxlen) - self._data[tag]['value'] = deque(maxlen=self._maxlen) - self._data[tag]['global_step'] = deque(maxlen=self._maxlen) - - self._data[tag]['name'].append(self._name) # used in plots, when aggregating several writers - self._data[tag]['tag'].append(tag) # useful to convert all data to a single DataFrame - self._data[tag]['value'].append(scalar_value) + self._data[tag]["name"] = deque(maxlen=self._maxlen) + self._data[tag]["tag"] = deque(maxlen=self._maxlen) + self._data[tag]["value"] = deque(maxlen=self._maxlen) + self._data[tag]["global_step"] = deque(maxlen=self._maxlen) + + self._data[tag]["name"].append( + self._name + ) # used in plots, when aggregating several writers + self._data[tag]["tag"].append( + tag + ) # useful to convert all data to a single DataFrame + self._data[tag]["value"].append(scalar_value) if global_step is None: - self._data[tag]['global_step'].append(np.nan) + self._data[tag]["global_step"].append(np.nan) else: - self._data[tag]['global_step'].append(global_step) + self._data[tag]["global_step"].append(global_step) # Append time interval corresponding to global_step if global_step is not None and self._log_time: - assert tag != 'dw_time_elapsed', 'The tag dw_time_elapsed is reserved.' + assert tag != "dw_time_elapsed", "The tag dw_time_elapsed is reserved." self._log_time = False - self._add_scalar(tag='dw_time_elapsed', scalar_value=timer() - self._initial_time, global_step=global_step) + self._add_scalar( + tag="dw_time_elapsed", + scalar_value=timer() - self._initial_time, + global_step=global_step, + ) self._log_time = True # Log @@ -144,18 +168,18 @@ def _log(self): max_global_step = 0 if time_elapsed > self._log_interval: self._time_last_log = t_now - message = '' + message = "" for tag in self._data: - val = self._data[tag]['value'][-1] - gstep = self._data[tag]['global_step'][-1] - message += f'{tag} = {val} | ' + val = self._data[tag]["value"][-1] + gstep = self._data[tag]["global_step"][-1] + message += f"{tag} = {val} | " if not np.isnan(gstep): max_global_step = max(max_global_step, gstep) header = self._name if self._execution_metadata: - header += f'[worker: {self._execution_metadata.obj_worker_id}]' - message = f'[{header}] | max_global_step = {max_global_step} | ' + message + header += f"[worker: {self._execution_metadata.obj_worker_id}]" + message = f"[{header}] | max_global_step = {max_global_step} | " + message logger.info(message) def __getattr__(self, attr): @@ -163,7 +187,7 @@ def __getattr__(self, attr): Calls SummaryWriter methods, if self._summary_writer is not None. Otherwise, does nothing. """ - if attr[:2] == '__': + if attr[:2] == "__": raise AttributeError(attr) if attr in self.__dict__: return getattr(self, attr) @@ -172,6 +196,7 @@ def __getattr__(self, attr): def method(*args, **kwargs): pass + return method # @@ -185,7 +210,11 @@ def __getstate__(self): def __setstate__(self, newstate): # Re-create summary writer with the same logdir - if newstate['_summary_writer']: - newstate['_tensorboard_kwargs'].update(dict(log_dir=newstate['_tensorboard_logdir'])) - newstate['_summary_writer'] = SummaryWriter(**newstate['_tensorboard_kwargs']) + if newstate["_summary_writer"]: + newstate["_tensorboard_kwargs"].update( + dict(log_dir=newstate["_tensorboard_logdir"]) + ) + newstate["_summary_writer"] = SummaryWriter( + **newstate["_tensorboard_kwargs"] + ) self.__dict__.update(newstate) diff --git a/rlberry/wrappers/discretize_state.py b/rlberry/wrappers/discretize_state.py index 168b4e1e7..2aa4c6bab 100644 --- a/rlberry/wrappers/discretize_state.py +++ b/rlberry/wrappers/discretize_state.py @@ -24,8 +24,9 @@ def __init__(self, _env, n_bins): self._bins = [] self._open_bins = [] for dd in range(self.dim): - range_dd = self.env.observation_space.high[dd] \ - - self.env.observation_space.low[dd] + range_dd = ( + self.env.observation_space.high[dd] - self.env.observation_space.low[dd] + ) epsilon = range_dd / n_bins bins_dd = [] for bb in range(n_bins + 1): @@ -41,8 +42,7 @@ def __init__(self, _env, n_bins): # List of discretized states self.discretized_states = np.zeros((self.dim, n_states)) for ii in range(n_states): - self.discretized_states[:, ii] = \ - self.get_continuous_state(ii, False) + self.discretized_states[:, ii] = self.get_continuous_state(ii, False) def reset(self): return self.get_discrete_state(self.env.reset()) @@ -55,11 +55,9 @@ def step(self, action): def sample(self, discrete_state, action): # map disctete state to continuous one assert self.observation_space.contains(discrete_state) - continuous_state = self.get_continuous_state(discrete_state, - randomize=True) + continuous_state = self.get_continuous_state(discrete_state, randomize=True) # sample in the true environment - next_state, reward, done, info = \ - self.env.sample(continuous_state, action) + next_state, reward, done, info = self.env.sample(continuous_state, action) # discretize next state next_state = binary_search_nd(next_state, self._bins) @@ -69,20 +67,21 @@ def get_discrete_state(self, continuous_state): return binary_search_nd(continuous_state, self._bins) def get_continuous_state(self, discrete_state, randomize=False): - assert discrete_state >= 0 \ - and discrete_state < self.observation_space.n, \ - "invalid discrete_state" + assert ( + discrete_state >= 0 and discrete_state < self.observation_space.n + ), "invalid discrete_state" # get multi-index - index \ - = unravel_index_uniform_bin(discrete_state, self.dim, self.n_bins) + index = unravel_index_uniform_bin(discrete_state, self.dim, self.n_bins) # get state continuous_state = np.zeros(self.dim) for dd in range(self.dim): continuous_state[dd] = self._bins[dd][index[dd]] if randomize: - range_dd = self.env.observation_space.high[dd] \ - - self.env.observation_space.low[dd] + range_dd = ( + self.env.observation_space.high[dd] + - self.env.observation_space.low[dd] + ) epsilon = range_dd / self.n_bins continuous_state[dd] += epsilon * self.rng.uniform() return continuous_state diff --git a/rlberry/wrappers/gym_utils.py b/rlberry/wrappers/gym_utils.py index f765db85b..736c8eb04 100644 --- a/rlberry/wrappers/gym_utils.py +++ b/rlberry/wrappers/gym_utils.py @@ -13,10 +13,7 @@ def convert_space_from_gym(gym_space): # # elif isinstance(gym_space, gym.spaces.Box): - return Box(gym_space.low, - gym_space.high, - gym_space.shape, - gym_space.dtype) + return Box(gym_space.low, gym_space.high, gym_space.shape, gym_space.dtype) # # elif isinstance(gym_space, gym.spaces.Tuple): diff --git a/rlberry/wrappers/tests/test_basewrapper.py b/rlberry/wrappers/tests/test_basewrapper.py index edbc33ab4..2545ffa59 100644 --- a/rlberry/wrappers/tests/test_basewrapper.py +++ b/rlberry/wrappers/tests/test_basewrapper.py @@ -14,12 +14,11 @@ def test_wrapper(): # calling some functions wrapped.reset() wrapped.step(wrapped.action_space.sample()) - wrapped.sample(wrapped.observation_space.sample(), - wrapped.action_space.sample()) + wrapped.sample(wrapped.observation_space.sample(), wrapped.action_space.sample()) def test_gym_wrapper(): - gym_env = gym.make('Acrobot-v1') + gym_env = gym.make("Acrobot-v1") wrapped = Wrapper(gym_env) assert isinstance(wrapped, Model) assert wrapped.is_online() diff --git a/rlberry/wrappers/tests/test_common_wrappers.py b/rlberry/wrappers/tests/test_common_wrappers.py index 1f45b093a..4b842e5c9 100644 --- a/rlberry/wrappers/tests/test_common_wrappers.py +++ b/rlberry/wrappers/tests/test_common_wrappers.py @@ -10,8 +10,7 @@ from rlberry.wrappers.rescale_reward import RescaleRewardWrapper from rlberry.agents import RSUCBVIAgent from rlberry.wrappers.autoreset import AutoResetWrapper -from rlberry.wrappers.uncertainty_estimator_wrapper import \ - UncertaintyEstimatorWrapper +from rlberry.wrappers.uncertainty_estimator_wrapper import UncertaintyEstimatorWrapper from rlberry.wrappers.vis2d import Vis2dWrapper @@ -58,8 +57,8 @@ def test_rescale_reward(): _ = wrapped.reset() for _ in range(100): _, reward, _, _ = wrapped.sample( - wrapped.observation_space.sample(), - wrapped.action_space.sample()) + wrapped.observation_space.sample(), wrapped.action_space.sample() + ) assert reward <= 10 + tol and reward >= -10 - tol _ = wrapped.reset() @@ -140,28 +139,30 @@ def test_uncertainty_est_wrapper(): env = GridWorld() def uncertainty_est_fn(observation_space, action_space): - return DiscreteCounter(observation_space, - action_space) + return DiscreteCounter(observation_space, action_space) - w_env = UncertaintyEstimatorWrapper( - env, - uncertainty_est_fn, - bonus_scale_factor=1.0) + w_env = UncertaintyEstimatorWrapper(env, uncertainty_est_fn, bonus_scale_factor=1.0) for ii in range(10): w_env.reset() _, _, _, info = w_env.step(0) nn = w_env.uncertainty_estimator.count(0, 0) assert nn == ii + 1 - assert info['exploration_bonus'] == pytest.approx(1 / np.sqrt(nn)) + assert info["exploration_bonus"] == pytest.approx(1 / np.sqrt(nn)) def test_vis2dwrapper(): env = MountainCar() env = Vis2dWrapper(env, n_bins_obs=20, memory_size=200) - agent = RSUCBVIAgent(env, gamma=0.99, horizon=200, - bonus_scale_factor=0.1, copy_env=False, min_dist=0.1) + agent = RSUCBVIAgent( + env, + gamma=0.99, + horizon=200, + bonus_scale_factor=0.1, + copy_env=False, + min_dist=0.1, + ) agent.fit(budget=15) env.plot_trajectories(show=False) diff --git a/rlberry/wrappers/tests/test_gym_space_conversion.py b/rlberry/wrappers/tests/test_gym_space_conversion.py index 5b335034e..4a6eb0463 100644 --- a/rlberry/wrappers/tests/test_gym_space_conversion.py +++ b/rlberry/wrappers/tests/test_gym_space_conversion.py @@ -7,11 +7,11 @@ def convert_and_compare(sp, rlberry_space): sp_conv = convert_space_from_gym(sp) - assert (isinstance(sp_conv, rlberry_space)) + assert isinstance(sp_conv, rlberry_space) sp_conv.reseed() for _ in range(100): - assert (sp.contains(sp_conv.sample())) - assert (sp_conv.contains(sp.sample())) + assert sp.contains(sp_conv.sample()) + assert sp_conv.contains(sp.sample()) @pytest.mark.parametrize("n", list(range(1, 10))) @@ -20,24 +20,26 @@ def test_discrete_space(n): convert_and_compare(sp, rlberry.spaces.Discrete) -@pytest.mark.parametrize("low, high, dim", - [ - (1.0, 10.0, 1), - (1.0, 10.0, 2), - (1.0, 10.0, 4), - (-10.0, 1.0, 1), - (-10.0, 1.0, 2), - (-10.0, 1.0, 4), - (-np.inf, 1.0, 1), - (-np.inf, 1.0, 2), - (-np.inf, 1.0, 4), - (1.0, np.inf, 1), - (1.0, np.inf, 2), - (1.0, np.inf, 4), - (-np.inf, np.inf, 1), - (-np.inf, np.inf, 2), - (-np.inf, np.inf, 4), - ]) +@pytest.mark.parametrize( + "low, high, dim", + [ + (1.0, 10.0, 1), + (1.0, 10.0, 2), + (1.0, 10.0, 4), + (-10.0, 1.0, 1), + (-10.0, 1.0, 2), + (-10.0, 1.0, 4), + (-np.inf, 1.0, 1), + (-np.inf, 1.0, 2), + (-np.inf, 1.0, 4), + (1.0, np.inf, 1), + (1.0, np.inf, 2), + (1.0, np.inf, 4), + (-np.inf, np.inf, 1), + (-np.inf, np.inf, 2), + (-np.inf, np.inf, 4), + ], +) def test_box_space_case(low, high, dim): shape = (dim, 1) sp = gym.spaces.Box(low, high, shape=shape) @@ -63,25 +65,35 @@ def test_multibinary(): def test_dict(): - nested_observation_space = gym.spaces.Dict({ - 'sensors': gym.spaces.Dict({ - 'position': gym.spaces.Box(low=-100, high=100, shape=(3,)), - 'velocity': gym.spaces.Box(low=-1, high=1, shape=(3,)), - 'front_cam': gym.spaces.Tuple(( - gym.spaces.Box(low=0, high=1, shape=(10, 10, 3)), - gym.spaces.Box(low=0, high=1, shape=(10, 10, 3)) - )), - 'rear_cam': gym.spaces.Box(low=0, high=1, shape=(10, 10, 3)), - }), - 'ext_controller': gym.spaces.MultiDiscrete((5, 2, 2)), - 'inner_state': gym.spaces.Dict({ - 'charge': gym.spaces.Discrete(100), - 'system_checks': gym.spaces.MultiBinary(10), - 'job_status': gym.spaces.Dict({ - 'task': gym.spaces.Discrete(5), - 'progress': gym.spaces.Box(low=0, high=100, shape=()), - }) - }) - }) + nested_observation_space = gym.spaces.Dict( + { + "sensors": gym.spaces.Dict( + { + "position": gym.spaces.Box(low=-100, high=100, shape=(3,)), + "velocity": gym.spaces.Box(low=-1, high=1, shape=(3,)), + "front_cam": gym.spaces.Tuple( + ( + gym.spaces.Box(low=0, high=1, shape=(10, 10, 3)), + gym.spaces.Box(low=0, high=1, shape=(10, 10, 3)), + ) + ), + "rear_cam": gym.spaces.Box(low=0, high=1, shape=(10, 10, 3)), + } + ), + "ext_controller": gym.spaces.MultiDiscrete((5, 2, 2)), + "inner_state": gym.spaces.Dict( + { + "charge": gym.spaces.Discrete(100), + "system_checks": gym.spaces.MultiBinary(10), + "job_status": gym.spaces.Dict( + { + "task": gym.spaces.Discrete(5), + "progress": gym.spaces.Box(low=0, high=100, shape=()), + } + ), + } + ), + } + ) sp = nested_observation_space convert_and_compare(sp, rlberry.spaces.Dict) diff --git a/rlberry/wrappers/tests/test_wrapper_seeding.py b/rlberry/wrappers/tests/test_wrapper_seeding.py index c1584b99b..3422117ff 100644 --- a/rlberry/wrappers/tests/test_wrapper_seeding.py +++ b/rlberry/wrappers/tests/test_wrapper_seeding.py @@ -16,14 +16,7 @@ except Exception: _GYM_INSTALLED = False -classes = [ - MountainCar, - GridWorld, - Chain, - PBall2D, - SimplePBallND, - Acrobot -] +classes = [MountainCar, GridWorld, Chain, PBall2D, SimplePBallND, Acrobot] def get_env_trajectory(env, horizon): @@ -121,7 +114,7 @@ def test_double_wrapper_copy_reseeding(ModelClass): def test_gym_copy_reseeding(): seeder = Seeder(123) if _GYM_INSTALLED: - gym_env = gym.make('Acrobot-v1') + gym_env = gym.make("Acrobot-v1") env = Wrapper(gym_env) env.reseed(seeder) @@ -137,7 +130,7 @@ def test_gym_copy_reseeding(): def test_gym_copy_reseeding_2(): seeder = Seeder(123) if _GYM_INSTALLED: - gym_env = gym.make('Acrobot-v1') + gym_env = gym.make("Acrobot-v1") # nested wrapping env = RescaleRewardWrapper(Wrapper(Wrapper(gym_env)), (0, 1)) env.reseed(seeder) diff --git a/rlberry/wrappers/uncertainty_estimator_wrapper.py b/rlberry/wrappers/uncertainty_estimator_wrapper.py index b3b35d419..4648edece 100644 --- a/rlberry/wrappers/uncertainty_estimator_wrapper.py +++ b/rlberry/wrappers/uncertainty_estimator_wrapper.py @@ -33,24 +33,28 @@ class UncertaintyEstimatorWrapper(Wrapper): Scale factor for the bonus. """ - def __init__(self, - env, - uncertainty_estimator_fn, - uncertainty_estimator_kwargs=None, - bonus_scale_factor=1.0, - bonus_max=np.inf): + def __init__( + self, + env, + uncertainty_estimator_fn, + uncertainty_estimator_kwargs=None, + bonus_scale_factor=1.0, + bonus_max=np.inf, + ): Wrapper.__init__(self, env) self.bonus_scale_factor = bonus_scale_factor self.bonus_max = bonus_max uncertainty_estimator_kwargs = uncertainty_estimator_kwargs or {} - uncertainty_estimator_fn = load(uncertainty_estimator_fn) if isinstance(uncertainty_estimator_fn, str) else \ - uncertainty_estimator_fn + uncertainty_estimator_fn = ( + load(uncertainty_estimator_fn) + if isinstance(uncertainty_estimator_fn, str) + else uncertainty_estimator_fn + ) self.uncertainty_estimator = uncertainty_estimator_fn( - env.observation_space, - env.action_space, - **uncertainty_estimator_kwargs) + env.observation_space, env.action_space, **uncertainty_estimator_kwargs + ) self.previous_obs = None def reset(self): @@ -61,20 +65,16 @@ def _update_and_get_bonus(self, state, action, next_state, reward): if self.previous_obs is None: return 0.0 # - self.uncertainty_estimator.update(state, - action, - next_state, - reward) + self.uncertainty_estimator.update(state, action, next_state, reward) return self.bonus(state, action) def step(self, action): observation, reward, done, info = self.env.step(action) # update uncertainty and compute bonus - bonus = self._update_and_get_bonus(self.previous_obs, - action, - observation, - reward) + bonus = self._update_and_get_bonus( + self.previous_obs, action, observation, reward + ) # self.previous_obs = observation @@ -82,25 +82,35 @@ def step(self, action): if info is None: info = {} else: - if 'exploration_bonus' in info: - logger.error("UncertaintyEstimatorWrapper Error: info has" + - " already a key named exploration_bonus!") + if "exploration_bonus" in info: + logger.error( + "UncertaintyEstimatorWrapper Error: info has" + + " already a key named exploration_bonus!" + ) - info['exploration_bonus'] = bonus + info["exploration_bonus"] = bonus return observation, reward, done, info def sample(self, state, action): logger.warning( - '[UncertaintyEstimatorWrapper]: sample()' - + ' method does not consider nor update bonuses.') + "[UncertaintyEstimatorWrapper]: sample()" + + " method does not consider nor update bonuses." + ) return self.env.sample(state, action) def bonus(self, state, action=None): - bonus = self.bonus_scale_factor * self.uncertainty_estimator.measure(state, action) + bonus = self.bonus_scale_factor * self.uncertainty_estimator.measure( + state, action + ) return np.clip(bonus, 0, self.bonus_max) def bonus_batch(self, states, actions=None): - bonus = self.bonus_scale_factor * self.uncertainty_estimator.measure_batch(states, actions) - return np.clip(bonus, 0, self.bonus_max) if isinstance(bonus, np.ndarray) else torch.clamp(bonus, 0, - self.bonus_max) + bonus = self.bonus_scale_factor * self.uncertainty_estimator.measure_batch( + states, actions + ) + return ( + np.clip(bonus, 0, self.bonus_max) + if isinstance(bonus, np.ndarray) + else torch.clamp(bonus, 0, self.bonus_max) + ) diff --git a/rlberry/wrappers/vis2d.py b/rlberry/wrappers/vis2d.py index d410d7a4f..70991cb2e 100644 --- a/rlberry/wrappers/vis2d.py +++ b/rlberry/wrappers/vis2d.py @@ -11,7 +11,9 @@ class Transition: - def __init__(self, raw_state, state, action, reward, n_total_visits, n_episode_visits): + def __init__( + self, raw_state, state, action, reward, n_total_visits, n_episode_visits + ): self.raw_state = raw_state self.state = state self.action = action @@ -79,12 +81,14 @@ class Vis2dWrapper(Wrapper): kwargs for state_preprocess_fn """ - def __init__(self, - env, - n_bins_obs=10, - memory_size=100, - state_preprocess_fn=None, - state_preprocess_kwargs=None): + def __init__( + self, + env, + n_bins_obs=10, + memory_size=100, + state_preprocess_fn=None, + state_preprocess_kwargs=None, + ): Wrapper.__init__(self, env) if state_preprocess_fn is None: @@ -95,12 +99,12 @@ def __init__(self, self.state_preprocess_kwargs = state_preprocess_kwargs or {} self.memory = TrajectoryMemory(memory_size) - self.total_visit_counter = DiscreteCounter(self.env.observation_space, - self.env.action_space, - n_bins_obs=n_bins_obs) - self.episode_visit_counter = DiscreteCounter(self.env.observation_space, - self.env.action_space, - n_bins_obs=n_bins_obs) + self.total_visit_counter = DiscreteCounter( + self.env.observation_space, self.env.action_space, n_bins_obs=n_bins_obs + ) + self.episode_visit_counter = DiscreteCounter( + self.env.observation_space, self.env.action_space, n_bins_obs=n_bins_obs + ) self.current_state = None self.curret_step = 0 @@ -122,31 +126,35 @@ def step(self, action): self.total_visit_counter.update(ss, aa, ns, reward) self.episode_visit_counter.update(ss, aa, ns, reward) # store transition - transition = Transition(ss, - self.state_preprocess_fn(ss, self.env, **self.state_preprocess_kwargs), - aa, - reward, - self.total_visit_counter.count(ss, aa), - self.episode_visit_counter.count(ss, aa)) + transition = Transition( + ss, + self.state_preprocess_fn(ss, self.env, **self.state_preprocess_kwargs), + aa, + reward, + self.total_visit_counter.count(ss, aa), + self.episode_visit_counter.count(ss, aa), + ) self.memory.append(transition) # update current state self.current_state = observation return observation, reward, done, info - def plot_trajectories(self, - fignum=None, - figsize=(6, 6), - hide_axis=True, - show=True, - video_filename=None, - colormap_name='cool', - framerate=15, - n_skip=1, - dot_scale_factor=2.5, - alpha=0.25, - xlim=None, - ylim=None, - dot_size_means='episode_visits'): + def plot_trajectories( + self, + fignum=None, + figsize=(6, 6), + hide_axis=True, + show=True, + video_filename=None, + colormap_name="cool", + framerate=15, + n_skip=1, + dot_scale_factor=2.5, + alpha=0.25, + xlim=None, + ylim=None, + dot_size_means="episode_visits", + ): """ Plot history of trajectories in a scatter plot. Colors distinguish recent and old trajectories, the size of the dots represent @@ -194,8 +202,10 @@ def plot_trajectories(self, # discretizer try: discretizer = self.episode_visit_counter.state_discretizer - epsilon = min(discretizer._bins[0][1] - discretizer._bins[0][0], - discretizer._bins[1][1] - discretizer._bins[1][0]) + epsilon = min( + discretizer._bins[0][1] - discretizer._bins[0][0], + discretizer._bins[1][1] - discretizer._bins[1][0], + ) except Exception: epsilon = 0.01 @@ -225,15 +235,18 @@ def plot_trajectories(self, states = np.array([traj[ii].state for ii in range(len(traj))]) - if dot_size_means == 'episode_visits': - sizes = np.array( - [traj[ii].n_episode_visits for ii in range(len(traj))] - ) - elif dot_size_means == 'total_visits': + if dot_size_means == "episode_visits": + sizes = np.array([traj[ii].n_episode_visits for ii in range(len(traj))]) + elif dot_size_means == "total_visits": raw_states = [traj[ii].raw_state for ii in range(len(traj))] sizes = np.array( [ - np.sum([self.total_visit_counter.count(ss, aa) for aa in range(self.env.action_space.n)]) + np.sum( + [ + self.total_visit_counter.count(ss, aa) + for aa in range(self.env.action_space.n) + ] + ) for ss in raw_states ] ) @@ -243,13 +256,19 @@ def plot_trajectories(self, sizes = 1 + sizes sizes = (dot_scale_factor ** 2) * 100 * epsilon * sizes / sizes.max() - ax.scatter(x=states[:, 0], y=states[:, 1], color=color, s=sizes, alpha=alpha) + ax.scatter( + x=states[:, 0], y=states[:, 1], color=color, s=sizes, alpha=alpha + ) plt.tight_layout() if video_filename is not None: canvas.draw() - image_from_plot = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8) - image_from_plot = image_from_plot.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + image_from_plot = np.frombuffer( + fig.canvas.tostring_rgb(), dtype=np.uint8 + ) + image_from_plot = image_from_plot.reshape( + fig.canvas.get_width_height()[::-1] + (3,) + ) images.append(image_from_plot) if video_filename is not None: @@ -261,21 +280,23 @@ def plot_trajectories(self, if show: plt.show() - def plot_trajectory_actions(self, - fignum=None, - figsize=(8, 6), - n_traj_to_show=10, - hide_axis=True, - show=True, - video_filename=None, - colormap_name='Paired', - framerate=15, - n_skip=1, - dot_scale_factor=2.5, - alpha=1.0, - action_description=None, - xlim=None, - ylim=None): + def plot_trajectory_actions( + self, + fignum=None, + figsize=(8, 6), + n_traj_to_show=10, + hide_axis=True, + show=True, + video_filename=None, + colormap_name="Paired", + framerate=15, + n_skip=1, + dot_scale_factor=2.5, + alpha=1.0, + action_description=None, + xlim=None, + ylim=None, + ): """ Plot actions (one action = one color) chosen in recent trajectories. @@ -317,15 +338,17 @@ def plot_trajectory_actions(self, """ logger.info("Plotting...") - fignum = fignum or (str(self) + '-actions') + fignum = fignum or (str(self) + "-actions") colormap_fn = plt.get_cmap(colormap_name) action_description = action_description or list(range(self.env.action_space.n)) # discretizer try: discretizer = self.episode_visit_counter.state_discretizer - epsilon = min(discretizer._bins[0][1] - discretizer._bins[0][0], - discretizer._bins[1][1] - discretizer._bins[1][0]) + epsilon = min( + discretizer._bins[0][1] - discretizer._bins[0][0], + discretizer._bins[1][1] - discretizer._bins[1][0], + ) except Exception: epsilon = 0.01 @@ -368,19 +391,35 @@ def plot_trajectory_actions(self, for aa in range(self.env.action_space.n): states_aa = states[actions == aa] color = colormap_fn(aa / self.env.action_space.n) - ax.scatter(x=states_aa[:, 0], y=states_aa[:, 1], color=color, - s=sizes, alpha=alpha, - label=f'action = {action_description[aa]}') + ax.scatter( + x=states_aa[:, 0], + y=states_aa[:, 1], + color=color, + s=sizes, + alpha=alpha, + label=f"action = {action_description[aa]}", + ) # for unique legend entries, source: https://stackoverflow.com/a/57600060 - plt.legend(*[*zip(*{l: h for h, l in zip(*ax.get_legend_handles_labels())}.items())][::-1], - loc='upper left', bbox_to_anchor=(1.00, 1.00)) + plt.legend( + *[ + *zip( + *{l: h for h, l in zip(*ax.get_legend_handles_labels())}.items() + ) + ][::-1], + loc="upper left", + bbox_to_anchor=(1.00, 1.00), + ) plt.tight_layout() if video_filename is not None: canvas.draw() - image_from_plot = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8) - image_from_plot = image_from_plot.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + image_from_plot = np.frombuffer( + fig.canvas.tostring_rgb(), dtype=np.uint8 + ) + image_from_plot = image_from_plot.reshape( + fig.canvas.get_width_height()[::-1] + (3,) + ) images.append(image_from_plot) if video_filename is not None: diff --git a/setup.py b/setup.py index a1ad82a9d..0d14f7f4f 100644 --- a/setup.py +++ b/setup.py @@ -1,20 +1,20 @@ from setuptools import setup, find_packages -packages = find_packages(exclude=['docs', 'notebooks', 'assets']) +packages = find_packages(exclude=["docs", "notebooks", "assets"]) # # Base installation (interface only) # install_requires = [ - 'numpy>=1.17', - 'pygame', - 'matplotlib', - 'seaborn', - 'pandas', - 'gym', - 'dill', - 'docopt', - 'pyyaml', + "numpy>=1.17", + "pygame", + "matplotlib", + "seaborn", + "pandas", + "gym", + "dill", + "docopt", + "pyyaml", ] # @@ -23,50 +23,50 @@ # default installation default_requires = [ - 'numba', - 'optuna', - 'ffmpeg-python', - 'PyOpenGL', - 'PyOpenGL_accelerate', - 'pyvirtualdisplay', + "numba", + "optuna", + "ffmpeg-python", + "PyOpenGL", + "PyOpenGL_accelerate", + "pyvirtualdisplay", ] # tensorboard must be installed manually, due to conflicts with # dm-reverb-nightly[tensorflow] in jax_agents_requires torch_agents_requires = default_requires + [ - 'torch>=1.6.0', + "torch>=1.6.0", # 'tensorboard' ] jax_agents_requires = default_requires + [ - 'jax[cpu]', - 'chex', - 'dm-haiku', - 'optax', - 'dm-reverb[tensorflow]==0.5.0', - 'dm-tree', - 'rlax' + "jax[cpu]", + "chex", + "dm-haiku", + "optax", + "dm-reverb[tensorflow]==0.5.0", + "dm-tree", + "rlax", ] extras_require = { - 'default': default_requires, - 'jax_agents': jax_agents_requires, - 'torch_agents': torch_agents_requires, - 'deploy': ['sphinx', 'sphinx_rtd_theme'], + "default": default_requires, + "jax_agents": jax_agents_requires, + "torch_agents": torch_agents_requires, + "deploy": ["sphinx", "sphinx_rtd_theme"], } with open("README.md", "r") as fh: long_description = fh.read() setup( - name='rlberry', - version='0.2.1', - description='An easy-to-use reinforcement learning library for research and education', + name="rlberry", + version="0.2.1", + description="An easy-to-use reinforcement learning library for research and education", long_description=long_description, long_description_content_type="text/markdown", - author='Omar Darwiche Domingues, Yannis Flet-Berliac, Edouard Leurent, Pierre Menard, Xuedong Shang', - url='https://github.com/rlberry-py', - license='MIT', + author="Omar Darwiche Domingues, Yannis Flet-Berliac, Edouard Leurent, Pierre Menard, Xuedong Shang", + url="https://github.com/rlberry-py", + license="MIT", packages=packages, classifiers=[ "Development Status :: 4 - Beta", From 0a17c0041004047beb6e511938ecbed81b278bf2 Mon Sep 17 00:00:00 2001 From: SHILOVA Alena Date: Wed, 4 May 2022 14:59:02 +0200 Subject: [PATCH 03/13] More details in DQN docstring about Qnet and test on changing default Qnet --- rlberry/agents/torch/dqn/dqn.py | 19 +++++++++++++++++++ rlberry/agents/torch/tests/test_dqn.py | 17 +++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/rlberry/agents/torch/dqn/dqn.py b/rlberry/agents/torch/dqn/dqn.py index 70ea82e63..b340b3eb3 100644 --- a/rlberry/agents/torch/dqn/dqn.py +++ b/rlberry/agents/torch/dqn/dqn.py @@ -83,6 +83,25 @@ class DQNAgent(AgentWithSimplePolicy): * Ouput shape = (batch_dim, chunk_size, number_of_actions) + Example: use `rlberry.agents.torch.utils.training.model_factory`, + `rlberry.agents.torch.utils.training.size_model_config` and `q_net_kwargs` + parameter to modify the neural network:: + + model_configs = { + "type": "MultiLayerPerceptron", + "layer_sizes": (5, 5), + "reshape": False, + } + + def mlp(env, **kwargs): + model_config = size_model_config(env, **model_config) + return model_factory(**kwargs) + + agent = DQNAgent(env, q_net_constructor=mlp, q_net_kwargs=model_configs) + + If not specified then it is set to MultiLayerPerceptron with 2 hidden layers + of size 64 + q_net_kwargs : optional, dict Parameters for q_net_constructor. use_double_dqn : bool, default = False diff --git a/rlberry/agents/torch/tests/test_dqn.py b/rlberry/agents/torch/tests/test_dqn.py index 6e932b983..8dae4d51b 100644 --- a/rlberry/agents/torch/tests/test_dqn.py +++ b/rlberry/agents/torch/tests/test_dqn.py @@ -1,6 +1,7 @@ import pytest from rlberry.envs import gym_make from rlberry.agents.torch.dqn import DQNAgent +from rlberry.agents.torch.utils.training import model_factory @pytest.mark.parametrize( @@ -18,3 +19,19 @@ def test_dqn_agent(use_double_dqn, use_prioritized_replay): use_prioritized_replay=use_prioritized_replay, ) agent.fit(budget=500) + + model_configs = { + "type": "MultiLayerPerceptron", + "layer_sizes": (5, 5), + "reshape": False, + } + + def mlp(env, **kwargs): + """ + Returns a default Q value network. + """ + kwargs["in_size"] = env.observation_space.shape[0] + return model_factory(**kwargs) + + new_agent = DQNAgent(env, q_net_constructor=mlp, q_net_kwargs=model_configs) + new_agent.fit(budget=2000) \ No newline at end of file From cdfe2c07745df5808670e33cf558ef950a3f9b00 Mon Sep 17 00:00:00 2001 From: SHILOVA Alena Date: Wed, 4 May 2022 15:02:28 +0200 Subject: [PATCH 04/13] blacked last commit --- rlberry/agents/torch/dqn/dqn.py | 4 ++-- rlberry/agents/torch/tests/test_dqn.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/rlberry/agents/torch/dqn/dqn.py b/rlberry/agents/torch/dqn/dqn.py index b340b3eb3..2508ccb4a 100644 --- a/rlberry/agents/torch/dqn/dqn.py +++ b/rlberry/agents/torch/dqn/dqn.py @@ -84,7 +84,7 @@ class DQNAgent(AgentWithSimplePolicy): * Ouput shape = (batch_dim, chunk_size, number_of_actions) Example: use `rlberry.agents.torch.utils.training.model_factory`, - `rlberry.agents.torch.utils.training.size_model_config` and `q_net_kwargs` + `rlberry.agents.torch.utils.training.size_model_config` and `q_net_kwargs` parameter to modify the neural network:: model_configs = { @@ -98,7 +98,7 @@ def mlp(env, **kwargs): return model_factory(**kwargs) agent = DQNAgent(env, q_net_constructor=mlp, q_net_kwargs=model_configs) - + If not specified then it is set to MultiLayerPerceptron with 2 hidden layers of size 64 diff --git a/rlberry/agents/torch/tests/test_dqn.py b/rlberry/agents/torch/tests/test_dqn.py index 8dae4d51b..9bfbb1304 100644 --- a/rlberry/agents/torch/tests/test_dqn.py +++ b/rlberry/agents/torch/tests/test_dqn.py @@ -34,4 +34,4 @@ def mlp(env, **kwargs): return model_factory(**kwargs) new_agent = DQNAgent(env, q_net_constructor=mlp, q_net_kwargs=model_configs) - new_agent.fit(budget=2000) \ No newline at end of file + new_agent.fit(budget=2000) From 4a25e6fc39e1ec09e3340f428db848eb65fac806 Mon Sep 17 00:00:00 2001 From: SHILOVA Alena Date: Wed, 4 May 2022 15:08:40 +0200 Subject: [PATCH 05/13] None is more clearly stated in DQN docstring in q_net_constructor --- rlberry/agents/torch/dqn/dqn.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rlberry/agents/torch/dqn/dqn.py b/rlberry/agents/torch/dqn/dqn.py index 2508ccb4a..7c122a15a 100644 --- a/rlberry/agents/torch/dqn/dqn.py +++ b/rlberry/agents/torch/dqn/dqn.py @@ -73,7 +73,7 @@ class DQNAgent(AgentWithSimplePolicy): After :code:`epsilon_decay` timesteps, epsilon approaches :code:`epsilon_final`. optimizer_type : {"ADAM", "RMS_PROP"} Optimization algorithm. - q_net_constructor : Callable + q_net_constructor : Callable or None Function/constructor that returns a torch module for the Q-network: :code:`qnet = q_net_constructor(env, **kwargs)`. @@ -99,7 +99,7 @@ def mlp(env, **kwargs): agent = DQNAgent(env, q_net_constructor=mlp, q_net_kwargs=model_configs) - If not specified then it is set to MultiLayerPerceptron with 2 hidden layers + If None then it is set to MultiLayerPerceptron with 2 hidden layers of size 64 q_net_kwargs : optional, dict From 1984059e9a58adb5cc7275098d4287c5f9a5e0d4 Mon Sep 17 00:00:00 2001 From: SHILOVA Alena Date: Thu, 5 May 2022 18:00:14 +0200 Subject: [PATCH 06/13] Better docstring for DQN: explains what is str for q_net_constructor and gives shorter example when it is function --- rlberry/agents/torch/dqn/dqn.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/rlberry/agents/torch/dqn/dqn.py b/rlberry/agents/torch/dqn/dqn.py index 7c122a15a..92cf1a2ad 100644 --- a/rlberry/agents/torch/dqn/dqn.py +++ b/rlberry/agents/torch/dqn/dqn.py @@ -73,7 +73,7 @@ class DQNAgent(AgentWithSimplePolicy): After :code:`epsilon_decay` timesteps, epsilon approaches :code:`epsilon_final`. optimizer_type : {"ADAM", "RMS_PROP"} Optimization algorithm. - q_net_constructor : Callable or None + q_net_constructor : Callable, str or None Function/constructor that returns a torch module for the Q-network: :code:`qnet = q_net_constructor(env, **kwargs)`. @@ -83,8 +83,8 @@ class DQNAgent(AgentWithSimplePolicy): * Ouput shape = (batch_dim, chunk_size, number_of_actions) - Example: use `rlberry.agents.torch.utils.training.model_factory`, - `rlberry.agents.torch.utils.training.size_model_config` and `q_net_kwargs` + Example: use `rlberry.agents.torch.utils.training.model_factory_from_env`, + and `q_net_kwargs` parameter to modify the neural network:: model_configs = { @@ -93,11 +93,16 @@ class DQNAgent(AgentWithSimplePolicy): "reshape": False, } - def mlp(env, **kwargs): - model_config = size_model_config(env, **model_config) - return model_factory(**kwargs) - - agent = DQNAgent(env, q_net_constructor=mlp, q_net_kwargs=model_configs) + agent = DQNAgent(env, + q_net_constructor=model_factory_from_env, + q_net_kwargs=model_configs + ) + If str then it should correspond to the full path to the constructor function, + e.g.:: + agent = DQNAgent(env, + q_net_constructor='rlberry.agents.torch.utils.training.model_factory_from_env', + q_net_kwargs=model_configs + ) If None then it is set to MultiLayerPerceptron with 2 hidden layers of size 64 From 57ce593f5c353e6b425903f4d46224e4cbdecc5c Mon Sep 17 00:00:00 2001 From: SHILOVA Alena Date: Thu, 5 May 2022 18:04:08 +0200 Subject: [PATCH 07/13] blacked last commit --- rlberry/agents/torch/dqn/dqn.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/rlberry/agents/torch/dqn/dqn.py b/rlberry/agents/torch/dqn/dqn.py index 92cf1a2ad..5462ead77 100644 --- a/rlberry/agents/torch/dqn/dqn.py +++ b/rlberry/agents/torch/dqn/dqn.py @@ -93,14 +93,14 @@ class DQNAgent(AgentWithSimplePolicy): "reshape": False, } - agent = DQNAgent(env, - q_net_constructor=model_factory_from_env, + agent = DQNAgent(env, + q_net_constructor=model_factory_from_env, q_net_kwargs=model_configs ) If str then it should correspond to the full path to the constructor function, e.g.:: - agent = DQNAgent(env, - q_net_constructor='rlberry.agents.torch.utils.training.model_factory_from_env', + agent = DQNAgent(env, + q_net_constructor='rlberry.agents.torch.utils.training.model_factory_from_env', q_net_kwargs=model_configs ) From 4d442c25e6253ca22be44709071ffb5d1f273fc1 Mon Sep 17 00:00:00 2001 From: SHILOVA Alena Date: Mon, 24 Jul 2023 15:47:48 +0200 Subject: [PATCH 08/13] model factory can take externally defined nn and load it from file + checks that externally defined nn is suitable for environment --- .../agents/torch/tests/test_torch_training.py | 61 ++++++- rlberry/agents/torch/utils/training.py | 157 ++++++++++++++++-- 2 files changed, 200 insertions(+), 18 deletions(-) diff --git a/rlberry/agents/torch/tests/test_torch_training.py b/rlberry/agents/torch/tests/test_torch_training.py index fe5fb722c..663cd5ed8 100644 --- a/rlberry/agents/torch/tests/test_torch_training.py +++ b/rlberry/agents/torch/tests/test_torch_training.py @@ -1,7 +1,11 @@ import torch -from rlberry.agents.torch.utils.training import loss_function_factory, optimizer_factory + +import os +from rlberry.agents.torch.utils.training import loss_function_factory, optimizer_factory, model_factory, model_factory_from_env from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env -from rlberry.agents.torch.utils.models import default_policy_net_fn +from rlberry.agents.torch.utils.models import default_policy_net_fn, Net, MultiLayerPerceptron +from rlberry.agents.torch.dqn import DQNAgent + # loss_function_factory assert isinstance(loss_function_factory("l2"), torch.nn.MSELoss) @@ -30,3 +34,56 @@ ] == 0.99 ) + + +#test model_factory + +obs_shape = env.observation_space.shape +n_act = env.action_space.n + +test_net = Net(obs_size=obs_shape[0],hidden_size=10, n_actions=n_act) + +test_net2 = MultiLayerPerceptron(in_size=obs_shape[0], layer_sizes=[10],out_size=1) + + +test_net3 = MultiLayerPerceptron(in_size=obs_shape[0], layer_sizes=[10],out_size=n_act, is_policy=True) + + +model_factory(net = test_net) +model_factory_from_env(env, net=test_net) +model_factory_from_env(env, net=test_net2, out_size = 1) +model_factory_from_env(env, net=test_net3, is_policy = True) + + + +# test loading pretrained nn +dqn_agent = DQNAgent(env, q_net_constructor=model_factory_from_env, q_net_kwargs=dict(net = test_net)) + +dqn_agent.fit(50) + +torch.save(dqn_agent._qnet_online, "test_dqn.pickle") + + +parameters_to_save = dqn_agent._qnet_online.state_dict() +torch.save(parameters_to_save, "test_dqn.pt") + + + +model_factory(filename="test_dqn.pickle") +model_factory(net = test_net, filename="test_dqn.pt") + + + +dqn_agent = DQNAgent(env, q_net_constructor=model_factory_from_env, q_net_kwargs=dict(filename = "test_dqn.pickle")) + +dqn_agent = DQNAgent(env, q_net_constructor=model_factory_from_env, q_net_kwargs=dict(net = test_net, filename = "test_dqn.pt")) + +assert dqn_agent._qnet_online.state_dict().keys() == parameters_to_save.keys() + +for k in parameters_to_save.keys(): + assert (dqn_agent._qnet_online.state_dict()[k] == parameters_to_save[k]).all() + +os.remove("test_dqn.pickle") +os.remove("test_dqn.pt") + +print("done") \ No newline at end of file diff --git a/rlberry/agents/torch/utils/training.py b/rlberry/agents/torch/utils/training.py index ed338b3bb..bd9883602 100644 --- a/rlberry/agents/torch/utils/training.py +++ b/rlberry/agents/torch/utils/training.py @@ -27,21 +27,67 @@ def optimizer_factory(params, optimizer_type="ADAM", **kwargs): raise ValueError("Unknown optimizer type: {}".format(optimizer_type)) -def model_factory_from_env(env, **kwargs): +def model_factory_from_env(env, type = "MultiLayerPerceptron", net = None, filename = None, **net_kwargs): """Returns a torch module after setting up input/output dimensions according to an env. Parameters ---------- env: gym.Env Environment + type: {"MultiLayerPerceptron", + "ConvolutionalNetwork", + "DuelingNetwork", + "Table"}, default = "MultiLayerPerceptron" + Type of neural network. + net: torch.nn.Module or None + If not None, return this neural network. It can be used to pass user-defined neural network. + filename: str or None + The path to a saved module or its 'state_dict'. If not None, it will load a net or a checkpoint. **kwargs: Dict Parameters to be updated, used to call :func:`~rlberry.agents.torch.utils.training.model_factory`. """ - kwargs = size_model_config(env, **kwargs) - return model_factory(**kwargs) + + if filename is not None: + load_dict = load_from_file(filename) + if load_dict["model"] is not None: + net = load_dict["model"] + checkpoint = load_dict["checkpoint"] + else: + checkpoint = None + + + kwargs = size_model_config(env, type, **net_kwargs) + + if net is not None: + check_network(env, net, **kwargs) + + + return model_factory(type, net, checkpoint=checkpoint, **kwargs) + -def model_factory(type="MultiLayerPerceptron", **kwargs) -> nn.Module: +def load_from_file(filename): + """Load a module or a checkpoint. + + Parameters + ---------- + filename: str + The path to a saved module or its 'state_dict'. It will load a net or a checkpoint. + """ + output_dict = dict(model = None, checkpoint = None) + + loaded = torch.load(filename) + if isinstance(loaded, torch.nn.Module): + output_dict["model"] = loaded + elif isinstance(loaded, dict): + output_dict["checkpoint"] = loaded + else: + raise ValueError("Invalid 'load_from_file'. File is expected to store either an entire model or its 'state_dict'.") + return output_dict + + + +def model_factory(type="MultiLayerPerceptron", net = None, filename = None, checkpoint = None, **net_kwargs) -> nn.Module: """Build a neural net of a given type. Parameters @@ -51,7 +97,13 @@ def model_factory(type="MultiLayerPerceptron", **kwargs) -> nn.Module: "DuelingNetwork", "Table"}, default = "MultiLayerPerceptron" Type of neural network. - **kwargs: dict + net: torch.nn.Module or None + If not None, return this neural network. It can be used to pass user-defined neural network. + filename: str or None + The path to a saved module or its 'state_dict'. If not None, it will load a net or a checkpoint. + checkpoint: dict or None + If not None, then it is treated as a 'state_dict' that is assigned to a neural network model. + **net_kwargs: dict Parameters that vary according to each neural net type, see * :class:`~rlberry.agents.torch.utils.models.MultiLayerPerceptron` @@ -69,19 +121,89 @@ def model_factory(type="MultiLayerPerceptron", **kwargs) -> nn.Module: Table, ) - if type == "MultiLayerPerceptron": - return MultiLayerPerceptron(**kwargs) - elif type == "DuelingNetwork": - return DuelingNetwork(**kwargs) - elif type == "ConvolutionalNetwork": - return ConvolutionalNetwork(**kwargs) - elif type == "Table": - return Table(**kwargs) + if filename is not None: + load_dict = load_from_file(filename) + if load_dict["model"] is not None: + return load_dict["model"] + else: + checkpoint = load_dict["checkpoint"] + + + if net is not None: + model = net else: - raise ValueError("Unknown model type") + if type == "MultiLayerPerceptron": + model = MultiLayerPerceptron(**net_kwargs) + elif type == "DuelingNetwork": + model = DuelingNetwork(**net_kwargs) + elif type == "ConvolutionalNetwork": + model = ConvolutionalNetwork(**net_kwargs) + elif type == "Table": + model = Table(**net_kwargs) + else: + raise ValueError("Unknown model type") + + if checkpoint is not None: + model.load_state_dict(checkpoint) + + return model + + +def check_network(env, net, **model_config): + """ + Check the neural network that it satisfies the environment and predefined model_config. If the network is not good, it should raise an error. + + Parameters + ---------- + env : gym.Env + An environment. + net: torch.nn.Module + A neural network. + model_config : dict + Desired parameters. + """ + + if isinstance(env.observation_space, spaces.Box): + obs_shape = env.observation_space.shape + elif isinstance(env.observation_space, spaces.Tuple): + obs_shape = env.observation_space.spaces[0].shape + elif isinstance(env.observation_space, spaces.Discrete): + return model_config + + + if net is not None: + #check that it is compliant with environment + #input check + fake_input = torch.zeros(1, *obs_shape) + try: + output = net(fake_input) + except Exception as err: + print(f"NN input is not compatible with the environment. Got an error {err=}, {type(err)=}") + raise + #output check + if "is_policy" in model_config: + is_policy = model_config["is_policy"] + if is_policy: + assert isinstance(output, torch.distributions.distribution.Distribution), "Policy should return distribution over actions" + else: + if "out_size" in model_config: + out_size = [model_config["out_size"]] + else: + if isinstance(env.action_space, spaces.Discrete): + out_size = [env.action_space.n] + elif isinstance(env.action_space, spaces.Tuple): + out_size = [env.action_space.spaces[0].n] + elif isinstance(env.action_space, spaces.Box): + out_size = env.action_space.shape + assert output.shape == (1, *out_size), f"Output should be of size {out_size}" + + + + + -def size_model_config(env, **model_config): +def size_model_config(env, type = None, **model_config): """ Setup input/output dimensions for the configuration of a model depending on the environment observation/action spaces. @@ -90,6 +212,8 @@ def size_model_config(env, **model_config): ---------- env : gym.Env An environment. + type: str or None + Make configs corresponding to the chosen type of neural network. model_config : dict Parameters to be updated, used to call :func:`~rlberry.agents.torch.utils.training.model_factory`. If "out_size" is not given in model_config, assumes @@ -103,9 +227,10 @@ def size_model_config(env, **model_config): obs_shape = env.observation_space.spaces[0].shape elif isinstance(env.observation_space, spaces.Discrete): return model_config + # Assume CHW observation space - if "type" in model_config and model_config["type"] == "ConvolutionalNetwork": + if type == "ConvolutionalNetwork": if "transpose_obs" in model_config and not model_config["transpose_obs"]: # Assume CHW observation space if "in_channels" not in model_config: From c278e6c5e63e92be3cf243d220a7e8b9e4892409 Mon Sep 17 00:00:00 2001 From: SHILOVA Alena Date: Mon, 24 Jul 2023 15:48:44 +0200 Subject: [PATCH 09/13] blacked --- .../agents/torch/tests/test_torch_training.py | 52 ++++++++++----- rlberry/agents/torch/utils/training.py | 63 ++++++++++--------- 2 files changed, 67 insertions(+), 48 deletions(-) diff --git a/rlberry/agents/torch/tests/test_torch_training.py b/rlberry/agents/torch/tests/test_torch_training.py index 663cd5ed8..3e795eb29 100644 --- a/rlberry/agents/torch/tests/test_torch_training.py +++ b/rlberry/agents/torch/tests/test_torch_training.py @@ -1,9 +1,18 @@ import torch import os -from rlberry.agents.torch.utils.training import loss_function_factory, optimizer_factory, model_factory, model_factory_from_env +from rlberry.agents.torch.utils.training import ( + loss_function_factory, + optimizer_factory, + model_factory, + model_factory_from_env, +) from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env -from rlberry.agents.torch.utils.models import default_policy_net_fn, Net, MultiLayerPerceptron +from rlberry.agents.torch.utils.models import ( + default_policy_net_fn, + Net, + MultiLayerPerceptron, +) from rlberry.agents.torch.dqn import DQNAgent @@ -36,28 +45,31 @@ ) -#test model_factory +# test model_factory obs_shape = env.observation_space.shape n_act = env.action_space.n -test_net = Net(obs_size=obs_shape[0],hidden_size=10, n_actions=n_act) +test_net = Net(obs_size=obs_shape[0], hidden_size=10, n_actions=n_act) -test_net2 = MultiLayerPerceptron(in_size=obs_shape[0], layer_sizes=[10],out_size=1) +test_net2 = MultiLayerPerceptron(in_size=obs_shape[0], layer_sizes=[10], out_size=1) -test_net3 = MultiLayerPerceptron(in_size=obs_shape[0], layer_sizes=[10],out_size=n_act, is_policy=True) +test_net3 = MultiLayerPerceptron( + in_size=obs_shape[0], layer_sizes=[10], out_size=n_act, is_policy=True +) -model_factory(net = test_net) +model_factory(net=test_net) model_factory_from_env(env, net=test_net) -model_factory_from_env(env, net=test_net2, out_size = 1) -model_factory_from_env(env, net=test_net3, is_policy = True) - +model_factory_from_env(env, net=test_net2, out_size=1) +model_factory_from_env(env, net=test_net3, is_policy=True) # test loading pretrained nn -dqn_agent = DQNAgent(env, q_net_constructor=model_factory_from_env, q_net_kwargs=dict(net = test_net)) +dqn_agent = DQNAgent( + env, q_net_constructor=model_factory_from_env, q_net_kwargs=dict(net=test_net) +) dqn_agent.fit(50) @@ -68,15 +80,21 @@ torch.save(parameters_to_save, "test_dqn.pt") - model_factory(filename="test_dqn.pickle") -model_factory(net = test_net, filename="test_dqn.pt") - +model_factory(net=test_net, filename="test_dqn.pt") -dqn_agent = DQNAgent(env, q_net_constructor=model_factory_from_env, q_net_kwargs=dict(filename = "test_dqn.pickle")) +dqn_agent = DQNAgent( + env, + q_net_constructor=model_factory_from_env, + q_net_kwargs=dict(filename="test_dqn.pickle"), +) -dqn_agent = DQNAgent(env, q_net_constructor=model_factory_from_env, q_net_kwargs=dict(net = test_net, filename = "test_dqn.pt")) +dqn_agent = DQNAgent( + env, + q_net_constructor=model_factory_from_env, + q_net_kwargs=dict(net=test_net, filename="test_dqn.pt"), +) assert dqn_agent._qnet_online.state_dict().keys() == parameters_to_save.keys() @@ -86,4 +104,4 @@ os.remove("test_dqn.pickle") os.remove("test_dqn.pt") -print("done") \ No newline at end of file +print("done") diff --git a/rlberry/agents/torch/utils/training.py b/rlberry/agents/torch/utils/training.py index bd9883602..ed9d9cf92 100644 --- a/rlberry/agents/torch/utils/training.py +++ b/rlberry/agents/torch/utils/training.py @@ -27,7 +27,9 @@ def optimizer_factory(params, optimizer_type="ADAM", **kwargs): raise ValueError("Unknown optimizer type: {}".format(optimizer_type)) -def model_factory_from_env(env, type = "MultiLayerPerceptron", net = None, filename = None, **net_kwargs): +def model_factory_from_env( + env, type="MultiLayerPerceptron", net=None, filename=None, **net_kwargs +): """Returns a torch module after setting up input/output dimensions according to an env. Parameters @@ -42,7 +44,7 @@ def model_factory_from_env(env, type = "MultiLayerPerceptron", net = None, filen net: torch.nn.Module or None If not None, return this neural network. It can be used to pass user-defined neural network. filename: str or None - The path to a saved module or its 'state_dict'. If not None, it will load a net or a checkpoint. + The path to a saved module or its 'state_dict'. If not None, it will load a net or a checkpoint. **kwargs: Dict Parameters to be updated, used to call :func:`~rlberry.agents.torch.utils.training.model_factory`. """ @@ -55,26 +57,23 @@ def model_factory_from_env(env, type = "MultiLayerPerceptron", net = None, filen else: checkpoint = None - kwargs = size_model_config(env, type, **net_kwargs) if net is not None: check_network(env, net, **kwargs) - return model_factory(type, net, checkpoint=checkpoint, **kwargs) - def load_from_file(filename): """Load a module or a checkpoint. - + Parameters ---------- filename: str The path to a saved module or its 'state_dict'. It will load a net or a checkpoint. """ - output_dict = dict(model = None, checkpoint = None) + output_dict = dict(model=None, checkpoint=None) loaded = torch.load(filename) if isinstance(loaded, torch.nn.Module): @@ -82,12 +81,15 @@ def load_from_file(filename): elif isinstance(loaded, dict): output_dict["checkpoint"] = loaded else: - raise ValueError("Invalid 'load_from_file'. File is expected to store either an entire model or its 'state_dict'.") + raise ValueError( + "Invalid 'load_from_file'. File is expected to store either an entire model or its 'state_dict'." + ) return output_dict - -def model_factory(type="MultiLayerPerceptron", net = None, filename = None, checkpoint = None, **net_kwargs) -> nn.Module: +def model_factory( + type="MultiLayerPerceptron", net=None, filename=None, checkpoint=None, **net_kwargs +) -> nn.Module: """Build a neural net of a given type. Parameters @@ -100,9 +102,9 @@ def model_factory(type="MultiLayerPerceptron", net = None, filename = None, chec net: torch.nn.Module or None If not None, return this neural network. It can be used to pass user-defined neural network. filename: str or None - The path to a saved module or its 'state_dict'. If not None, it will load a net or a checkpoint. + The path to a saved module or its 'state_dict'. If not None, it will load a net or a checkpoint. checkpoint: dict or None - If not None, then it is treated as a 'state_dict' that is assigned to a neural network model. + If not None, then it is treated as a 'state_dict' that is assigned to a neural network model. **net_kwargs: dict Parameters that vary according to each neural net type, see @@ -127,22 +129,21 @@ def model_factory(type="MultiLayerPerceptron", net = None, filename = None, chec return load_dict["model"] else: checkpoint = load_dict["checkpoint"] - if net is not None: model = net else: if type == "MultiLayerPerceptron": - model = MultiLayerPerceptron(**net_kwargs) + model = MultiLayerPerceptron(**net_kwargs) elif type == "DuelingNetwork": - model = DuelingNetwork(**net_kwargs) + model = DuelingNetwork(**net_kwargs) elif type == "ConvolutionalNetwork": model = ConvolutionalNetwork(**net_kwargs) elif type == "Table": - model = Table(**net_kwargs) + model = Table(**net_kwargs) else: raise ValueError("Unknown model type") - + if checkpoint is not None: model.load_state_dict(checkpoint) @@ -169,22 +170,25 @@ def check_network(env, net, **model_config): obs_shape = env.observation_space.spaces[0].shape elif isinstance(env.observation_space, spaces.Discrete): return model_config - if net is not None: - #check that it is compliant with environment - #input check + # check that it is compliant with environment + # input check fake_input = torch.zeros(1, *obs_shape) try: output = net(fake_input) except Exception as err: - print(f"NN input is not compatible with the environment. Got an error {err=}, {type(err)=}") + print( + f"NN input is not compatible with the environment. Got an error {err=}, {type(err)=}" + ) raise - #output check + # output check if "is_policy" in model_config: is_policy = model_config["is_policy"] if is_policy: - assert isinstance(output, torch.distributions.distribution.Distribution), "Policy should return distribution over actions" + assert isinstance( + output, torch.distributions.distribution.Distribution + ), "Policy should return distribution over actions" else: if "out_size" in model_config: out_size = [model_config["out_size"]] @@ -195,15 +199,13 @@ def check_network(env, net, **model_config): out_size = [env.action_space.spaces[0].n] elif isinstance(env.action_space, spaces.Box): out_size = env.action_space.shape - assert output.shape == (1, *out_size), f"Output should be of size {out_size}" - - - - - + assert output.shape == ( + 1, + *out_size, + ), f"Output should be of size {out_size}" -def size_model_config(env, type = None, **model_config): +def size_model_config(env, type=None, **model_config): """ Setup input/output dimensions for the configuration of a model depending on the environment observation/action spaces. @@ -227,7 +229,6 @@ def size_model_config(env, type = None, **model_config): obs_shape = env.observation_space.spaces[0].shape elif isinstance(env.observation_space, spaces.Discrete): return model_config - # Assume CHW observation space if type == "ConvolutionalNetwork": From 19b9bfd720f393535feae3871afba288c179f7b7 Mon Sep 17 00:00:00 2001 From: SHILOVA Alena Date: Mon, 24 Jul 2023 17:34:45 +0200 Subject: [PATCH 10/13] more coverage --- .../agents/torch/tests/test_torch_training.py | 33 +++++++++++++++++++ rlberry/agents/torch/utils/training.py | 10 +++--- 2 files changed, 39 insertions(+), 4 deletions(-) diff --git a/rlberry/agents/torch/tests/test_torch_training.py b/rlberry/agents/torch/tests/test_torch_training.py index 3e795eb29..3f4d81282 100644 --- a/rlberry/agents/torch/tests/test_torch_training.py +++ b/rlberry/agents/torch/tests/test_torch_training.py @@ -6,8 +6,11 @@ optimizer_factory, model_factory, model_factory_from_env, + check_network ) from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env +from rlberry.envs.finite import Chain +from rlberry.envs import gym_make from rlberry.agents.torch.utils.models import ( default_policy_net_fn, Net, @@ -24,6 +27,10 @@ # optimizer_factory env = get_benchmark_env(level=1) + +finite_env = Chain() + +cont_act_env = gym_make("Pendulum-v1") assert ( optimizer_factory(default_policy_net_fn(env).parameters(), "ADAM").defaults["lr"] == 0.001 @@ -59,11 +66,18 @@ in_size=obs_shape[0], layer_sizes=[10], out_size=n_act, is_policy=True ) +test_net4 = MultiLayerPerceptron( + in_size=100, layer_sizes=[10], out_size=n_act +) + +test_net5 = MultiLayerPerceptron(in_size = cont_act_env.observation_space.shape[0], layer_sizes=[10], out_size=cont_act_env.action_space.shape[0]) + model_factory(net=test_net) model_factory_from_env(env, net=test_net) model_factory_from_env(env, net=test_net2, out_size=1) model_factory_from_env(env, net=test_net3, is_policy=True) +model_factory_from_env(cont_act_env, net=test_net5) # test loading pretrained nn @@ -78,6 +92,25 @@ parameters_to_save = dqn_agent._qnet_online.state_dict() torch.save(parameters_to_save, "test_dqn.pt") +torch.save((parameters_to_save, parameters_to_save), "test_dqn2.pt") + +try: + model_factory(filename="test_dqn2.pt") +except Exception as err: + os.remove("test_dqn2.pt") + print(err, "Bad file was removed.") + +try: + model_factory(type = "dummy") +except Exception as err: + print(err) + + +# This test should fail as +# try: +# check_network(cont_act_env, test_net) +# except Exception as err: +# print(err) model_factory(filename="test_dqn.pickle") diff --git a/rlberry/agents/torch/utils/training.py b/rlberry/agents/torch/utils/training.py index ed9d9cf92..41475e1ba 100644 --- a/rlberry/agents/torch/utils/training.py +++ b/rlberry/agents/torch/utils/training.py @@ -166,10 +166,12 @@ def check_network(env, net, **model_config): if isinstance(env.observation_space, spaces.Box): obs_shape = env.observation_space.shape - elif isinstance(env.observation_space, spaces.Tuple): - obs_shape = env.observation_space.spaces[0].shape - elif isinstance(env.observation_space, spaces.Discrete): - return model_config + else: + raise NotImplementedError + # elif isinstance(env.observation_space, spaces.Tuple): + # obs_shape = env.observation_space.spaces[0].shape + # elif isinstance(env.observation_space, spaces.Discrete): + # return model_config if net is not None: # check that it is compliant with environment From b99188d32685f5efcf99c35cbc384e9b628998c2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 24 Jul 2023 15:37:03 +0000 Subject: [PATCH 11/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../agents/torch/tests/test_torch_training.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/rlberry/agents/torch/tests/test_torch_training.py b/rlberry/agents/torch/tests/test_torch_training.py index 3f4d81282..c5a91c325 100644 --- a/rlberry/agents/torch/tests/test_torch_training.py +++ b/rlberry/agents/torch/tests/test_torch_training.py @@ -6,7 +6,7 @@ optimizer_factory, model_factory, model_factory_from_env, - check_network + check_network, ) from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env from rlberry.envs.finite import Chain @@ -66,11 +66,13 @@ in_size=obs_shape[0], layer_sizes=[10], out_size=n_act, is_policy=True ) -test_net4 = MultiLayerPerceptron( - in_size=100, layer_sizes=[10], out_size=n_act -) +test_net4 = MultiLayerPerceptron(in_size=100, layer_sizes=[10], out_size=n_act) -test_net5 = MultiLayerPerceptron(in_size = cont_act_env.observation_space.shape[0], layer_sizes=[10], out_size=cont_act_env.action_space.shape[0]) +test_net5 = MultiLayerPerceptron( + in_size=cont_act_env.observation_space.shape[0], + layer_sizes=[10], + out_size=cont_act_env.action_space.shape[0], +) model_factory(net=test_net) @@ -101,12 +103,12 @@ print(err, "Bad file was removed.") try: - model_factory(type = "dummy") + model_factory(type="dummy") except Exception as err: print(err) -# This test should fail as +# This test should fail as # try: # check_network(cont_act_env, test_net) # except Exception as err: From 215bec51cc0355f8750e43ee8984a2fd20dc1745 Mon Sep 17 00:00:00 2001 From: SHILOVA Alena Date: Mon, 24 Jul 2023 18:40:24 +0200 Subject: [PATCH 12/13] blacked --- .../agents/torch/tests/test_torch_training.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/rlberry/agents/torch/tests/test_torch_training.py b/rlberry/agents/torch/tests/test_torch_training.py index 3f4d81282..c5a91c325 100644 --- a/rlberry/agents/torch/tests/test_torch_training.py +++ b/rlberry/agents/torch/tests/test_torch_training.py @@ -6,7 +6,7 @@ optimizer_factory, model_factory, model_factory_from_env, - check_network + check_network, ) from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env from rlberry.envs.finite import Chain @@ -66,11 +66,13 @@ in_size=obs_shape[0], layer_sizes=[10], out_size=n_act, is_policy=True ) -test_net4 = MultiLayerPerceptron( - in_size=100, layer_sizes=[10], out_size=n_act -) +test_net4 = MultiLayerPerceptron(in_size=100, layer_sizes=[10], out_size=n_act) -test_net5 = MultiLayerPerceptron(in_size = cont_act_env.observation_space.shape[0], layer_sizes=[10], out_size=cont_act_env.action_space.shape[0]) +test_net5 = MultiLayerPerceptron( + in_size=cont_act_env.observation_space.shape[0], + layer_sizes=[10], + out_size=cont_act_env.action_space.shape[0], +) model_factory(net=test_net) @@ -101,12 +103,12 @@ print(err, "Bad file was removed.") try: - model_factory(type = "dummy") + model_factory(type="dummy") except Exception as err: print(err) -# This test should fail as +# This test should fail as # try: # check_network(cont_act_env, test_net) # except Exception as err: From 411d3ca1caa40f8b26416e4def403eb415046a71 Mon Sep 17 00:00:00 2001 From: SHILOVA Alena Date: Mon, 24 Jul 2023 18:43:52 +0200 Subject: [PATCH 13/13] flake 8 should be fine --- rlberry/agents/torch/tests/test_torch_training.py | 1 - 1 file changed, 1 deletion(-) diff --git a/rlberry/agents/torch/tests/test_torch_training.py b/rlberry/agents/torch/tests/test_torch_training.py index c5a91c325..478fa4564 100644 --- a/rlberry/agents/torch/tests/test_torch_training.py +++ b/rlberry/agents/torch/tests/test_torch_training.py @@ -6,7 +6,6 @@ optimizer_factory, model_factory, model_factory_from_env, - check_network, ) from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env from rlberry.envs.finite import Chain