From 4ce9d7b4a28fe0a69e21aebdfb5f8b4553952bac Mon Sep 17 00:00:00 2001
From: SHILOVA Alena <alena.shilova@inria.fr>
Date: Fri, 21 Jan 2022 17:01:22 +0100
Subject: [PATCH 01/13] corrected bug in assertion: missing self.env

---
 rlberry/agents/torch/dqn/dqn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rlberry/agents/torch/dqn/dqn.py b/rlberry/agents/torch/dqn/dqn.py
index 978fc9fd9..fec8f4b82 100644
--- a/rlberry/agents/torch/dqn/dqn.py
+++ b/rlberry/agents/torch/dqn/dqn.py
@@ -135,7 +135,7 @@ def __init__(
         self.double = double
 
         assert isinstance(
-            env.action_space, spaces.Discrete
+            self.env.action_space, spaces.Discrete
         ), "Only compatible with Discrete action spaces."
 
         self.prioritized_replay = prioritized_replay

From e2301521d9cc1bff400e0f87e2fe6d3b6db3fe8a Mon Sep 17 00:00:00 2001
From: riccardo <riccardo.dellavecchia91@gmail.com>
Date: Wed, 16 Feb 2022 16:15:23 +0100
Subject: [PATCH 02/13] blacked main

---
 docs/conf.py                                  |  50 +--
 examples/demo_agents/video_plot_a2c.py        |   7 +-
 examples/demo_agents/video_plot_dqn.py        |   6 +-
 examples/demo_agents/video_plot_ppo.py        |   8 +-
 .../demo_agents/video_plot_rs_kernel_ucbvi.py |  14 +-
 examples/demo_agents/video_plot_rsucbvi.py    |   3 +-
 examples/demo_agents/video_plot_vi.py         |   1 +
 examples/demo_env/video_plot_acrobot.py       |   8 +-
 examples/demo_env/video_plot_gridworld.py     |   1 -
 examples/demo_env/video_plot_pball.py         |  31 +-
 examples/demo_env/video_plot_rooms.py         |  12 +-
 examples/demo_examples/demo_adaptiveql.py     |  48 ++-
 examples/demo_examples/demo_agent_manager.py  |  36 +-
 .../demo_examples/demo_agent_manager_save.py  |  21 +-
 .../demo_agent_manager_set_writer.py          |  17 +-
 examples/demo_examples/demo_avecppo.py        |   8 +-
 examples/demo_examples/demo_experiment/run.py |   6 +-
 .../demo_from_stable_baselines.py             |  92 ++---
 .../demo_from_stable_baselines_atari.py       |  78 ++--
 examples/demo_examples/demo_gym_wrapper.py    |   5 +-
 .../demo_examples/demo_hyperparam_optim.py    |  29 +-
 examples/demo_examples/demo_jax_dqn.py        |  28 +-
 examples/demo_examples/demo_lsvi_ucb.py       |  50 +--
 .../demo_examples/demo_network/run_client.py  |  13 +-
 .../demo_network/run_remote_manager.py        |  28 +-
 .../demo_examples/demo_network/run_server.py  |  29 +-
 examples/demo_examples/demo_ppo_benchmark.py  |  32 +-
 examples/demo_examples/demo_ppo_bonus.py      |  71 ++--
 .../demo_examples/demo_ppo_partial_fit.py     |  39 +-
 examples/demo_examples/demo_rnd.py            |   6 +-
 examples/demo_examples/demo_seeding.py        |   2 +-
 .../demo_examples/demo_ucbvi_and_opqtl.py     |  45 ++-
 examples/demo_examples/demo_vis2d.py          |  18 +-
 examples/plot_agent_manager.py                |  52 ++-
 examples/plot_kernels.py                      |   4 +-
 rlberry/__init__.py                           |   2 +-
 rlberry/agents/adaptiveql/adaptiveql.py       |  37 +-
 rlberry/agents/adaptiveql/tree.py             |  36 +-
 rlberry/agents/agent.py                       |  52 +--
 rlberry/agents/dynprog/value_iteration.py     |  18 +-
 rlberry/agents/jax/dqn/dqn.py                 | 192 +++++-----
 rlberry/agents/jax/nets/common.py             |   5 +-
 rlberry/agents/jax/tests/old_test_tqn.py      |   9 +-
 rlberry/agents/jax/utils/replay_buffer.py     |  61 ++--
 rlberry/agents/kernel_based/common.py         |  30 +-
 .../agents/kernel_based/rs_kernel_ucbvi.py    | 186 ++++++----
 rlberry/agents/kernel_based/rs_ucbvi.py       | 111 +++---
 rlberry/agents/linear/lsvi_ucb.py             | 123 ++++---
 rlberry/agents/mbqvi/mbqvi.py                 |  48 ++-
 rlberry/agents/optql/optql.py                 |  59 +--
 rlberry/agents/tests/test_dynprog.py          |  68 ++--
 rlberry/agents/tests/test_kernel_based.py     |  41 +--
 rlberry/agents/tests/test_lsvi_ucb.py         |  66 ++--
 rlberry/agents/tests/test_optql.py            |   5 +-
 rlberry/agents/tests/test_ucbvi.py            |  38 +-
 rlberry/agents/torch/a2c/a2c.py               | 110 +++---
 rlberry/agents/torch/avec/avec_ppo.py         | 129 ++++---
 rlberry/agents/torch/dqn/dqn.py               | 295 ++++++++-------
 rlberry/agents/torch/dqn/exploration.py       |  39 +-
 rlberry/agents/torch/ppo/ppo.py               | 178 +++++----
 rlberry/agents/torch/reinforce/reinforce.py   |  70 ++--
 .../torch/tests/test_actor_critic_algos.py    | 129 +++----
 rlberry/agents/torch/tests/test_dqn.py        |  48 +--
 rlberry/agents/torch/tests/test_reinforce.py  |  36 +-
 .../agents/torch/tests/test_torch_models.py   |  29 +-
 .../agents/torch/tests/test_torch_training.py |  31 +-
 .../agents/torch/utils/attention_models.py    | 160 ++++----
 rlberry/agents/torch/utils/models.py          | 164 +++++----
 rlberry/agents/torch/utils/training.py        |  12 +-
 rlberry/agents/ucbvi/ucbvi.py                 |  72 ++--
 rlberry/agents/ucbvi/utils.py                 |  18 +-
 rlberry/agents/utils/memories.py              |  46 +--
 rlberry/colab_utils/display_setup.py          |  13 +-
 rlberry/envs/basewrapper.py                   |  11 +-
 .../benchmarks/ball_exploration/ball2d.py     |  82 +++--
 .../envs/benchmarks/ball_exploration/pball.py | 149 +++++---
 .../benchmarks/generalization/twinrooms.py    |  19 +-
 .../benchmarks/grid_exploration/apple_gold.py |  27 +-
 .../benchmarks/grid_exploration/four_room.py  |  26 +-
 .../envs/benchmarks/grid_exploration/nroom.py |  89 +++--
 .../benchmarks/grid_exploration/six_room.py   |  21 +-
 .../envs/bullet3/pybullet_envs/__init__.py    |  16 +-
 .../pybullet_envs/gym_pendulum_envs.py        |  11 +-
 .../envs/bullet3/pybullet_envs/robot_bases.py | 118 ++++--
 .../bullet3/pybullet_envs/robot_pendula.py    |  15 +-
 rlberry/envs/classic_control/acrobot.py       |  89 +++--
 rlberry/envs/classic_control/mountain_car.py  |  31 +-
 rlberry/envs/classic_control/pendulum.py      |  44 ++-
 rlberry/envs/finite/finite_mdp.py             |  14 +-
 rlberry/envs/finite/gridworld.py              | 121 +++---
 rlberry/envs/gym_make.py                      |   2 +
 rlberry/envs/interface/model.py               |  17 +-
 rlberry/envs/tests/test_env_seeding.py        |   6 +-
 rlberry/envs/tests/test_gym_env_seeding.py    |   6 +-
 rlberry/envs/tests/test_instantiation.py      |  99 ++---
 rlberry/experiment/generator.py               |  13 +-
 rlberry/experiment/load_results.py            |  36 +-
 .../tests/old_test_experiment_generator.py    |  25 +-
 rlberry/experiment/yaml_utils.py              |  39 +-
 rlberry/exploration_tools/discrete_counter.py |  26 +-
 .../online_discretization_counter.py          |  84 +++--
 .../tests/test_discrete_counter.py            |  29 +-
 rlberry/exploration_tools/torch/rnd.py        | 144 +++++---
 .../exploration_tools/torch/tests/test_rnd.py |   3 +-
 rlberry/exploration_tools/typing.py           |   8 +-
 .../uncertainty_estimator.py                  |   7 +-
 rlberry/manager/agent_manager.py              | 343 ++++++++++--------
 rlberry/manager/evaluation.py                 |  66 ++--
 rlberry/manager/multiple_managers.py          |   8 +-
 rlberry/manager/remote_agent_manager.py       |  62 ++--
 rlberry/manager/tests/test_agent_manager.py   | 103 ++++--
 .../tests/test_agent_manager_seeding.py       |  36 +-
 .../manager/tests/test_hyperparam_optim.py    | 130 +++----
 rlberry/manager/utils.py                      |   2 +-
 rlberry/metadata_utils.py                     |   7 +-
 rlberry/network/client.py                     |  11 +-
 rlberry/network/interface.py                  |  51 +--
 rlberry/network/server.py                     |  76 ++--
 rlberry/network/server_utils.py               |  59 +--
 rlberry/network/utils.py                      |  45 ++-
 rlberry/rendering/opengl_render2d.py          |  30 +-
 rlberry/rendering/pygame_render2d.py          |  22 +-
 rlberry/rendering/render_interface.py         |   9 +-
 .../tests/test_rendering_interface.py         |  10 +-
 rlberry/rendering/utils.py                    |  35 +-
 rlberry/seeding/tests/test_seeding.py         |   8 +-
 rlberry/seeding/tests/test_threads.py         |   4 +-
 rlberry/seeding/tests/test_threads_torch.py   |   4 +-
 rlberry/spaces/box.py                         |  32 +-
 rlberry/spaces/from_gym.py                    |  37 +-
 rlberry/spaces/multi_binary.py                |   3 +-
 rlberry/spaces/tests/test_from_gym.py         |  97 ++---
 rlberry/spaces/tests/test_spaces.py           |  97 ++---
 rlberry/utils/binsearch.py                    |   7 +-
 rlberry/utils/io.py                           |  12 +-
 rlberry/utils/jit_setup.py                    |   1 +
 rlberry/utils/logging.py                      |  32 +-
 rlberry/utils/math.py                         |   6 +-
 rlberry/utils/space_discretizer.py            |   5 +-
 rlberry/utils/tests/test_binsearch.py         |  10 +-
 rlberry/utils/tests/test_metrics.py           |   8 +-
 rlberry/utils/torch.py                        |  44 ++-
 rlberry/utils/writers.py                      |  95 +++--
 rlberry/wrappers/discretize_state.py          |  29 +-
 rlberry/wrappers/gym_utils.py                 |   5 +-
 rlberry/wrappers/tests/test_basewrapper.py    |   5 +-
 .../wrappers/tests/test_common_wrappers.py    |  27 +-
 .../tests/test_gym_space_conversion.py        |  94 ++---
 .../wrappers/tests/test_wrapper_seeding.py    |  13 +-
 .../wrappers/uncertainty_estimator_wrapper.py |  68 ++--
 rlberry/wrappers/vis2d.py                     | 177 +++++----
 setup.py                                      |  68 ++--
 152 files changed, 4007 insertions(+), 3177 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index 366d4ba5e..4893481f1 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -16,52 +16,54 @@
 import sphinx_gallery  # noqa
 
 
-sys.path.insert(0, os.path.abspath('.'))
-sys.path.insert(0, os.path.abspath('../'))
+sys.path.insert(0, os.path.abspath("."))
+sys.path.insert(0, os.path.abspath("../"))
 
 # -- Project information -----------------------------------------------------
 
-project = 'rlberry'
-copyright = '2021, rlberry team'
-author = 'rlberry team'
+project = "rlberry"
+copyright = "2021, rlberry team"
+author = "rlberry team"
 
 # The full version, including alpha/beta/rc tags
-release = '0.1'
+release = "0.1"
 
 # -- General configuration ---------------------------------------------------
 
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
-extensions = ['sphinx.ext.doctest',
-              'sphinx.ext.todo',
-              'sphinx.ext.viewcode',
-              'sphinx.ext.autodoc',
-              'sphinx.ext.autosummary',
-              'sphinx.ext.mathjax',
-              'sphinx.ext.autosectionlabel',
-              'sphinxcontrib.video',
-              "numpydoc",
-              "sphinx_gallery.gen_gallery",
-              'myst_parser',]
+extensions = [
+    "sphinx.ext.doctest",
+    "sphinx.ext.todo",
+    "sphinx.ext.viewcode",
+    "sphinx.ext.autodoc",
+    "sphinx.ext.autosummary",
+    "sphinx.ext.mathjax",
+    "sphinx.ext.autosectionlabel",
+    "sphinxcontrib.video",
+    "numpydoc",
+    "sphinx_gallery.gen_gallery",
+    "myst_parser",
+]
 autodoc_default_flags = ["members", "inherited-members"]
 
 
 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['templates']
+templates_path = ["templates"]
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'themes']
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "themes"]
 
 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
 #
-source_suffix = ['.rst', '.md']
+source_suffix = [".rst", ".md"]
 
 # The master toctree document.
-master_doc = 'index'
+master_doc = "index"
 
 # Copied from scikit-learn:
 # For maths, use mathjax by default and svg if NO_MATHJAX env variable is set
@@ -81,7 +83,7 @@
 #
 html_theme = "scikit-learn-fork"
 
-html_theme_options = { "mathjax_path": mathjax_path}
+html_theme_options = {"mathjax_path": mathjax_path}
 
 html_theme_path = ["themes"]
 
@@ -98,6 +100,6 @@
     "doc_module": "rlberry",
     "backreferences_dir": os.path.join("generated"),
     "reference_url": {"rlberry": None},
-    'matplotlib_animations':True,
-    'remove_config_comments': True,
+    "matplotlib_animations": True,
+    "remove_config_comments": True,
 }
diff --git a/examples/demo_agents/video_plot_a2c.py b/examples/demo_agents/video_plot_a2c.py
index 50a158aec..35a8ba0a4 100644
--- a/examples/demo_agents/video_plot_a2c.py
+++ b/examples/demo_agents/video_plot_a2c.py
@@ -18,12 +18,7 @@
 env = PBall2D()
 n_episodes = 400
 horizon = 256
-agent = A2CAgent(
-    env,
-    horizon=horizon,
-    gamma=0.99,
-    learning_rate=0.001,
-    k_epochs=4)
+agent = A2CAgent(env, horizon=horizon, gamma=0.99, learning_rate=0.001, k_epochs=4)
 agent.fit(budget=n_episodes)
 
 env.enable_rendering()
diff --git a/examples/demo_agents/video_plot_dqn.py b/examples/demo_agents/video_plot_dqn.py
index 828eeff0c..340c53b10 100644
--- a/examples/demo_agents/video_plot_dqn.py
+++ b/examples/demo_agents/video_plot_dqn.py
@@ -39,7 +39,11 @@
 print(f"Running DQN on {env}")
 
 agent.fit(budget=50)
-vid = video_recorder.VideoRecorder(env,path="_video/video_plot_dqn.mp4", enabled=True,)
+vid = video_recorder.VideoRecorder(
+    env,
+    path="_video/video_plot_dqn.mp4",
+    enabled=True,
+)
 
 for episode in range(3):
     done = False
diff --git a/examples/demo_agents/video_plot_ppo.py b/examples/demo_agents/video_plot_ppo.py
index 0dc2444f3..9ace752be 100644
--- a/examples/demo_agents/video_plot_ppo.py
+++ b/examples/demo_agents/video_plot_ppo.py
@@ -20,12 +20,8 @@
 horizon = 256
 
 agent = PPOAgent(
-    env,
-    horizon=horizon,
-    gamma=0.99,
-    learning_rate=0.001,
-    eps_clip=0.2,
-    k_epochs=4)
+    env, horizon=horizon, gamma=0.99, learning_rate=0.001, eps_clip=0.2, k_epochs=4
+)
 agent.fit(budget=n_episodes)
 
 env.enable_rendering()
diff --git a/examples/demo_agents/video_plot_rs_kernel_ucbvi.py b/examples/demo_agents/video_plot_rs_kernel_ucbvi.py
index 6afa1c965..0d30d5d1f 100644
--- a/examples/demo_agents/video_plot_rs_kernel_ucbvi.py
+++ b/examples/demo_agents/video_plot_rs_kernel_ucbvi.py
@@ -19,10 +19,16 @@
 # rescake rewards to [0, 1]
 env = RescaleRewardWrapper(env, (0.0, 1.0))
 
-agent = RSKernelUCBVIAgent(env, gamma=0.99, horizon=300,
-                           bonus_scale_factor=0.01,
-                           min_dist=0.2, bandwidth=0.05, beta=1.0,
-                           kernel_type="gaussian")
+agent = RSKernelUCBVIAgent(
+    env,
+    gamma=0.99,
+    horizon=300,
+    bonus_scale_factor=0.01,
+    min_dist=0.2,
+    bandwidth=0.05,
+    beta=1.0,
+    kernel_type="gaussian",
+)
 agent.fit(budget=500)
 
 env.enable_rendering()
diff --git a/examples/demo_agents/video_plot_rsucbvi.py b/examples/demo_agents/video_plot_rsucbvi.py
index 8878f8c83..44dce662d 100644
--- a/examples/demo_agents/video_plot_rsucbvi.py
+++ b/examples/demo_agents/video_plot_rsucbvi.py
@@ -17,8 +17,7 @@
 env = MountainCar()
 horizon = 170
 print("Running RS-UCBVI on %s" % env.name)
-agent = RSUCBVIAgent(env, gamma=0.99, horizon=horizon,
-                     bonus_scale_factor=0.1)
+agent = RSUCBVIAgent(env, gamma=0.99, horizon=horizon, bonus_scale_factor=0.1)
 agent.fit(budget=500)
 
 env.enable_rendering()
diff --git a/examples/demo_agents/video_plot_vi.py b/examples/demo_agents/video_plot_vi.py
index 2e7eb5ead..2065e9660 100644
--- a/examples/demo_agents/video_plot_vi.py
+++ b/examples/demo_agents/video_plot_vi.py
@@ -13,6 +13,7 @@
 
 from rlberry.agents.dynprog import ValueIterationAgent
 from rlberry.envs.finite import Chain
+
 env = Chain()
 agent = ValueIterationAgent(env, gamma=0.95)
 info = agent.fit()
diff --git a/examples/demo_env/video_plot_acrobot.py b/examples/demo_env/video_plot_acrobot.py
index d22c28b87..7a4129985 100644
--- a/examples/demo_env/video_plot_acrobot.py
+++ b/examples/demo_env/video_plot_acrobot.py
@@ -20,15 +20,13 @@
 env = RescaleRewardWrapper(env, (0.0, 1.0))
 n_episodes = 300
 agent = RSUCBVIAgent(
-    env,
-    gamma=0.99,
-    horizon=300,
-    bonus_scale_factor=0.01, min_dist=0.25)
+    env, gamma=0.99, horizon=300, bonus_scale_factor=0.01, min_dist=0.25
+)
 agent.fit(budget=n_episodes)
 
 env.enable_rendering()
 state = env.reset()
-for tt in range(2*agent.horizon):
+for tt in range(2 * agent.horizon):
     action = agent.policy(state)
     next_state, reward, done, _ = env.step(action)
     state = next_state
diff --git a/examples/demo_env/video_plot_gridworld.py b/examples/demo_env/video_plot_gridworld.py
index c7d18e452..fe5e23c45 100644
--- a/examples/demo_env/video_plot_gridworld.py
+++ b/examples/demo_env/video_plot_gridworld.py
@@ -14,7 +14,6 @@
 from rlberry.envs.finite import GridWorld
 
 
-
 env = GridWorld(7, 10, walls=((2, 2), (3, 3)))
 agent = ValueIterationAgent(env, gamma=0.95)
 info = agent.fit()
diff --git a/examples/demo_env/video_plot_pball.py b/examples/demo_env/video_plot_pball.py
index cc0b85df2..af6c7c637 100644
--- a/examples/demo_env/video_plot_pball.py
+++ b/examples/demo_env/video_plot_pball.py
@@ -14,11 +14,7 @@
 from rlberry.envs.benchmarks.ball_exploration import PBall2D
 
 p = 5
-A = np.array([
-    [1.0, 0.1],
-    [-0.1, 1.0]
-]
-)
+A = np.array([[1.0, 0.1], [-0.1, 1.0]])
 
 reward_amplitudes = np.array([1.0, 0.5, 0.5])
 reward_smoothness = np.array([0.25, 0.25, 0.25])
@@ -26,19 +22,24 @@
 reward_centers = [
     np.array([0.75 * np.cos(np.pi / 2), 0.75 * np.sin(np.pi / 2)]),
     np.array([0.75 * np.cos(np.pi / 6), 0.75 * np.sin(np.pi / 6)]),
-    np.array([0.75 * np.cos(5 * np.pi / 6), 0.75 * np.sin(5 * np.pi / 6)])
+    np.array([0.75 * np.cos(5 * np.pi / 6), 0.75 * np.sin(5 * np.pi / 6)]),
 ]
 
-action_list = [0.1 * np.array([1, 0]),
-               -0.1 * np.array([1, 0]),
-               0.1 * np.array([0, 1]),
-               -0.1 * np.array([0, 1])]
+action_list = [
+    0.1 * np.array([1, 0]),
+    -0.1 * np.array([1, 0]),
+    0.1 * np.array([0, 1]),
+    -0.1 * np.array([0, 1]),
+]
 
-env = PBall2D(p=p, A=A,
-              reward_amplitudes=reward_amplitudes,
-              reward_centers=reward_centers,
-              reward_smoothness=reward_smoothness,
-              action_list=action_list)
+env = PBall2D(
+    p=p,
+    A=A,
+    reward_amplitudes=reward_amplitudes,
+    reward_centers=reward_centers,
+    reward_smoothness=reward_smoothness,
+    action_list=action_list,
+)
 
 env.enable_rendering()
 
diff --git a/examples/demo_env/video_plot_rooms.py b/examples/demo_env/video_plot_rooms.py
index 39e43d100..163ede1c6 100644
--- a/examples/demo_env/video_plot_rooms.py
+++ b/examples/demo_env/video_plot_rooms.py
@@ -13,11 +13,13 @@
 from rlberry.envs.benchmarks.grid_exploration.nroom import NRoom
 from rlberry.agents.dynprog import ValueIterationAgent
 
-env = NRoom(nrooms=9,
-            remove_walls=False,
-            room_size=9,
-            initial_state_distribution='center',
-            include_traps=True)
+env = NRoom(
+    nrooms=9,
+    remove_walls=False,
+    room_size=9,
+    initial_state_distribution="center",
+    include_traps=True,
+)
 horizon = env.observation_space.n
 
 agent = ValueIterationAgent(env, gamma=0.999, horizon=horizon)
diff --git a/examples/demo_examples/demo_adaptiveql.py b/examples/demo_examples/demo_adaptiveql.py
index a314ff042..3493af262 100644
--- a/examples/demo_examples/demo_adaptiveql.py
+++ b/examples/demo_examples/demo_adaptiveql.py
@@ -7,29 +7,30 @@
 from rlberry.agents import AdaptiveQLAgent
 from rlberry.agents import RSUCBVIAgent
 from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env
-from rlberry.manager import MultipleManagers, AgentManager, plot_writer_data, evaluate_agents
+from rlberry.manager import (
+    MultipleManagers,
+    AgentManager,
+    plot_writer_data,
+    evaluate_agents,
+)
 import matplotlib.pyplot as plt
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     env = (get_benchmark_env, dict(level=2))
 
     N_EP = 1000
     HORIZON = 30
 
     params = {}
-    params['adaql'] = {
-        'horizon': HORIZON,
-        'gamma': 1.0,
-        'bonus_scale_factor': 1.0
-    }
+    params["adaql"] = {"horizon": HORIZON, "gamma": 1.0, "bonus_scale_factor": 1.0}
 
-    params['rsucbvi'] = {
-        'horizon': HORIZON,
-        'gamma': 1.0,
-        'bonus_scale_factor': 1.0,
-        'min_dist': 0.05,
-        'max_repr': 800
+    params["rsucbvi"] = {
+        "horizon": HORIZON,
+        "gamma": 1.0,
+        "bonus_scale_factor": 1.0,
+        "min_dist": 0.05,
+        "max_repr": 800,
     }
 
     eval_kwargs = dict(eval_horizon=HORIZON, n_simulations=20)
@@ -40,26 +41,33 @@
             AdaptiveQLAgent,
             env,
             fit_budget=N_EP,
-            init_kwargs=params['adaql'],
+            init_kwargs=params["adaql"],
             eval_kwargs=eval_kwargs,
             n_fit=4,
-            output_dir='dev/examples/')
+            output_dir="dev/examples/",
+        )
     )
     multimanagers.append(
         AgentManager(
             RSUCBVIAgent,
             env,
             fit_budget=N_EP,
-            init_kwargs=params['rsucbvi'], n_fit=2,
-            output_dir='dev/examples/')
+            init_kwargs=params["rsucbvi"],
+            n_fit=2,
+            output_dir="dev/examples/",
+        )
     )
 
     multimanagers.run(save=False)
 
     evaluate_agents(multimanagers.managers)
 
-    plot_writer_data(multimanagers.managers, tag='episode_rewards',
-                     preprocess_func=np.cumsum, title='Cumulative Rewards')
+    plot_writer_data(
+        multimanagers.managers,
+        tag="episode_rewards",
+        preprocess_func=np.cumsum,
+        title="Cumulative Rewards",
+    )
 
     for stats in multimanagers.managers:
         agent = stats.get_agent_instances()[0]
@@ -70,4 +78,4 @@
     plt.show()
 
     for stats in multimanagers.managers:
-        print(f'Agent = {stats.agent_name}, Eval = {stats.eval_agents()}')
+        print(f"Agent = {stats.agent_name}, Eval = {stats.eval_agents()}")
diff --git a/examples/demo_examples/demo_agent_manager.py b/examples/demo_examples/demo_agent_manager.py
index 74fce9ea8..7d3ee9273 100644
--- a/examples/demo_examples/demo_agent_manager.py
+++ b/examples/demo_examples/demo_agent_manager.py
@@ -11,7 +11,7 @@
 from rlberry.seeding import set_external_seed
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     set_external_seed(123)
 
     # --------------------------------
@@ -46,9 +46,7 @@
         "kernel_type": "gaussian",
     }
 
-    params_a2c = {"gamma": GAMMA,
-                  "horizon": HORIZON,
-                  "learning_rate": 0.0003}
+    params_a2c = {"gamma": GAMMA, "horizon": HORIZON, "learning_rate": 0.0003}
 
     eval_kwargs = dict(eval_horizon=HORIZON, n_simulations=20)
 
@@ -67,7 +65,8 @@
         default_writer_kwargs=dict(
             maxlen=N_EPISODES - 10,
             log_interval=5.0,
-        ))
+        ),
+    )
     rskernel_stats = AgentManager(
         RSKernelUCBVIAgent,
         train_env,
@@ -76,7 +75,8 @@
         eval_kwargs=eval_kwargs,
         n_fit=4,
         seed=123,
-        enable_tensorboard=True)
+        enable_tensorboard=True,
+    )
     a2c_stats = AgentManager(
         A2CAgent,
         train_env,
@@ -85,7 +85,8 @@
         eval_kwargs=eval_kwargs,
         n_fit=4,
         seed=123,
-        parallelization='process')
+        parallelization="process",
+    )
 
     agent_manager_list = [rsucbvi_stats, rskernel_stats, a2c_stats]
 
@@ -96,16 +97,17 @@
     rsucbvi_stats.fit(budget=50)
 
     # learning curves
-    plot_writer_data(agent_manager_list,
-                     tag='episode_rewards',
-                     preprocess_func=np.cumsum,
-                     title='cumulative rewards',
-                     show=False)
-        
-    plot_writer_data(agent_manager_list,
-                     tag='episode_rewards',
-                     title='episode rewards',
-                     show=False)
+    plot_writer_data(
+        agent_manager_list,
+        tag="episode_rewards",
+        preprocess_func=np.cumsum,
+        title="cumulative rewards",
+        show=False,
+    )
+
+    plot_writer_data(
+        agent_manager_list, tag="episode_rewards", title="episode rewards", show=False
+    )
 
     # compare final policies
     output = evaluate_agents(agent_manager_list)
diff --git a/examples/demo_examples/demo_agent_manager_save.py b/examples/demo_examples/demo_agent_manager_save.py
index c9eb49155..76439d43d 100644
--- a/examples/demo_examples/demo_agent_manager_save.py
+++ b/examples/demo_examples/demo_agent_manager_save.py
@@ -9,7 +9,7 @@
 from rlberry.manager import AgentManager, plot_writer_data, evaluate_agents
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     # --------------------------------
     # Define train and evaluation envs
     # --------------------------------
@@ -25,9 +25,7 @@
     BONUS_SCALE_FACTOR = 0.1
     MIN_DIST = 0.1
 
-    params_ppo = {"gamma": GAMMA,
-                  "horizon": HORIZON,
-                  "learning_rate": 0.0003}
+    params_ppo = {"gamma": GAMMA, "horizon": HORIZON, "learning_rate": 0.0003}
 
     eval_kwargs = dict(eval_horizon=HORIZON, n_simulations=20)
 
@@ -41,8 +39,9 @@
         init_kwargs=params_ppo,
         eval_kwargs=eval_kwargs,
         n_fit=4,
-        output_dir='dev/',
-        parallelization='process')
+        output_dir="dev/",
+        parallelization="process",
+    )
     ppo_stats.fit()  # fit the 4 agents
     ppo_stats_fname = ppo_stats.save()
     del ppo_stats
@@ -53,9 +52,13 @@
     ppo_stats = AgentManager.load(ppo_stats_fname)
 
     # learning curves
-    plot_writer_data(ppo_stats, tag='episode_rewards',
-                     preprocess_func=np.cumsum,
-                     title='Cumulative Rewards', show=False)
+    plot_writer_data(
+        ppo_stats,
+        tag="episode_rewards",
+        preprocess_func=np.cumsum,
+        title="Cumulative Rewards",
+        show=False,
+    )
 
     # compare final policies
     output = evaluate_agents([ppo_stats], n_simulations=15)
diff --git a/examples/demo_examples/demo_agent_manager_set_writer.py b/examples/demo_examples/demo_agent_manager_set_writer.py
index 017972976..8fea389b7 100644
--- a/examples/demo_examples/demo_agent_manager_set_writer.py
+++ b/examples/demo_examples/demo_agent_manager_set_writer.py
@@ -8,7 +8,7 @@
 from rlberry.manager import AgentManager, evaluate_agents
 from torch.utils.tensorboard import SummaryWriter
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     # --------------------------------
     # Define training env
     # --------------------------------
@@ -21,9 +21,7 @@
     GAMMA = 0.99
     HORIZON = 50
 
-    params_ppo = {"gamma": GAMMA,
-                  "horizon": HORIZON,
-                  "learning_rate": 0.0003}
+    params_ppo = {"gamma": GAMMA, "horizon": HORIZON, "learning_rate": 0.0003}
 
     eval_kwargs = dict(eval_horizon=HORIZON, n_simulations=20)
 
@@ -36,15 +34,18 @@
         fit_budget=N_EPISODES,
         init_kwargs=params_ppo,
         eval_kwargs=eval_kwargs,
-        n_fit=4)
+        n_fit=4,
+    )
 
-    ppo_stats.set_writer(0, SummaryWriter, writer_kwargs={'comment': 'worker_0'})
-    ppo_stats.set_writer(1, SummaryWriter, writer_kwargs={'comment': 'worker_1'})
+    ppo_stats.set_writer(0, SummaryWriter, writer_kwargs={"comment": "worker_0"})
+    ppo_stats.set_writer(1, SummaryWriter, writer_kwargs={"comment": "worker_1"})
 
     agent_manager_list = [ppo_stats]
 
     agent_manager_list[0].fit()
-    agent_manager_list[0].save()  # after fit, writers are set to None to avoid pickle problems.
+    agent_manager_list[
+        0
+    ].save()  # after fit, writers are set to None to avoid pickle problems.
 
     # compare final policies
     output = evaluate_agents(agent_manager_list)
diff --git a/examples/demo_examples/demo_avecppo.py b/examples/demo_examples/demo_avecppo.py
index 15b4afca6..1a745b0fd 100644
--- a/examples/demo_examples/demo_avecppo.py
+++ b/examples/demo_examples/demo_avecppo.py
@@ -11,12 +11,8 @@
 n_episodes = 400
 horizon = 256
 agent = AVECPPOAgent(
-    env,
-    horizon=horizon,
-    gamma=0.99,
-    learning_rate=0.00025,
-    eps_clip=0.2,
-    k_epochs=4)
+    env, horizon=horizon, gamma=0.99, learning_rate=0.00025, eps_clip=0.2, k_epochs=4
+)
 agent.fit(budget=n_episodes)
 
 env.enable_rendering()
diff --git a/examples/demo_examples/demo_experiment/run.py b/examples/demo_examples/demo_experiment/run.py
index 5f2d38979..38457401f 100644
--- a/examples/demo_examples/demo_experiment/run.py
+++ b/examples/demo_examples/demo_experiment/run.py
@@ -18,7 +18,7 @@
 from rlberry.manager.multiple_managers import MultipleManagers
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     multimanagers = MultipleManagers()
 
     for agent_manager in experiment_generator():
@@ -30,10 +30,10 @@
     # Reading the results
     del multimanagers
 
-    data = load_experiment_results('results', 'params_experiment')
+    data = load_experiment_results("results", "params_experiment")
 
     print(data)
 
     # Fit one of the managers for a few more episodes
     # If tensorboard is enabled, you should see more episodes ran for 'rsucbvi_alternative'
-    data['manager']['rsucbvi_alternative'].fit(50)
+    data["manager"]["rsucbvi_alternative"].fit(50)
diff --git a/examples/demo_examples/demo_from_stable_baselines.py b/examples/demo_examples/demo_from_stable_baselines.py
index 870371335..bdddde3b0 100644
--- a/examples/demo_examples/demo_from_stable_baselines.py
+++ b/examples/demo_examples/demo_from_stable_baselines.py
@@ -9,31 +9,33 @@
 
 
 class A2CAgent(AgentWithSimplePolicy):
-    name = 'A2C'
-
-    def __init__(self,
-                 env,
-                 policy,
-                 learning_rate=7e-4,
-                 n_steps: int = 200,
-                 gamma: float = 0.99,
-                 gae_lambda: float = 1.0,
-                 ent_coef: float = 0.0,
-                 vf_coef: float = 0.5,
-                 max_grad_norm: float = 0.5,
-                 rms_prop_eps: float = 1e-5,
-                 use_rms_prop: bool = True,
-                 use_sde: bool = False,
-                 sde_sample_freq: int = -1,
-                 normalize_advantage: bool = False,
-                 tensorboard_log=None,
-                 create_eval_env=False,
-                 policy_kwargs=None,
-                 verbose: int = 0,
-                 seed=None,
-                 device="auto",
-                 _init_setup_model: bool = True,
-                 **kwargs):
+    name = "A2C"
+
+    def __init__(
+        self,
+        env,
+        policy,
+        learning_rate=7e-4,
+        n_steps: int = 200,
+        gamma: float = 0.99,
+        gae_lambda: float = 1.0,
+        ent_coef: float = 0.0,
+        vf_coef: float = 0.5,
+        max_grad_norm: float = 0.5,
+        rms_prop_eps: float = 1e-5,
+        use_rms_prop: bool = True,
+        use_sde: bool = False,
+        sde_sample_freq: int = -1,
+        normalize_advantage: bool = False,
+        tensorboard_log=None,
+        create_eval_env=False,
+        policy_kwargs=None,
+        verbose: int = 0,
+        seed=None,
+        device="auto",
+        _init_setup_model: bool = True,
+        **kwargs
+    ):
         # init rlberry base class
         AgentWithSimplePolicy.__init__(self, env, **kwargs)
         # rlberry accepts tuples (env_constructor, env_kwargs) as env
@@ -65,7 +67,8 @@ def __init__(self,
             verbose,
             seed,
             device,
-            _init_setup_model)
+            _init_setup_model,
+        )
 
     def fit(self, budget, **kwargs):
         self.wrapped.learn(total_timesteps=budget, **kwargs)
@@ -79,10 +82,12 @@ def policy(self, observation):
     #
     @classmethod
     def sample_parameters(cls, trial):
-        learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1)
+        learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
         ent_coef = trial.suggest_loguniform("ent_coef", 0.00000001, 0.1)
         vf_coef = trial.suggest_uniform("vf_coef", 0, 1)
-        normalize_advantage = trial.suggest_categorical("normalize_advantage", [False, True])
+        normalize_advantage = trial.suggest_categorical(
+            "normalize_advantage", [False, True]
+        )
         return dict(
             learning_rate=learning_rate,
             ent_coef=ent_coef,
@@ -91,12 +96,12 @@ def sample_parameters(cls, trial):
         )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     #
     # Training one agent
     #
     env_ctor = gym_make
-    env_kwargs = dict(id='CartPole-v1')
+    env_kwargs = dict(id="CartPole-v1")
     # env = env_ctor(**env_kwargs)
     # agent = A2CAgent(env, 'MlpPolicy', verbose=1)
     # agent.fit(budget=1000)
@@ -109,36 +114,39 @@ def sample_parameters(cls, trial):
     stats = AgentManager(
         A2CAgent,
         (env_ctor, env_kwargs),
-        agent_name='A2C baseline',
-        init_kwargs=dict(policy='MlpPolicy', verbose=1),
+        agent_name="A2C baseline",
+        init_kwargs=dict(policy="MlpPolicy", verbose=1),
         fit_kwargs=dict(log_interval=1000),
         fit_budget=2500,
         eval_kwargs=dict(eval_horizon=400),
         n_fit=4,
-        parallelization='process',
-        output_dir='dev/stable_baselines',
-        seed=123)
+        parallelization="process",
+        output_dir="dev/stable_baselines",
+        seed=123,
+    )
 
     stats_alternative = AgentManager(
         A2CAgent,
         (env_ctor, env_kwargs),
-        agent_name='A2C optimized',
-        init_kwargs=dict(policy='MlpPolicy', verbose=1),
+        agent_name="A2C optimized",
+        init_kwargs=dict(policy="MlpPolicy", verbose=1),
         fit_kwargs=dict(log_interval=1000),
         fit_budget=2500,
         eval_kwargs=dict(eval_horizon=400),
         n_fit=4,
-        parallelization='process',
-        output_dir='dev/stable_baselines',
-        seed=456)
+        parallelization="process",
+        output_dir="dev/stable_baselines",
+        seed=456,
+    )
 
     # Optimize hyperparams (600 seconds)
     stats_alternative.optimize_hyperparams(
         timeout=600,
         n_optuna_workers=2,
         n_fit=2,
-        optuna_parallelization='process',
-        fit_fraction=1.0)
+        optuna_parallelization="process",
+        fit_fraction=1.0,
+    )
 
     # Fit everything in parallel
     multimanagers = MultipleManagers()
diff --git a/examples/demo_examples/demo_from_stable_baselines_atari.py b/examples/demo_examples/demo_from_stable_baselines_atari.py
index a4095bbd1..946ff0351 100644
--- a/examples/demo_examples/demo_from_stable_baselines_atari.py
+++ b/examples/demo_examples/demo_from_stable_baselines_atari.py
@@ -13,31 +13,33 @@
 
 
 class A2CAgent(AgentWithSimplePolicy):
-    name = 'A2C'
-
-    def __init__(self,
-                 env,
-                 policy,
-                 learning_rate=7e-4,
-                 n_steps: int = 5,
-                 gamma: float = 0.99,
-                 gae_lambda: float = 1.0,
-                 ent_coef: float = 0.0,
-                 vf_coef: float = 0.5,
-                 max_grad_norm: float = 0.5,
-                 rms_prop_eps: float = 1e-5,
-                 use_rms_prop: bool = True,
-                 use_sde: bool = False,
-                 sde_sample_freq: int = -1,
-                 normalize_advantage: bool = False,
-                 tensorboard_log=None,
-                 create_eval_env=False,
-                 policy_kwargs=None,
-                 verbose: int = 0,
-                 seed=None,
-                 device="auto",
-                 _init_setup_model: bool = True,
-                 **kwargs):
+    name = "A2C"
+
+    def __init__(
+        self,
+        env,
+        policy,
+        learning_rate=7e-4,
+        n_steps: int = 5,
+        gamma: float = 0.99,
+        gae_lambda: float = 1.0,
+        ent_coef: float = 0.0,
+        vf_coef: float = 0.5,
+        max_grad_norm: float = 0.5,
+        rms_prop_eps: float = 1e-5,
+        use_rms_prop: bool = True,
+        use_sde: bool = False,
+        sde_sample_freq: int = -1,
+        normalize_advantage: bool = False,
+        tensorboard_log=None,
+        create_eval_env=False,
+        policy_kwargs=None,
+        verbose: int = 0,
+        seed=None,
+        device="auto",
+        _init_setup_model: bool = True,
+        **kwargs
+    ):
         # init rlberry base class
         AgentWithSimplePolicy.__init__(self, env, **kwargs)
         # rlberry accepts tuples (env_constructor, env_kwargs) as env
@@ -69,7 +71,8 @@ def __init__(self,
             verbose,
             seed,
             device,
-            _init_setup_model)
+            _init_setup_model,
+        )
 
     def fit(self, budget):
         self.wrapped.learn(total_timesteps=budget)
@@ -83,7 +86,7 @@ def policy(self, observation):
     #
     def save(self, filename):
         self.wrapped.save(filename)
-        return Path(filename).with_suffix('.zip')
+        return Path(filename).with_suffix(".zip")
 
     @classmethod
     def load(cls, filename, **kwargs):
@@ -96,16 +99,16 @@ def load(cls, filename, **kwargs):
     #
     @classmethod
     def sample_parameters(cls, trial):
-        learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1)
+        learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
 
-        return {'learning_rate': learning_rate}
+        return {"learning_rate": learning_rate}
 
 
 #
 # Train and eval env constructors
 #
 def env_constructor(n_envs=4):
-    env = make_atari_env('MontezumaRevenge-v0', n_envs=n_envs)
+    env = make_atari_env("MontezumaRevenge-v0", n_envs=n_envs)
     env = VecFrameStack(env, n_stack=4)
     return env
 
@@ -114,7 +117,7 @@ def eval_env_constructor(n_envs=1):
     """
     Evaluation should be in a scalar environment.
     """
-    env = make_atari_env('MontezumaRevenge-v0', n_envs=n_envs)
+    env = make_atari_env("MontezumaRevenge-v0", n_envs=n_envs)
     env = VecFrameStack(env, n_stack=4)
     env = ScalarizeEnvWrapper(env)
     return env
@@ -125,7 +128,7 @@ def eval_env_constructor(n_envs=1):
 #
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     #
     # Training several agents and comparing different hyperparams
     #
@@ -135,13 +138,14 @@ def eval_env_constructor(n_envs=1):
         train_env=(env_constructor, None),
         eval_env=(eval_env_constructor, None),
         eval_kwargs=dict(eval_horizon=200),
-        agent_name='A2C baseline',
+        agent_name="A2C baseline",
         fit_budget=5000,
-        init_kwargs=dict(policy='CnnPolicy', verbose=10),
+        init_kwargs=dict(policy="CnnPolicy", verbose=10),
         n_fit=4,
-        parallelization='process',
-        output_dir='dev/stable_baselines_atari',
-        seed=123)
+        parallelization="process",
+        output_dir="dev/stable_baselines_atari",
+        seed=123,
+    )
 
     stats.fit()
     stats.optimize_hyperparams(timeout=60, n_fit=2)
diff --git a/examples/demo_examples/demo_gym_wrapper.py b/examples/demo_examples/demo_gym_wrapper.py
index 8726e47e9..8dd4143ba 100644
--- a/examples/demo_examples/demo_gym_wrapper.py
+++ b/examples/demo_examples/demo_gym_wrapper.py
@@ -7,14 +7,13 @@
 from rlberry.agents import RSUCBVIAgent
 from rlberry.wrappers import RescaleRewardWrapper
 
-env = gym_make('Acrobot-v1')
+env = gym_make("Acrobot-v1")
 env.reward_range = (-1.0, 0.0)  # missing in gym implementation
 
 # rescake rewards to [0, 1]
 env = RescaleRewardWrapper(env, (0.0, 1.0))
 
-agent = RSUCBVIAgent(env, gamma=0.99, horizon=200,
-                     bonus_scale_factor=0.1, min_dist=0.2)
+agent = RSUCBVIAgent(env, gamma=0.99, horizon=200, bonus_scale_factor=0.1, min_dist=0.2)
 agent.fit(budget=10)
 
 state = env.reset()
diff --git a/examples/demo_examples/demo_hyperparam_optim.py b/examples/demo_examples/demo_hyperparam_optim.py
index 6f2c3d681..f00b2939c 100644
--- a/examples/demo_examples/demo_hyperparam_optim.py
+++ b/examples/demo_examples/demo_hyperparam_optim.py
@@ -7,7 +7,7 @@
 from rlberry.agents.torch import REINFORCEAgent
 from rlberry.manager import AgentManager
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     # --------------------------------
     # Define train and evaluation envs
     # --------------------------------
@@ -22,9 +22,7 @@
     BONUS_SCALE_FACTOR = 0.1
     MIN_DIST = 0.1
 
-    params = {"gamma": GAMMA,
-              "horizon": HORIZON,
-              "learning_rate": 0.0003}
+    params = {"gamma": GAMMA, "horizon": HORIZON, "learning_rate": 0.0003}
 
     eval_kwargs = dict(eval_horizon=HORIZON, n_simulations=20)
 
@@ -32,17 +30,22 @@
     # Run AgentManager and save results
     # --------------------------------
     manager = AgentManager(
-        REINFORCEAgent, train_env, fit_budget=N_EPISODES,
+        REINFORCEAgent,
+        train_env,
+        fit_budget=N_EPISODES,
         init_kwargs=params,
         eval_kwargs=eval_kwargs,
-        n_fit=4)
+        n_fit=4,
+    )
 
     # hyperparam optim with multiple threads
     manager.optimize_hyperparams(
-        n_trials=5, timeout=None,
+        n_trials=5,
+        timeout=None,
         n_fit=2,
-        sampler_method='optuna_default',
-        optuna_parallelization='thread')
+        sampler_method="optuna_default",
+        optuna_parallelization="thread",
+    )
 
     initial_n_trials = len(manager.optuna_study.trials)
 
@@ -55,11 +58,13 @@
 
     # continue previous optimization, now with 120s of timeout and multiprocessing
     manager.optimize_hyperparams(
-        n_trials=512, timeout=120,
+        n_trials=512,
+        timeout=120,
         n_fit=8,
         continue_previous=True,
-        optuna_parallelization='process',
-        n_optuna_workers=4)
+        optuna_parallelization="process",
+        n_optuna_workers=4,
+    )
 
     print("number of initial trials = ", initial_n_trials)
     print("number of trials after continuing= ", len(manager.optuna_study.trials))
diff --git a/examples/demo_examples/demo_jax_dqn.py b/examples/demo_examples/demo_jax_dqn.py
index 7afc33598..76a001650 100644
--- a/examples/demo_examples/demo_jax_dqn.py
+++ b/examples/demo_examples/demo_jax_dqn.py
@@ -8,13 +8,13 @@
 from rlberry.envs import gym_make
 from rlberry.manager import AgentManager, MultipleManagers, plot_writer_data
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     # global params
     fit_budget = 10000
     n_fit = 2
 
     # env and algorithm params
-    env = (gym_make, dict(id='CartPole-v0'))
+    env = (gym_make, dict(id="CartPole-v0"))
     params = dict(
         chunk_size=8,
         batch_size=64,
@@ -25,17 +25,15 @@
         learning_rate=0.0015,
         net_constructor=nets.MLPQNetwork,
         net_kwargs=dict(
-            num_actions=env[0](**env[1]).action_space.n,
-            hidden_sizes=(64, 64)
-        )
+            num_actions=env[0](**env[1]).action_space.n, hidden_sizes=(64, 64)
+        ),
     )
 
     params_alternative = params.copy()
     params_alternative.update(
         dict(
             net_kwargs=dict(
-                num_actions=env[0](**env[1]).action_space.n,
-                hidden_sizes=(16, 16)
+                num_actions=env[0](**env[1]).action_space.n, hidden_sizes=(16, 16)
             )
         )
     )
@@ -47,8 +45,8 @@
         eval_env=env,
         init_kwargs=params,
         n_fit=n_fit,
-        parallelization='process',
-        agent_name='dqn',
+        parallelization="process",
+        agent_name="dqn",
     )
 
     stats_alternative = AgentManager(
@@ -58,8 +56,8 @@
         eval_env=env,
         init_kwargs=params_alternative,
         n_fit=n_fit,
-        parallelization='process',
-        agent_name='dqn_smaller_net'
+        parallelization="process",
+        agent_name="dqn_smaller_net",
     )
 
     # fit everything in parallel
@@ -68,10 +66,10 @@
     multimanagers.append(stats_alternative)
     multimanagers.run()
 
-    plot_writer_data(multimanagers.managers, tag='episode_rewards', show=False)
-    plot_writer_data(multimanagers.managers, tag='dw_time_elapsed', show=False)
-    plot_writer_data(multimanagers.managers, tag='eval_rewards', show=False)
-    plot_writer_data(multimanagers.managers, tag='q_loss')
+    plot_writer_data(multimanagers.managers, tag="episode_rewards", show=False)
+    plot_writer_data(multimanagers.managers, tag="dw_time_elapsed", show=False)
+    plot_writer_data(multimanagers.managers, tag="eval_rewards", show=False)
+    plot_writer_data(multimanagers.managers, tag="q_loss")
 
     stats.save()
     stats.clear_output_dir()
diff --git a/examples/demo_examples/demo_lsvi_ucb.py b/examples/demo_examples/demo_lsvi_ucb.py
index 43289df0c..5d988c659 100644
--- a/examples/demo_examples/demo_lsvi_ucb.py
+++ b/examples/demo_examples/demo_lsvi_ucb.py
@@ -16,7 +16,9 @@
 
 
 class GridWorldFeatureMap(FeatureMap):
-    def __init__(self, n_states, n_actions, n_rows, n_cols, index2coord, dim=15, sigma=0.25):
+    def __init__(
+        self, n_states, n_actions, n_rows, n_cols, index2coord, dim=15, sigma=0.25
+    ):
         self.index2coord = index2coord
         self.n_states = n_states
         self.n_actions = n_actions
@@ -35,7 +37,7 @@ def __init__(self, n_states, n_actions, n_rows, n_cols, index2coord, dim=15, sig
                 x_jj = row_jj / n_rows
                 y_jj = col_jj / n_cols
                 dist = np.sqrt((x_jj - x_ii) ** 2.0 + (y_jj - y_ii) ** 2.0)
-                sim_matrix[ii, jj] = np.exp(-(dist / sigma) ** 2.0)
+                sim_matrix[ii, jj] = np.exp(-((dist / sigma) ** 2.0))
 
         # factorize similarity matrix to obtain features
         uu, ss, vh = np.linalg.svd(sim_matrix, hermitian=True)
@@ -54,16 +56,17 @@ def feature_map_fn(env):
         env.action_space.n,
         env.nrows,
         env.ncols,
-        env.index2coord)
+        env.index2coord,
+    )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     # Parameters
     n_episodes = 750
     horizon = 10
     gamma = 0.99
     eval_kwargs = dict(eval_horizon=10)
-    parallelization = 'process'
+    parallelization = "process"
 
     # Define environment (constructor, kwargs)
     env = (GridWorld, dict(nrows=5, ncols=5, walls=(), success_probability=0.95))
@@ -72,7 +75,7 @@ def feature_map_fn(env):
         feature_map_fn=feature_map_fn,
         horizon=horizon,
         bonus_scale_factor=0.01,
-        gamma=gamma
+        gamma=gamma,
     )
 
     params_ucbvi = dict(
@@ -80,20 +83,17 @@ def feature_map_fn(env):
         gamma=gamma,
         real_time_dp=False,
         stage_dependent=False,
-        bonus_scale_factor=0.01
+        bonus_scale_factor=0.01,
     )
 
     params_greedy = dict(
         feature_map_fn=feature_map_fn,
         horizon=horizon,
         bonus_scale_factor=0.0,
-        gamma=gamma
+        gamma=gamma,
     )
 
-    params_oracle = dict(
-        horizon=horizon,
-        gamma=gamma
-    )
+    params_oracle = dict(horizon=horizon, gamma=gamma)
 
     stats = AgentManager(
         LSVIUCBAgent,
@@ -102,7 +102,8 @@ def feature_map_fn(env):
         fit_budget=n_episodes,
         eval_kwargs=eval_kwargs,
         n_fit=4,
-        parallelization=parallelization)
+        parallelization=parallelization,
+    )
 
     # UCBVI baseline
     stats_ucbvi = AgentManager(
@@ -112,7 +113,8 @@ def feature_map_fn(env):
         fit_budget=n_episodes,
         eval_kwargs=eval_kwargs,
         n_fit=4,
-        parallelization=parallelization)
+        parallelization=parallelization,
+    )
 
     # Random exploration baseline
     stats_random = AgentManager(
@@ -122,8 +124,9 @@ def feature_map_fn(env):
         fit_budget=n_episodes,
         eval_kwargs=eval_kwargs,
         n_fit=4,
-        agent_name='LSVI (random exploration)',
-        parallelization=parallelization)
+        agent_name="LSVI (random exploration)",
+        parallelization=parallelization,
+    )
 
     # Oracle (optimal policy)
     oracle_stats = AgentManager(
@@ -132,7 +135,8 @@ def feature_map_fn(env):
         init_kwargs=params_oracle,
         fit_budget=n_episodes,
         eval_kwargs=eval_kwargs,
-        n_fit=1)
+        n_fit=1,
+    )
 
     # fit
     stats.fit()
@@ -143,12 +147,12 @@ def feature_map_fn(env):
     # visualize results
     plot_writer_data(
         [stats, stats_ucbvi, stats_random],
-        tag='episode_rewards',
+        tag="episode_rewards",
         preprocess_func=np.cumsum,
-        title='Cumulative Rewards',
-        show=False)
+        title="Cumulative Rewards",
+        show=False,
+    )
     plot_writer_data(
-        [stats, stats_ucbvi, stats_random],
-        tag='dw_time_elapsed',
-        show=False)
+        [stats, stats_ucbvi, stats_random], tag="dw_time_elapsed", show=False
+    )
     evaluate_agents([stats, stats_ucbvi, stats_random, oracle_stats], n_simulations=20)
diff --git a/examples/demo_examples/demo_network/run_client.py b/examples/demo_examples/demo_network/run_client.py
index 5aa5f391e..88bb6f3e6 100644
--- a/examples/demo_examples/demo_network/run_client.py
+++ b/examples/demo_examples/demo_network/run_client.py
@@ -15,30 +15,29 @@
     Message.create(
         command=interface.Command.AGENT_MANAGER_CREATE_INSTANCE,
         params=dict(
-            agent_class=ResourceRequest(name='ValueIterationAgent'),
-            train_env=ResourceRequest(name='GridWorld', kwargs=dict(nrows=35)),
+            agent_class=ResourceRequest(name="ValueIterationAgent"),
+            train_env=ResourceRequest(name="GridWorld", kwargs=dict(nrows=35)),
             fit_budget=100,
             init_kwargs=dict(gamma=0.95),
             eval_kwargs=dict(eval_horizon=100, n_simulations=20),
             n_fit=2,
-            seed=10
+            seed=10,
         ),
         data=None,
     ),
     Message.create(
-        command=interface.Command.LIST_RESOURCES,
-        params=dict(),
-        data=dict()
+        command=interface.Command.LIST_RESOURCES, params=dict(), data=dict()
     ),
     print_response=True,
 )
 
 import numpy as np
+
 client.send(
     Message.create(
         command=interface.Command.NONE,
         params=dict(),
-        data=dict(big_list=list(1.0 * np.arange(2**8)))
+        data=dict(big_list=list(1.0 * np.arange(2 ** 8))),
     ),
     print_response=True,
 )
diff --git a/examples/demo_examples/demo_network/run_remote_manager.py b/examples/demo_examples/demo_network/run_remote_manager.py
index 427712b60..c4125b176 100644
--- a/examples/demo_examples/demo_network/run_remote_manager.py
+++ b/examples/demo_examples/demo_network/run_remote_manager.py
@@ -15,7 +15,7 @@
 from rlberry.manager.evaluation import evaluate_agents, plot_writer_data
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     port = int(input("Select server port: "))
     client = BerryClient(port=port)
 
@@ -23,39 +23,41 @@
 
     local_manager = AgentManager(
         agent_class=REINFORCEAgent,
-        train_env=(gym_make, dict(id='CartPole-v0')),
+        train_env=(gym_make, dict(id="CartPole-v0")),
         fit_budget=FIT_BUDGET,
         init_kwargs=dict(gamma=0.99),
         eval_kwargs=dict(eval_horizon=200, n_simulations=20),
         n_fit=2,
         seed=10,
-        agent_name='REINFORCE(local)',
-        parallelization='process'
+        agent_name="REINFORCE(local)",
+        parallelization="process",
     )
 
     remote_manager = RemoteAgentManager(
         client,
-        agent_class=ResourceRequest(name='REINFORCEAgent'),
-        train_env=ResourceRequest(name='gym_make', kwargs=dict(id='CartPole-v0')),
+        agent_class=ResourceRequest(name="REINFORCEAgent"),
+        train_env=ResourceRequest(name="gym_make", kwargs=dict(id="CartPole-v0")),
         fit_budget=FIT_BUDGET,
         init_kwargs=dict(gamma=0.99),
         eval_kwargs=dict(eval_horizon=200, n_simulations=20),
         n_fit=3,
         seed=10,
-        agent_name='REINFORCE(remote)',
-        parallelization='process',
+        agent_name="REINFORCE(remote)",
+        parallelization="process",
         enable_tensorboard=True,
     )
 
     remote_manager.set_writer(
         idx=0,
-        writer_fn=ResourceRequest(name='DefaultWriter'),
-        writer_kwargs=dict(name='debug_reinforce_writer')
+        writer_fn=ResourceRequest(name="DefaultWriter"),
+        writer_kwargs=dict(name="debug_reinforce_writer"),
     )
 
     # Optimize hyperparams of remote agent
-    best_params = remote_manager.optimize_hyperparams(timeout=60, optuna_parallelization='process')
-    print(f'best params = {best_params}')
+    best_params = remote_manager.optimize_hyperparams(
+        timeout=60, optuna_parallelization="process"
+    )
+    print(f"best params = {best_params}")
 
     # Test save/load
     fname1 = remote_manager.save()
@@ -72,7 +74,7 @@
     remote_manager.fit(budget=100)
 
     # plot
-    plot_writer_data(mmanagers.managers, tag='episode_rewards', show=False)
+    plot_writer_data(mmanagers.managers, tag="episode_rewards", show=False)
     evaluate_agents(mmanagers.managers, n_simulations=10, show=True)
 
     # Test some methods
diff --git a/examples/demo_examples/demo_network/run_server.py b/examples/demo_examples/demo_network/run_server.py
index 368d4f127..2426eb5b0 100644
--- a/examples/demo_examples/demo_network/run_server.py
+++ b/examples/demo_examples/demo_network/run_server.py
@@ -10,33 +10,20 @@
 from rlberry.envs import GridWorld, gym_make
 from rlberry.utils.writers import DefaultWriter
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     port = int(input("Select server port: "))
     resources = dict(
-        GridWorld=ResourceItem(
-            obj=GridWorld,
-            description='GridWorld constructor'
-        ),
-        gym_make=ResourceItem(
-            obj=gym_make,
-            description='gym_make'
-        ),
-        REINFORCEAgent=ResourceItem(
-            obj=REINFORCEAgent,
-            description='REINFORCEAgent'
-        ),
-        A2CAgent=ResourceItem(
-            obj=A2CAgent,
-            description='A2CAgent'
-        ),
+        GridWorld=ResourceItem(obj=GridWorld, description="GridWorld constructor"),
+        gym_make=ResourceItem(obj=gym_make, description="gym_make"),
+        REINFORCEAgent=ResourceItem(obj=REINFORCEAgent, description="REINFORCEAgent"),
+        A2CAgent=ResourceItem(obj=A2CAgent, description="A2CAgent"),
         ValueIterationAgent=ResourceItem(
             obj=ValueIterationAgent,
-            description='ValueIterationAgent constructor' + ValueIterationAgent.__doc__
+            description="ValueIterationAgent constructor" + ValueIterationAgent.__doc__,
         ),
         DefaultWriter=ResourceItem(
-            obj=DefaultWriter,
-            description='rlberry default writer'
-        )
+            obj=DefaultWriter, description="rlberry default writer"
+        ),
     )
     server = BerryServer(resources=resources, port=port, client_socket_timeout=120.0)
     server.start()
diff --git a/examples/demo_examples/demo_ppo_benchmark.py b/examples/demo_examples/demo_ppo_benchmark.py
index 1765c9cb6..d16d1f750 100644
--- a/examples/demo_examples/demo_ppo_benchmark.py
+++ b/examples/demo_examples/demo_ppo_benchmark.py
@@ -30,12 +30,10 @@
 params_oracle = {
     "n_samples": 20,  # samples per state-action
     "gamma": GAMMA,
-    "horizon": HORIZON
+    "horizon": HORIZON,
 }
 
-params_ppo = {"gamma": GAMMA,
-              "horizon": HORIZON,
-              "learning_rate": 0.0003}
+params_ppo = {"gamma": GAMMA, "horizon": HORIZON, "learning_rate": 0.0003}
 
 eval_kwargs = dict(eval_horizon=HORIZON, n_simulations=20)
 
@@ -43,15 +41,23 @@
 # Run AgentManager
 # -----------------------------
 oracle_stats = AgentManager(
-    MBQVIAgent, d_train_env, fit_budget=0,
+    MBQVIAgent,
+    d_train_env,
+    fit_budget=0,
     init_kwargs=params_oracle,
     eval_kwargs=eval_kwargs,
-    n_fit=4, agent_name="Oracle")
+    n_fit=4,
+    agent_name="Oracle",
+)
 ppo_stats = AgentManager(
-    PPOAgent, train_env, fit_budget=N_EPISODES,
+    PPOAgent,
+    train_env,
+    fit_budget=N_EPISODES,
     init_kwargs=params_ppo,
     eval_kwargs=eval_kwargs,
-    n_fit=4, agent_name="PPO")
+    n_fit=4,
+    agent_name="PPO",
+)
 
 agent_manager_list = [oracle_stats, ppo_stats]
 
@@ -59,9 +65,13 @@
     manager.fit()
 
 # learning curves
-plot_writer_data(agent_manager_list, tag='episode_rewards',
-                 preprocess_func=np.cumsum,
-                 title='Cumulative Rewards', show=False)
+plot_writer_data(
+    agent_manager_list,
+    tag="episode_rewards",
+    preprocess_func=np.cumsum,
+    title="Cumulative Rewards",
+    show=False,
+)
 
 # compare final policies
 output = evaluate_agents(agent_manager_list)
diff --git a/examples/demo_examples/demo_ppo_bonus.py b/examples/demo_examples/demo_ppo_bonus.py
index 264e71d33..3e16304fb 100644
--- a/examples/demo_examples/demo_ppo_bonus.py
+++ b/examples/demo_examples/demo_ppo_bonus.py
@@ -16,9 +16,7 @@
 
 
 def uncertainty_estimator_fn(obs_space, act_space):
-    counter = DiscreteCounter(obs_space,
-                              act_space,
-                              n_bins_obs=20)
+    counter = DiscreteCounter(obs_space, act_space, n_bins_obs=20)
     return counter
 
 
@@ -32,26 +30,27 @@ def uncertainty_estimator_fn(obs_space, act_space):
 MIN_DIST = 0.1
 
 params_ppo = {
-    'gamma': GAMMA,
-    'horizon': HORIZON,
-    'batch_size': 16,
-    'entr_coef': 8e-7,
-    'k_epochs': 10,
-    'eps_clip': 0.2,
-    'learning_rate': 0.03
+    "gamma": GAMMA,
+    "horizon": HORIZON,
+    "batch_size": 16,
+    "entr_coef": 8e-7,
+    "k_epochs": 10,
+    "eps_clip": 0.2,
+    "learning_rate": 0.03,
 }
 
 params_ppo_bonus = {
-    'gamma': GAMMA,
-    'horizon': HORIZON,
-    'batch_size': 16,
-    'entr_coef': 8e-7,
-    'k_epochs': 10,
-    'eps_clip': 0.2,
-    'learning_rate': 0.03,
-    'use_bonus': True,
-    'uncertainty_estimator_kwargs': {
-        'uncertainty_estimator_fn': uncertainty_estimator_fn}
+    "gamma": GAMMA,
+    "horizon": HORIZON,
+    "batch_size": 16,
+    "entr_coef": 8e-7,
+    "k_epochs": 10,
+    "eps_clip": 0.2,
+    "learning_rate": 0.03,
+    "use_bonus": True,
+    "uncertainty_estimator_kwargs": {
+        "uncertainty_estimator_fn": uncertainty_estimator_fn
+    },
 }
 
 eval_kwargs = dict(eval_horizon=HORIZON, n_simulations=20)
@@ -60,13 +59,23 @@ def uncertainty_estimator_fn(obs_space, act_space):
 # Run AgentManager
 # -----------------------------
 ppo_stats = AgentManager(
-    PPOAgent, env, fit_budget=N_EPISODES,
-    init_kwargs=params_ppo, eval_kwargs=eval_kwargs,
-    n_fit=4, agent_name='PPO')
+    PPOAgent,
+    env,
+    fit_budget=N_EPISODES,
+    init_kwargs=params_ppo,
+    eval_kwargs=eval_kwargs,
+    n_fit=4,
+    agent_name="PPO",
+)
 ppo_bonus_stats = AgentManager(
-    PPOAgent, env, fit_budget=N_EPISODES,
-    init_kwargs=params_ppo_bonus, eval_kwargs=eval_kwargs,
-    n_fit=4, agent_name='PPO-Bonus')
+    PPOAgent,
+    env,
+    fit_budget=N_EPISODES,
+    init_kwargs=params_ppo_bonus,
+    eval_kwargs=eval_kwargs,
+    n_fit=4,
+    agent_name="PPO-Bonus",
+)
 
 agent_manager_list = [ppo_bonus_stats, ppo_stats]
 
@@ -74,9 +83,13 @@ def uncertainty_estimator_fn(obs_space, act_space):
     manager.fit()
 
 # learning curves
-plot_writer_data(agent_manager_list, tag='episode_rewards',
-                 preprocess_func=np.cumsum,
-                 title='Cumulative Rewards', show=False)
+plot_writer_data(
+    agent_manager_list,
+    tag="episode_rewards",
+    preprocess_func=np.cumsum,
+    title="Cumulative Rewards",
+    show=False,
+)
 
 # compare final policies
 output = evaluate_agents(agent_manager_list)
diff --git a/examples/demo_examples/demo_ppo_partial_fit.py b/examples/demo_examples/demo_ppo_partial_fit.py
index 412a578cb..0ad22e10a 100644
--- a/examples/demo_examples/demo_ppo_partial_fit.py
+++ b/examples/demo_examples/demo_ppo_partial_fit.py
@@ -13,24 +13,37 @@
 horizon = 100
 
 ppo_params = {}
-ppo_params['horizon'] = 100
-ppo_params['gamma'] = 0.99
-ppo_params['learning_rate'] = 0.001
-ppo_params['eps_clip'] = 0.2
-ppo_params['k_epochs'] = 4
+ppo_params["horizon"] = 100
+ppo_params["gamma"] = 0.99
+ppo_params["learning_rate"] = 0.001
+ppo_params["eps_clip"] = 0.2
+ppo_params["k_epochs"] = 4
 
 eval_kwargs = dict(eval_horizon=horizon, n_simulations=20)
 
 ppo_stats = AgentManager(
-    PPOAgent, env, fit_budget=n_episodes, eval_kwargs=eval_kwargs,
-    init_kwargs=ppo_params, n_fit=2)
+    PPOAgent,
+    env,
+    fit_budget=n_episodes,
+    eval_kwargs=eval_kwargs,
+    init_kwargs=ppo_params,
+    n_fit=2,
+)
 ppo_stats.fit(n_episodes // 2)
-plot_writer_data(ppo_stats, tag='episode_rewards',
-                 preprocess_func=np.cumsum,
-                 title='Cumulative Rewards', show=False)
+plot_writer_data(
+    ppo_stats,
+    tag="episode_rewards",
+    preprocess_func=np.cumsum,
+    title="Cumulative Rewards",
+    show=False,
+)
 evaluate_agents([ppo_stats], show=False)
 ppo_stats.fit(n_episodes // 4)
-plot_writer_data(ppo_stats, tag='episode_rewards',
-                 preprocess_func=np.cumsum,
-                 title='Cumulative Rewards', show=False)
+plot_writer_data(
+    ppo_stats,
+    tag="episode_rewards",
+    preprocess_func=np.cumsum,
+    title="Cumulative Rewards",
+    show=False,
+)
 evaluate_agents([ppo_stats], show=True)
diff --git a/examples/demo_examples/demo_rnd.py b/examples/demo_examples/demo_rnd.py
index 0b2760f8c..ef6b3c58d 100644
--- a/examples/demo_examples/demo_rnd.py
+++ b/examples/demo_examples/demo_rnd.py
@@ -15,7 +15,8 @@
     env.action_space,
     learning_rate=0.1,
     update_period=100,
-    embedding_dim=2)
+    embedding_dim=2,
+)
 
 # Test
 state = env.reset()
@@ -28,5 +29,4 @@
     if ii % 500 == 0:
         state = env.reset()
         bonus = rnd.measure(state, action)
-        print("it = {}, bonus = {}, loss = {}"
-              .format(ii, bonus, rnd.loss.item()))
+        print("it = {}, bonus = {}, loss = {}".format(ii, bonus, rnd.loss.item()))
diff --git a/examples/demo_examples/demo_seeding.py b/examples/demo_examples/demo_seeding.py
index eb2d891e0..0f85a2010 100644
--- a/examples/demo_examples/demo_seeding.py
+++ b/examples/demo_examples/demo_seeding.py
@@ -20,7 +20,7 @@
 from rlberry.envs import gym_make
 from rlberry.agents import RSUCBVIAgent
 
-env = gym_make('MountainCar-v0')
+env = gym_make("MountainCar-v0")
 env.reseed(seeder)
 
 agent = RSUCBVIAgent(env)
diff --git a/examples/demo_examples/demo_ucbvi_and_opqtl.py b/examples/demo_examples/demo_ucbvi_and_opqtl.py
index 8e928a7fa..2e73413a5 100644
--- a/examples/demo_examples/demo_ucbvi_and_opqtl.py
+++ b/examples/demo_examples/demo_ucbvi_and_opqtl.py
@@ -18,18 +18,18 @@
 
 params = {}
 
-params['ucbvi'] = {
-    'horizon': HORIZON,
-    'stage_dependent': True,
-    'gamma': GAMMA,
-    'real_time_dp': True,
-    'bonus_scale_factor': 1.0,
+params["ucbvi"] = {
+    "horizon": HORIZON,
+    "stage_dependent": True,
+    "gamma": GAMMA,
+    "real_time_dp": True,
+    "bonus_scale_factor": 1.0,
 }
 
-params['optql'] = {
-    'horizon': HORIZON,
-    'gamma': GAMMA,
-    'bonus_scale_factor': 1.0,
+params["optql"] = {
+    "horizon": HORIZON,
+    "gamma": GAMMA,
+    "bonus_scale_factor": 1.0,
 }
 
 eval_kwargs = dict(eval_horizon=HORIZON, n_simulations=20)
@@ -37,15 +37,30 @@
 multimanagers = MultipleManagers()
 
 multimanagers.append(
-    AgentManager(UCBVIAgent, env, fit_budget=N_EP, init_kwargs=params['ucbvi'], eval_kwargs=eval_kwargs)
+    AgentManager(
+        UCBVIAgent,
+        env,
+        fit_budget=N_EP,
+        init_kwargs=params["ucbvi"],
+        eval_kwargs=eval_kwargs,
+    )
 )
 
 multimanagers.append(
-    AgentManager(OptQLAgent, env, fit_budget=N_EP, init_kwargs=params['optql'], eval_kwargs=eval_kwargs)
+    AgentManager(
+        OptQLAgent,
+        env,
+        fit_budget=N_EP,
+        init_kwargs=params["optql"],
+        eval_kwargs=eval_kwargs,
+    )
 )
 
 multimanagers.run()
 
-plot_writer_data(multimanagers.managers, tag='episode_rewards',
-                 preprocess_func=np.cumsum,
-                 title='Cumulative Rewards')
+plot_writer_data(
+    multimanagers.managers,
+    tag="episode_rewards",
+    preprocess_func=np.cumsum,
+    title="Cumulative Rewards",
+)
diff --git a/examples/demo_examples/demo_vis2d.py b/examples/demo_examples/demo_vis2d.py
index 0bcb03993..d90ad2f21 100644
--- a/examples/demo_examples/demo_vis2d.py
+++ b/examples/demo_examples/demo_vis2d.py
@@ -13,15 +13,23 @@
 
 if CHOICE == 0:
     env = NRoom(nrooms=5, array_observation=False, reward_free=True)
-    env = Vis2dWrapper(env, n_bins_obs=20, memory_size=100, state_preprocess_fn=get_nroom_state_coord)
+    env = Vis2dWrapper(
+        env, n_bins_obs=20, memory_size=100, state_preprocess_fn=get_nroom_state_coord
+    )
     agent = ValueIterationAgent(env.unwrapped, gamma=0.99, horizon=200, copy_env=False)
 
 else:
     env = MountainCar()
     env = Vis2dWrapper(env, n_bins_obs=20, memory_size=200)
 
-    agent = RSUCBVIAgent(env, gamma=0.99, horizon=200,
-                         bonus_scale_factor=0.1, copy_env=False, min_dist=0.1)
+    agent = RSUCBVIAgent(
+        env,
+        gamma=0.99,
+        horizon=200,
+        bonus_scale_factor=0.1,
+        copy_env=False,
+        min_dist=0.1,
+    )
 
 agent.fit(budget=100)
 
@@ -41,5 +49,7 @@
     ylim = None
 
 # env.render()
-env.plot_trajectories(n_skip=5, dot_scale_factor=15, xlim=xlim, ylim=ylim, dot_size_means='total_visits')
+env.plot_trajectories(
+    n_skip=5, dot_scale_factor=15, xlim=xlim, ylim=ylim, dot_size_means="total_visits"
+)
 env.plot_trajectory_actions(xlim=xlim, ylim=ylim)
diff --git a/examples/plot_agent_manager.py b/examples/plot_agent_manager.py
index 49e07ccb5..76f324d27 100644
--- a/examples/plot_agent_manager.py
+++ b/examples/plot_agent_manager.py
@@ -12,25 +12,33 @@
 # -> The reward function can be accessed by: env.R[state, action]
 # -> And the transitions: env.P[state, action, next_state]
 env_ctor = GridWorld
-env_kwargs =dict(nrows=3, ncols=10,
-                reward_at = {(1,1):0.1, (2, 9):1.0},
-                walls=((1,4),(2,4), (1,5)),
-                success_probability=0.9)
+env_kwargs = dict(
+    nrows=3,
+    ncols=10,
+    reward_at={(1, 1): 0.1, (2, 9): 1.0},
+    walls=((1, 4), (2, 4), (1, 5)),
+    success_probability=0.9,
+)
 env = env_ctor(**env_kwargs)
 
 
-
 import numpy as np
 from rlberry.agents import AgentWithSimplePolicy
 
+
 class ValueIterationAgent(AgentWithSimplePolicy):
-    name = 'ValueIterationAgent'
-    def __init__(self, env, gamma=0.99, epsilon=1e-5, **kwargs):   # it's important to put **kwargs to ensure compatibility with the base class
+    name = "ValueIterationAgent"
+
+    def __init__(
+        self, env, gamma=0.99, epsilon=1e-5, **kwargs
+    ):  # it's important to put **kwargs to ensure compatibility with the base class
         """
         gamma: discount factor
         episilon: precision of value iteration
         """
-        AgentWithSimplePolicy.__init__(self, env, **kwargs) # self.env is initialized in the base class
+        AgentWithSimplePolicy.__init__(
+            self, env, **kwargs
+        )  # self.env is initialized in the base class
 
         self.gamma = gamma
         self.epsilon = epsilon
@@ -48,10 +56,10 @@ def fit(self, budget=None, **kwargs):
             TQ = np.zeros((S, A))
             for ss in range(S):
                 for aa in range(A):
-                    TQ[ss, aa] = env.R[ss, aa] + self.gamma*env.P[ss, aa, :].dot(V)
+                    TQ[ss, aa] = env.R[ss, aa] + self.gamma * env.P[ss, aa, :].dot(V)
             V = TQ.max(axis=1)
 
-            if np.abs(TQ-Q).max() < self.epsilon:
+            if np.abs(TQ - Q).max() < self.epsilon:
                 break
             Q = TQ
         self.Q = Q
@@ -59,18 +67,19 @@ def fit(self, budget=None, **kwargs):
     def policy(self, observation):
         return self.Q[observation, :].argmax()
 
-
     @classmethod
     def sample_parameters(cls, trial):
-      """
-      Sample hyperparameters for hyperparam optimization using Optuna (https://optuna.org/)
-      """
-      gamma = trial.suggest_categorical('gamma', [0.1, 0.25, 0.5, 0.75, 0.99])
-      return {'gamma':gamma}
+        """
+        Sample hyperparameters for hyperparam optimization using Optuna (https://optuna.org/)
+        """
+        gamma = trial.suggest_categorical("gamma", [0.1, 0.25, 0.5, 0.75, 0.99])
+        return {"gamma": gamma}
+
 
 # Create random agent as a baseline
 class RandomAgent(AgentWithSimplePolicy):
-    name = 'RandomAgent'
+    name = "RandomAgent"
+
     def __init__(self, env, **kwargs):
         AgentWithSimplePolicy.__init__(self, env, **kwargs)
 
@@ -80,10 +89,11 @@ def fit(self, budget=None, **kwargs):
     def policy(self, observation):
         return self.env.action_space.sample()
 
+
 from rlberry.manager import AgentManager, evaluate_agents
 
 # Define parameters
-vi_params = {'gamma':0.1, 'epsilon':1e-3}
+vi_params = {"gamma": 0.1, "epsilon": 1e-3}
 
 # Create AgentManager to fit 4 agents using 1 job
 vi_stats = AgentManager(
@@ -92,7 +102,8 @@ def policy(self, observation):
     fit_budget=0,
     eval_kwargs=dict(eval_horizon=20),
     init_kwargs=vi_params,
-    n_fit=4)
+    n_fit=4,
+)
 vi_stats.fit()
 
 # Create AgentManager for baseline
@@ -101,7 +112,8 @@ def policy(self, observation):
     (env_ctor, env_kwargs),
     fit_budget=0,
     eval_kwargs=dict(eval_horizon=20),
-    n_fit=1)
+    n_fit=1,
+)
 baseline_stats.fit()
 
 # Compare policies using 10 Monte Carlo simulations
diff --git a/examples/plot_kernels.py b/examples/plot_kernels.py
index a13b824d3..84b2b2cfc 100644
--- a/examples/plot_kernels.py
+++ b/examples/plot_kernels.py
@@ -19,13 +19,13 @@
     "triweight",
     "tricube",
     "cosine",
-    "exp-4"
+    "exp-4",
 ]
 
 z = np.linspace(-2, 2, 100)
 
 
-fig, axes = plt.subplots(1, len(kernel_types),figsize=(15,5))
+fig, axes = plt.subplots(1, len(kernel_types), figsize=(15, 5))
 for ii, k_type in enumerate(kernel_types):
     kernel_vals = kernel_func(z, k_type)
     axes[ii].plot(z, kernel_vals)
diff --git a/rlberry/__init__.py b/rlberry/__init__.py
index 8ce358f45..769508b35 100644
--- a/rlberry/__init__.py
+++ b/rlberry/__init__.py
@@ -1,4 +1,4 @@
-__path__ = __import__('pkgutil').extend_path(__path__, __name__)
+__path__ = __import__("pkgutil").extend_path(__path__, __name__)
 
 # Initialize logging level
 from rlberry.utils.logging import configure_logging
diff --git a/rlberry/agents/adaptiveql/adaptiveql.py b/rlberry/agents/adaptiveql/adaptiveql.py
index 83b97f824..6bb87b8f7 100644
--- a/rlberry/agents/adaptiveql/adaptiveql.py
+++ b/rlberry/agents/adaptiveql/adaptiveql.py
@@ -43,15 +43,17 @@ class AdaptiveQLAgent(AgentWithSimplePolicy):
     Uses the metric induced by the l-infinity norm.
     """
 
-    name = 'AdaptiveQLearning'
-
-    def __init__(self,
-                 env,
-                 gamma=1.0,
-                 horizon=50,
-                 bonus_scale_factor=1.0,
-                 bonus_type="simplified_bernstein",
-                 **kwargs):
+    name = "AdaptiveQLearning"
+
+    def __init__(
+        self,
+        env,
+        gamma=1.0,
+        horizon=50,
+        bonus_scale_factor=1.0,
+        bonus_type="simplified_bernstein",
+        **kwargs
+    ):
         AgentWithSimplePolicy.__init__(self, env, **kwargs)
 
         assert isinstance(self.env.observation_space, spaces.Box)
@@ -65,8 +67,10 @@ def __init__(self,
         # maximum value
         r_range = self.env.reward_range[1] - self.env.reward_range[0]
         if r_range == np.inf or r_range == 0.0:
-            logger.warning("{}: Reward range is  zero or infinity. ".format(self.name)
-                           + "Setting it to 1.")
+            logger.warning(
+                "{}: Reward range is  zero or infinity. ".format(self.name)
+                + "Setting it to 1."
+            )
             r_range = 1.0
 
         self.v_max = np.zeros(self.horizon)
@@ -77,9 +81,9 @@ def __init__(self,
         self.reset()
 
     def reset(self):
-        self.Qtree = MDPTreePartition(self.env.observation_space,
-                                      self.env.action_space,
-                                      self.horizon)
+        self.Qtree = MDPTreePartition(
+            self.env.observation_space, self.env.action_space, self.horizon
+        )
 
         # info
         self.episode = 0
@@ -106,7 +110,7 @@ def _update(self, node, state, action, next_state, reward, hh):
         if hh < self.horizon - 1:
             value_next_state = min(
                 self.v_max[hh + 1],
-                self.Qtree.get_argmax_and_node(next_state, hh + 1)[1].qvalue
+                self.Qtree.get_argmax_and_node(next_state, hh + 1)[1].qvalue,
             )
 
         # learning rate
@@ -125,7 +129,8 @@ def _compute_bonus(self, n, hh):
             return bonus
         else:
             raise ValueError(
-                "Error: bonus type {} not implemented".format(self.bonus_type))
+                "Error: bonus type {} not implemented".format(self.bonus_type)
+            )
 
     def _run_episode(self):
         # interact for H steps
diff --git a/rlberry/agents/adaptiveql/tree.py b/rlberry/agents/adaptiveql/tree.py
index 7a1c2ec57..fd22c3ca5 100644
--- a/rlberry/agents/adaptiveql/tree.py
+++ b/rlberry/agents/adaptiveql/tree.py
@@ -122,18 +122,22 @@ def traverse(self, x, update=False):
         # return value at leaf
         return node
 
-    def plot(self,
-             fignum="tree plot",
-             colormap_name='cool',
-             max_value=10,
-             node=None,
-             root=True, ):
+    def plot(
+        self,
+        fignum="tree plot",
+        colormap_name="cool",
+        max_value=10,
+        node=None,
+        root=True,
+    ):
         """
         Visualize the function (2d domain only).
         Shows the hierarchical partition.
         """
         if root:
-            assert self.dim == 2, "TreePartition plot only available for 2-dimensional spaces."
+            assert (
+                self.dim == 2
+            ), "TreePartition plot only available for 2-dimensional spaces."
             node = self.root
             plt.figure(fignum)
 
@@ -144,13 +148,20 @@ def plot(self,
 
             colormap_fn = plt.get_cmap(colormap_name)
             color = colormap_fn(node.qvalue / max_value)
-            rectangle = plt.Rectangle((x0, y0), x1 - x0, y1 - y0, ec="black", color=color)
+            rectangle = plt.Rectangle(
+                (x0, y0), x1 - x0, y1 - y0, ec="black", color=color
+            )
             plt.gca().add_patch(rectangle)
-            plt.axis('scaled')
+            plt.axis("scaled")
 
         else:
             for cc in node.children:
-                self.plot(max_value=max_value, colormap_name=colormap_name, node=cc, root=False)
+                self.plot(
+                    max_value=max_value,
+                    colormap_name=colormap_name,
+                    node=cc,
+                    root=False,
+                )
 
 
 class MDPTreePartition:
@@ -167,8 +178,9 @@ def __init__(self, observation_space, action_space, horizon):
         for hh in range(horizon):
             self.trees.append({})
             for aa in range(self.n_actions):
-                self.trees[hh][aa] = TreePartition(observation_space,
-                                                   initial_value=horizon - hh)
+                self.trees[hh][aa] = TreePartition(
+                    observation_space, initial_value=horizon - hh
+                )
 
         self.dmax = self.trees[0][0].dmax
 
diff --git a/rlberry/agents/agent.py b/rlberry/agents/agent.py
index e5355debd..c8614e764 100644
--- a/rlberry/agents/agent.py
+++ b/rlberry/agents/agent.py
@@ -18,7 +18,7 @@
 
 
 class Agent(ABC):
-    """ Basic interface for agents.
+    """Basic interface for agents.
 
     Parameters
     ----------
@@ -59,18 +59,19 @@ class Agent(ABC):
 
     name = ""
 
-    def __init__(self,
-                 env: types.Env,
-                 eval_env: Optional[types.Env] = None,
-                 copy_env: bool = True,
-                 seeder: Optional[types.Seed] = None,
-                 output_dir: Optional[str] = None,
-                 _execution_metadata: Optional[metadata_utils.ExecutionMetadata] = None,
-                 _default_writer_kwargs: Optional[dict] = None,
-                 **kwargs):
+    def __init__(
+        self,
+        env: types.Env,
+        eval_env: Optional[types.Env] = None,
+        copy_env: bool = True,
+        seeder: Optional[types.Seed] = None,
+        output_dir: Optional[str] = None,
+        _execution_metadata: Optional[metadata_utils.ExecutionMetadata] = None,
+        _default_writer_kwargs: Optional[dict] = None,
+        **kwargs,
+    ):
         # Check if wrong parameters have been sent to an agent.
-        assert kwargs == {}, \
-            'Unknown parameters sent to agent:' + str(kwargs.keys())
+        assert kwargs == {}, "Unknown parameters sent to agent:" + str(kwargs.keys())
 
         self.seeder = Seeder(seeder)
         self.env = process_env(env, self.seeder, copy_env=copy_env)
@@ -80,14 +81,17 @@ def __init__(self,
         self.eval_env = process_env(eval_env, self.seeder, copy_env=True)
 
         # metadata
-        self._execution_metadata = _execution_metadata or metadata_utils.ExecutionMetadata()
+        self._execution_metadata = (
+            _execution_metadata or metadata_utils.ExecutionMetadata()
+        )
         self._unique_id = metadata_utils.get_unique_id(self)
         if self.name:
-            self._unique_id = self.name + '_' + self._unique_id
+            self._unique_id = self.name + "_" + self._unique_id
 
         # create writer
         _default_writer_kwargs = _default_writer_kwargs or dict(
-            name=self.name, execution_metadata=self._execution_metadata)
+            name=self.name, execution_metadata=self._execution_metadata
+        )
         self._writer = DefaultWriter(**_default_writer_kwargs)
 
         # output directory for the agent instance
@@ -174,7 +178,7 @@ def sample_parameters(cls, trial):
 
     @property
     def rng(self):
-        """ Random number generator. """
+        """Random number generator."""
         return self.seeder.rng
 
     def reseed(self, seed_seq=None):
@@ -230,7 +234,7 @@ def save(self, filename):
         if not dill.pickles(self.writer):
             self.set_writer(None)
         # save
-        filename = Path(filename).with_suffix('.pickle')
+        filename = Path(filename).with_suffix(".pickle")
         filename.parent.mkdir(parents=True, exist_ok=True)
         try:
             with filename.open("wb") as ff:
@@ -256,14 +260,14 @@ def load(cls, filename, **kwargs):
         **kwargs: dict
             Arguments to required by the __init__ method of the Agent subclass.
         """
-        filename = Path(filename).with_suffix('.pickle')
+        filename = Path(filename).with_suffix(".pickle")
 
         obj = cls(**kwargs)
         try:
-            with filename.open('rb') as ff:
+            with filename.open("rb") as ff:
                 tmp_dict = pickle.load(ff)
         except Exception:
-            with filename.open('rb') as ff:
+            with filename.open("rb") as ff:
                 tmp_dict = dill.load(ff)
 
         obj.__dict__.clear()
@@ -283,11 +287,7 @@ def policy(self, observation):
         """Returns an action, given an observation."""
         pass
 
-    def eval(self,
-             eval_horizon=10 ** 5,
-             n_simulations=10,
-             gamma=1.0,
-             **kwargs):
+    def eval(self, eval_horizon=10 ** 5, n_simulations=10, gamma=1.0, **kwargs):
         """
         Monte-Carlo policy evaluation [1]_ of an agent to estimate the value at the initial state.
 
@@ -307,7 +307,7 @@ def eval(self,
         References
         ----------
         .. [1] http://incompleteideas.net/book/first/ebook/node50.html
-            """
+        """
         del kwargs  # unused
         episode_rewards = np.zeros(n_simulations)
         for sim in range(n_simulations):
diff --git a/rlberry/agents/dynprog/value_iteration.py b/rlberry/agents/dynprog/value_iteration.py
index c718f7774..0748f16d5 100644
--- a/rlberry/agents/dynprog/value_iteration.py
+++ b/rlberry/agents/dynprog/value_iteration.py
@@ -31,8 +31,9 @@ def __init__(self, env, gamma=0.95, horizon=None, epsilon=1e-6, **kwargs):
         AgentWithSimplePolicy.__init__(self, env, **kwargs)
 
         # initialize base class
-        assert isinstance(self.env, FiniteMDP), \
-            "Value iteration requires a FiniteMDP model."
+        assert isinstance(
+            self.env, FiniteMDP
+        ), "Value iteration requires a FiniteMDP model."
         #
 
         self.gamma = gamma
@@ -50,15 +51,16 @@ def fit(self, budget=None, **kwargs):
         del kwargs
         info = {}
         if self.horizon is None:
-            assert self.gamma < 1.0, \
-                "The discounted setting requires gamma < 1.0"
-            self.Q, self.V, n_it = value_iteration(self.env.R, self.env.P,
-                                                   self.gamma, self.epsilon)
+            assert self.gamma < 1.0, "The discounted setting requires gamma < 1.0"
+            self.Q, self.V, n_it = value_iteration(
+                self.env.R, self.env.P, self.gamma, self.epsilon
+            )
             info["n_iterations"] = n_it
             info["precision"] = self.epsilon
         else:
-            self.Q, self.V = backward_induction(self.env.R, self.env.P,
-                                                self.horizon, self.gamma)
+            self.Q, self.V = backward_induction(
+                self.env.R, self.env.P, self.horizon, self.gamma
+            )
             info["n_iterations"] = self.horizon
             info["precision"] = 0.0
         return info
diff --git a/rlberry/agents/jax/dqn/dqn.py b/rlberry/agents/jax/dqn/dqn.py
index 7f889f260..54be541f4 100644
--- a/rlberry/agents/jax/dqn/dqn.py
+++ b/rlberry/agents/jax/dqn/dqn.py
@@ -108,28 +108,29 @@ class DQNAgent(AgentWithSimplePolicy):
     max_gradient_norm : float, default: 100.0
         Maximum gradient norm.
     """
+
     name = "JaxDqnAgent"
 
     def __init__(
-            self,
-            env: types.Env,
-            gamma: float = 0.99,
-            batch_size: int = 64,
-            chunk_size: int = 8,
-            online_update_interval: int = 1,
-            target_update_interval: int = 512,
-            learning_rate: float = 0.001,
-            epsilon_init: float = 1.0,
-            epsilon_end: float = 0.05,
-            epsilon_steps: int = 5000,
-            max_replay_size: int = 100000,
-            eval_interval: Optional[int] = None,
-            max_episode_length: Optional[int] = None,
-            lambda_: Optional[float] = None,
-            net_constructor: Optional[Callable[..., hk.Module]] = None,
-            net_kwargs: Optional[Mapping[str, Any]] = None,
-            max_gradient_norm: float = 100.0,
-            **kwargs
+        self,
+        env: types.Env,
+        gamma: float = 0.99,
+        batch_size: int = 64,
+        chunk_size: int = 8,
+        online_update_interval: int = 1,
+        target_update_interval: int = 512,
+        learning_rate: float = 0.001,
+        epsilon_init: float = 1.0,
+        epsilon_end: float = 0.05,
+        epsilon_steps: int = 5000,
+        max_replay_size: int = 100000,
+        eval_interval: Optional[int] = None,
+        max_episode_length: Optional[int] = None,
+        lambda_: Optional[float] = None,
+        net_constructor: Optional[Callable[..., hk.Module]] = None,
+        net_kwargs: Optional[Mapping[str, Any]] = None,
+        max_gradient_norm: float = 100.0,
+        **kwargs
     ):
         AgentWithSimplePolicy.__init__(self, env, **kwargs)
         env = self.env
@@ -137,9 +138,9 @@ def __init__(
 
         # checks
         if not isinstance(self.env.observation_space, spaces.Box):
-            raise ValueError('DQN only implemented for Box observation spaces.')
+            raise ValueError("DQN only implemented for Box observation spaces.")
         if not isinstance(self.env.action_space, spaces.Discrete):
-            raise ValueError('DQN only implemented for Discrete action spaces.')
+            raise ValueError("DQN only implemented for Discrete action spaces.")
 
         # params
         self._gamma = gamma
@@ -164,7 +165,10 @@ def __init__(
         try:
             obs_shape, obs_dtype = sample_obs.shape, sample_obs.dtype
         except AttributeError:  # in case sample_obs has no .shape attribute
-            obs_shape, obs_dtype = env.observation_space.shape, env.observation_space.dtype
+            obs_shape, obs_dtype = (
+                env.observation_space.shape,
+                env.observation_space.dtype,
+            )
         action_shape, action_dtype = env.action_space.shape, env.action_space.dtype
 
         self._replay_buffer = ReplayBuffer(
@@ -172,23 +176,20 @@ def __init__(
             self._chunk_size,
             self._max_replay_size,
         )
-        self._replay_buffer.setup_entry('actions', action_shape, action_dtype)
-        self._replay_buffer.setup_entry('observations', obs_shape, obs_dtype)
-        self._replay_buffer.setup_entry('next_observations', obs_shape, obs_dtype)
-        self._replay_buffer.setup_entry('rewards', (), np.float32)
-        self._replay_buffer.setup_entry('discounts', (), np.float32)
+        self._replay_buffer.setup_entry("actions", action_shape, action_dtype)
+        self._replay_buffer.setup_entry("observations", obs_shape, obs_dtype)
+        self._replay_buffer.setup_entry("next_observations", obs_shape, obs_dtype)
+        self._replay_buffer.setup_entry("rewards", (), np.float32)
+        self._replay_buffer.setup_entry("discounts", (), np.float32)
         self._replay_buffer.build()
 
         # initialize network and params
         net_constructor = net_constructor or nets.MLPQNetwork
         net_kwargs = net_kwargs or dict(
-            num_actions=self.env.action_space.n,
-            hidden_sizes=(64, 64)
+            num_actions=self.env.action_space.n, hidden_sizes=(64, 64)
         )
         net_ctor = functools.partial(net_constructor, **net_kwargs)
-        self._q_net = hk.without_apply_rng(
-            hk.transform(lambda x: net_ctor()(x))
-        )
+        self._q_net = hk.without_apply_rng(hk.transform(lambda x: net_ctor()(x)))
 
         self._dummy_obs = jnp.ones(self.env.observation_space.shape)
 
@@ -197,13 +198,13 @@ def __init__(
 
         self._all_params = AllParams(
             online=self._q_net.init(subkey1, self._dummy_obs),
-            target=self._q_net.init(subkey2, self._dummy_obs)
+            target=self._q_net.init(subkey2, self._dummy_obs),
         )
 
         # initialize optimizer and states
         self._optimizer = optax.chain(
             optax.clip_by_global_norm(self._max_gradient_norm),
-            optax.adam(learning_rate)
+            optax.adam(learning_rate),
         )
         self._all_states = AllStates(
             optimizer=self._optimizer.init(self._all_params.online),
@@ -236,11 +237,7 @@ def policy(self, observation):
         action = actor_out.actions.item()
         return action
 
-    def fit(
-            self,
-            budget: int,
-            **kwargs
-    ):
+    def fit(self, budget: int, **kwargs):
         """
         Train DQN agent.
 
@@ -273,11 +270,16 @@ def fit(
                 # store data
                 episode_rewards += reward
                 buffer_writer.append(
-                    {'actions': action,
-                     'observations': observation,
-                     'rewards': np.array(reward, dtype=np.float32),
-                     'discounts': np.array(self._gamma * (1.0 - done), dtype=np.float32),
-                     'next_observations': next_obs})
+                    {
+                        "actions": action,
+                        "observations": observation,
+                        "rewards": np.array(reward, dtype=np.float32),
+                        "discounts": np.array(
+                            self._gamma * (1.0 - done), dtype=np.float32
+                        ),
+                        "next_observations": next_obs,
+                    }
+                )
 
                 # counters and next obs
                 timesteps_counter += 1
@@ -291,44 +293,49 @@ def fit(
                     if sample:
                         batch = sample.data
                         self._all_params, self._all_states, info = self.learner_step(
-                            self._all_params,
-                            self._all_states,
-                            batch
+                            self._all_params, self._all_states, batch
                         )
                         if self.writer:
-                            self.writer.add_scalar('q_loss', info['loss'].item(), total_timesteps)
                             self.writer.add_scalar(
-                                'learner_steps',
+                                "q_loss", info["loss"].item(), total_timesteps
+                            )
+                            self.writer.add_scalar(
+                                "learner_steps",
                                 self._all_states.learner_steps.item(),
-                                total_timesteps)
+                                total_timesteps,
+                            )
 
                 # eval
-                if self._eval_interval is not None and total_timesteps % self._eval_interval == 0:
+                if (
+                    self._eval_interval is not None
+                    and total_timesteps % self._eval_interval == 0
+                ):
                     eval_rewards = self.eval(
                         eval_horizon=self._max_episode_length,
                         n_simimulations=2,
-                        gamma=1.0)
+                        gamma=1.0,
+                    )
                     self.writer.add_scalar(
-                        'eval_rewards',
-                        eval_rewards,
-                        total_timesteps
+                        "eval_rewards", eval_rewards, total_timesteps
                     )
 
                 # check if episode ended
                 if done:
                     if self.writer:
-                        self.writer.add_scalar('episode_rewards', episode_rewards, total_timesteps)
+                        self.writer.add_scalar(
+                            "episode_rewards", episode_rewards, total_timesteps
+                        )
                     buffer_writer.end_episode()
                     episode_rewards = 0.0
                     episode_timesteps = 0
                     observation = self.env.reset()
 
     def _loss(self, all_params, batch):
-        obs_tm1 = batch['observations']
-        a_tm1 = batch['actions']
-        r_t = batch['rewards']
-        discount_t = batch['discounts']
-        obs_t = batch['next_observations']
+        obs_tm1 = batch["observations"]
+        a_tm1 = batch["actions"]
+        r_t = batch["rewards"]
+        discount_t = batch["discounts"]
+        obs_t = batch["next_observations"]
 
         if self._lambda is None:
             # remove time dim (batch has shape [batch, chunk_size, ...])
@@ -348,13 +355,13 @@ def _loss(self, all_params, batch):
         else:
             batched_loss = jax.vmap(rlax.q_lambda)
             batch_lambda = self._lambda * jnp.ones(r_t.shape)
-            td_error = batched_loss(q_tm1, a_tm1, r_t, discount_t, q_t_val, batch_lambda)
+            td_error = batched_loss(
+                q_tm1, a_tm1, r_t, discount_t, q_t_val, batch_lambda
+            )
 
         loss = jnp.mean(rlax.l2_loss(td_error))
 
-        info = dict(
-            loss=loss
-        )
+        info = dict(loss=loss)
         return loss, info
 
     def _actor_step(self, all_params, all_states, observation, rng_key, evaluation):
@@ -365,13 +372,12 @@ def _actor_step(self, all_params, all_states, observation, rng_key, evaluation):
         eval_action = rlax.greedy().sample(rng_key, q_val)
         action = jax.lax.select(evaluation, eval_action, train_action)
         return (
-            ActorOutput(
-                actions=action,
-                q_values=q_val),
+            ActorOutput(actions=action, q_values=q_val),
             AllStates(
                 optimizer=all_states.optimizer,
                 learner_steps=all_states.learner_steps,
-                actor_steps=all_states.actor_steps + 1),
+                actor_steps=all_states.actor_steps + 1,
+            ),
         )
 
     def _learner_step(self, all_params, all_states, batch):
@@ -379,30 +385,28 @@ def _learner_step(self, all_params, all_states, batch):
             all_params.online,
             all_params.target,
             all_states.learner_steps,
-            self._target_update_interval)
-        grad, info = jax.grad(self._loss, has_aux=True)(
-            all_params,
-            batch)
+            self._target_update_interval,
+        )
+        grad, info = jax.grad(self._loss, has_aux=True)(all_params, batch)
         updates, optimizer_state = self._optimizer.update(
-            grad.online,
-            all_states.optimizer)
+            grad.online, all_states.optimizer
+        )
         online_params = optax.apply_updates(all_params.online, updates)
         return (
-            AllParams(
-                online=online_params,
-                target=target_params),
+            AllParams(online=online_params, target=target_params),
             AllStates(
                 optimizer=optimizer_state,
                 learner_steps=all_states.learner_steps + 1,
-                actor_steps=all_states.actor_steps),
-            info
+                actor_steps=all_states.actor_steps,
+            ),
+            info,
         )
 
     #
     # Custom save/load methods.
     #
     def save(self, filename):
-        filename = Path(filename).with_suffix('.pickle')
+        filename = Path(filename).with_suffix(".pickle")
         filename.parent.mkdir(parents=True, exist_ok=True)
 
         writer = None
@@ -422,14 +426,14 @@ def save(self, filename):
 
     @classmethod
     def load(cls, filename, **kwargs):
-        filename = Path(filename).with_suffix('.pickle')
+        filename = Path(filename).with_suffix(".pickle")
         agent = cls(**kwargs)
-        with filename.open('rb') as ff:
+        with filename.open("rb") as ff:
             agent_data = dill.load(ff)
-        agent.key = agent_data['rng_key']
-        agent._all_params = agent_data['params']
-        agent._all_states = agent_data['states']
-        writer = agent_data['writer']
+        agent.key = agent_data["rng_key"]
+        agent._all_params = agent_data["params"]
+        agent._all_states = agent_data["states"]
+        writer = agent_data["writer"]
         if writer:
             agent._writer = writer
         return agent
@@ -439,13 +443,7 @@ def load(cls, filename, **kwargs):
     #
     @classmethod
     def sample_parameters(cls, trial):
-        learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-1)
-        gamma = trial.suggest_uniform('gamma', 0.95, 0.99)
-        lambda_ = trial.suggest_categorical(
-            'lambda_',
-            [0.1, 0.5, 0.9, None])
-        return dict(
-            learning_rate=learning_rate,
-            gamma=gamma,
-            lambda_=lambda_
-        )
+        learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-1)
+        gamma = trial.suggest_uniform("gamma", 0.95, 0.99)
+        lambda_ = trial.suggest_categorical("lambda_", [0.1, 0.5, 0.9, None])
+        return dict(learning_rate=learning_rate, gamma=gamma, lambda_=lambda_)
diff --git a/rlberry/agents/jax/nets/common.py b/rlberry/agents/jax/nets/common.py
index 321a754d2..593000cdf 100644
--- a/rlberry/agents/jax/nets/common.py
+++ b/rlberry/agents/jax/nets/common.py
@@ -20,10 +20,7 @@ class MLPQNetwork(hk.Module):
     """
 
     def __init__(
-            self,
-            num_actions: int,
-            hidden_sizes: Tuple[int, ...],
-            name: str = 'MLPQNetwork'
+        self, num_actions: int, hidden_sizes: Tuple[int, ...], name: str = "MLPQNetwork"
     ):
         super().__init__(name=name)
         self._mlp = hk.nets.MLP(output_sizes=hidden_sizes + (num_actions,))
diff --git a/rlberry/agents/jax/tests/old_test_tqn.py b/rlberry/agents/jax/tests/old_test_tqn.py
index c48d4048b..8cbfc7ab8 100644
--- a/rlberry/agents/jax/tests/old_test_tqn.py
+++ b/rlberry/agents/jax/tests/old_test_tqn.py
@@ -18,12 +18,9 @@ def test_jax_dqn(lambda_):
     if not _IMPORT_SUCCESSFUL:
         return
 
-    env = (gym_make, dict(id='CartPole-v0'))
+    env = (gym_make, dict(id="CartPole-v0"))
     params = dict(
-        chunk_size=4,
-        batch_size=128,
-        target_update_interval=5,
-        lambda_=lambda_
+        chunk_size=4, batch_size=128, target_update_interval=5, lambda_=lambda_
     )
 
     stats = AgentManager(
@@ -33,7 +30,7 @@ def test_jax_dqn(lambda_):
         eval_env=env,
         init_kwargs=params,
         n_fit=1,
-        parallelization='thread',
+        parallelization="thread",
     )
     stats.fit()
     stats.clear_output_dir()
diff --git a/rlberry/agents/jax/utils/replay_buffer.py b/rlberry/agents/jax/utils/replay_buffer.py
index 15aeb6a87..f49cf3584 100644
--- a/rlberry/agents/jax/utils/replay_buffer.py
+++ b/rlberry/agents/jax/utils/replay_buffer.py
@@ -14,13 +14,13 @@
     import reverb
 except ImportError as ex:
     logger.error(
-        f'[replay_buffer] Could not import reverb: \n   {ex}   \n'
-        + ' >>> If you have issues with libpython3.7m.so.1.0, try running: \n'
-        + ' >>> $ export LD_LIBRARY_PATH=$CONDA_PREFIX/lib \n'
-        + ' >>> in a conda environment, '
-        + ' >>> or see https://github.com/deepmind/acme/issues/47 \n'
-        + ' >>> See also https://stackoverflow.com/a/46833531 for how to set \n'
-        + ' >>> LD_LIBRARY_PATH automatically when activating a conda environment.'
+        f"[replay_buffer] Could not import reverb: \n   {ex}   \n"
+        + " >>> If you have issues with libpython3.7m.so.1.0, try running: \n"
+        + " >>> $ export LD_LIBRARY_PATH=$CONDA_PREFIX/lib \n"
+        + " >>> in a conda environment, "
+        + " >>> or see https://github.com/deepmind/acme/issues/47 \n"
+        + " >>> See also https://stackoverflow.com/a/46833531 for how to set \n"
+        + " >>> LD_LIBRARY_PATH automatically when activating a conda environment."
     )
     exit(1)
 
@@ -55,13 +55,13 @@ def append(self, *args, **kwargs):
             for key in self.writer.history:
                 if key not in self.entries:
                     raise RuntimeError(
-                        'Cannot add to replay buffer an item that'
-                        f' was not setup with setup_entry() method of ReplayBuffer: {key}')
-                trajectory[key] = self.writer.history[key][-self.chunk_size:]
+                        "Cannot add to replay buffer an item that"
+                        f" was not setup with setup_entry() method of ReplayBuffer: {key}"
+                    )
+                trajectory[key] = self.writer.history[key][-self.chunk_size :]
             self.writer.create_item(
-                table='replay_buffer',
-                priority=1.0,
-                trajectory=trajectory)
+                table="replay_buffer", priority=1.0, trajectory=trajectory
+            )
             self.total_items += 1
 
 
@@ -77,13 +77,13 @@ class ReplayBuffer:
     """
 
     def __init__(
-            self,
-            batch_size: int,
-            chunk_size: int,
-            max_replay_size: int,
+        self,
+        batch_size: int,
+        chunk_size: int,
+        max_replay_size: int,
     ):
         if chunk_size < 1:
-            raise ValueError('chunk_size needs to be >= 1')
+            raise ValueError("chunk_size needs to be >= 1")
 
         self._batch_size = batch_size
         self._chunk_size = chunk_size
@@ -101,12 +101,14 @@ def dataset(self):
         return self._batched_dataset
 
     def get_writer(self):
-        self._chunk_writer = ChunkWriter(self._reverb_client, self._chunk_size, list(self._signature.keys()))
+        self._chunk_writer = ChunkWriter(
+            self._reverb_client, self._chunk_size, list(self._signature.keys())
+        )
         return self._chunk_writer
 
     def sample(self):
         if self._chunk_writer is None:
-            raise RuntimeError('Calling sample() without previous call to get_writer()')
+            raise RuntimeError("Calling sample() without previous call to get_writer()")
         if self._chunk_writer.total_items < self._batch_size:
             return None
         return next(self.dataset)
@@ -125,7 +127,7 @@ def setup_entry(self, name, shape, dtype):
             Type of the data. Can be nested.
         """
         if name in self._signature:
-            raise ValueError(f'Entry {name} already added to the replay buffer.')
+            raise ValueError(f"Entry {name} already added to the replay buffer.")
 
         self._signature[name] = tf.TensorSpec(
             shape=[self._chunk_size, *shape],
@@ -137,7 +139,7 @@ def build(self):
         self._reverb_server = reverb.Server(
             tables=[
                 reverb.Table(
-                    name='replay_buffer',
+                    name="replay_buffer",
                     sampler=reverb.selectors.Uniform(),
                     remover=reverb.selectors.Fifo(),
                     max_size=self._max_replay_size,
@@ -145,12 +147,15 @@ def build(self):
                     signature=self._signature,
                 ),
             ],
-            port=None
+            port=None,
         )
-        self._reverb_client = reverb.Client(f'localhost:{self._reverb_server.port}')
+        self._reverb_client = reverb.Client(f"localhost:{self._reverb_server.port}")
         self._reverb_dataset = reverb.TrajectoryDataset.from_table_signature(
-            server_address=f'localhost:{self._reverb_server.port}',
-            table='replay_buffer',
-            max_in_flight_samples_per_worker=2 * self._batch_size)
-        self._batched_dataset = self._reverb_dataset.batch(self._batch_size, drop_remainder=True).as_numpy_iterator()
+            server_address=f"localhost:{self._reverb_server.port}",
+            table="replay_buffer",
+            max_in_flight_samples_per_worker=2 * self._batch_size,
+        )
+        self._batched_dataset = self._reverb_dataset.batch(
+            self._batch_size, drop_remainder=True
+        ).as_numpy_iterator()
         # logger.info(self._reverb_client.server_info())
diff --git a/rlberry/agents/kernel_based/common.py b/rlberry/agents/kernel_based/common.py
index d052908d2..33757f66a 100644
--- a/rlberry/agents/kernel_based/common.py
+++ b/rlberry/agents/kernel_based/common.py
@@ -4,28 +4,30 @@
 
 
 @numba_jit
-def map_to_representative(state,
-                          lp_metric,
-                          representative_states,
-                          n_representatives,
-                          min_dist,
-                          scaling,
-                          accept_new_repr):
-    """Map state to representative state. """
+def map_to_representative(
+    state,
+    lp_metric,
+    representative_states,
+    n_representatives,
+    min_dist,
+    scaling,
+    accept_new_repr,
+):
+    """Map state to representative state."""
     dist_to_closest = np.inf
     argmin = -1
     for ii in range(n_representatives):
-        dist = metric_lp(state, representative_states[ii, :],
-                         lp_metric,
-                         scaling)
+        dist = metric_lp(state, representative_states[ii, :], lp_metric, scaling)
         if dist < dist_to_closest:
             dist_to_closest = dist
             argmin = ii
 
     max_representatives = representative_states.shape[0]
-    if (dist_to_closest > min_dist) \
-            and (n_representatives < max_representatives) \
-            and accept_new_repr:
+    if (
+        (dist_to_closest > min_dist)
+        and (n_representatives < max_representatives)
+        and accept_new_repr
+    ):
         new_index = n_representatives
         representative_states[new_index, :] = state
         return new_index
diff --git a/rlberry/agents/kernel_based/rs_kernel_ucbvi.py b/rlberry/agents/kernel_based/rs_kernel_ucbvi.py
index e376fdbab..1f573b362 100644
--- a/rlberry/agents/kernel_based/rs_kernel_ucbvi.py
+++ b/rlberry/agents/kernel_based/rs_kernel_ucbvi.py
@@ -15,11 +15,26 @@
 
 
 @numba_jit
-def update_model(repr_state, action, repr_next_state, reward,
-                 n_representatives, repr_states,
-                 lp_metric, scaling, bandwidth,
-                 bonus_scale_factor, beta, v_max, bonus_type,
-                 kernel_type, N_sa, B_sa, P_hat, R_hat):
+def update_model(
+    repr_state,
+    action,
+    repr_next_state,
+    reward,
+    n_representatives,
+    repr_states,
+    lp_metric,
+    scaling,
+    bandwidth,
+    bonus_scale_factor,
+    beta,
+    v_max,
+    bonus_type,
+    kernel_type,
+    N_sa,
+    B_sa,
+    P_hat,
+    R_hat,
+):
     """
     Model update function, lots of arguments so we can use JIT :)
     """
@@ -29,10 +44,9 @@ def update_model(repr_state, action, repr_next_state, reward,
 
     for u_repr_state in range(n_representatives):
         # compute weight
-        dist = metric_lp(repr_states[repr_state, :],
-                         repr_states[u_repr_state, :],
-                         lp_metric,
-                         scaling)
+        dist = metric_lp(
+            repr_states[repr_state, :], repr_states[u_repr_state, :], lp_metric, scaling
+        )
         weight = kernel_func(dist / bandwidth, kernel_type=kernel_type)
 
         # aux variables
@@ -43,19 +57,22 @@ def update_model(repr_state, action, repr_next_state, reward,
         N_sa[u_repr_state, action] += weight
 
         # update transitions
-        P_hat[u_repr_state, action, :n_representatives] = \
-            dirac_next_s * weight / current_N_sa + \
-            (prev_N_sa / current_N_sa) * \
-            P_hat[u_repr_state, action, :n_representatives]
+        P_hat[u_repr_state, action, :n_representatives] = (
+            dirac_next_s * weight / current_N_sa
+            + (prev_N_sa / current_N_sa)
+            * P_hat[u_repr_state, action, :n_representatives]
+        )
 
         # update rewards
-        R_hat[u_repr_state, action] = weight * reward / current_N_sa + \
-            (prev_N_sa / current_N_sa) * R_hat[u_repr_state, action]
+        R_hat[u_repr_state, action] = (
+            weight * reward / current_N_sa
+            + (prev_N_sa / current_N_sa) * R_hat[u_repr_state, action]
+        )
 
         # update bonus
-        B_sa[u_repr_state, action] = compute_bonus(N_sa[u_repr_state, action],
-                                                   beta, bonus_scale_factor,
-                                                   v_max, bonus_type)
+        B_sa[u_repr_state, action] = compute_bonus(
+            N_sa[u_repr_state, action], beta, bonus_scale_factor, v_max, bonus_type
+        )
 
 
 @numba_jit
@@ -142,19 +159,22 @@ class RSKernelUCBVIAgent(AgentWithSimplePolicy):
 
     name = "RSKernelUCBVI"
 
-    def __init__(self, env,
-                 gamma=0.99,
-                 horizon=None,
-                 lp_metric=2,
-                 kernel_type="epanechnikov",
-                 scaling=None,
-                 bandwidth=0.05,
-                 min_dist=0.1,
-                 max_repr=1000,
-                 bonus_scale_factor=1.0,
-                 beta=0.01,
-                 bonus_type="simplified_bernstein",
-                 **kwargs):
+    def __init__(
+        self,
+        env,
+        gamma=0.99,
+        horizon=None,
+        lp_metric=2,
+        kernel_type="epanechnikov",
+        scaling=None,
+        bandwidth=0.05,
+        min_dist=0.1,
+        max_repr=1000,
+        bonus_scale_factor=1.0,
+        beta=0.01,
+        bonus_type="simplified_bernstein",
+        **kwargs
+    ):
         # init base class
         AgentWithSimplePolicy.__init__(self, env, **kwargs)
 
@@ -175,8 +195,7 @@ def __init__(self, env,
         # other checks
         assert gamma >= 0 and gamma <= 1.0
         if self.horizon is None:
-            assert gamma < 1.0, \
-                "If no horizon is given, gamma must be smaller than 1."
+            assert gamma < 1.0, "If no horizon is given, gamma must be smaller than 1."
             self.horizon = int(np.ceil(1.0 / (1.0 - gamma)))
 
         # state dimension
@@ -185,10 +204,12 @@ def __init__(self, env,
         # compute scaling, if it is None
         if scaling is None:
             # if high and low are bounded
-            if (self.env.observation_space.high == np.inf).sum() == 0 \
-                    and (self.env.observation_space.low == -np.inf).sum() == 0:
-                scaling = self.env.observation_space.high \
-                    - self.env.observation_space.low
+            if (self.env.observation_space.high == np.inf).sum() == 0 and (
+                self.env.observation_space.low == -np.inf
+            ).sum() == 0:
+                scaling = (
+                    self.env.observation_space.high - self.env.observation_space.low
+                )
                 # if high or low are unbounded
             else:
                 scaling = np.ones(self.state_dim)
@@ -200,19 +221,28 @@ def __init__(self, env,
         # maximum value
         r_range = self.env.reward_range[1] - self.env.reward_range[0]
         if r_range == np.inf or r_range == 0.0:
-            logger.warning("{}: Reward range is  zero or infinity. ".format(self.name)
-                           + "Setting it to 1.")
+            logger.warning(
+                "{}: Reward range is  zero or infinity. ".format(self.name)
+                + "Setting it to 1."
+            )
             r_range = 1.0
 
         if self.gamma == 1.0:
             self.v_max = r_range * horizon
         else:
-            self.v_max = r_range * (1.0 - np.power(self.gamma, self.horizon)) / (1.0 - self.gamma)
+            self.v_max = (
+                r_range
+                * (1.0 - np.power(self.gamma, self.horizon))
+                / (1.0 - self.gamma)
+            )
 
         # number of representative states and number of actions
         if max_repr is None:
-            max_repr = int(np.ceil((1.0 * np.sqrt(self.state_dim)
-                                    / self.min_dist) ** self.state_dim))
+            max_repr = int(
+                np.ceil(
+                    (1.0 * np.sqrt(self.state_dim) / self.min_dist) ** self.state_dim
+                )
+            )
         self.max_repr = max_repr
 
         # current number of representative states
@@ -261,18 +291,23 @@ def fit(self, budget: int, **kwargs):
             self._run_episode()
 
         # compute Q function for the recommended policy
-        self.Q_policy, _ = backward_induction(self.R_hat[:self.M, :],
-                                              self.P_hat[:self.M, :, :self.M],
-                                              self.horizon, self.gamma)
+        self.Q_policy, _ = backward_induction(
+            self.R_hat[: self.M, :],
+            self.P_hat[: self.M, :, : self.M],
+            self.horizon,
+            self.gamma,
+        )
 
     def _map_to_repr(self, state, accept_new_repr=True):
-        repr_state = map_to_representative(state,
-                                           self.lp_metric,
-                                           self.representative_states,
-                                           self.M,
-                                           self.min_dist,
-                                           self.scaling,
-                                           accept_new_repr)
+        repr_state = map_to_representative(
+            state,
+            self.lp_metric,
+            self.representative_states,
+            self.M,
+            self.min_dist,
+            self.scaling,
+            accept_new_repr,
+        )
         # check if new representative state
         if repr_state == self.M:
             self.M += 1
@@ -282,21 +317,26 @@ def _update(self, state, action, next_state, reward):
         repr_state = self._map_to_repr(state)
         repr_next_state = self._map_to_repr(next_state)
 
-        update_model(repr_state, action, repr_next_state, reward,
-                     self.M,
-                     self.representative_states,
-                     self.lp_metric,
-                     self.scaling,
-                     self.bandwidth,
-                     self.bonus_scale_factor,
-                     self.beta,
-                     self.v_max,
-                     self.bonus_type,
-                     self.kernel_type,
-                     self.N_sa,
-                     self.B_sa,
-                     self.P_hat,
-                     self.R_hat)
+        update_model(
+            repr_state,
+            action,
+            repr_next_state,
+            reward,
+            self.M,
+            self.representative_states,
+            self.lp_metric,
+            self.scaling,
+            self.bandwidth,
+            self.bonus_scale_factor,
+            self.beta,
+            self.v_max,
+            self.bonus_type,
+            self.kernel_type,
+            self.N_sa,
+            self.B_sa,
+            self.P_hat,
+            self.R_hat,
+        )
 
     def _get_action(self, state, hh=0):
         assert self.Q is not None
@@ -319,10 +359,14 @@ def _run_episode(self):
 
         # run backward induction
         backward_induction_in_place(
-            self.Q[:, :self.M, :], self.V[:, :self.M],
-            self.R_hat[:self.M, :] + self.B_sa[:self.M, :],
-            self.P_hat[:self.M, :, :self.M],
-            self.horizon, self.gamma, self.v_max)
+            self.Q[:, : self.M, :],
+            self.V[:, : self.M],
+            self.R_hat[: self.M, :] + self.B_sa[: self.M, :],
+            self.P_hat[: self.M, :, : self.M],
+            self.horizon,
+            self.gamma,
+            self.v_max,
+        )
 
         self.episode += 1
         #
diff --git a/rlberry/agents/kernel_based/rs_ucbvi.py b/rlberry/agents/kernel_based/rs_ucbvi.py
index a9908c0e1..ec7bfbb96 100644
--- a/rlberry/agents/kernel_based/rs_ucbvi.py
+++ b/rlberry/agents/kernel_based/rs_ucbvi.py
@@ -84,17 +84,20 @@ class RSUCBVIAgent(AgentWithSimplePolicy):
 
     name = "RSUCBVI"
 
-    def __init__(self, env,
-                 gamma=0.99,
-                 horizon=100,
-                 lp_metric=2,
-                 scaling=None,
-                 min_dist=0.1,
-                 max_repr=1000,
-                 bonus_scale_factor=1.0,
-                 bonus_type="simplified_bernstein",
-                 reward_free=False,
-                 **kwargs):
+    def __init__(
+        self,
+        env,
+        gamma=0.99,
+        horizon=100,
+        lp_metric=2,
+        scaling=None,
+        min_dist=0.1,
+        max_repr=1000,
+        bonus_scale_factor=1.0,
+        bonus_type="simplified_bernstein",
+        reward_free=False,
+        **kwargs
+    ):
         # init base class
         AgentWithSimplePolicy.__init__(self, env, **kwargs)
 
@@ -113,8 +116,7 @@ def __init__(self, env,
         # other checks
         assert gamma >= 0 and gamma <= 1.0
         if self.horizon is None:
-            assert gamma < 1.0, \
-                "If no horizon is given, gamma must be smaller than 1."
+            assert gamma < 1.0, "If no horizon is given, gamma must be smaller than 1."
             self.horizon = int(np.ceil(1.0 / (1.0 - gamma)))
 
         # state dimension
@@ -123,10 +125,12 @@ def __init__(self, env,
         # compute scaling, if it is None
         if scaling is None:
             # if high and low are bounded
-            if (self.env.observation_space.high == np.inf).sum() == 0 \
-                    and (self.env.observation_space.low == -np.inf).sum() == 0:
-                scaling = self.env.observation_space.high \
-                    - self.env.observation_space.low
+            if (self.env.observation_space.high == np.inf).sum() == 0 and (
+                self.env.observation_space.low == -np.inf
+            ).sum() == 0:
+                scaling = (
+                    self.env.observation_space.high - self.env.observation_space.low
+                )
                 # if high or low are unbounded
             else:
                 scaling = np.ones(self.state_dim)
@@ -138,20 +142,28 @@ def __init__(self, env,
         # maximum value
         r_range = self.env.reward_range[1] - self.env.reward_range[0]
         if r_range == np.inf or r_range == 0.0:
-            logger.warning("{}: Reward range is  zero or infinity. ".format(self.name)
-                           + "Setting it to 1.")
+            logger.warning(
+                "{}: Reward range is  zero or infinity. ".format(self.name)
+                + "Setting it to 1."
+            )
             r_range = 1.0
 
         if self.gamma == 1.0:
             self.v_max = r_range * horizon
         else:
-            self.v_max = r_range * (1.0 - np.power(self.gamma, self.horizon)) \
+            self.v_max = (
+                r_range
+                * (1.0 - np.power(self.gamma, self.horizon))
                 / (1.0 - self.gamma)
+            )
 
         # number of representative states and number of actions
         if max_repr is None:
-            max_repr = int(np.ceil((1.0 * np.sqrt(self.state_dim) /
-                                    self.min_dist) ** self.state_dim))
+            max_repr = int(
+                np.ceil(
+                    (1.0 * np.sqrt(self.state_dim) / self.min_dist) ** self.state_dim
+                )
+            )
         self.max_repr = max_repr
 
         # current number of representative states
@@ -205,18 +217,23 @@ def fit(self, budget: int, **kwargs):
             count += 1
 
         # compute Q function for the recommended policy
-        self.Q_policy, _ = backward_induction(self.R_hat[:self.M, :],
-                                              self.P_hat[:self.M, :, :self.M],
-                                              self.horizon, self.gamma)
+        self.Q_policy, _ = backward_induction(
+            self.R_hat[: self.M, :],
+            self.P_hat[: self.M, :, : self.M],
+            self.horizon,
+            self.gamma,
+        )
 
     def _map_to_repr(self, state, accept_new_repr=True):
-        repr_state = map_to_representative(state,
-                                           self.lp_metric,
-                                           self.representative_states,
-                                           self.M,
-                                           self.min_dist,
-                                           self.scaling,
-                                           accept_new_repr)
+        repr_state = map_to_representative(
+            state,
+            self.lp_metric,
+            self.representative_states,
+            self.M,
+            self.min_dist,
+            self.scaling,
+            accept_new_repr,
+        )
         # check if new representative state
         if repr_state == self.M:
             self.M += 1
@@ -230,12 +247,15 @@ def _update(self, state, action, next_state, reward):
         self.N_sas[repr_state, action, repr_next_state] += 1
         self.S_sa[repr_state, action] += reward
 
-        self.R_hat[repr_state, action] = self.S_sa[repr_state, action] \
-            / self.N_sa[repr_state, action]
-        self.P_hat[repr_state, action, :] = self.N_sas[repr_state, action, :] \
-            / self.N_sa[repr_state, action]
-        self.B_sa[repr_state, action] = \
-            self._compute_bonus(self.N_sa[repr_state, action])
+        self.R_hat[repr_state, action] = (
+            self.S_sa[repr_state, action] / self.N_sa[repr_state, action]
+        )
+        self.P_hat[repr_state, action, :] = (
+            self.N_sas[repr_state, action, :] / self.N_sa[repr_state, action]
+        )
+        self.B_sa[repr_state, action] = self._compute_bonus(
+            self.N_sa[repr_state, action]
+        )
 
     def _compute_bonus(self, n):
         # reward-free
@@ -250,7 +270,8 @@ def _compute_bonus(self, n):
             return bonus
         else:
             raise NotImplementedError(
-                "Error: bonus type {} not implemented".format(self.bonus_type))
+                "Error: bonus type {} not implemented".format(self.bonus_type)
+            )
 
     def _get_action(self, state, hh=0):
         assert self.Q is not None
@@ -277,10 +298,14 @@ def _run_episode(self):
 
         # run backward induction
         backward_induction_in_place(
-            self.Q[:, :self.M, :], self.V[:, :self.M],
-            self.R_hat[:self.M, :] + self.B_sa[:self.M, :],
-            self.P_hat[:self.M, :, :self.M],
-            self.horizon, self.gamma, self.v_max)
+            self.Q[:, : self.M, :],
+            self.V[:, : self.M],
+            self.R_hat[: self.M, :] + self.B_sa[: self.M, :],
+            self.P_hat[: self.M, :, : self.M],
+            self.horizon,
+            self.gamma,
+            self.v_max,
+        )
 
         self.episode += 1
         #
diff --git a/rlberry/agents/linear/lsvi_ucb.py b/rlberry/agents/linear/lsvi_ucb.py
index 438b39e3c..36706116f 100644
--- a/rlberry/agents/linear/lsvi_ucb.py
+++ b/rlberry/agents/linear/lsvi_ucb.py
@@ -9,16 +9,18 @@
 
 @numba_jit
 def run_lsvi_jit(
-        dim, horizon,
-        bonus_factor,
-        lambda_mat_inv,
-        reward_hist,
-        gamma,
-        feat_hist,
-        n_actions,
-        feat_ns_all_actions,
-        v_max,
-        total_time_steps):
+    dim,
+    horizon,
+    bonus_factor,
+    lambda_mat_inv,
+    reward_hist,
+    gamma,
+    feat_hist,
+    n_actions,
+    feat_ns_all_actions,
+    v_max,
+    total_time_steps,
+):
     """
     Jit version of Least-Squares Value Iteration.
 
@@ -61,10 +63,10 @@ def run_lsvi_jit(
             for aa in range(n_actions):
                 #
                 feat_ns_aa = feat_ns_all_actions[tt, aa, :]
-                inverse_counts = \
-                    feat_ns_aa.dot(lambda_mat_inv.T.dot(feat_ns_aa))
-                bonus = bonus_factor * np.sqrt(inverse_counts) \
-                    + v_max * inverse_counts * (bonus_factor > 0.0)
+                inverse_counts = feat_ns_aa.dot(lambda_mat_inv.T.dot(feat_ns_aa))
+                bonus = bonus_factor * np.sqrt(
+                    inverse_counts
+                ) + v_max * inverse_counts * (bonus_factor > 0.0)
                 #
                 q_ns[aa] = feat_ns_aa.dot(q_w[hh + 1, :]) + bonus
                 q_ns[aa] = min(q_ns[aa], v_max)
@@ -119,17 +121,19 @@ class LSVIUCBAgent(AgentWithSimplePolicy):
     function approximation. In Conference on Learning Theory (pp. 2137-2143).
     """
 
-    name = 'LSVI-UCB'
-
-    def __init__(self,
-                 env,
-                 horizon,
-                 feature_map_fn,
-                 feature_map_kwargs=None,
-                 gamma=0.99,
-                 bonus_scale_factor=1.0,
-                 reg_factor=0.1,
-                 **kwargs):
+    name = "LSVI-UCB"
+
+    def __init__(
+        self,
+        env,
+        horizon,
+        feature_map_fn,
+        feature_map_kwargs=None,
+        gamma=0.99,
+        bonus_scale_factor=1.0,
+        reg_factor=0.1,
+        **kwargs
+    ):
         AgentWithSimplePolicy.__init__(self, env, **kwargs)
 
         self.n_episodes = None
@@ -142,23 +146,29 @@ def __init__(self,
 
         #
         if self.bonus_scale_factor == 0.0:
-            self.name = 'LSVI-Random-Expl'
+            self.name = "LSVI-Random-Expl"
 
         # maximum value
         r_range = self.env.reward_range[1] - self.env.reward_range[0]
         if r_range == np.inf:
-            logger.warning("{}: Reward range is infinity. ".format(self.name)
-                           + "Clipping it to 1.")
+            logger.warning(
+                "{}: Reward range is infinity. ".format(self.name) + "Clipping it to 1."
+            )
             r_range = 1.0
 
         if self.gamma == 1.0:
             self.v_max = r_range * horizon
         else:
-            self.v_max = r_range * (1.0 - np.power(self.gamma, self.horizon)) / (1.0 - self.gamma)
+            self.v_max = (
+                r_range
+                * (1.0 - np.power(self.gamma, self.horizon))
+                / (1.0 - self.gamma)
+            )
 
         #
-        assert isinstance(self.env.action_space, Discrete), \
-            "LSVI-UCB requires discrete actions."
+        assert isinstance(
+            self.env.action_space, Discrete
+        ), "LSVI-UCB requires discrete actions."
 
         #
         assert len(self.feature_map.shape) == 1
@@ -196,9 +206,9 @@ def reset(self):
         self._rewards = np.zeros(self.n_episodes)
         #
         self.feat_hist = np.zeros((self.n_episodes * self.horizon, self.dim))
-        self.feat_ns_all_actions = np.zeros((self.n_episodes * self.horizon,
-                                             self.env.action_space.n,
-                                             self.dim))
+        self.feat_ns_all_actions = np.zeros(
+            (self.n_episodes * self.horizon, self.env.action_space.n, self.dim)
+        )
         #
         self.w_policy = None
 
@@ -210,7 +220,8 @@ def fit(self, budget, **kwargs):
         if self.n_episodes is not None:
             logger.warning(
                 "[LSVI-UCB]: Calling fit() more than once will reset the algorithm"
-                + " (to realocate memory according to the number of episodes).")
+                + " (to realocate memory according to the number of episodes)."
+            )
         self.n_episodes = budget
         self.reset()
 
@@ -252,8 +263,7 @@ def run_episode(self):
             #
             self.lambda_mat += np.outer(feat, feat)
             # update inverse
-            self.lambda_mat_inv -= \
-                (inv @ outer_prod @ inv) / (1 + feat @ inv.T @ feat)
+            self.lambda_mat_inv -= (inv @ outer_prod @ inv) / (1 + feat @ inv.T @ feat)
 
             # update history
             self.reward_hist[self.total_time_steps] = reward
@@ -265,8 +275,9 @@ def run_episode(self):
             tt = self.total_time_steps
             self.feat_hist[tt, :] = self.feature_map.map(state, action)
             for aa in range(self.env.action_space.n):
-                self.feat_ns_all_actions[tt, aa, :] = \
-                    self.feature_map.map(next_state, aa)
+                self.feat_ns_all_actions[tt, aa, :] = self.feature_map.map(
+                    next_state, aa
+                )
 
             # increments
             self.total_time_steps += 1
@@ -293,8 +304,9 @@ def _compute_q(self, q_w, state, action, bonus_factor):
         """q_w is the vector representation of the Q function."""
         feat = self.feature_map.map(state, action)
         inverse_counts = feat @ (self.lambda_mat_inv.T @ feat)
-        bonus = bonus_factor * np.sqrt(inverse_counts) \
-            + self.v_max * inverse_counts * (bonus_factor > 0.0)
+        bonus = bonus_factor * np.sqrt(inverse_counts) + self.v_max * inverse_counts * (
+            bonus_factor > 0.0
+        )
         q = feat.dot(q_w) + bonus
         return q
 
@@ -305,23 +317,26 @@ def _compute_q_vec(self, q_w, state, bonus_factor):
             # q_vec[aa] = self._compute_q(q_w, state, aa, bonus_factor)
             feat = self.feature_map.map(state, aa)
             inverse_counts = feat @ (self.lambda_mat_inv.T @ feat)
-            bonus = bonus_factor * np.sqrt(inverse_counts) \
-                + self.v_max * inverse_counts * (bonus_factor > 0.0)
+            bonus = bonus_factor * np.sqrt(
+                inverse_counts
+            ) + self.v_max * inverse_counts * (bonus_factor > 0.0)
             q_vec[aa] = feat.dot(q_w) + bonus
             # q_vec[aa] = min(q_vec[aa], self.v_max)   # !!!!!!!!!
         return q_vec
 
     def _run_lsvi(self, bonus_factor):
         # run value iteration
-        q_w = run_lsvi_jit(self.dim,
-                           self.horizon,
-                           bonus_factor,
-                           self.lambda_mat_inv,
-                           self.reward_hist,
-                           self.gamma,
-                           self.feat_hist,
-                           self.env.action_space.n,
-                           self.feat_ns_all_actions,
-                           self.v_max,
-                           self.total_time_steps)
+        q_w = run_lsvi_jit(
+            self.dim,
+            self.horizon,
+            bonus_factor,
+            self.lambda_mat_inv,
+            self.reward_hist,
+            self.gamma,
+            self.feat_hist,
+            self.env.action_space.n,
+            self.feat_ns_all_actions,
+            self.v_max,
+            self.total_time_steps,
+        )
         return q_w
diff --git a/rlberry/agents/mbqvi/mbqvi.py b/rlberry/agents/mbqvi/mbqvi.py
index e32d8322c..4b1e3c92a 100644
--- a/rlberry/agents/mbqvi/mbqvi.py
+++ b/rlberry/agents/mbqvi/mbqvi.py
@@ -41,21 +41,19 @@ class MBQVIAgent(AgentWithSimplePolicy):
 
     name = "MBQVI"
 
-    def __init__(self, env,
-                 n_samples=10,
-                 gamma=0.99,
-                 horizon=None,
-                 epsilon=1e-6,
-                 **kwargs):
+    def __init__(
+        self, env, n_samples=10, gamma=0.99, horizon=None, epsilon=1e-6, **kwargs
+    ):
         AgentWithSimplePolicy.__init__(self, env, **kwargs)
 
         # initialize base class
-        assert self.env.is_generative(), \
-            "MBQVI requires a generative model."
-        assert isinstance(self.env.observation_space, Discrete), \
-            "MBQVI requires a finite state space."
-        assert isinstance(self.env.action_space, Discrete), \
-            "MBQVI requires a finite action space."
+        assert self.env.is_generative(), "MBQVI requires a generative model."
+        assert isinstance(
+            self.env.observation_space, Discrete
+        ), "MBQVI requires a finite state space."
+        assert isinstance(
+            self.env.action_space, Discrete
+        ), "MBQVI requires a finite action space."
 
         #
         self.n_samples = n_samples
@@ -102,15 +100,14 @@ def fit(self, budget=None, **kwargs):
                     count += 1
                     if count % 10000 == 0:
                         completed = 100 * count / total_samples
-                        logger.debug("[{}] ... {}/{} ({:0.0f}%)".format(
-                            self.name,
-                            count,
-                            total_samples,
-                            completed))
+                        logger.debug(
+                            "[{}] ... {}/{} ({:0.0f}%)".format(
+                                self.name, count, total_samples, completed
+                            )
+                        )
 
         # build model and run VI
-        logger.debug(
-            f"{self.name} building model and running backward induction...")
+        logger.debug(f"{self.name} building model and running backward induction...")
 
         N_sa = np.maximum(self.N_sa, 1)
         self.R_hat = self.S_sa / N_sa
@@ -122,15 +119,16 @@ def fit(self, budget=None, **kwargs):
         info["n_samples"] = self.n_samples
         info["total_samples"] = total_samples
         if self.horizon is None:
-            assert self.gamma < 1.0, \
-                "The discounted setting requires gamma < 1.0"
-            self.Q, self.V, n_it = value_iteration(self.R_hat, self.P_hat,
-                                                   self.gamma, self.epsilon)
+            assert self.gamma < 1.0, "The discounted setting requires gamma < 1.0"
+            self.Q, self.V, n_it = value_iteration(
+                self.R_hat, self.P_hat, self.gamma, self.epsilon
+            )
             info["n_iterations"] = n_it
             info["precision"] = self.epsilon
         else:
-            self.Q, self.V = backward_induction(self.R_hat, self.P_hat,
-                                                self.horizon, self.gamma)
+            self.Q, self.V = backward_induction(
+                self.R_hat, self.P_hat, self.horizon, self.gamma
+            )
             info["n_iterations"] = self.horizon
             info["precision"] = 0.0
         return info
diff --git a/rlberry/agents/optql/optql.py b/rlberry/agents/optql/optql.py
index f0fc66750..d4e4d91df 100644
--- a/rlberry/agents/optql/optql.py
+++ b/rlberry/agents/optql/optql.py
@@ -36,16 +36,19 @@ class OptQLAgent(AgentWithSimplePolicy):
            Is Q-Learning Provably Efficient?
            https://arxiv.org/abs/1807.03765
     """
+
     name = "OptQL"
 
-    def __init__(self,
-                 env,
-                 gamma=1.0,
-                 horizon=100,
-                 bonus_scale_factor=1.0,
-                 bonus_type="simplified_bernstein",
-                 add_bonus_after_update=False,
-                 **kwargs):
+    def __init__(
+        self,
+        env,
+        gamma=1.0,
+        horizon=100,
+        bonus_scale_factor=1.0,
+        bonus_type="simplified_bernstein",
+        add_bonus_after_update=False,
+        **kwargs
+    ):
         # init base class
         AgentWithSimplePolicy.__init__(self, env, **kwargs)
 
@@ -62,8 +65,10 @@ def __init__(self,
         # maximum value
         r_range = self.env.reward_range[1] - self.env.reward_range[0]
         if r_range == np.inf or r_range == 0.0:
-            logger.warning("{}: Reward range is  zero or infinity. ".format(self.name)
-                           + "Setting it to 1.")
+            logger.warning(
+                "{}: Reward range is  zero or infinity. ".format(self.name)
+                + "Setting it to 1."
+            )
             r_range = 1.0
 
         self.v_max = np.zeros(self.horizon)
@@ -88,9 +93,9 @@ def reset(self, **kwargs):
         self.Q = np.ones((H, S, A))
         self.Q_bar = np.ones((H, S, A))
         for hh in range(self.horizon):
-            self.V[hh, :] *= (self.horizon - hh)
-            self.Q[hh, :, :] *= (self.horizon - hh)
-            self.Q_bar[hh, :, :] *= (self.horizon - hh)
+            self.V[hh, :] *= self.horizon - hh
+            self.Q[hh, :, :] *= self.horizon - hh
+            self.Q_bar[hh, :, :] *= self.horizon - hh
 
         if self.add_bonus_after_update:
             self.Q *= 0.0
@@ -99,16 +104,17 @@ def reset(self, **kwargs):
         self.episode = 0
 
         # useful object to compute total number of visited states & entropy of visited states
-        self.counter = DiscreteCounter(self.env.observation_space,
-                                       self.env.action_space)
+        self.counter = DiscreteCounter(
+            self.env.observation_space, self.env.action_space
+        )
 
     def policy(self, observation):
-        """ Recommended policy. """
+        """Recommended policy."""
         state = observation
         return self.Q_bar[0, state, :].argmax()
 
     def _get_action(self, state, hh=0):
-        """ Sampling policy. """
+        """Sampling policy."""
         return self.Q_bar[hh, state, :].argmax()
 
     def _compute_bonus(self, n, hh):
@@ -118,7 +124,8 @@ def _compute_bonus(self, n, hh):
             return bonus
         else:
             raise ValueError(
-                "Error: bonus type {} not implemented".format(self.bonus_type))
+                "Error: bonus type {} not implemented".format(self.bonus_type)
+            )
 
     def _update(self, state, action, next_state, reward, hh):
         self.N_sa[hh, state, action] += 1
@@ -131,14 +138,20 @@ def _update(self, state, action, next_state, reward, hh):
         # bonus in the update
         if not self.add_bonus_after_update:
             target = reward + bonus + self.gamma * self.V[hh + 1, next_state]
-            self.Q[hh, state, action] = (1 - alpha) * self.Q[hh, state, action] + alpha * target
+            self.Q[hh, state, action] = (1 - alpha) * self.Q[
+                hh, state, action
+            ] + alpha * target
             self.V[hh, state] = min(self.v_max[hh], self.Q[hh, state, :].max())
             self.Q_bar[hh, state, action] = self.Q[hh, state, action]
         # bonus outside the update
         else:
             target = reward + self.gamma * self.V[hh + 1, next_state]  # bonus not here
-            self.Q[hh, state, action] = (1 - alpha) * self.Q[hh, state, action] + alpha * target
-            self.Q_bar[hh, state, action] = self.Q[hh, state, action] + bonus  # bonus here
+            self.Q[hh, state, action] = (1 - alpha) * self.Q[
+                hh, state, action
+            ] + alpha * target
+            self.Q_bar[hh, state, action] = (
+                self.Q[hh, state, action] + bonus
+            )  # bonus here
             self.V[hh, state] = min(self.v_max[hh], self.Q_bar[hh, state, :].max())
 
     def _run_episode(self):
@@ -164,7 +177,9 @@ def _run_episode(self):
         # writer
         if self.writer is not None:
             self.writer.add_scalar("episode_rewards", episode_rewards, self.episode)
-            self.writer.add_scalar("n_visited_states", self.counter.get_n_visited_states(), self.episode)
+            self.writer.add_scalar(
+                "n_visited_states", self.counter.get_n_visited_states(), self.episode
+            )
 
         # return sum of rewards collected in the episode
         return episode_rewards
diff --git a/rlberry/agents/tests/test_dynprog.py b/rlberry/agents/tests/test_dynprog.py
index f8f4bc6b4..feb3e9f4e 100644
--- a/rlberry/agents/tests/test_dynprog.py
+++ b/rlberry/agents/tests/test_dynprog.py
@@ -22,24 +22,26 @@ def get_random_mdp(S, A):
     return R, P
 
 
-@pytest.mark.parametrize("gamma, S, A",
-                         [
-                             (0.001, 2, 1),
-                             (0.25, 2, 1),
-                             (0.5, 2, 1),
-                             (0.75, 2, 1),
-                             (0.999, 2, 1),
-                             (0.001, 4, 2),
-                             (0.25, 4, 2),
-                             (0.5, 4, 2),
-                             (0.75, 4, 2),
-                             (0.999, 4, 2),
-                             (0.001, 20, 4),
-                             (0.25, 20, 4),
-                             (0.5, 20, 4),
-                             (0.75, 20, 4),
-                             (0.999, 20, 4)
-                         ])
+@pytest.mark.parametrize(
+    "gamma, S, A",
+    [
+        (0.001, 2, 1),
+        (0.25, 2, 1),
+        (0.5, 2, 1),
+        (0.75, 2, 1),
+        (0.999, 2, 1),
+        (0.001, 4, 2),
+        (0.25, 4, 2),
+        (0.5, 4, 2),
+        (0.75, 4, 2),
+        (0.999, 4, 2),
+        (0.001, 20, 4),
+        (0.25, 20, 4),
+        (0.5, 20, 4),
+        (0.75, 20, 4),
+        (0.999, 20, 4),
+    ],
+)
 def test_bellman_operator_monotonicity_and_contraction(gamma, S, A):
     rng = seeding.Seeder(123).rng
     vmax = 1.0 / (1.0 - gamma)
@@ -67,14 +69,10 @@ def test_bellman_operator_monotonicity_and_contraction(gamma, S, A):
         assert np.greater(TQ2, TQ3).sum() == 0
 
 
-@pytest.mark.parametrize("gamma, S, A",
-                         [
-                             (0.01, 10, 4),
-                             (0.25, 10, 4),
-                             (0.5, 10, 4),
-                             (0.75, 10, 4),
-                             (0.99, 10, 4)
-                         ])
+@pytest.mark.parametrize(
+    "gamma, S, A",
+    [(0.01, 10, 4), (0.25, 10, 4), (0.5, 10, 4), (0.75, 10, 4), (0.99, 10, 4)],
+)
 def test_value_iteration(gamma, S, A):
     for epsilon in np.logspace(-1, -6, num=5):
         for sim in range(5):
@@ -88,11 +86,7 @@ def test_value_iteration(gamma, S, A):
             assert np.abs(TQ - Q).max() <= epsilon
 
 
-@pytest.mark.parametrize("horizon, S, A",
-                         [
-                             (10, 5, 4),
-                             (20, 10, 4)
-                         ])
+@pytest.mark.parametrize("horizon, S, A", [(10, 5, 4), (20, 10, 4)])
 def test_backward_induction(horizon, S, A):
     for sim in range(5):
         # generate random MDP
@@ -116,11 +110,7 @@ def test_backward_induction(horizon, S, A):
         assert np.array_equal(V, V2)
 
 
-@pytest.mark.parametrize("horizon, S, A",
-                         [
-                             (10, 5, 4),
-                             (20, 10, 4)
-                         ])
+@pytest.mark.parametrize("horizon, S, A", [(10, 5, 4), (20, 10, 4)])
 def test_backward_induction_sd(horizon, S, A):
     """
     Test stage-dependent MDPs
@@ -146,11 +136,7 @@ def test_backward_induction_sd(horizon, S, A):
         assert np.array_equal(V, Vstat)
 
 
-@pytest.mark.parametrize("horizon, gamma, S, A",
-                         [
-                             (None, 0.5, 10, 4),
-                             (10, 1.0, 10, 4)
-                         ])
+@pytest.mark.parametrize("horizon, gamma, S, A", [(None, 0.5, 10, 4), (10, 1.0, 10, 4)])
 def test_value_iteration_agent(horizon, gamma, S, A):
     for sim in range(5):
         # generate random MDP
diff --git a/rlberry/agents/tests/test_kernel_based.py b/rlberry/agents/tests/test_kernel_based.py
index 4edefaa42..65abac706 100644
--- a/rlberry/agents/tests/test_kernel_based.py
+++ b/rlberry/agents/tests/test_kernel_based.py
@@ -5,17 +5,20 @@
 from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env
 
 
-@pytest.mark.parametrize("kernel_type", [
-    "uniform",
-    "triangular",
-    "gaussian",
-    "epanechnikov",
-    "quartic",
-    "triweight",
-    "tricube",
-    "cosine",
-    "exp-2"
-])
+@pytest.mark.parametrize(
+    "kernel_type",
+    [
+        "uniform",
+        "triangular",
+        "gaussian",
+        "epanechnikov",
+        "quartic",
+        "triweight",
+        "tricube",
+        "cosine",
+        "exp-2",
+    ],
+)
 def test_rs_kernel_ucbvi(kernel_type):
     for horizon in [None, 30]:
         env = get_benchmark_env(level=1)
@@ -27,7 +30,8 @@ def test_rs_kernel_ucbvi(kernel_type):
             min_dist=0.2,
             bandwidth=0.05,
             beta=1.0,
-            kernel_type=kernel_type)
+            kernel_type=kernel_type,
+        )
         agent.fit(budget=5)
         agent.policy(env.observation_space.sample())
 
@@ -39,21 +43,16 @@ def test_str_to_int():
 
 def test_rs_ucbvi():
     env = get_benchmark_env(level=1)
-    agent = RSUCBVIAgent(env,
-                         gamma=0.99,
-                         horizon=30,
-                         bonus_scale_factor=0.1)
+    agent = RSUCBVIAgent(env, gamma=0.99, horizon=30, bonus_scale_factor=0.1)
     agent.fit(budget=5)
     agent.policy(env.observation_space.sample())
 
 
 def test_rs_ucbvi_reward_free():
     env = get_benchmark_env(level=1)
-    agent = RSUCBVIAgent(env,
-                         gamma=0.99,
-                         horizon=30,
-                         bonus_scale_factor=0.1,
-                         reward_free=True)
+    agent = RSUCBVIAgent(
+        env, gamma=0.99, horizon=30, bonus_scale_factor=0.1, reward_free=True
+    )
     agent.fit(budget=5)
     agent.policy(env.observation_space.sample())
     assert agent.R_hat.sum() == 0.0
diff --git a/rlberry/agents/tests/test_lsvi_ucb.py b/rlberry/agents/tests/test_lsvi_ucb.py
index 9dc3abeaa..31480e1d3 100644
--- a/rlberry/agents/tests/test_lsvi_ucb.py
+++ b/rlberry/agents/tests/test_lsvi_ucb.py
@@ -37,10 +37,9 @@ def feature_map_fn(_env):
         return FeatMapClass(_env.observation_space.n, _env.action_space.n)
 
     reg_factor = 0.1
-    agent = LSVIUCBAgent(env,
-                         feature_map_fn=feature_map_fn,
-                         horizon=10,
-                         reg_factor=reg_factor)
+    agent = LSVIUCBAgent(
+        env, feature_map_fn=feature_map_fn, horizon=10, reg_factor=reg_factor
+    )
     agent.reseed(123)
     agent.fit(budget=50)
     assert np.allclose(np.linalg.inv(agent.lambda_mat), agent.lambda_mat_inv)
@@ -57,14 +56,17 @@ def feature_map_fn(_env):
     for state, action in zip(agent.state_hist, agent.action_hist):
         N_sa[state, action] += 1.0
 
-    assert np.allclose(agent.lambda_mat_inv.diagonal(),
-                       1.0 / (N_sa.flatten() + reg_factor))
+    assert np.allclose(
+        agent.lambda_mat_inv.diagonal(), 1.0 / (N_sa.flatten() + reg_factor)
+    )
 
     for ss in range(S):
         for aa in range(A):
             feat = agent.feature_map.map(ss, aa)
-            assert np.allclose(feat @ (agent.lambda_mat_inv.T @ feat),
-                               1.0 / (N_sa[ss, aa] + reg_factor))
+            assert np.allclose(
+                feat @ (agent.lambda_mat_inv.T @ feat),
+                1.0 / (N_sa[ss, aa] + reg_factor),
+            )
 
 
 def test_lsvi_without_bonus():
@@ -88,8 +90,7 @@ def lsvi_debug_gather_data(agent):
             #
             agent.lambda_mat += np.outer(feat, feat)
             # update inverse
-            agent.lambda_mat_inv -= \
-                (inv @ outer_prod @ inv) / (1 + feat @ inv.T @ feat)
+            agent.lambda_mat_inv -= (inv @ outer_prod @ inv) / (1 + feat @ inv.T @ feat)
 
             # update history
             agent.reward_hist[count] = reward
@@ -101,8 +102,9 @@ def lsvi_debug_gather_data(agent):
             tt = agent.total_time_steps
             agent.feat_hist[tt, :] = agent.feature_map.map(state, action)
             for aa in range(agent.env.action_space.n):
-                agent.feat_ns_all_actions[tt, aa, :] = \
-                    agent.feature_map.map(next_state, aa)
+                agent.feat_ns_all_actions[tt, aa, :] = agent.feature_map.map(
+                    next_state, aa
+                )
 
             # increments
             agent.total_time_steps += 1
@@ -114,11 +116,9 @@ def lsvi_debug_gather_data(agent):
     def feature_map_fn(_env):
         return OneHotFeatureMap(_env.observation_space.n, _env.action_space.n)
 
-    agent = LSVIUCBAgent(env,
-                         feature_map_fn=feature_map_fn,
-                         horizon=20,
-                         gamma=0.99,
-                         reg_factor=1e-5)
+    agent = LSVIUCBAgent(
+        env, feature_map_fn=feature_map_fn, horizon=20, gamma=0.99, reg_factor=1e-5
+    )
     agent.reseed(123)
     agent.n_episodes = 100
     agent.reset()
@@ -150,12 +150,14 @@ def test_lsvi_random_exploration():
     def feature_map_fn(_env):
         return OneHotFeatureMap(_env.observation_space.n, _env.action_space.n)
 
-    agent = LSVIUCBAgent(env,
-                         feature_map_fn=feature_map_fn,
-                         horizon=20,
-                         gamma=0.99,
-                         reg_factor=1e-5,
-                         bonus_scale_factor=0.0)
+    agent = LSVIUCBAgent(
+        env,
+        feature_map_fn=feature_map_fn,
+        horizon=20,
+        gamma=0.99,
+        reg_factor=1e-5,
+        bonus_scale_factor=0.0,
+    )
     agent.reseed(123)
     agent.fit(budget=250)
 
@@ -184,11 +186,14 @@ def test_lsvi_optimism():
     def feature_map_fn(_env):
         return OneHotFeatureMap(_env.observation_space.n, _env.action_space.n)
 
-    agent = LSVIUCBAgent(env, gamma=0.99,
-                         feature_map_fn=feature_map_fn,
-                         horizon=3,
-                         bonus_scale_factor=3,
-                         reg_factor=0.000001)
+    agent = LSVIUCBAgent(
+        env,
+        gamma=0.99,
+        feature_map_fn=feature_map_fn,
+        horizon=3,
+        bonus_scale_factor=3,
+        reg_factor=0.000001,
+    )
     agent.fit(budget=250)
 
     # near optimal Q
@@ -202,9 +207,8 @@ def feature_map_fn(_env):
     Q_optimistic = np.zeros((S, A))
     for ss in range(S):
         Q_optimistic[ss, :] = agent._compute_q_vec(
-            agent.w_vec[0, :],
-            ss,
-            agent.bonus_scale_factor)
+            agent.w_vec[0, :], ss, agent.bonus_scale_factor
+        )
 
     print(Q)
     print(Q_optimistic)
diff --git a/rlberry/agents/tests/test_optql.py b/rlberry/agents/tests/test_optql.py
index 381c30b4a..35adf21d6 100644
--- a/rlberry/agents/tests/test_optql.py
+++ b/rlberry/agents/tests/test_optql.py
@@ -4,9 +4,6 @@
 
 def test_optql():
     env = GridWorld(walls=(), nrows=5, ncols=5)
-    agent = OptQLAgent(env,
-                       horizon=11,
-                       gamma=0.99,
-                       bonus_scale_factor=0.1)
+    agent = OptQLAgent(env, horizon=11, gamma=0.99, bonus_scale_factor=0.1)
     agent.fit(budget=50)
     agent.policy(env.observation_space.sample())
diff --git a/rlberry/agents/tests/test_ucbvi.py b/rlberry/agents/tests/test_ucbvi.py
index 75cd4f0f6..641fe0c02 100644
--- a/rlberry/agents/tests/test_ucbvi.py
+++ b/rlberry/agents/tests/test_ucbvi.py
@@ -3,24 +3,28 @@
 from rlberry.envs.finite import GridWorld
 
 
-@pytest.mark.parametrize("gamma, stage_dependent, real_time_dp",
-                         [
-                             (1.0, True, True),
-                             (1.0, True, False),
-                             (1.0, False, True),
-                             (1.0, False, False),
-                             (0.9, True, True),
-                             (0.9, True, False),
-                             (0.9, False, True),
-                             (0.9, False, False),
-                         ])
+@pytest.mark.parametrize(
+    "gamma, stage_dependent, real_time_dp",
+    [
+        (1.0, True, True),
+        (1.0, True, False),
+        (1.0, False, True),
+        (1.0, False, False),
+        (0.9, True, True),
+        (0.9, True, False),
+        (0.9, False, True),
+        (0.9, False, False),
+    ],
+)
 def test_ucbvi(gamma, stage_dependent, real_time_dp):
     env = GridWorld(walls=(), nrows=5, ncols=5)
-    agent = UCBVIAgent(env,
-                       horizon=11,
-                       stage_dependent=stage_dependent,
-                       gamma=gamma,
-                       real_time_dp=real_time_dp,
-                       bonus_scale_factor=0.1)
+    agent = UCBVIAgent(
+        env,
+        horizon=11,
+        stage_dependent=stage_dependent,
+        gamma=gamma,
+        real_time_dp=real_time_dp,
+        bonus_scale_factor=0.1,
+    )
     agent.fit(budget=50)
     agent.policy(env.observation_space.sample())
diff --git a/rlberry/agents/torch/a2c/a2c.py b/rlberry/agents/torch/a2c/a2c.py
index 6fe9b475e..75df7875a 100644
--- a/rlberry/agents/torch/a2c/a2c.py
+++ b/rlberry/agents/torch/a2c/a2c.py
@@ -62,29 +62,33 @@ class A2CAgent(AgentWithSimplePolicy):
 
     name = "A2C"
 
-    def __init__(self, env,
-                 batch_size=8,
-                 horizon=256,
-                 gamma=0.99,
-                 entr_coef=0.01,
-                 learning_rate=0.01,
-                 optimizer_type='ADAM',
-                 k_epochs=5,
-                 policy_net_fn=None,
-                 value_net_fn=None,
-                 policy_net_kwargs=None,
-                 value_net_kwargs=None,
-                 use_bonus=False,
-                 uncertainty_estimator_kwargs=None,
-                 device="cuda:best",
-                 **kwargs):
+    def __init__(
+        self,
+        env,
+        batch_size=8,
+        horizon=256,
+        gamma=0.99,
+        entr_coef=0.01,
+        learning_rate=0.01,
+        optimizer_type="ADAM",
+        k_epochs=5,
+        policy_net_fn=None,
+        value_net_fn=None,
+        policy_net_kwargs=None,
+        value_net_kwargs=None,
+        use_bonus=False,
+        uncertainty_estimator_kwargs=None,
+        device="cuda:best",
+        **kwargs
+    ):
 
         AgentWithSimplePolicy.__init__(self, env, **kwargs)
 
         self.use_bonus = use_bonus
         if self.use_bonus:
             self.env = UncertaintyEstimatorWrapper(
-                self.env, **uncertainty_estimator_kwargs)
+                self.env, **uncertainty_estimator_kwargs
+            )
 
         self.batch_size = batch_size
         self.horizon = horizon
@@ -104,8 +108,7 @@ def __init__(self, env,
         self.policy_net_fn = policy_net_fn or default_policy_net_fn
         self.value_net_fn = value_net_fn or default_value_net_fn
 
-        self.optimizer_kwargs = {'optimizer_type': optimizer_type,
-                                 'lr': learning_rate}
+        self.optimizer_kwargs = {"optimizer_type": optimizer_type, "lr": learning_rate}
 
         # check environment
         assert isinstance(self.env.observation_space, spaces.Box)
@@ -117,24 +120,24 @@ def __init__(self, env,
         self.reset()
 
     def reset(self, **kwargs):
-        self.cat_policy = self.policy_net_fn(
-            self.env,
-            **self.policy_net_kwargs).to(self.device)
+        self.cat_policy = self.policy_net_fn(self.env, **self.policy_net_kwargs).to(
+            self.device
+        )
         self.policy_optimizer = optimizer_factory(
-            self.cat_policy.parameters(),
-            **self.optimizer_kwargs)
+            self.cat_policy.parameters(), **self.optimizer_kwargs
+        )
 
-        self.value_net = self.value_net_fn(
-            self.env,
-            **self.value_net_kwargs).to(self.device)
+        self.value_net = self.value_net_fn(self.env, **self.value_net_kwargs).to(
+            self.device
+        )
 
         self.value_optimizer = optimizer_factory(
-            self.value_net.parameters(),
-            **self.optimizer_kwargs)
+            self.value_net.parameters(), **self.optimizer_kwargs
+        )
 
-        self.cat_policy_old = self.policy_net_fn(
-            self.env,
-            **self.policy_net_kwargs).to(self.device)
+        self.cat_policy_old = self.policy_net_fn(self.env, **self.policy_net_kwargs).to(
+            self.device
+        )
         self.cat_policy_old.load_state_dict(self.cat_policy.state_dict())
 
         self.MseLoss = nn.MSELoss()
@@ -183,8 +186,8 @@ def _run_episode(self):
             # check whether to use bonus
             bonus = 0.0
             if self.use_bonus:
-                if info is not None and 'exploration_bonus' in info:
-                    bonus = info['exploration_bonus']
+                if info is not None and "exploration_bonus" in info:
+                    bonus = info["exploration_bonus"]
 
             # save in batch
             self.memory.rewards.append(reward + bonus)  # add bonus here
@@ -214,8 +217,9 @@ def _update(self):
         # monte carlo estimate of rewards
         rewards = []
         discounted_reward = 0
-        for reward, is_terminal in zip(reversed(self.memory.rewards),
-                                       reversed(self.memory.is_terminals)):
+        for reward, is_terminal in zip(
+            reversed(self.memory.rewards), reversed(self.memory.is_terminals)
+        ):
             if is_terminal:
                 discounted_reward = 0
             discounted_reward = reward + (self.gamma * discounted_reward)
@@ -239,13 +243,14 @@ def _update(self):
 
             # normalize the advantages
             advantages = rewards - state_values.detach()
-            advantages = (advantages - advantages.mean()) \
-                / (advantages.std() + 1e-8)
+            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
             # find pg loss
-            pg_loss = - logprobs * advantages
-            loss = pg_loss \
-                + 0.5 * self.MseLoss(state_values, rewards) \
+            pg_loss = -logprobs * advantages
+            loss = (
+                pg_loss
+                + 0.5 * self.MseLoss(state_values, rewards)
                 - self.entr_coef * dist_entropy
+            )
 
             # take gradient step
             self.policy_optimizer.zero_grad()
@@ -264,21 +269,18 @@ def _update(self):
     #
     @classmethod
     def sample_parameters(cls, trial):
-        batch_size = trial.suggest_categorical('batch_size',
-                                               [1, 4, 8, 16, 32])
-        gamma = trial.suggest_categorical('gamma',
-                                          [0.9, 0.95, 0.99])
-        learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1)
+        batch_size = trial.suggest_categorical("batch_size", [1, 4, 8, 16, 32])
+        gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.99])
+        learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
 
-        entr_coef = trial.suggest_loguniform('entr_coef', 1e-8, 0.1)
+        entr_coef = trial.suggest_loguniform("entr_coef", 1e-8, 0.1)
 
-        k_epochs = trial.suggest_categorical('k_epochs',
-                                             [1, 5, 10, 20])
+        k_epochs = trial.suggest_categorical("k_epochs", [1, 5, 10, 20])
 
         return {
-            'batch_size': batch_size,
-            'gamma': gamma,
-            'learning_rate': learning_rate,
-            'entr_coef': entr_coef,
-            'k_epochs': k_epochs,
+            "batch_size": batch_size,
+            "gamma": gamma,
+            "learning_rate": learning_rate,
+            "entr_coef": entr_coef,
+            "k_epochs": k_epochs,
         }
diff --git a/rlberry/agents/torch/avec/avec_ppo.py b/rlberry/agents/torch/avec/avec_ppo.py
index a3176f34a..40e082280 100644
--- a/rlberry/agents/torch/avec/avec_ppo.py
+++ b/rlberry/agents/torch/avec/avec_ppo.py
@@ -93,32 +93,36 @@ class AVECPPOAgent(AgentWithSimplePolicy):
 
     name = "AVECPPO"
 
-    def __init__(self, env,
-                 batch_size=8,
-                 horizon=256,
-                 gamma=0.99,
-                 entr_coef=0.01,
-                 vf_coef=0.,
-                 avec_coef=1.,
-                 learning_rate=0.0003,
-                 optimizer_type='ADAM',
-                 eps_clip=0.2,
-                 k_epochs=10,
-                 policy_net_fn=None,
-                 value_net_fn=None,
-                 policy_net_kwargs=None,
-                 value_net_kwargs=None,
-                 use_bonus=False,
-                 uncertainty_estimator_kwargs=None,
-                 device="cuda:best",
-                 **kwargs):
+    def __init__(
+        self,
+        env,
+        batch_size=8,
+        horizon=256,
+        gamma=0.99,
+        entr_coef=0.01,
+        vf_coef=0.0,
+        avec_coef=1.0,
+        learning_rate=0.0003,
+        optimizer_type="ADAM",
+        eps_clip=0.2,
+        k_epochs=10,
+        policy_net_fn=None,
+        value_net_fn=None,
+        policy_net_kwargs=None,
+        value_net_kwargs=None,
+        use_bonus=False,
+        uncertainty_estimator_kwargs=None,
+        device="cuda:best",
+        **kwargs
+    ):
 
         AgentWithSimplePolicy.__init__(self, env, **kwargs)
 
         self.use_bonus = use_bonus
         if self.use_bonus:
             self.env = UncertaintyEstimatorWrapper(
-                self.env, **uncertainty_estimator_kwargs)
+                self.env, **uncertainty_estimator_kwargs
+            )
 
         self.learning_rate = learning_rate
         self.gamma = gamma
@@ -141,8 +145,7 @@ def __init__(self, env,
         self.policy_net_fn = policy_net_fn or default_policy_net_fn
         self.value_net_fn = value_net_fn or default_value_net_fn
 
-        self.optimizer_kwargs = {'optimizer_type': optimizer_type,
-                                 'lr': learning_rate}
+        self.optimizer_kwargs = {"optimizer_type": optimizer_type, "lr": learning_rate}
 
         # check environment
         assert isinstance(self.env.observation_space, spaces.Box)
@@ -154,26 +157,23 @@ def __init__(self, env,
         self.reset()
 
     def reset(self, **kwargs):
-        self.cat_policy = self.policy_net_fn(
-            self.env,
-            **self.policy_net_kwargs
-        ).to(self.device)
+        self.cat_policy = self.policy_net_fn(self.env, **self.policy_net_kwargs).to(
+            self.device
+        )
         self.policy_optimizer = optimizer_factory(
-            self.cat_policy.parameters(),
-            **self.optimizer_kwargs)
+            self.cat_policy.parameters(), **self.optimizer_kwargs
+        )
 
-        self.value_net = self.value_net_fn(
-            self.env,
-            **self.value_net_kwargs
-        ).to(self.device)
+        self.value_net = self.value_net_fn(self.env, **self.value_net_kwargs).to(
+            self.device
+        )
         self.value_optimizer = optimizer_factory(
-            self.value_net.parameters(),
-            **self.optimizer_kwargs)
+            self.value_net.parameters(), **self.optimizer_kwargs
+        )
 
-        self.cat_policy_old = self.policy_net_fn(
-            self.env,
-            **self.policy_net_kwargs
-        ).to(self.device)
+        self.cat_policy_old = self.policy_net_fn(self.env, **self.policy_net_kwargs).to(
+            self.device
+        )
         self.cat_policy_old.load_state_dict(self.cat_policy.state_dict())
 
         self.MseLoss = nn.MSELoss()
@@ -223,8 +223,8 @@ def _run_episode(self):
             # check whether to use bonus
             bonus = 0.0
             if self.use_bonus:
-                if info is not None and 'exploration_bonus' in info:
-                    bonus = info['exploration_bonus']
+                if info is not None and "exploration_bonus" in info:
+                    bonus = info["exploration_bonus"]
 
             # save in batch
             self.memory.rewards.append(reward + bonus)  # add bonus here
@@ -255,8 +255,9 @@ def _update(self):
         # monte carlo estimate of rewards
         rewards = []
         discounted_reward = 0
-        for reward, is_terminal in zip(reversed(self.memory.rewards),
-                                       reversed(self.memory.is_terminals)):
+        for reward, is_terminal in zip(
+            reversed(self.memory.rewards), reversed(self.memory.is_terminals)
+        ):
             if is_terminal:
                 discounted_reward = 0
             discounted_reward = reward + (self.gamma * discounted_reward)
@@ -284,16 +285,18 @@ def _update(self):
 
             # normalize the advantages
             advantages = rewards - state_values.detach()
-            advantages = (advantages - advantages.mean()) / \
-                         (advantages.std() + 1e-8)
+            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
             # find surrogate loss
             surr1 = ratios * advantages
-            surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1
-                                + self.eps_clip) * advantages
-            loss = -torch.min(surr1, surr2) \
-                + self.avec_coef * self._avec_loss(state_values, rewards) \
-                + self.vf_coef * self.MseLoss(state_values, rewards) \
+            surr2 = (
+                torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages
+            )
+            loss = (
+                -torch.min(surr1, surr2)
+                + self.avec_coef * self._avec_loss(state_values, rewards)
+                + self.vf_coef * self.MseLoss(state_values, rewards)
                 - self.entr_coef * dist_entropy
+            )
 
             # take gradient step
             self.policy_optimizer.zero_grad()
@@ -328,25 +331,21 @@ def _avec_loss(self, y_pred, y_true):
     #
     @classmethod
     def sample_parameters(cls, trial):
-        batch_size = trial.suggest_categorical('batch_size',
-                                               [1, 4, 8, 16, 32])
-        gamma = trial.suggest_categorical('gamma',
-                                          [0.9, 0.95, 0.99])
-        learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1)
+        batch_size = trial.suggest_categorical("batch_size", [1, 4, 8, 16, 32])
+        gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.99])
+        learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
 
-        entr_coef = trial.suggest_loguniform('entr_coef', 1e-8, 0.1)
+        entr_coef = trial.suggest_loguniform("entr_coef", 1e-8, 0.1)
 
-        eps_clip = trial.suggest_categorical('eps_clip',
-                                             [0.1, 0.2, 0.3])
+        eps_clip = trial.suggest_categorical("eps_clip", [0.1, 0.2, 0.3])
 
-        k_epochs = trial.suggest_categorical('k_epochs',
-                                             [1, 5, 10, 20])
+        k_epochs = trial.suggest_categorical("k_epochs", [1, 5, 10, 20])
 
         return {
-            'batch_size': batch_size,
-            'gamma': gamma,
-            'learning_rate': learning_rate,
-            'entr_coef': entr_coef,
-            'eps_clip': eps_clip,
-            'k_epochs': k_epochs,
+            "batch_size": batch_size,
+            "gamma": gamma,
+            "learning_rate": learning_rate,
+            "entr_coef": entr_coef,
+            "eps_clip": eps_clip,
+            "k_epochs": k_epochs,
         }
diff --git a/rlberry/agents/torch/dqn/dqn.py b/rlberry/agents/torch/dqn/dqn.py
index df8ada635..978fc9fd9 100644
--- a/rlberry/agents/torch/dqn/dqn.py
+++ b/rlberry/agents/torch/dqn/dqn.py
@@ -4,13 +4,24 @@
 import numpy as np
 
 from rlberry.agents import AgentWithSimplePolicy
-from rlberry.agents.utils.memories import Transition, PrioritizedReplayMemory, TransitionReplayMemory
+from rlberry.agents.utils.memories import (
+    Transition,
+    PrioritizedReplayMemory,
+    TransitionReplayMemory,
+)
 from rlberry.exploration_tools.discrete_counter import DiscreteCounter
-from rlberry.exploration_tools.online_discretization_counter import OnlineDiscretizationCounter
+from rlberry.exploration_tools.online_discretization_counter import (
+    OnlineDiscretizationCounter,
+)
 from rlberry.wrappers.uncertainty_estimator_wrapper import UncertaintyEstimatorWrapper
 from rlberry.agents.torch.dqn.exploration import exploration_factory
-from rlberry.agents.torch.utils.training import loss_function_factory, model_factory, size_model_config, \
-    trainable_parameters, optimizer_factory
+from rlberry.agents.torch.utils.training import (
+    loss_function_factory,
+    model_factory,
+    size_model_config,
+    trainable_parameters,
+    optimizer_factory,
+)
 from rlberry.seeding import Seeder
 from rlberry.utils.factory import load
 from rlberry.utils.torch import choose_device
@@ -75,47 +86,47 @@ class DQNAgent(AgentWithSimplePolicy):
     prioritized_replay: bool
         Use prioritized replay.
     """
-    name = 'DQN'
-
-    def __init__(self,
-                 env,
-                 horizon=256,
-                 gamma=0.99,
-                 loss_function="l2",
-                 batch_size=100,
-                 device="cuda:best",
-                 target_update=1,
-                 learning_rate=0.001,
-                 epsilon_init=1.0,
-                 epsilon_final=0.1,
-                 epsilon_decay=5000,
-                 optimizer_type='ADAM',
-                 qvalue_net_fn=None,
-                 qvalue_net_kwargs=None,
-                 double=True,
-                 memory_capacity=10000,
-                 use_bonus=False,
-                 uncertainty_estimator_kwargs=None,
-                 prioritized_replay=True,
-                 update_frequency=1,
-                 **kwargs):
+
+    name = "DQN"
+
+    def __init__(
+        self,
+        env,
+        horizon=256,
+        gamma=0.99,
+        loss_function="l2",
+        batch_size=100,
+        device="cuda:best",
+        target_update=1,
+        learning_rate=0.001,
+        epsilon_init=1.0,
+        epsilon_final=0.1,
+        epsilon_decay=5000,
+        optimizer_type="ADAM",
+        qvalue_net_fn=None,
+        qvalue_net_kwargs=None,
+        double=True,
+        memory_capacity=10000,
+        use_bonus=False,
+        uncertainty_estimator_kwargs=None,
+        prioritized_replay=True,
+        update_frequency=1,
+        **kwargs,
+    ):
         # Wrap arguments and initialize base class
-        memory_kwargs = {
-            'capacity': memory_capacity,
-            'n_steps': 1,
-            'gamma': gamma
-        }
+        memory_kwargs = {"capacity": memory_capacity, "n_steps": 1, "gamma": gamma}
         exploration_kwargs = {
-            'method': "EpsilonGreedy",
-            'temperature': epsilon_init,
-            'final_temperature': epsilon_final,
-            'tau': epsilon_decay,
+            "method": "EpsilonGreedy",
+            "temperature": epsilon_init,
+            "final_temperature": epsilon_final,
+            "tau": epsilon_decay,
         }
         AgentWithSimplePolicy.__init__(self, env, **kwargs)
         self.use_bonus = use_bonus
         if self.use_bonus:
             self.env = UncertaintyEstimatorWrapper(
-                self.env, **uncertainty_estimator_kwargs)
+                self.env, **uncertainty_estimator_kwargs
+            )
         self.horizon = horizon
         self.exploration_kwargs = exploration_kwargs or {}
         self.memory_kwargs = memory_kwargs or {}
@@ -123,40 +134,49 @@ def __init__(self,
         self.target_update = target_update
         self.double = double
 
-        assert isinstance(env.action_space, spaces.Discrete), \
-            "Only compatible with Discrete action spaces."
+        assert isinstance(
+            env.action_space, spaces.Discrete
+        ), "Only compatible with Discrete action spaces."
 
         self.prioritized_replay = prioritized_replay
-        memory_class = PrioritizedReplayMemory if prioritized_replay else TransitionReplayMemory
+        memory_class = (
+            PrioritizedReplayMemory if prioritized_replay else TransitionReplayMemory
+        )
         self.memory = memory_class(**self.memory_kwargs)
-        self.exploration_policy = \
-            exploration_factory(self.env.action_space,
-                                **self.exploration_kwargs)
+        self.exploration_policy = exploration_factory(
+            self.env.action_space, **self.exploration_kwargs
+        )
         self.training = True
         self.steps = 0
         self.episode = 0
 
-        self.optimizer_kwargs = {'optimizer_type': optimizer_type,
-                                 'lr': learning_rate}
+        self.optimizer_kwargs = {"optimizer_type": optimizer_type, "lr": learning_rate}
         self.device = choose_device(device)
         self.loss_function = loss_function
         self.gamma = gamma
 
         qvalue_net_kwargs = qvalue_net_kwargs or {}
-        qvalue_net_fn = load(qvalue_net_fn) if isinstance(qvalue_net_fn, str) else \
-            qvalue_net_fn or default_qvalue_net_fn
+        qvalue_net_fn = (
+            load(qvalue_net_fn)
+            if isinstance(qvalue_net_fn, str)
+            else qvalue_net_fn or default_qvalue_net_fn
+        )
         self.value_net = qvalue_net_fn(self.env, **qvalue_net_kwargs)
         self.target_net = qvalue_net_fn(self.env, **qvalue_net_kwargs)
 
         self.target_net.load_state_dict(self.value_net.state_dict())
         self.target_net.eval()
-        logger.info("Number of trainable parameters: {}"
-                    .format(trainable_parameters(self.value_net)))
+        logger.info(
+            "Number of trainable parameters: {}".format(
+                trainable_parameters(self.value_net)
+            )
+        )
         self.value_net.to(self.device)
         self.target_net.to(self.device)
         self.loss_function = loss_function_factory(self.loss_function)
-        self.optimizer = optimizer_factory(self.value_net.parameters(),
-                                           **self.optimizer_kwargs)
+        self.optimizer = optimizer_factory(
+            self.value_net.parameters(), **self.optimizer_kwargs
+        )
         self.update_frequency = update_frequency
         self.steps = 0
 
@@ -167,21 +187,36 @@ def fit(self, budget: int, **kwargs):
                 state = self.env.reset()
                 values = self.get_state_action_values(state)
                 for i, value in enumerate(values):
-                    self.writer.add_scalar(f"agent/action_value_{i}", value, self.episode)
+                    self.writer.add_scalar(
+                        f"agent/action_value_{i}", value, self.episode
+                    )
             total_reward, total_bonus, total_success, length = self._run_episode()
             if self.episode % 20 == 0:
-                logger.info(f"Episode {self.episode + 1}/{budget}, total reward {total_reward}")
+                logger.info(
+                    f"Episode {self.episode + 1}/{budget}, total reward {total_reward}"
+                )
             if self.writer:
                 self.writer.add_scalar("episode_rewards", total_reward, self.episode)
-                self.writer.add_scalar("episode/total_reward", total_reward, self.episode)
+                self.writer.add_scalar(
+                    "episode/total_reward", total_reward, self.episode
+                )
                 self.writer.add_scalar("episode/total_bonus", total_bonus, self.episode)
-                self.writer.add_scalar("episode/total_success", total_success, self.episode)
+                self.writer.add_scalar(
+                    "episode/total_success", total_success, self.episode
+                )
                 self.writer.add_scalar("episode/length", length, self.episode)
-                if self.use_bonus and \
-                        (isinstance(self.env.uncertainty_estimator, OnlineDiscretizationCounter) or
-                         isinstance(self.env.uncertainty_estimator, DiscreteCounter)):
-                    n_visited_states = (self.env.uncertainty_estimator.N_sa.sum(axis=1) > 0).sum()
-                    self.writer.add_scalar("debug/n_visited_states", n_visited_states, self.episode)
+                if self.use_bonus and (
+                    isinstance(
+                        self.env.uncertainty_estimator, OnlineDiscretizationCounter
+                    )
+                    or isinstance(self.env.uncertainty_estimator, DiscreteCounter)
+                ):
+                    n_visited_states = (
+                        self.env.uncertainty_estimator.N_sa.sum(axis=1) > 0
+                    ).sum()
+                    self.writer.add_scalar(
+                        "debug/n_visited_states", n_visited_states, self.episode
+                    )
 
     def _run_episode(self):
         total_reward = total_bonus = total_success = time = 0
@@ -194,8 +229,8 @@ def _run_episode(self):
             # bonus used only for logging, here
             bonus = 0.0
             if self.use_bonus:
-                if info is not None and 'exploration_bonus' in info:
-                    bonus = info['exploration_bonus']
+                if info is not None and "exploration_bonus" in info:
+                    bonus = info["exploration_bonus"]
 
             self.record(state, action, reward, next_state, done, info)
             state = next_state
@@ -295,52 +330,60 @@ def compute_bellman_residual(self, batch):
         The residuals over the batch, and the computed target.
         """
         # Concatenate the batch elements
-        state = torch.cat(tuple(torch.tensor([batch.state],
-                                             dtype=torch.float))).to(self.device)
-        action = torch.tensor(batch.action,
-                              dtype=torch.long).to(self.device)
-        reward = torch.tensor(batch.reward,
-                              dtype=torch.float).to(self.device)
+        state = torch.cat(tuple(torch.tensor([batch.state], dtype=torch.float))).to(
+            self.device
+        )
+        action = torch.tensor(batch.action, dtype=torch.long).to(self.device)
+        reward = torch.tensor(batch.reward, dtype=torch.float).to(self.device)
         if self.use_bonus:
-            bonus = self.env.bonus_batch(state, action).to(self.device) * self.exploration_policy.epsilon
+            bonus = (
+                self.env.bonus_batch(state, action).to(self.device)
+                * self.exploration_policy.epsilon
+            )
             if self.writer:
-                self.writer.add_scalar("debug/minibatch_mean_bonus", bonus.mean().item(), self.episode)
-                self.writer.add_scalar("debug/minibatch_mean_reward", reward.mean().item(), self.episode)
+                self.writer.add_scalar(
+                    "debug/minibatch_mean_bonus", bonus.mean().item(), self.episode
+                )
+                self.writer.add_scalar(
+                    "debug/minibatch_mean_reward", reward.mean().item(), self.episode
+                )
             reward += bonus
-        next_state = torch.cat(tuple(torch.tensor([batch.next_state],
-                                                  dtype=torch.float))).to(self.device)
-        terminal = torch.tensor(batch.terminal,
-                                dtype=torch.bool).to(self.device)
+        next_state = torch.cat(
+            tuple(torch.tensor([batch.next_state], dtype=torch.float))
+        ).to(self.device)
+        terminal = torch.tensor(batch.terminal, dtype=torch.bool).to(self.device)
         batch = Transition(state, action, reward, next_state, terminal, batch.info)
 
         # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
         # columns of actions taken
         state_action_values = self.value_net(batch.state)
-        state_action_values = \
-            state_action_values.gather(1, batch.action.unsqueeze(1)).squeeze(1)
+        state_action_values = state_action_values.gather(
+            1, batch.action.unsqueeze(1)
+        ).squeeze(1)
 
         with torch.no_grad():
             # Compute V(s_{t+1}) for all next states.
-            next_state_values = \
-                torch.zeros(batch.reward.shape).to(self.device)
+            next_state_values = torch.zeros(batch.reward.shape).to(self.device)
             if self.double:
                 # Double Q-learning: pick best actions from policy network
                 _, best_actions = self.value_net(batch.next_state).max(1)
                 # Double Q-learning: estimate action values
                 # from target network
-                best_values = self.target_net(
-                    batch.next_state
-                ).gather(1, best_actions.unsqueeze(1)) \
+                best_values = (
+                    self.target_net(batch.next_state)
+                    .gather(1, best_actions.unsqueeze(1))
                     .squeeze(1)
+                )
             else:
                 best_values, _ = self.target_net(batch.next_state).max(1)
-            next_state_values[~batch.terminal] \
-                = best_values[~batch.terminal]
+            next_state_values[~batch.terminal] = best_values[~batch.terminal]
             # Compute the expected Q values
             target_state_action_value = batch.reward + self.gamma * next_state_values
 
         # Compute residuals
-        residuals = self.loss_function(state_action_values, target_state_action_value, reduction='none')
+        residuals = self.loss_function(
+            state_action_values, target_state_action_value, reduction="none"
+        )
         return residuals, target_state_action_value
 
     def get_batch_state_values(self, states):
@@ -359,9 +402,9 @@ def get_batch_state_values(self, states):
             * [a1*; ...; aN*] the array of corresponding optimal action
             indexes for each state
         """
-        values, actions = self.value_net(torch.tensor(states,
-                                                      dtype=torch.float)
-                                         .to(self.device)).max(1)
+        values, actions = self.value_net(
+            torch.tensor(states, dtype=torch.float).to(self.device)
+        ).max(1)
         return values.data.cpu().numpy(), actions.data.cpu().numpy()
 
     def get_batch_state_action_values(self, states):
@@ -378,9 +421,11 @@ def get_batch_state_action_values(self, states):
         values:[[Q11, ..., Q1n]; ...] the array of all action values
         for each state
         """
-        return self.value_net(torch.tensor(states,
-                                           dtype=torch.float)
-                              .to(self.device)).data.cpu().numpy()
+        return (
+            self.value_net(torch.tensor(states, dtype=torch.float).to(self.device))
+            .data.cpu()
+            .numpy()
+        )
 
     def get_state_value(self, state):
         """
@@ -442,22 +487,24 @@ def set_time(self, time):
 
     def eval_mode(self):
         self.training = False
-        self.exploration_kwargs['method'] = "Greedy"
-        self.exploration_policy = \
-            exploration_factory(self.env.action_space,
-                                **self.exploration_kwargs)
+        self.exploration_kwargs["method"] = "Greedy"
+        self.exploration_policy = exploration_factory(
+            self.env.action_space, **self.exploration_kwargs
+        )
 
     def save(self, filename, **kwargs):
-        state = {'state_dict': self.value_net.state_dict(),
-                 'optimizer': self.optimizer.state_dict()}
+        state = {
+            "state_dict": self.value_net.state_dict(),
+            "optimizer": self.optimizer.state_dict(),
+        }
         torch.save(state, filename)
         return filename
 
     def load(self, filename, **kwargs):
         checkpoint = torch.load(filename, map_location=self.device)
-        self.value_net.load_state_dict(checkpoint['state_dict'])
-        self.target_net.load_state_dict(checkpoint['state_dict'])
-        self.optimizer.load_state_dict(checkpoint['optimizer'])
+        self.value_net.load_state_dict(checkpoint["state_dict"])
+        self.target_net.load_state_dict(checkpoint["state_dict"])
+        self.optimizer.load_state_dict(checkpoint["optimizer"])
         return filename
 
     def initialize_model(self):
@@ -470,39 +517,39 @@ def set_writer(self, writer):
         except AttributeError:
             pass
         if self.writer:
-            obs_shape = self.env.observation_space.shape \
-                if isinstance(self.env.observation_space, spaces.Box) else \
-                self.env.observation_space.spaces[0].shape
-            model_input = torch.zeros((1, *obs_shape), dtype=torch.float,
-                                      device=self.device)
+            obs_shape = (
+                self.env.observation_space.shape
+                if isinstance(self.env.observation_space, spaces.Box)
+                else self.env.observation_space.spaces[0].shape
+            )
+            model_input = torch.zeros(
+                (1, *obs_shape), dtype=torch.float, device=self.device
+            )
             self.writer.add_graph(self.value_net, input_to_model=(model_input,))
-            self.writer.add_scalar("agent/trainable_parameters",
-                                   trainable_parameters(self.value_net), 0)
+            self.writer.add_scalar(
+                "agent/trainable_parameters", trainable_parameters(self.value_net), 0
+            )
 
     #
     # For hyperparameter optimization
     #
     @classmethod
     def sample_parameters(cls, trial):
-        batch_size = trial.suggest_categorical('batch_size',
-                                               [32, 64, 128, 256, 512])
-        gamma = trial.suggest_categorical('gamma',
-                                          [0.95, 0.99])
-        learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1)
+        batch_size = trial.suggest_categorical("batch_size", [32, 64, 128, 256, 512])
+        gamma = trial.suggest_categorical("gamma", [0.95, 0.99])
+        learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
 
-        target_update = trial.suggest_categorical('target_update',
-                                                  [1, 250, 500, 1000])
+        target_update = trial.suggest_categorical("target_update", [1, 250, 500, 1000])
 
-        epsilon_final = trial.suggest_loguniform('epsilon_final', 1e-2, 1e-1)
+        epsilon_final = trial.suggest_loguniform("epsilon_final", 1e-2, 1e-1)
 
-        epsilon_decay = trial.suggest_categorical('target_update',
-                                                  [1000, 5000, 10000])
+        epsilon_decay = trial.suggest_categorical("target_update", [1000, 5000, 10000])
 
         return {
-            'batch_size': batch_size,
-            'gamma': gamma,
-            'learning_rate': learning_rate,
-            'target_update': target_update,
-            'epsilon_final': epsilon_final,
-            'epsilon_decay': epsilon_decay,
+            "batch_size": batch_size,
+            "gamma": gamma,
+            "learning_rate": learning_rate,
+            "target_update": target_update,
+            "epsilon_final": epsilon_final,
+            "epsilon_decay": epsilon_decay,
         }
diff --git a/rlberry/agents/torch/dqn/exploration.py b/rlberry/agents/torch/dqn/exploration.py
index 4a3b24150..70d5ac70e 100644
--- a/rlberry/agents/torch/dqn/exploration.py
+++ b/rlberry/agents/torch/dqn/exploration.py
@@ -25,8 +25,8 @@ def sample(self):
         """
         distribution = self.get_distribution()
         return self.np_random.choice(
-            list(distribution.keys()), 1,
-            p=np.array(list(distribution.values())))[0]
+            list(distribution.keys()), 1, p=np.array(list(distribution.values()))
+        )[0]
 
     def seed(self, seeder=None):
         """
@@ -58,12 +58,9 @@ class EpsilonGreedy(DiscreteDistribution):
     probability 1-epsilon.
     """
 
-    def __init__(self,
-                 action_space,
-                 temperature=1.0,
-                 final_temperature=0.1,
-                 tau=5000,
-                 **kwargs):
+    def __init__(
+        self, action_space, temperature=1.0, final_temperature=0.1, tau=5000, **kwargs
+    ):
         super().__init__(**kwargs)
         self.action_space = action_space
         self.temperature = temperature
@@ -81,8 +78,10 @@ def __init__(self,
         self.seed()
 
     def get_distribution(self):
-        distribution = {action: self.epsilon / self.action_space.n
-                        for action in range(self.action_space.n)}
+        distribution = {
+            action: self.epsilon / self.action_space.n
+            for action in range(self.action_space.n)
+        }
         distribution[self.optimal_action] += 1 - self.epsilon
         return distribution
 
@@ -98,13 +97,11 @@ def update(self, values):
             Whether to update epsilon schedule
         """
         self.optimal_action = np.argmax(values)
-        self.epsilon = self.final_temperature \
-                       + (self.temperature - self.final_temperature) * \
-                       np.exp(- self.time / self.tau)
+        self.epsilon = self.final_temperature + (
+            self.temperature - self.final_temperature
+        ) * np.exp(-self.time / self.tau)
         if self.writer:
-            self.writer.add_scalar('exploration/epsilon',
-                                   self.epsilon,
-                                   self.time)
+            self.writer.add_scalar("exploration/epsilon", self.epsilon, self.time)
 
     def step_time(self):
         self.time += 1
@@ -133,8 +130,10 @@ def __init__(self, action_space, **kwargs):
 
     def get_distribution(self):
         optimal_action = np.argmax(self.values)
-        return {action: 1 if action == optimal_action
-        else 0 for action in range(self.action_space.n)}
+        return {
+            action: 1 if action == optimal_action else 0
+            for action in range(self.action_space.n)
+        }
 
     def update(self, values):
         self.values = values
@@ -155,9 +154,9 @@ def exploration_factory(action_space, method="EpsilonGreedy", **kwargs):
     -------
     A new exploration policy.
     """
-    if method == 'Greedy':
+    if method == "Greedy":
         return Greedy(action_space, **kwargs)
-    elif method == 'EpsilonGreedy':
+    elif method == "EpsilonGreedy":
         return EpsilonGreedy(action_space, **kwargs)
     else:
         raise ValueError("Unknown exploration method")
diff --git a/rlberry/agents/torch/ppo/ppo.py b/rlberry/agents/torch/ppo/ppo.py
index 5b32c5b97..2292059d9 100644
--- a/rlberry/agents/torch/ppo/ppo.py
+++ b/rlberry/agents/torch/ppo/ppo.py
@@ -71,34 +71,39 @@ class PPOAgent(AgentWithSimplePolicy):
 
     name = "PPO"
 
-    def __init__(self, env,
-                 batch_size=64,
-                 update_frequency=8,
-                 horizon=256,
-                 gamma=0.99,
-                 entr_coef=0.01,
-                 vf_coef=0.5,
-                 learning_rate=0.01,
-                 optimizer_type='ADAM',
-                 eps_clip=0.2,
-                 k_epochs=5,
-                 use_gae=True,
-                 gae_lambda=0.95,
-                 policy_net_fn=None,
-                 value_net_fn=None,
-                 policy_net_kwargs=None,
-                 value_net_kwargs=None,
-                 device="cuda:best",
-                 use_bonus=False,
-                 uncertainty_estimator_kwargs=None,
-                 **kwargs):  # TODO: sort arguments
+    def __init__(
+        self,
+        env,
+        batch_size=64,
+        update_frequency=8,
+        horizon=256,
+        gamma=0.99,
+        entr_coef=0.01,
+        vf_coef=0.5,
+        learning_rate=0.01,
+        optimizer_type="ADAM",
+        eps_clip=0.2,
+        k_epochs=5,
+        use_gae=True,
+        gae_lambda=0.95,
+        policy_net_fn=None,
+        value_net_fn=None,
+        policy_net_kwargs=None,
+        value_net_kwargs=None,
+        device="cuda:best",
+        use_bonus=False,
+        uncertainty_estimator_kwargs=None,
+        **kwargs
+    ):  # TODO: sort arguments
 
         AgentWithSimplePolicy.__init__(self, env, **kwargs)
 
         # bonus
         self.use_bonus = use_bonus
         if self.use_bonus:
-            self.env = UncertaintyEstimatorWrapper(self.env, **uncertainty_estimator_kwargs)
+            self.env = UncertaintyEstimatorWrapper(
+                self.env, **uncertainty_estimator_kwargs
+            )
 
         # algorithm parameters
         self.gamma = gamma
@@ -137,8 +142,7 @@ def __init__(self, env,
 
         self.device = choose_device(device)
 
-        self.optimizer_kwargs = {'optimizer_type': optimizer_type,
-                                 'lr': learning_rate}
+        self.optimizer_kwargs = {"optimizer_type": optimizer_type, "lr": learning_rate}
 
         # check environment
         assert isinstance(self.env.observation_space, spaces.Box)
@@ -156,13 +160,23 @@ def from_config(cls, **kwargs):
         return cls(**kwargs)
 
     def reset(self, **kwargs):
-        self.cat_policy = self.policy_net_fn(self.env, **self.policy_net_kwargs).to(self.device)
-        self.policy_optimizer = optimizer_factory(self.cat_policy.parameters(), **self.optimizer_kwargs)
-
-        self.value_net = self.value_net_fn(self.env, **self.value_net_kwargs).to(self.device)
-        self.value_optimizer = optimizer_factory(self.value_net.parameters(), **self.optimizer_kwargs)
-
-        self.cat_policy_old = self.policy_net_fn(self.env, **self.policy_net_kwargs).to(self.device)
+        self.cat_policy = self.policy_net_fn(self.env, **self.policy_net_kwargs).to(
+            self.device
+        )
+        self.policy_optimizer = optimizer_factory(
+            self.cat_policy.parameters(), **self.optimizer_kwargs
+        )
+
+        self.value_net = self.value_net_fn(self.env, **self.value_net_kwargs).to(
+            self.device
+        )
+        self.value_optimizer = optimizer_factory(
+            self.value_net.parameters(), **self.optimizer_kwargs
+        )
+
+        self.cat_policy_old = self.policy_net_fn(self.env, **self.policy_net_kwargs).to(
+            self.device
+        )
         self.cat_policy_old.load_state_dict(self.cat_policy.state_dict())
 
         self.MseLoss = nn.MSELoss()  # TODO: turn into argument
@@ -215,8 +229,8 @@ def _run_episode(self):
             # check whether to use bonus
             bonus = 0.0
             if self.use_bonus:
-                if info is not None and 'exploration_bonus' in info:
-                    bonus = info['exploration_bonus']
+                if info is not None and "exploration_bonus" in info:
+                    bonus = info["exploration_bonus"]
 
             # save transition
             states.append(state)
@@ -238,7 +252,9 @@ def _run_episode(self):
         state_values = torch.squeeze(state_values).tolist()
 
         # TODO: add the option to normalize before computing returns/advantages?
-        returns, advantages = self._compute_returns_avantages(rewards, is_terminals, state_values)
+        returns, advantages = self._compute_returns_avantages(
+            rewards, is_terminals, state_values
+        )
 
         # save in batch
         self.memory.states.extend(states)
@@ -258,7 +274,9 @@ def _run_episode(self):
             self.writer.add_scalar("episode_rewards", episode_rewards, self.episode)
 
         # update
-        if self.episode % self.update_frequency == 0:  # TODO: maybe change to update in function of n_steps instead
+        if (
+            self.episode % self.update_frequency == 0
+        ):  # TODO: maybe change to update in function of n_steps instead
             self._update()
             self.memory.clear_memory()
             del self.returns[:]  # TODO: add to memory (cf reset)
@@ -292,7 +310,9 @@ def _update(self):
             for k in range(n_batches):
 
                 # sample batch
-                batch_idx = np.arange(k * self.batch_size, min((k + 1) * self.batch_size, n_samples))
+                batch_idx = np.arange(
+                    k * self.batch_size, min((k + 1) * self.batch_size, n_samples)
+                )
                 old_states = shuffled_states[batch_idx]
                 old_actions = shuffled_actions[batch_idx]
                 old_logprobs = shuffled_logprobs[batch_idx]
@@ -313,14 +333,21 @@ def _update(self):
                 # rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)
 
                 # normalize the advantages
-                old_advantages = old_advantages.view(-1, )
+                old_advantages = old_advantages.view(
+                    -1,
+                )
 
                 if self.normalize_advantages:
-                    old_advantages = (old_advantages - old_advantages.mean()) / (old_advantages.std() + 1e-10)
+                    old_advantages = (old_advantages - old_advantages.mean()) / (
+                        old_advantages.std() + 1e-10
+                    )
 
                 # compute surrogate loss
                 surr1 = ratios * old_advantages
-                surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * old_advantages
+                surr2 = (
+                    torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip)
+                    * old_advantages
+                )
                 surr_loss = torch.min(surr1, surr2)
 
                 # compute value function loss
@@ -330,7 +357,7 @@ def _update(self):
                 loss_entropy = self.entr_coef * dist_entropy
 
                 # compute total loss
-                loss = - surr_loss + loss_vf - loss_entropy
+                loss = -surr_loss + loss_vf - loss_entropy
 
                 # take gradient step
                 self.policy_optimizer.zero_grad()
@@ -343,8 +370,16 @@ def _update(self):
 
         # log
         if self.writer:
-            self.writer.add_scalar("fit/surrogate_loss", surr_loss.mean().cpu().detach().numpy(), self.episode)
-            self.writer.add_scalar("fit/entropy_loss", dist_entropy.mean().cpu().detach().numpy(), self.episode)
+            self.writer.add_scalar(
+                "fit/surrogate_loss",
+                surr_loss.mean().cpu().detach().numpy(),
+                self.episode,
+            )
+            self.writer.add_scalar(
+                "fit/entropy_loss",
+                dist_entropy.mean().cpu().detach().numpy(),
+                self.episode,
+            )
 
         # copy new weights into old policy
         self.cat_policy_old.load_state_dict(self.cat_policy.state_dict())
@@ -357,22 +392,39 @@ def _compute_returns_avantages(self, rewards, is_terminals, state_values):
         if not self.use_gae:
             for t in reversed(range(self.horizon)):
                 if t == self.horizon - 1:
-                    returns[t] = rewards[t] + self.gamma * (1 - is_terminals[t]) * state_values[-1]
+                    returns[t] = (
+                        rewards[t]
+                        + self.gamma * (1 - is_terminals[t]) * state_values[-1]
+                    )
                 else:
-                    returns[t] = rewards[t] + self.gamma * (1 - is_terminals[t]) * returns[t + 1]
+                    returns[t] = (
+                        rewards[t] + self.gamma * (1 - is_terminals[t]) * returns[t + 1]
+                    )
 
                 advantages[t] = returns[t] - state_values[t]
         else:
             last_adv = 0
             for t in reversed(range(self.horizon)):
                 if t == self.horizon - 1:
-                    returns[t] = rewards[t] + self.gamma * (1 - is_terminals[t]) * state_values[-1]
+                    returns[t] = (
+                        rewards[t]
+                        + self.gamma * (1 - is_terminals[t]) * state_values[-1]
+                    )
                     td_error = returns[t] - state_values[t]
                 else:
-                    returns[t] = rewards[t] + self.gamma * (1 - is_terminals[t]) * returns[t + 1]
-                    td_error = rewards[t] + self.gamma * (1 - is_terminals[t]) * state_values[t + 1] - state_values[t]
-
-                last_adv = self.gae_lambda * self.gamma * (1 - is_terminals[t]) * last_adv + td_error
+                    returns[t] = (
+                        rewards[t] + self.gamma * (1 - is_terminals[t]) * returns[t + 1]
+                    )
+                    td_error = (
+                        rewards[t]
+                        + self.gamma * (1 - is_terminals[t]) * state_values[t + 1]
+                        - state_values[t]
+                    )
+
+                last_adv = (
+                    self.gae_lambda * self.gamma * (1 - is_terminals[t]) * last_adv
+                    + td_error
+                )
                 advantages[t] = last_adv
 
         return returns, advantages
@@ -382,25 +434,21 @@ def _compute_returns_avantages(self, rewards, is_terminals, state_values):
     #
     @classmethod
     def sample_parameters(cls, trial):
-        batch_size = trial.suggest_categorical('batch_size',
-                                               [1, 4, 8, 16, 32])
-        gamma = trial.suggest_categorical('gamma',
-                                          [0.9, 0.95, 0.99])
-        learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1)
+        batch_size = trial.suggest_categorical("batch_size", [1, 4, 8, 16, 32])
+        gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.99])
+        learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
 
-        entr_coef = trial.suggest_loguniform('entr_coef', 1e-8, 0.1)
+        entr_coef = trial.suggest_loguniform("entr_coef", 1e-8, 0.1)
 
-        eps_clip = trial.suggest_categorical('eps_clip',
-                                             [0.1, 0.2, 0.3])
+        eps_clip = trial.suggest_categorical("eps_clip", [0.1, 0.2, 0.3])
 
-        k_epochs = trial.suggest_categorical('k_epochs',
-                                             [1, 5, 10, 20])
+        k_epochs = trial.suggest_categorical("k_epochs", [1, 5, 10, 20])
 
         return {
-            'batch_size': batch_size,
-            'gamma': gamma,
-            'learning_rate': learning_rate,
-            'entr_coef': entr_coef,
-            'eps_clip': eps_clip,
-            'k_epochs': k_epochs,
+            "batch_size": batch_size,
+            "gamma": gamma,
+            "learning_rate": learning_rate,
+            "entr_coef": entr_coef,
+            "eps_clip": eps_clip,
+            "k_epochs": k_epochs,
         }
diff --git a/rlberry/agents/torch/reinforce/reinforce.py b/rlberry/agents/torch/reinforce/reinforce.py
index 80255877c..460d6720b 100644
--- a/rlberry/agents/torch/reinforce/reinforce.py
+++ b/rlberry/agents/torch/reinforce/reinforce.py
@@ -54,19 +54,22 @@ class REINFORCEAgent(AgentWithSimplePolicy):
 
     name = "REINFORCE"
 
-    def __init__(self, env,
-                 batch_size=8,
-                 horizon=256,
-                 gamma=0.99,
-                 entr_coef=0.01,
-                 learning_rate=0.0001,
-                 normalize=True,
-                 optimizer_type='ADAM',
-                 policy_net_fn=None,
-                 policy_net_kwargs=None,
-                 use_bonus_if_available=False,
-                 device="cuda:best",
-                 **kwargs):
+    def __init__(
+        self,
+        env,
+        batch_size=8,
+        horizon=256,
+        gamma=0.99,
+        entr_coef=0.01,
+        learning_rate=0.0001,
+        normalize=True,
+        optimizer_type="ADAM",
+        policy_net_fn=None,
+        policy_net_kwargs=None,
+        use_bonus_if_available=False,
+        device="cuda:best",
+        **kwargs
+    ):
         AgentWithSimplePolicy.__init__(self, env, **kwargs)
 
         self.batch_size = batch_size
@@ -86,8 +89,7 @@ def __init__(self, env,
         #
         self.policy_net_fn = policy_net_fn or default_policy_net_fn
 
-        self.optimizer_kwargs = {'optimizer_type': optimizer_type,
-                                 'lr': learning_rate}
+        self.optimizer_kwargs = {"optimizer_type": optimizer_type, "lr": learning_rate}
 
         # check environment
         assert isinstance(self.env.observation_space, spaces.Box)
@@ -99,14 +101,13 @@ def __init__(self, env,
         self.reset()
 
     def reset(self, **kwargs):
-        self.policy_net = self.policy_net_fn(
-            self.env,
-            **self.policy_net_kwargs
-        ).to(self.device)
+        self.policy_net = self.policy_net_fn(self.env, **self.policy_net_kwargs).to(
+            self.device
+        )
 
         self.policy_optimizer = optimizer_factory(
-            self.policy_net.parameters(),
-            **self.optimizer_kwargs)
+            self.policy_net.parameters(), **self.optimizer_kwargs
+        )
 
         self.memory = Memory()
 
@@ -140,8 +141,8 @@ def _run_episode(self):
             # check whether to use bonus
             bonus = 0.0
             if self.use_bonus_if_available:
-                if info is not None and 'exploration_bonus' in info:
-                    bonus = info['exploration_bonus']
+                if info is not None and "exploration_bonus" in info:
+                    bonus = info["exploration_bonus"]
 
             # save in batch
             self.memory.states.append(state)
@@ -177,8 +178,9 @@ def _update(self):
         # monte carlo estimate of rewards
         rewards = []
         discounted_reward = 0
-        for reward, is_terminal in zip(reversed(self.memory.rewards),
-                                       reversed(self.memory.is_terminals)):
+        for reward, is_terminal in zip(
+            reversed(self.memory.rewards), reversed(self.memory.is_terminals)
+        ):
             if is_terminal:
                 discounted_reward = 0
             discounted_reward = reward + (self.gamma * discounted_reward)
@@ -211,17 +213,15 @@ def _update(self):
     #
     @classmethod
     def sample_parameters(cls, trial):
-        batch_size = trial.suggest_categorical('batch_size',
-                                               [1, 4, 8, 16, 32])
-        gamma = trial.suggest_categorical('gamma',
-                                          [0.9, 0.95, 0.99])
-        learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1)
+        batch_size = trial.suggest_categorical("batch_size", [1, 4, 8, 16, 32])
+        gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.99])
+        learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
 
-        entr_coef = trial.suggest_loguniform('entr_coef', 1e-8, 0.1)
+        entr_coef = trial.suggest_loguniform("entr_coef", 1e-8, 0.1)
 
         return {
-            'batch_size': batch_size,
-            'gamma': gamma,
-            'learning_rate': learning_rate,
-            'entr_coef': entr_coef,
+            "batch_size": batch_size,
+            "gamma": gamma,
+            "learning_rate": learning_rate,
+            "entr_coef": entr_coef,
         }
diff --git a/rlberry/agents/torch/tests/test_actor_critic_algos.py b/rlberry/agents/torch/tests/test_actor_critic_algos.py
index 989cf63dd..c76b83728 100644
--- a/rlberry/agents/torch/tests/test_actor_critic_algos.py
+++ b/rlberry/agents/torch/tests/test_actor_critic_algos.py
@@ -11,21 +11,20 @@ def test_a2c_agent():
     horizon = 30
 
     def uncertainty_estimator_fn(observation_space, action_space):
-        counter = DiscreteCounter(observation_space,
-                                  action_space,
-                                  n_bins_obs=20)
+        counter = DiscreteCounter(observation_space, action_space, n_bins_obs=20)
         return counter
 
-    agent = A2CAgent(env,
-                     horizon=horizon,
-                     gamma=0.99,
-                     learning_rate=0.001,
-                     k_epochs=4,
-                     use_bonus=True,
-                     uncertainty_estimator_kwargs=dict(
-                         uncertainty_estimator_fn=uncertainty_estimator_fn,
-                         bonus_scale_factor=1.0
-                     ))
+    agent = A2CAgent(
+        env,
+        horizon=horizon,
+        gamma=0.99,
+        learning_rate=0.001,
+        k_epochs=4,
+        use_bonus=True,
+        uncertainty_estimator_kwargs=dict(
+            uncertainty_estimator_fn=uncertainty_estimator_fn, bonus_scale_factor=1.0
+        ),
+    )
     agent.fit(budget=n_episodes)
     agent.policy(env.observation_space.sample())
 
@@ -35,12 +34,14 @@ def test_a2c_agent_partial_fit():
     n_episodes = 10
     horizon = 30
 
-    agent = A2CAgent(env,
-                     horizon=horizon,
-                     gamma=0.99,
-                     learning_rate=0.001,
-                     k_epochs=4,
-                     use_bonus=False)
+    agent = A2CAgent(
+        env,
+        horizon=horizon,
+        gamma=0.99,
+        learning_rate=0.001,
+        k_epochs=4,
+        use_bonus=False,
+    )
 
     agent.fit(budget=n_episodes // 2)
     agent.policy(env.observation_space.sample())
@@ -56,22 +57,21 @@ def test_ppo_agent():
     horizon = 30
 
     def uncertainty_estimator_fn(observation_space, action_space):
-        counter = DiscreteCounter(observation_space,
-                                  action_space,
-                                  n_bins_obs=20)
+        counter = DiscreteCounter(observation_space, action_space, n_bins_obs=20)
         return counter
 
-    agent = PPOAgent(env,
-                     horizon=horizon,
-                     gamma=0.99,
-                     learning_rate=0.001,
-                     eps_clip=0.2,
-                     k_epochs=4,
-                     use_bonus=True,
-                     uncertainty_estimator_kwargs=dict(
-                         uncertainty_estimator_fn=uncertainty_estimator_fn,
-                         bonus_scale_factor=1
-                     ))
+    agent = PPOAgent(
+        env,
+        horizon=horizon,
+        gamma=0.99,
+        learning_rate=0.001,
+        eps_clip=0.2,
+        k_epochs=4,
+        use_bonus=True,
+        uncertainty_estimator_kwargs=dict(
+            uncertainty_estimator_fn=uncertainty_estimator_fn, bonus_scale_factor=1
+        ),
+    )
     agent.fit(budget=n_episodes)
     agent.policy(env.observation_space.sample())
 
@@ -81,13 +81,15 @@ def test_ppo_agent_partial_fit():
     n_episodes = 10
     horizon = 30
 
-    agent = PPOAgent(env,
-                     horizon=horizon,
-                     gamma=0.99,
-                     learning_rate=0.001,
-                     eps_clip=0.2,
-                     k_epochs=4,
-                     use_bonus=False)
+    agent = PPOAgent(
+        env,
+        horizon=horizon,
+        gamma=0.99,
+        learning_rate=0.001,
+        eps_clip=0.2,
+        k_epochs=4,
+        use_bonus=False,
+    )
 
     agent.fit(budget=n_episodes // 2)
     agent.policy(env.observation_space.sample())
@@ -104,23 +106,22 @@ def test_avec_ppo_agent():
 
     #
     def uncertainty_estimator_fn(observation_space, action_space):
-        counter = DiscreteCounter(observation_space,
-                                  action_space,
-                                  n_bins_obs=20)
+        counter = DiscreteCounter(observation_space, action_space, n_bins_obs=20)
         return counter
 
-    agent = AVECPPOAgent(env,
-                         horizon=horizon,
-                         gamma=0.99,
-                         learning_rate=0.001,
-                         eps_clip=0.2,
-                         k_epochs=4,
-                         batch_size=1,
-                         use_bonus=True,
-                         uncertainty_estimator_kwargs=dict(
-                             uncertainty_estimator_fn=uncertainty_estimator_fn,
-                             bonus_scale_factor=1.0)
-                         )
+    agent = AVECPPOAgent(
+        env,
+        horizon=horizon,
+        gamma=0.99,
+        learning_rate=0.001,
+        eps_clip=0.2,
+        k_epochs=4,
+        batch_size=1,
+        use_bonus=True,
+        uncertainty_estimator_kwargs=dict(
+            uncertainty_estimator_fn=uncertainty_estimator_fn, bonus_scale_factor=1.0
+        ),
+    )
     agent.fit(budget=n_episodes // 2)
     agent.policy(env.observation_space.sample())
 
@@ -130,14 +131,16 @@ def test_avec_ppo_agent_partial_fit():
     n_episodes = 10
     horizon = 30
 
-    agent = AVECPPOAgent(env,
-                         horizon=horizon,
-                         gamma=0.99,
-                         learning_rate=0.001,
-                         eps_clip=0.2,
-                         k_epochs=4,
-                         batch_size=1,
-                         use_bonus=False)
+    agent = AVECPPOAgent(
+        env,
+        horizon=horizon,
+        gamma=0.99,
+        learning_rate=0.001,
+        eps_clip=0.2,
+        k_epochs=4,
+        batch_size=1,
+        use_bonus=False,
+    )
 
     agent.fit(budget=n_episodes // 2)
     agent.policy(env.observation_space.sample())
diff --git a/rlberry/agents/torch/tests/test_dqn.py b/rlberry/agents/torch/tests/test_dqn.py
index 293934951..07760e438 100644
--- a/rlberry/agents/torch/tests/test_dqn.py
+++ b/rlberry/agents/torch/tests/test_dqn.py
@@ -1,6 +1,8 @@
 from rlberry.envs import gym_make
 from rlberry.agents.torch.dqn import DQNAgent
-from rlberry.exploration_tools.online_discretization_counter import OnlineDiscretizationCounter
+from rlberry.exploration_tools.online_discretization_counter import (
+    OnlineDiscretizationCounter,
+)
 from rlberry.exploration_tools.torch.rnd import RandomNetworkDistillation
 from rlberry.seeding import Seeder
 import numpy as np
@@ -11,27 +13,28 @@ def test_dqn_agent():
 
     def uncertainty_estimator_fn(observation_space, action_space):
         counter = OnlineDiscretizationCounter(
-            observation_space,
-            action_space,
-            min_dist=0.25)
+            observation_space, action_space, min_dist=0.25
+        )
         return counter
 
-    agent = DQNAgent(env,
-                     use_bonus=True,
-                     uncertainty_estimator_kwargs=dict(
-                         uncertainty_estimator_fn=uncertainty_estimator_fn,
-                         bonus_scale_factor=1.0
-                     ))
+    agent = DQNAgent(
+        env,
+        use_bonus=True,
+        uncertainty_estimator_kwargs=dict(
+            uncertainty_estimator_fn=uncertainty_estimator_fn, bonus_scale_factor=1.0
+        ),
+    )
     agent.fit(budget=5)
     agent.policy(env.observation_space.sample())
 
     # test seeding of exploration policy
-    agent2 = DQNAgent(env,
-                      use_bonus=True,
-                      uncertainty_estimator_kwargs=dict(
-                          uncertainty_estimator_fn=uncertainty_estimator_fn,
-                          bonus_scale_factor=1.0
-                      ))
+    agent2 = DQNAgent(
+        env,
+        use_bonus=True,
+        uncertainty_estimator_kwargs=dict(
+            uncertainty_estimator_fn=uncertainty_estimator_fn, bonus_scale_factor=1.0
+        ),
+    )
     agent.reseed(Seeder(123))
     agent2.reseed(Seeder(123))
 
@@ -47,11 +50,12 @@ def uncertainty_estimator_fn(observation_space, action_space):
         counter = RandomNetworkDistillation(observation_space, action_space)
         return counter
 
-    agent = DQNAgent(env,
-                     use_bonus=True,
-                     uncertainty_estimator_kwargs=dict(
-                         uncertainty_estimator_fn=uncertainty_estimator_fn,
-                         bonus_scale_factor=1.0
-                     ))
+    agent = DQNAgent(
+        env,
+        use_bonus=True,
+        uncertainty_estimator_kwargs=dict(
+            uncertainty_estimator_fn=uncertainty_estimator_fn, bonus_scale_factor=1.0
+        ),
+    )
     agent.fit(budget=5)
     agent.policy(env.observation_space.sample())
diff --git a/rlberry/agents/torch/tests/test_reinforce.py b/rlberry/agents/torch/tests/test_reinforce.py
index 56640ad6a..5df650288 100644
--- a/rlberry/agents/torch/tests/test_reinforce.py
+++ b/rlberry/agents/torch/tests/test_reinforce.py
@@ -1,8 +1,7 @@
 from rlberry.agents.torch import REINFORCEAgent
 from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env
 from rlberry.exploration_tools.discrete_counter import DiscreteCounter
-from rlberry.wrappers.uncertainty_estimator_wrapper import \
-    UncertaintyEstimatorWrapper
+from rlberry.wrappers.uncertainty_estimator_wrapper import UncertaintyEstimatorWrapper
 
 
 def test_reinforce_agent():
@@ -12,19 +11,20 @@ def test_reinforce_agent():
 
     #
     def uncertainty_estimator_fn(observation_space, action_space):
-        counter = DiscreteCounter(observation_space, action_space,
-                                  n_bins_obs=20)
+        counter = DiscreteCounter(observation_space, action_space, n_bins_obs=20)
         return counter
 
-    env = UncertaintyEstimatorWrapper(_env,
-                                      uncertainty_estimator_fn,
-                                      bonus_scale_factor=1.0)
+    env = UncertaintyEstimatorWrapper(
+        _env, uncertainty_estimator_fn, bonus_scale_factor=1.0
+    )
     #
-    agent = REINFORCEAgent(env,
-                           horizon=horizon,
-                           gamma=0.99,
-                           learning_rate=0.001,
-                           use_bonus_if_available=True)
+    agent = REINFORCEAgent(
+        env,
+        horizon=horizon,
+        gamma=0.99,
+        learning_rate=0.001,
+        use_bonus_if_available=True,
+    )
     agent.fit(budget=n_episodes)
     agent.policy(env.observation_space.sample())
 
@@ -34,11 +34,13 @@ def test_reinforce_agent_partial_fit():
     n_episodes = 10
     horizon = 30
 
-    agent = REINFORCEAgent(env,
-                           horizon=horizon,
-                           gamma=0.99,
-                           learning_rate=0.001,
-                           use_bonus_if_available=False)
+    agent = REINFORCEAgent(
+        env,
+        horizon=horizon,
+        gamma=0.99,
+        learning_rate=0.001,
+        use_bonus_if_available=False,
+    )
     agent.fit(budget=n_episodes // 2)
     agent.policy(env.observation_space.sample())
     assert agent.episode == 5
diff --git a/rlberry/agents/torch/tests/test_torch_models.py b/rlberry/agents/torch/tests/test_torch_models.py
index 5a0330d12..8ba3f0639 100644
--- a/rlberry/agents/torch/tests/test_torch_models.py
+++ b/rlberry/agents/torch/tests/test_torch_models.py
@@ -10,42 +10,34 @@
 
 
 def test_mlp():
-    model = MultiLayerPerceptron(in_size=5,
-                                 layer_sizes=[10, 10, 10],
-                                 out_size=10,
-                                 reshape=False)
+    model = MultiLayerPerceptron(
+        in_size=5, layer_sizes=[10, 10, 10], out_size=10, reshape=False
+    )
     x = torch.rand(1, 5)
     y = model.forward(x)
     assert y.shape[1] == 10
 
 
 def test_mlp_policy():
-    model = MultiLayerPerceptron(in_size=5,
-                                 layer_sizes=[10, 10, 10],
-                                 out_size=10,
-                                 reshape=False,
-                                 is_policy=True)
+    model = MultiLayerPerceptron(
+        in_size=5, layer_sizes=[10, 10, 10], out_size=10, reshape=False, is_policy=True
+    )
     x = torch.rand(1, 5)
     scores = model.action_scores(x)
     assert scores.shape[1] == 10
 
 
 def test_cnn():
-    model = ConvolutionalNetwork(in_channels=10,
-                                 in_height=20,
-                                 in_width=30,
-                                 out_size=15)
+    model = ConvolutionalNetwork(in_channels=10, in_height=20, in_width=30, out_size=15)
     x = torch.rand(1, 10, 20, 30)
     y = model.forward(x)
     assert y.shape[1] == 15
 
 
 def test_cnn_policy():
-    model = ConvolutionalNetwork(in_channels=10,
-                                 in_height=20,
-                                 in_width=30,
-                                 out_size=15,
-                                 is_policy=True)
+    model = ConvolutionalNetwork(
+        in_channels=10, in_height=20, in_width=30, out_size=15, is_policy=True
+    )
     x = torch.rand(1, 10, 20, 30)
     scores = model.action_scores(x)
     assert scores.shape[1] == 15
@@ -58,4 +50,3 @@ def test_ego_attention():
 
 def test_self_attention():
     _ = SelfAttention()
-
diff --git a/rlberry/agents/torch/tests/test_torch_training.py b/rlberry/agents/torch/tests/test_torch_training.py
index 4fe78526e..33233de04 100644
--- a/rlberry/agents/torch/tests/test_torch_training.py
+++ b/rlberry/agents/torch/tests/test_torch_training.py
@@ -6,12 +6,31 @@
 # loss_function_factory
 assert isinstance(loss_function_factory("l2").__name__, type(F.mse_loss.__name__))
 assert isinstance(loss_function_factory("l1").__name__, type(F.l1_loss.__name__))
-assert isinstance(loss_function_factory("smooth_l1").__name__, type(F.smooth_l1_loss.__name__))
-assert isinstance(loss_function_factory("bce").__name__, type(F.binary_cross_entropy.__name__))
+assert isinstance(
+    loss_function_factory("smooth_l1").__name__, type(F.smooth_l1_loss.__name__)
+)
+assert isinstance(
+    loss_function_factory("bce").__name__, type(F.binary_cross_entropy.__name__)
+)
 
 # optimizer_factory
 env = get_benchmark_env(level=1)
-assert optimizer_factory(default_policy_net_fn(env).parameters(), "ADAM").defaults["lr"] == 0.001
-assert optimizer_factory(default_policy_net_fn(env).parameters(), "ADAM").defaults["betas"] == (0.9, 0.999)
-assert optimizer_factory(default_policy_net_fn(env).parameters(), "RMS_PROP").defaults["lr"] == 0.01
-assert optimizer_factory(default_policy_net_fn(env).parameters(), "RMS_PROP").defaults["alpha"] == 0.99
+assert (
+    optimizer_factory(default_policy_net_fn(env).parameters(), "ADAM").defaults["lr"]
+    == 0.001
+)
+assert optimizer_factory(default_policy_net_fn(env).parameters(), "ADAM").defaults[
+    "betas"
+] == (0.9, 0.999)
+assert (
+    optimizer_factory(default_policy_net_fn(env).parameters(), "RMS_PROP").defaults[
+        "lr"
+    ]
+    == 0.01
+)
+assert (
+    optimizer_factory(default_policy_net_fn(env).parameters(), "RMS_PROP").defaults[
+        "alpha"
+    ]
+    == 0.99
+)
diff --git a/rlberry/agents/torch/utils/attention_models.py b/rlberry/agents/torch/utils/attention_models.py
index 8145441e5..47d984487 100644
--- a/rlberry/agents/torch/utils/attention_models.py
+++ b/rlberry/agents/torch/utils/attention_models.py
@@ -10,138 +10,122 @@
 
 
 class EgoAttention(BaseModule):
-    def __init__(self,
-                 feature_size=64,
-                 heads=4,
-                 dropout_factor=0):
+    def __init__(self, feature_size=64, heads=4, dropout_factor=0):
         super().__init__()
         self.feature_size = feature_size
         self.heads = heads
         self.dropout_factor = dropout_factor
         self.features_per_head = int(self.feature_size / self.heads)
 
-        self.value_all = nn.Linear(self.feature_size,
-                                   self.feature_size,
-                                   bias=False)
-        self.key_all = nn.Linear(self.feature_size,
-                                 self.feature_size,
-                                 bias=False)
-        self.query_ego = nn.Linear(self.feature_size,
-                                   self.feature_size,
-                                   bias=False)
-        self.attention_combine = nn.Linear(self.feature_size,
-                                           self.feature_size,
-                                           bias=False)
+        self.value_all = nn.Linear(self.feature_size, self.feature_size, bias=False)
+        self.key_all = nn.Linear(self.feature_size, self.feature_size, bias=False)
+        self.query_ego = nn.Linear(self.feature_size, self.feature_size, bias=False)
+        self.attention_combine = nn.Linear(
+            self.feature_size, self.feature_size, bias=False
+        )
 
     @classmethod
     def default_config(cls):
-        return {
-        }
+        return {}
 
     def forward(self, ego, others, mask=None):
         batch_size = others.shape[0]
         n_entities = others.shape[1] + 1
-        input_all = torch.cat((ego.view(batch_size, 1,
-                                        self.feature_size), others), dim=1)
+        input_all = torch.cat(
+            (ego.view(batch_size, 1, self.feature_size), others), dim=1
+        )
         # Dimensions: Batch, entity, head, feature_per_head
-        key_all = self.key_all(input_all).view(batch_size,
-                                               n_entities,
-                                               self.heads,
-                                               self.features_per_head)
-        value_all = self.value_all(input_all).view(batch_size,
-                                                   n_entities,
-                                                   self.heads,
-                                                   self.features_per_head)
-        query_ego = self.query_ego(ego).view(batch_size, 1,
-                                             self.heads,
-                                             self.features_per_head)
+        key_all = self.key_all(input_all).view(
+            batch_size, n_entities, self.heads, self.features_per_head
+        )
+        value_all = self.value_all(input_all).view(
+            batch_size, n_entities, self.heads, self.features_per_head
+        )
+        query_ego = self.query_ego(ego).view(
+            batch_size, 1, self.heads, self.features_per_head
+        )
 
         # Dimensions: Batch, head, entity, feature_per_head
         key_all = key_all.permute(0, 2, 1, 3)
         value_all = value_all.permute(0, 2, 1, 3)
         query_ego = query_ego.permute(0, 2, 1, 3)
         if mask is not None:
-            mask = mask.view((batch_size, 1, 1,
-                              n_entities)).repeat((1, self.heads, 1, 1))
-        value, attention_matrix = attention(query_ego,
-                                            key_all,
-                                            value_all,
-                                            mask,
-                                            nn.Dropout(self.dropout_factor))
-        result = (self.attention_combine(
-            value.reshape((batch_size,
-                           self.feature_size))) + ego.squeeze(1)) / 2
+            mask = mask.view((batch_size, 1, 1, n_entities)).repeat(
+                (1, self.heads, 1, 1)
+            )
+        value, attention_matrix = attention(
+            query_ego, key_all, value_all, mask, nn.Dropout(self.dropout_factor)
+        )
+        result = (
+            self.attention_combine(value.reshape((batch_size, self.feature_size)))
+            + ego.squeeze(1)
+        ) / 2
         return result, attention_matrix
 
 
 class SelfAttention(BaseModule):
-    def __init__(self,
-                 feature_size=64,
-                 heads=4,
-                 dropout_factor=0,
-                 **kwargs):
+    def __init__(self, feature_size=64, heads=4, dropout_factor=0, **kwargs):
         super().__init__(**kwargs)
         self.feature_size = feature_size
         self.heads = heads
         self.dropout_factor = dropout_factor
         self.features_per_head = int(self.feature_size / self.heads)
 
-        self.value_all = nn.Linear(self.feature_size,
-                                   self.feature_size,
-                                   bias=False)
-        self.key_all = nn.Linear(self.feature_size,
-                                 self.feature_size,
-                                 bias=False)
-        self.query_all = nn.Linear(self.feature_size,
-                                   self.feature_size,
-                                   bias=False)
-        self.attention_combine = nn.Linear(self.feature_size,
-                                           self.feature_size,
-                                           bias=False)
+        self.value_all = nn.Linear(self.feature_size, self.feature_size, bias=False)
+        self.key_all = nn.Linear(self.feature_size, self.feature_size, bias=False)
+        self.query_all = nn.Linear(self.feature_size, self.feature_size, bias=False)
+        self.attention_combine = nn.Linear(
+            self.feature_size, self.feature_size, bias=False
+        )
 
     def forward(self, ego, others, mask=None):
         batch_size = others.shape[0]
         n_entities = others.shape[1] + 1
-        input_all = torch.cat((ego.view(batch_size, 1,
-                                        self.feature_size),
-                               others), dim=1)
+        input_all = torch.cat(
+            (ego.view(batch_size, 1, self.feature_size), others), dim=1
+        )
         # Dimensions: Batch, entity, head, feature_per_head
-        key_all = self.key_all(input_all).view(batch_size, n_entities,
-                                               self.heads,
-                                               self.features_per_head)
-        value_all = self.value_all(input_all).view(batch_size, n_entities,
-                                                   self.heads,
-                                                   self.features_per_head)
-        query_all = self.query_all(input_all).view(batch_size,
-                                                   n_entities,
-                                                   self.heads,
-                                                   self.features_per_head)
+        key_all = self.key_all(input_all).view(
+            batch_size, n_entities, self.heads, self.features_per_head
+        )
+        value_all = self.value_all(input_all).view(
+            batch_size, n_entities, self.heads, self.features_per_head
+        )
+        query_all = self.query_all(input_all).view(
+            batch_size, n_entities, self.heads, self.features_per_head
+        )
 
         # Dimensions: Batch, head, entity, feature_per_head
         key_all = key_all.permute(0, 2, 1, 3)
         value_all = value_all.permute(0, 2, 1, 3)
         query_all = query_all.permute(0, 2, 1, 3)
         if mask is not None:
-            mask = mask.view((batch_size, 1, 1,
-                              n_entities)).repeat((1, self.heads, 1, 1))
-        value, attention_matrix = attention(query_all, key_all, value_all,
-                                            mask,
-                                            nn.Dropout(self.dropout_factor))
-        result = (self.attention_combine(
-            value.reshape((batch_size, n_entities, self.feature_size)))
-                  + input_all) / 2
+            mask = mask.view((batch_size, 1, 1, n_entities)).repeat(
+                (1, self.heads, 1, 1)
+            )
+        value, attention_matrix = attention(
+            query_all, key_all, value_all, mask, nn.Dropout(self.dropout_factor)
+        )
+        result = (
+            self.attention_combine(
+                value.reshape((batch_size, n_entities, self.feature_size))
+            )
+            + input_all
+        ) / 2
         return result, attention_matrix
 
 
 class EgoAttentionNetwork(BaseModule):
-    def __init__(self,
-                 in_size=None,
-                 out_size=None,
-                 presence_feature_idx=0,
-                 embedding_layer_kwargs=None,
-                 attention_layer_kwargs=None,
-                 output_layer_kwargs=None,
-                 **kwargs):
+    def __init__(
+        self,
+        in_size=None,
+        out_size=None,
+        presence_feature_idx=0,
+        embedding_layer_kwargs=None,
+        attention_layer_kwargs=None,
+        output_layer_kwargs=None,
+        **kwargs
+    ):
         super().__init__(**kwargs)
         self.out_size = out_size
         self.presence_feature_idx = presence_feature_idx
@@ -171,7 +155,7 @@ def split_input(self, x, mask=None):
         others = x[:, 1:, :]
         if mask is None:
             aux = self.presence_feature_idx
-            mask = x[:, :, aux:aux + 1] < 0.5
+            mask = x[:, :, aux : aux + 1] < 0.5
         return ego, others, mask
 
     def forward_attention(self, x):
diff --git a/rlberry/agents/torch/utils/models.py b/rlberry/agents/torch/utils/models.py
index f93dc4de6..729969c81 100644
--- a/rlberry/agents/torch/utils/models.py
+++ b/rlberry/agents/torch/utils/models.py
@@ -23,35 +23,50 @@ def default_policy_net_fn(env):
     elif isinstance(env.observation_space, spaces.Tuple):
         obs_shape = env.observation_space.spaces[0].shape
     else:
-        raise ValueError("Incompatible observation space: {}".format(env.observation_space))
+        raise ValueError(
+            "Incompatible observation space: {}".format(env.observation_space)
+        )
 
     if len(obs_shape) == 3:
         if obs_shape[0] < obs_shape[1] and obs_shape[0] < obs_shape[1]:
             # Assume CHW observation space
-            model_config = {"type": "ConvolutionalNetwork",
-                            "is_policy": True,
-                            "in_channels": int(obs_shape[0]),
-                            "in_height": int(obs_shape[1]),
-                            "in_width": int(obs_shape[2])}
+            model_config = {
+                "type": "ConvolutionalNetwork",
+                "is_policy": True,
+                "in_channels": int(obs_shape[0]),
+                "in_height": int(obs_shape[1]),
+                "in_width": int(obs_shape[2]),
+            }
         elif obs_shape[2] < obs_shape[0] and obs_shape[2] < obs_shape[1]:
             # Assume WHC observation space
-            model_config = {"type": "ConvolutionalNetwork",
-                            "is_policy": True,
-                            "transpose_obs": True,
-                            "in_channels": int(obs_shape[2]),
-                            "in_height": int(obs_shape[1]),
-                            "in_width": int(obs_shape[0])}
+            model_config = {
+                "type": "ConvolutionalNetwork",
+                "is_policy": True,
+                "transpose_obs": True,
+                "in_channels": int(obs_shape[2]),
+                "in_height": int(obs_shape[1]),
+                "in_width": int(obs_shape[0]),
+            }
     elif len(obs_shape) == 2:
-        model_config = {"type": "ConvolutionalNetwork",
-                        "is_policy": True,
-                        "in_channels": int(1),
-                        "in_height": int(obs_shape[0]),
-                        "in_width": int(obs_shape[1])}
+        model_config = {
+            "type": "ConvolutionalNetwork",
+            "is_policy": True,
+            "in_channels": int(1),
+            "in_height": int(obs_shape[0]),
+            "in_width": int(obs_shape[1]),
+        }
     elif len(obs_shape) == 1:
-        model_config = {"type": "MultiLayerPerceptron", "in_size": int(obs_shape[0]),
-                        "layer_sizes": [64, 64], "reshape": False, "is_policy": True}
+        model_config = {
+            "type": "MultiLayerPerceptron",
+            "in_size": int(obs_shape[0]),
+            "layer_sizes": [64, 64],
+            "reshape": False,
+            "is_policy": True,
+        }
     else:
-        raise ValueError("Incompatible observation shape: {}".format(env.observation_space.shape))
+        raise ValueError(
+            "Incompatible observation shape: {}".format(env.observation_space.shape)
+        )
 
     if isinstance(env.action_space, spaces.Discrete):
         model_config["out_size"] = env.action_space.n
@@ -70,21 +85,34 @@ def default_value_net_fn(env):
     elif isinstance(env.observation_space, spaces.Tuple):
         obs_shape = env.observation_space.spaces[0].shape
     else:
-        raise ValueError("Incompatible observation space: {}".format(env.observation_space))
+        raise ValueError(
+            "Incompatible observation space: {}".format(env.observation_space)
+        )
     # Assume CHW observation space
     if len(obs_shape) == 3:
-        model_config = {"type": "ConvolutionalNetwork", "in_channels": int(obs_shape[0]),
-                        "in_height": int(obs_shape[1]),
-                        "in_width": int(obs_shape[2])}
+        model_config = {
+            "type": "ConvolutionalNetwork",
+            "in_channels": int(obs_shape[0]),
+            "in_height": int(obs_shape[1]),
+            "in_width": int(obs_shape[2]),
+        }
     elif len(obs_shape) == 2:
-        model_config = {"type": "ConvolutionalNetwork", "in_channels": int(1),
-                        "in_height": int(obs_shape[0]),
-                        "in_width": int(obs_shape[1])}
+        model_config = {
+            "type": "ConvolutionalNetwork",
+            "in_channels": int(1),
+            "in_height": int(obs_shape[0]),
+            "in_width": int(obs_shape[1]),
+        }
     elif len(obs_shape) == 1:
-        model_config = {"type": "MultiLayerPerceptron", "in_size": int(obs_shape[0]),
-                        "layer_sizes": [64, 64]}
+        model_config = {
+            "type": "MultiLayerPerceptron",
+            "in_size": int(obs_shape[0]),
+            "layer_sizes": [64, 64],
+        }
     else:
-        raise ValueError("Incompatible observation shape: {}".format(env.observation_space.shape))
+        raise ValueError(
+            "Incompatible observation shape: {}".format(env.observation_space.shape)
+        )
 
     model_config["out_size"] = 1
 
@@ -97,7 +125,7 @@ def __init__(self, obs_size, hidden_size, n_actions):
         self.net = nn.Sequential(
             nn.Linear(obs_size, hidden_size),
             nn.ReLU(),
-            nn.Linear(hidden_size, n_actions)
+            nn.Linear(hidden_size, n_actions),
         )
 
     def forward(self, x):
@@ -117,15 +145,15 @@ def __init__(self, activation_type="RELU", reset_type="XAVIER"):
         self.reset_type = reset_type
 
     def _init_weights(self, m):
-        if hasattr(m, 'weight'):
+        if hasattr(m, "weight"):
             if self.reset_type == "XAVIER":
                 torch.nn.init.xavier_uniform_(m.weight.data)
             elif self.reset_type == "ZEROS":
-                torch.nn.init.constant_(m.weight.data, 0.)
+                torch.nn.init.constant_(m.weight.data, 0.0)
             else:
                 raise ValueError("Unknown reset type")
-        if hasattr(m, 'bias') and m.bias is not None:
-            torch.nn.init.constant_(m.bias.data, 0.)
+        if hasattr(m, "bias") and m.bias is not None:
+            torch.nn.init.constant_(m.bias.data, 0.0)
 
     def reset(self):
         self.apply(self._init_weights)
@@ -134,7 +162,9 @@ def reset(self):
 class Table(torch.nn.Module):
     def __init__(self, state_size, action_size):
         super().__init__()
-        self.policy = nn.Embedding.from_pretrained(torch.zeros(state_size, action_size), freeze=False)
+        self.policy = nn.Embedding.from_pretrained(
+            torch.zeros(state_size, action_size), freeze=False
+        )
         self.softmax = nn.Softmax(dim=-1)
 
     def forward(self, x):
@@ -146,14 +176,16 @@ def action_scores(self, x):
 
 
 class MultiLayerPerceptron(BaseModule):
-    def __init__(self,
-                 in_size=None,
-                 layer_sizes=None,
-                 reshape=True,
-                 out_size=None,
-                 activation="RELU",
-                 is_policy=False,
-                 **kwargs):
+    def __init__(
+        self,
+        in_size=None,
+        layer_sizes=None,
+        reshape=True,
+        out_size=None,
+        activation="RELU",
+        is_policy=False,
+        **kwargs
+    ):
         super().__init__(**kwargs)
         self.reshape = reshape
         self.layer_sizes = layer_sizes or [64, 64]
@@ -162,8 +194,7 @@ def __init__(self,
         self.is_policy = is_policy
         self.softmax = nn.Softmax(dim=-1)
         sizes = [in_size] + self.layer_sizes
-        layers_list = [nn.Linear(sizes[i], sizes[i + 1])
-                       for i in range(len(sizes) - 1)]
+        layers_list = [nn.Linear(sizes[i], sizes[i + 1]) for i in range(len(sizes) - 1)]
         self.layers = nn.ModuleList(layers_list)
         if out_size:
             self.predict = nn.Linear(sizes[-1], out_size)
@@ -193,12 +224,14 @@ def action_scores(self, x):
 
 
 class DuelingNetwork(BaseModule):
-    def __init__(self,
-                 in_size=None,
-                 base_module_kwargs=None,
-                 value_kwargs=None,
-                 advantage_kwargs=None,
-                 out_size=None):
+    def __init__(
+        self,
+        in_size=None,
+        base_module_kwargs=None,
+        value_kwargs=None,
+        advantage_kwargs=None,
+        out_size=None,
+    ):
         super().__init__()
         self.out_size = out_size
         base_module_kwargs = base_module_kwargs or {}
@@ -217,21 +250,24 @@ def forward(self, x):
         x = self.base_module(x)
         value = self.value(x).expand(-1, self.out_size)
         advantage = self.advantage(x)
-        return value + advantage \
-               - advantage.mean(1).unsqueeze(1).expand(-1, self.out_size)
+        return (
+            value + advantage - advantage.mean(1).unsqueeze(1).expand(-1, self.out_size)
+        )
 
 
 class ConvolutionalNetwork(nn.Module):
-    def __init__(self,
-                 activation="RELU",
-                 in_channels=None,
-                 in_height=None,
-                 in_width=None,
-                 head_mlp_kwargs=None,
-                 out_size=None,
-                 is_policy=False,
-                 transpose_obs=False,
-                 **kwargs):
+    def __init__(
+        self,
+        activation="RELU",
+        in_channels=None,
+        in_height=None,
+        in_width=None,
+        head_mlp_kwargs=None,
+        out_size=None,
+        is_policy=False,
+        transpose_obs=False,
+        **kwargs
+    ):
         super().__init__()
         self.activation = activation_factory(activation)
         self.conv1 = nn.Conv2d(in_channels, 16, kernel_size=2, stride=2)
diff --git a/rlberry/agents/torch/utils/training.py b/rlberry/agents/torch/utils/training.py
index b175c963a..1a5080cfb 100644
--- a/rlberry/agents/torch/utils/training.py
+++ b/rlberry/agents/torch/utils/training.py
@@ -29,8 +29,13 @@ def optimizer_factory(params, optimizer_type="ADAM", **kwargs):
 
 def model_factory(type="MultiLayerPerceptron", **kwargs) -> nn.Module:
     from rlberry.agents.torch.utils.attention_models import EgoAttentionNetwork
-    from rlberry.agents.torch.utils.models import MultiLayerPerceptron, DuelingNetwork, ConvolutionalNetwork, \
-        Table
+    from rlberry.agents.torch.utils.models import (
+        MultiLayerPerceptron,
+        DuelingNetwork,
+        ConvolutionalNetwork,
+        Table,
+    )
+
     if type == "MultiLayerPerceptron":
         return MultiLayerPerceptron(**kwargs)
     elif type == "DuelingNetwork":
@@ -50,8 +55,7 @@ def model_factory_from_env(env, **kwargs):
     return model_factory(**kwargs)
 
 
-def size_model_config(env,
-                      **model_config):
+def size_model_config(env, **model_config):
     """
     Update the configuration of a model depending on the environment
     observation/action spaces.
diff --git a/rlberry/agents/ucbvi/ucbvi.py b/rlberry/agents/ucbvi/ucbvi.py
index 99476b5f4..424fd7e0c 100644
--- a/rlberry/agents/ucbvi/ucbvi.py
+++ b/rlberry/agents/ucbvi/ucbvi.py
@@ -3,7 +3,10 @@
 
 import gym.spaces as spaces
 from rlberry.agents import AgentWithSimplePolicy
-from rlberry.agents.ucbvi.utils import update_value_and_get_action, update_value_and_get_action_sd
+from rlberry.agents.ucbvi.utils import (
+    update_value_and_get_action,
+    update_value_and_get_action_sd,
+)
 from rlberry.exploration_tools.discrete_counter import DiscreteCounter
 from rlberry.agents.dynprog.utils import backward_induction_sd
 from rlberry.agents.dynprog.utils import backward_induction_in_place
@@ -56,18 +59,21 @@ class UCBVIAgent(AgentWithSimplePolicy):
           Advances in Neural Information Processing Systems. 2019.
           https://papers.nips.cc/paper/2019/file/25caef3a545a1fff2ff4055484f0e758-Paper.pdf
     """
+
     name = "UCBVI"
 
-    def __init__(self,
-                 env,
-                 gamma=1.0,
-                 horizon=100,
-                 bonus_scale_factor=1.0,
-                 bonus_type="simplified_bernstein",
-                 reward_free=False,
-                 stage_dependent=False,
-                 real_time_dp=False,
-                 **kwargs):
+    def __init__(
+        self,
+        env,
+        gamma=1.0,
+        horizon=100,
+        bonus_scale_factor=1.0,
+        bonus_type="simplified_bernstein",
+        reward_free=False,
+        stage_dependent=False,
+        real_time_dp=False,
+        **kwargs
+    ):
         # init base class
         AgentWithSimplePolicy.__init__(self, env, **kwargs)
 
@@ -86,15 +92,16 @@ def __init__(self,
         # other checks
         assert gamma >= 0 and gamma <= 1.0
         if self.horizon is None:
-            assert gamma < 1.0, \
-                "If no horizon is given, gamma must be smaller than 1."
+            assert gamma < 1.0, "If no horizon is given, gamma must be smaller than 1."
             self.horizon = int(np.ceil(1.0 / (1.0 - gamma)))
 
         # maximum value
         r_range = self.env.reward_range[1] - self.env.reward_range[0]
         if r_range == np.inf or r_range == 0.0:
-            logger.warning("{}: Reward range is  zero or infinity. ".format(self.name)
-                           + "Setting it to 1.")
+            logger.warning(
+                "{}: Reward range is  zero or infinity. ".format(self.name)
+                + "Setting it to 1."
+            )
             r_range = 1.0
 
         self.v_max = np.zeros(self.horizon)
@@ -146,12 +153,13 @@ def reset(self, **kwargs):
         self.episode = 0
 
         # useful object to compute total number of visited states & entropy of visited states
-        self.counter = DiscreteCounter(self.env.observation_space,
-                                       self.env.action_space)
+        self.counter = DiscreteCounter(
+            self.env.observation_space, self.env.action_space
+        )
 
         # update name
         if self.real_time_dp:
-            self.name = 'UCBVI-RTDP'
+            self.name = "UCBVI-RTDP"
 
     def policy(self, observation):
         state = observation
@@ -159,7 +167,7 @@ def policy(self, observation):
         return self.Q_policy[0, state, :].argmax()
 
     def _get_action(self, state, hh=0):
-        """ Sampling policy. """
+        """Sampling policy."""
         if not self.real_time_dp:
             assert self.Q is not None
             return self.Q[hh, state, :].argmax()
@@ -176,7 +184,8 @@ def _get_action(self, state, hh=0):
                 self.P_hat,
                 self.B_sa,
                 self.gamma,
-                self.v_max)
+                self.v_max,
+            )
 
     def _compute_bonus(self, n, hh):
         # reward-free
@@ -191,7 +200,8 @@ def _compute_bonus(self, n, hh):
             return bonus
         else:
             raise ValueError(
-                "Error: bonus type {} not implemented".format(self.bonus_type))
+                "Error: bonus type {} not implemented".format(self.bonus_type)
+            )
 
     def _update(self, state, action, next_state, reward, hh):
         if self.stage_dependent:
@@ -201,7 +211,9 @@ def _update(self, state, action, next_state, reward, hh):
             prev_r = self.R_hat[hh, state, action]
             prev_p = self.P_hat[hh, state, action, :]
 
-            self.R_hat[hh, state, action] = (1.0 - 1.0 / nn) * prev_r + reward * 1.0 / nn
+            self.R_hat[hh, state, action] = (
+                1.0 - 1.0 / nn
+            ) * prev_r + reward * 1.0 / nn
 
             self.P_hat[hh, state, action, :] = (1.0 - 1.0 / nn) * prev_p
             self.P_hat[hh, state, action, next_state] += 1.0 / nn
@@ -251,7 +263,8 @@ def _run_episode(self):
                     self.R_hat + self.B_sa,
                     self.P_hat,
                     self.gamma,
-                    self.v_max[0])
+                    self.v_max[0],
+                )
             else:
                 backward_induction_in_place(
                     self.Q,
@@ -260,7 +273,8 @@ def _run_episode(self):
                     self.P_hat,
                     self.horizon,
                     self.gamma,
-                    self.v_max[0])
+                    self.v_max[0],
+                )
 
         # update info
         self.episode += 1
@@ -268,7 +282,9 @@ def _run_episode(self):
         # writer
         if self.writer is not None:
             self.writer.add_scalar("episode_rewards", episode_rewards, self.episode)
-            self.writer.add_scalar("n_visited_states", self.counter.get_n_visited_states(), self.episode)
+            self.writer.add_scalar(
+                "n_visited_states", self.counter.get_n_visited_states(), self.episode
+            )
 
         # return sum of rewards collected in the episode
         return episode_rewards
@@ -289,7 +305,8 @@ def fit(self, budget: int, **kwargs):
                 self.R_hat,
                 self.P_hat,
                 self.gamma,
-                self.v_max[0])
+                self.v_max[0],
+            )
         else:
             backward_induction_in_place(
                 self.Q_policy,
@@ -298,4 +315,5 @@ def fit(self, budget: int, **kwargs):
                 self.P_hat,
                 self.horizon,
                 self.gamma,
-                self.v_max[0])
+                self.v_max[0],
+            )
diff --git a/rlberry/agents/ucbvi/utils.py b/rlberry/agents/ucbvi/utils.py
index dd530e809..255affeed 100644
--- a/rlberry/agents/ucbvi/utils.py
+++ b/rlberry/agents/ucbvi/utils.py
@@ -2,14 +2,7 @@
 
 
 @numba_jit
-def update_value_and_get_action(state,
-                                hh,
-                                V,
-                                R_hat,
-                                P_hat,
-                                B_sa,
-                                gamma,
-                                v_max):
+def update_value_and_get_action(state, hh, V, R_hat, P_hat, B_sa, gamma, v_max):
     """
     state : int
     hh : int
@@ -50,14 +43,7 @@ def update_value_and_get_action(state,
 
 
 @numba_jit
-def update_value_and_get_action_sd(state,
-                                   hh,
-                                   V,
-                                   R_hat,
-                                   P_hat,
-                                   B_sa,
-                                   gamma,
-                                   v_max):
+def update_value_and_get_action_sd(state, hh, V, R_hat, P_hat, B_sa, gamma, v_max):
     """
     state : int
     hh : int
diff --git a/rlberry/agents/utils/memories.py b/rlberry/agents/utils/memories.py
index 2787190ec..6ec890402 100644
--- a/rlberry/agents/utils/memories.py
+++ b/rlberry/agents/utils/memories.py
@@ -2,8 +2,9 @@
 import operator
 from collections import namedtuple
 
-Transition = namedtuple('Transition',
-                        ('state', 'action', 'reward', 'next_state', 'terminal', 'info'))
+Transition = namedtuple(
+    "Transition", ("state", "action", "reward", "next_state", "terminal", "info")
+)
 
 
 class ReplayMemory(object):
@@ -11,9 +12,7 @@ class ReplayMemory(object):
     Container that stores and samples transitions.
     """
 
-    def __init__(self,
-                 capacity=10000,
-                 **kwargs):
+    def __init__(self, capacity=10000, **kwargs):
         self.capacity = int(capacity)
         self.memory = []
         self.position = 0
@@ -59,22 +58,20 @@ def _encode_sample(self, idxes):
             rewards.append(reward)
             next_states.append(np.array(next_state, copy=False))
             dones.append(done)
-        return Transition(np.array(states),
-                          np.array(actions),
-                          np.array(rewards),
-                          np.array(next_states),
-                          np.array(dones),
-                          {})
+        return Transition(
+            np.array(states),
+            np.array(actions),
+            np.array(rewards),
+            np.array(next_states),
+            np.array(dones),
+            {},
+        )
 
 
 class PrioritizedReplayMemory(TransitionReplayMemory):
     """Code from https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py"""
 
-    def __init__(self,
-                 capacity=10000,
-                 alpha=0.5,
-                 beta=0.5,
-                 **kwargs):
+    def __init__(self, capacity=10000, alpha=0.5, beta=0.5, **kwargs):
         """Create Prioritized Replay buffer.
         Parameters
         ----------
@@ -251,7 +248,9 @@ def __init__(self, capacity, operation, neutral_element):
             neutral element for the operation above. eg. float('-inf')
             for max and 0 for sum.
         """
-        assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2."
+        assert (
+            capacity > 0 and capacity & (capacity - 1) == 0
+        ), "capacity must be positive and a power of 2."
         self._capacity = capacity
         self._value = [neutral_element for _ in range(2 * capacity)]
         self._operation = operation
@@ -268,7 +267,7 @@ def _reduce_helper(self, start, end, node, node_start, node_end):
             else:
                 return self._operation(
                     self._reduce_helper(start, mid, 2 * node, node_start, mid),
-                    self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end)
+                    self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end),
                 )
 
     def reduce(self, start=0, end=None):
@@ -303,8 +302,7 @@ def __setitem__(self, idx, val):
         idx //= 2
         while idx >= 1:
             self._value[idx] = self._operation(
-                self._value[2 * idx],
-                self._value[2 * idx + 1]
+                self._value[2 * idx], self._value[2 * idx + 1]
             )
             idx //= 2
 
@@ -316,9 +314,7 @@ def __getitem__(self, idx):
 class SumSegmentTree(SegmentTree):
     def __init__(self, capacity):
         super(SumSegmentTree, self).__init__(
-            capacity=capacity,
-            operation=operator.add,
-            neutral_element=0.0
+            capacity=capacity, operation=operator.add, neutral_element=0.0
         )
 
     def sum(self, start=0, end=None):
@@ -357,9 +353,7 @@ def find_prefixsum_idx(self, prefixsum):
 class MinSegmentTree(SegmentTree):
     def __init__(self, capacity):
         super(MinSegmentTree, self).__init__(
-            capacity=capacity,
-            operation=min,
-            neutral_element=float('inf')
+            capacity=capacity, operation=min, neutral_element=float("inf")
         )
 
     def min(self, start=0, end=None):
diff --git a/rlberry/colab_utils/display_setup.py b/rlberry/colab_utils/display_setup.py
index 583a8fbbd..302e589eb 100644
--- a/rlberry/colab_utils/display_setup.py
+++ b/rlberry/colab_utils/display_setup.py
@@ -5,26 +5,31 @@
 import base64
 from pyvirtualdisplay import Display
 from IPython import display as ipythondisplay
+
 # from IPython.display import clear_output
 from pathlib import Path
 
 
-def show_video(filename=None, directory='./videos'):
+def show_video(filename=None, directory="./videos"):
     """
     Either show all videos in a directory (if filename is None) or
     show video corresponding to filename.
     """
     html = []
     if filename is not None:
-        files = Path('./').glob(filename)
+        files = Path("./").glob(filename)
     else:
         files = Path(directory).glob("*.mp4")
     for mp4 in files:
         video_b64 = base64.b64encode(mp4.read_bytes())
-        html.append('''<video alt="{}" autoplay
+        html.append(
+            """<video alt="{}" autoplay
                       loop controls style="height: 400px;">
                     <source src="data:video/mp4;base64,{}" type="video/mp4" />
-                 </video>'''.format(mp4, video_b64.decode('ascii')))
+                 </video>""".format(
+                mp4, video_b64.decode("ascii")
+            )
+        )
     ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))
 
 
diff --git a/rlberry/envs/basewrapper.py b/rlberry/envs/basewrapper.py
index f0bb3c6e4..597f36208 100644
--- a/rlberry/envs/basewrapper.py
+++ b/rlberry/envs/basewrapper.py
@@ -63,7 +63,7 @@ def __getattr__(self, attr):
         The first condition is to avoid infinite recursion when deep copying.
         See https://stackoverflow.com/a/47300262
         """
-        if attr[:2] == '__':
+        if attr[:2] == "__":
             raise AttributeError(attr)
         if attr in self.__dict__:
             return getattr(self, attr)
@@ -94,7 +94,7 @@ def step(self, action):
     def sample(self, state, action):
         return self.env.sample(state, action)
 
-    def render(self, mode='human', **kwargs):
+    def render(self, mode="human", **kwargs):
         return self.env.render(mode=mode, **kwargs)
 
     def close(self):
@@ -116,8 +116,9 @@ def is_online(self):
 
     def is_generative(self):
         try:
-            self.env.sample(self.env.observation_space.sample(),
-                            self.env.action_space.sample())
+            self.env.sample(
+                self.env.observation_space.sample(), self.env.action_space.sample()
+            )
             return True
         except Exception:
             return False
@@ -126,4 +127,4 @@ def __repr__(self):
         return str(self)
 
     def __str__(self):
-        return '<{}{}>'.format(type(self).__name__, self.env)
+        return "<{}{}>".format(type(self).__name__, self.env)
diff --git a/rlberry/envs/benchmarks/ball_exploration/ball2d.py b/rlberry/envs/benchmarks/ball_exploration/ball2d.py
index deb2070c2..bb1b43d7e 100644
--- a/rlberry/envs/benchmarks/ball_exploration/ball2d.py
+++ b/rlberry/envs/benchmarks/ball_exploration/ball2d.py
@@ -55,11 +55,13 @@ def __init__(self):
         self.horizon = 30
         #
         self.p = 2
-        self.action_list = [np.array([0.0, 0.0]),
-                            0.05 * np.array([1.0, 0.0]),
-                            -0.05 * np.array([1.0, 0.0]),
-                            0.05 * np.array([0.0, 1.0]),
-                            -0.05 * np.array([0.0, 1.0])]
+        self.action_list = [
+            np.array([0.0, 0.0]),
+            0.05 * np.array([1.0, 0.0]),
+            -0.05 * np.array([1.0, 0.0]),
+            0.05 * np.array([0.0, 1.0]),
+            -0.05 * np.array([0.0, 1.0]),
+        ]
 
         self.reward_amplitudes = []
         self.reward_smoothness = []
@@ -70,17 +72,19 @@ def __init__(self):
         self.sigma_init = 0.001
         self.mu_init = np.array([0.0, 0.0])
 
-        PBall2D.__init__(self,
-                         self.p,
-                         self.action_list,
-                         self.reward_amplitudes,
-                         self.reward_smoothness,
-                         self.reward_centers,
-                         self.A,
-                         self.B,
-                         self.sigma,
-                         self.sigma_init,
-                         self.mu_init)
+        PBall2D.__init__(
+            self,
+            self.p,
+            self.action_list,
+            self.reward_amplitudes,
+            self.reward_smoothness,
+            self.reward_centers,
+            self.A,
+            self.B,
+            self.sigma,
+            self.sigma_init,
+            self.mu_init,
+        )
         self.name = "Ball Exploration Benchmark - Level 0 (Reward-Free)"
 
 
@@ -88,6 +92,7 @@ def __init__(self):
 # Level 1
 #
 
+
 class BallLevel1(PBall2D):
     """
     Dense rewards
@@ -97,11 +102,13 @@ def __init__(self):
         self.horizon = 30
         #
         self.p = 2
-        self.action_list = [np.array([0.0, 0.0]),
-                            0.05 * np.array([1.0, 0.0]),
-                            -0.05 * np.array([1.0, 0.0]),
-                            0.05 * np.array([0.0, 1.0]),
-                            -0.05 * np.array([0.0, 1.0])]
+        self.action_list = [
+            np.array([0.0, 0.0]),
+            0.05 * np.array([1.0, 0.0]),
+            -0.05 * np.array([1.0, 0.0]),
+            0.05 * np.array([0.0, 1.0]),
+            -0.05 * np.array([0.0, 1.0]),
+        ]
 
         self.reward_amplitudes = np.array([1.0])
         self.reward_smoothness = np.array([0.5 * np.sqrt(2)])
@@ -112,17 +119,19 @@ def __init__(self):
         self.sigma_init = 0.001
         self.mu_init = np.array([0.0, 0.0])
 
-        PBall2D.__init__(self,
-                         self.p,
-                         self.action_list,
-                         self.reward_amplitudes,
-                         self.reward_smoothness,
-                         self.reward_centers,
-                         self.A,
-                         self.B,
-                         self.sigma,
-                         self.sigma_init,
-                         self.mu_init)
+        PBall2D.__init__(
+            self,
+            self.p,
+            self.action_list,
+            self.reward_amplitudes,
+            self.reward_smoothness,
+            self.reward_centers,
+            self.A,
+            self.B,
+            self.sigma,
+            self.sigma_init,
+            self.mu_init,
+        )
         self.name = "Ball Exploration Benchmark - Level 1"
 
 
@@ -130,6 +139,7 @@ def __init__(self):
 # Level 2
 #
 
+
 class BallLevel2(BallLevel1):
     """
     Sparse rewards
@@ -174,8 +184,10 @@ def __init__(self):
 
         self.reward_amplitudes = np.array([1.0, 0.1])
         self.reward_smoothness = np.array([0.2, 0.5 * np.sqrt(2)])
-        self.reward_centers = [np.array([-0.5, -0.5]),  # far sparse
-                               np.array([0.5, 0.5])]  # dense
+        self.reward_centers = [
+            np.array([-0.5, -0.5]),  # far sparse
+            np.array([0.5, 0.5]),
+        ]  # dense
         self.name = "Ball Exploration Benchmark - Level 4"
 
 
@@ -183,6 +195,7 @@ def __init__(self):
 # Level 5
 #
 
+
 class BallLevel5(BallLevel4):
     """
     Far sparse reward (as lvl 2) + dense suboptimal rewards, noisier
@@ -193,6 +206,7 @@ def __init__(self):
         self.sigma = 0.025
         self.name = "Ball Exploration Benchmark - Level 5"
 
+
 # if __name__ == '__main__':
 #     env = get_benchmark_env(1)
 #     env.enable_rendering()
diff --git a/rlberry/envs/benchmarks/ball_exploration/pball.py b/rlberry/envs/benchmarks/ball_exploration/pball.py
index ff554a481..a196805cf 100644
--- a/rlberry/envs/benchmarks/ball_exploration/pball.py
+++ b/rlberry/envs/benchmarks/ball_exploration/pball.py
@@ -81,17 +81,19 @@ class PBall(Model):
 
     name = "LP-Ball"
 
-    def __init__(self,
-                 p,
-                 action_list,
-                 reward_amplitudes,
-                 reward_smoothness,
-                 reward_centers,
-                 A,
-                 B,
-                 sigma,
-                 sigma_init,
-                 mu_init):
+    def __init__(
+        self,
+        p,
+        action_list,
+        reward_amplitudes,
+        reward_smoothness,
+        reward_centers,
+        A,
+        B,
+        sigma,
+        sigma_init,
+        mu_init,
+    ):
         """
         Parameters
         -----------
@@ -121,8 +123,10 @@ def __init__(self,
 
         assert p >= 1, "PBall requires p>=1"
         if p not in [2, np.inf]:
-            logger.warning("For p!=2 or p!=np.inf, PBall \
-does not make true projections onto the lp ball.")
+            logger.warning(
+                "For p!=2 or p!=np.inf, PBall \
+does not make true projections onto the lp ball."
+            )
         self.p = p
         self.d, self.dp = B.shape  # d and d'
         self.m = len(action_list)
@@ -146,11 +150,13 @@ def __init__(self,
         assert len(self.reward_amplitudes) == len(self.reward_smoothness)
         assert len(self.reward_amplitudes) == len(self.reward_centers)
         if len(self.reward_amplitudes) > 0:
-            assert self.reward_amplitudes.max() <= 1.0 and \
-                   self.reward_amplitudes.min() >= 0.0, \
-                "reward amplitudes b_i must be in [0, 1]"
-            assert self.reward_smoothness.min() > 0.0, \
-                "reward smoothness c_i must be > 0"
+            assert (
+                self.reward_amplitudes.max() <= 1.0
+                and self.reward_amplitudes.min() >= 0.0
+            ), "reward amplitudes b_i must be in [0, 1]"
+            assert (
+                self.reward_smoothness.min() > 0.0
+            ), "reward smoothness c_i must be > 0"
         self.reward_range = (0, 1.0)
 
         #
@@ -163,8 +169,9 @@ def reset(self, state=None):
         if state is not None:
             self.state = state
         else:
-            self.state = self.mu_init \
-                         + self.sigma_init * self.seeder.rng.normal(size=self.d)
+            self.state = self.mu_init + self.sigma_init * self.seeder.rng.normal(
+                size=self.d
+            )
             # projection to unit ball
         self.state = projection_to_pball(self.state, self.p)
         return self.state.copy()
@@ -175,8 +182,11 @@ def sample(self, state, action):
 
         # next state
         action_vec = self.action_list[action]
-        next_s = self.A.dot(state) + self.B.dot(action_vec) \
-                 + self.sigma * self.rng.normal(size=self.d)
+        next_s = (
+            self.A.dot(state)
+            + self.B.dot(action_vec)
+            + self.sigma * self.rng.normal(size=self.d)
+        )
         next_s = projection_to_pball(next_s, self.p)
 
         # done and reward
@@ -220,31 +230,42 @@ def get_transitions_lipschitz_constant(self):
             return np.linalg.norm(self.A, ord=order)
 
         # If p!=1, p!=2 or p!=np.inf, return upper bound on the induced norm.
-        return np.power(self.d, 1.0 / self.p) * np.linalg.norm(self.A,
-                                                               ord=np.inf)
+        return np.power(self.d, 1.0 / self.p) * np.linalg.norm(self.A, ord=np.inf)
 
 
 class PBall2D(RenderInterface2D, PBall):
-    def __init__(self,
-                 p=2,
-                 action_list=[0.05 * np.array([1, 0]),
-                              -0.05 * np.array([1, 0]),
-                              0.05 * np.array([0, 1]),
-                              -0.05 * np.array([0, 1])],
-                 reward_amplitudes=np.array([1.0]),
-                 reward_smoothness=np.array([0.25]),
-                 reward_centers=[np.array([0.75, 0.0])],
-                 A=np.eye(2),
-                 B=np.eye(2),
-                 sigma=0.01,
-                 sigma_init=0.001,
-                 mu_init=np.array([0.0, 0.0])
-                 ):
+    def __init__(
+        self,
+        p=2,
+        action_list=[
+            0.05 * np.array([1, 0]),
+            -0.05 * np.array([1, 0]),
+            0.05 * np.array([0, 1]),
+            -0.05 * np.array([0, 1]),
+        ],
+        reward_amplitudes=np.array([1.0]),
+        reward_smoothness=np.array([0.25]),
+        reward_centers=[np.array([0.75, 0.0])],
+        A=np.eye(2),
+        B=np.eye(2),
+        sigma=0.01,
+        sigma_init=0.001,
+        mu_init=np.array([0.0, 0.0]),
+    ):
         # Initialize PBall
-        PBall.__init__(self, p, action_list, reward_amplitudes,
-                       reward_smoothness,
-                       reward_centers,
-                       A, B, sigma, sigma_init, mu_init)
+        PBall.__init__(
+            self,
+            p,
+            action_list,
+            reward_amplitudes,
+            reward_smoothness,
+            reward_centers,
+            A,
+            B,
+            sigma,
+            sigma_init,
+            mu_init,
+        )
 
         # Render interface
         RenderInterface2D.__init__(self)
@@ -285,8 +306,9 @@ def get_background(self):
 
         # reward position
         for ii, ampl in enumerate(self.reward_amplitudes):
-            contour = self._get_ball_shape(self.reward_centers[ii],
-                                           self.reward_smoothness[ii])
+            contour = self._get_ball_shape(
+                self.reward_centers[ii], self.reward_smoothness[ii]
+            )
             ampl = 1.0 - ampl  # dark violet = more reward
             contour.set_color((0.5, 0.0, 0.5 * (1.0 + ampl)))
             bg.add_shape(contour)
@@ -320,15 +342,16 @@ class SimplePBallND(PBall):
     PBall environment in d dimensions with simple dynamics.
     """
 
-    def __init__(self,
-                 p=2,
-                 dim=2,
-                 action_amplitude=0.05,
-                 r_smoothness=0.25,
-                 sigma=0.01,
-                 sigma_init=0.001,
-                 mu_init=None
-                 ):
+    def __init__(
+        self,
+        p=2,
+        dim=2,
+        action_amplitude=0.05,
+        r_smoothness=0.25,
+        sigma=0.01,
+        sigma_init=0.001,
+        mu_init=None,
+    ):
         # Action list
         action_list = []
         for dd in range(dim):
@@ -352,10 +375,20 @@ def __init__(self,
             mu_init = np.zeros(dim)
 
         # Initialize PBall
-        PBall.__init__(self, p, action_list, reward_amplitudes,
-                       reward_smoothness,
-                       reward_centers,
-                       A, B, sigma, sigma_init, mu_init)
+        PBall.__init__(
+            self,
+            p,
+            action_list,
+            reward_amplitudes,
+            reward_smoothness,
+            reward_centers,
+            A,
+            B,
+            sigma,
+            sigma_init,
+            mu_init,
+        )
+
 
 # if __name__ == '__main__':
 #     env = PBall2D(p=5)
diff --git a/rlberry/envs/benchmarks/generalization/twinrooms.py b/rlberry/envs/benchmarks/generalization/twinrooms.py
index 693b3f568..6d444b876 100644
--- a/rlberry/envs/benchmarks/generalization/twinrooms.py
+++ b/rlberry/envs/benchmarks/generalization/twinrooms.py
@@ -33,11 +33,10 @@ class TwinRooms(RenderInterface2D, Model):
     when array_observation is True. Only the functions env.reset() and
     env.step() are covered.
     """
+
     name = "TwinRooms"
 
-    def __init__(self,
-                 noise_room1=0.01,
-                 noise_room2=0.01):
+    def __init__(self, noise_room1=0.01, noise_room2=0.01):
         Model.__init__(self)
         RenderInterface2D.__init__(self)
 
@@ -60,7 +59,7 @@ def __init__(self,
         # rendering info
         self.set_clipping_area((0, 2, 0, 1))
         self.set_refresh_interval(100)  # in milliseconds
-        self.renderer_type = 'opengl'
+        self.renderer_type = "opengl"
 
         # reset
         self.reset()
@@ -119,8 +118,11 @@ def sample(self, state, action):
         else:
             raise ValueError("Invalid action")
 
-        next_state = state + displacement \
-                     + self.room_noises[self.current_room] * self.rng.normal(size=2)
+        next_state = (
+            state
+            + displacement
+            + self.room_noises[self.current_room] * self.rng.normal(size=2)
+        )
 
         # clip to room
         next_state = self._clip_to_room(next_state)
@@ -152,7 +154,10 @@ def get_background(self):
         bg.add_shape(shape)
 
         # rewards
-        for (x, y) in [self.base_reward_pos, self.base_reward_pos + np.array([1.0, 0.0])]:
+        for (x, y) in [
+            self.base_reward_pos,
+            self.base_reward_pos + np.array([1.0, 0.0]),
+        ]:
             reward = circle_shape((x, y), 0.1, n_points=50)
             reward.type = "POLYGON"
             reward.set_color((0.0, 0.5, 0.0))
diff --git a/rlberry/envs/benchmarks/grid_exploration/apple_gold.py b/rlberry/envs/benchmarks/grid_exploration/apple_gold.py
index 3517225ab..e444cb71f 100644
--- a/rlberry/envs/benchmarks/grid_exploration/apple_gold.py
+++ b/rlberry/envs/benchmarks/grid_exploration/apple_gold.py
@@ -34,6 +34,7 @@ class AppleGold(GridWorld):
         for Hard-Exploration Tasks
         arXiv preprint arXiv:1907.10247
     """
+
     name = "AppleGold"
 
     def __init__(self, reward_free=False, array_observation=False):
@@ -70,26 +71,24 @@ def __init__(self, reward_free=False, array_observation=False):
         if self.reward_free:
             reward_at = {}
         else:
-            reward_at = {
-                (7, 7): 10.0,
-                (8, 2): 1.0,
-                (10, 3): 1.0
-            }
+            reward_at = {(7, 7): 10.0, (8, 2): 1.0, (10, 3): 1.0}
             for jj in range(7, 16):
                 for ii in range(1, 12):
                     if (ii, jj) not in walls and (ii, jj) != (7, 7):
                         reward_at[(ii, jj)] = -0.05
 
         # Init base class
-        GridWorld.__init__(self,
-                           nrows=nrows,
-                           ncols=ncols,
-                           start_coord=start_coord,
-                           terminal_states=terminal_states,
-                           success_probability=success_probability,
-                           reward_at=reward_at,
-                           walls=walls,
-                           default_reward=default_reward)
+        GridWorld.__init__(
+            self,
+            nrows=nrows,
+            ncols=ncols,
+            start_coord=start_coord,
+            terminal_states=terminal_states,
+            success_probability=success_probability,
+            reward_at=reward_at,
+            walls=walls,
+            default_reward=default_reward,
+        )
 
         # spaces
         if self.array_observation:
diff --git a/rlberry/envs/benchmarks/grid_exploration/four_room.py b/rlberry/envs/benchmarks/grid_exploration/four_room.py
index 6f96775d6..b40cf208b 100644
--- a/rlberry/envs/benchmarks/grid_exploration/four_room.py
+++ b/rlberry/envs/benchmarks/grid_exploration/four_room.py
@@ -30,12 +30,10 @@ class FourRoom(GridWorld):
     when array_observation is True. Only the functions env.reset() and
     env.step() are covered.
     """
+
     name = "FourRoom"
 
-    def __init__(self,
-                 reward_free=False,
-                 difficulty=0,
-                 array_observation=False):
+    def __init__(self, reward_free=False, difficulty=0, array_observation=False):
         self.reward_free = reward_free
         self.difficulty = difficulty
         self.array_observation = array_observation
@@ -77,15 +75,17 @@ def __init__(self,
                 }
 
         # Init base class
-        GridWorld.__init__(self,
-                           nrows=nrows,
-                           ncols=ncols,
-                           start_coord=start_coord,
-                           terminal_states=terminal_states,
-                           success_probability=success_probability,
-                           reward_at=reward_at,
-                           walls=walls,
-                           default_reward=default_reward)
+        GridWorld.__init__(
+            self,
+            nrows=nrows,
+            ncols=ncols,
+            start_coord=start_coord,
+            terminal_states=terminal_states,
+            success_probability=success_probability,
+            reward_at=reward_at,
+            walls=walls,
+            default_reward=default_reward,
+        )
 
         # spaces
         if self.array_observation:
diff --git a/rlberry/envs/benchmarks/grid_exploration/nroom.py b/rlberry/envs/benchmarks/grid_exploration/nroom.py
index 04ef975c1..f94079123 100644
--- a/rlberry/envs/benchmarks/grid_exploration/nroom.py
+++ b/rlberry/envs/benchmarks/grid_exploration/nroom.py
@@ -55,20 +55,23 @@ class NRoom(GridWorld):
     when array_observation is True. Only the functions env.reset() and
     env.step() are covered.
     """
+
     name = "N-Room"
 
-    def __init__(self,
-                 nrooms=7,
-                 reward_free=False,
-                 array_observation=False,
-                 room_size=5,
-                 success_probability=0.95,
-                 remove_walls=False,
-                 initial_state_distribution='center',
-                 include_traps=False):
+    def __init__(
+        self,
+        nrooms=7,
+        reward_free=False,
+        array_observation=False,
+        room_size=5,
+        success_probability=0.95,
+        remove_walls=False,
+        initial_state_distribution="center",
+        include_traps=False,
+    ):
 
         assert nrooms > 0, "nrooms must be > 0"
-        assert initial_state_distribution in ('center', 'uniform')
+        assert initial_state_distribution in ("center", "uniform")
 
         self.reward_free = reward_free
         self.array_observation = array_observation
@@ -116,12 +119,13 @@ def __init__(self,
                 # existing rooms
                 if count < self.nrooms:
                     # remove top wall
-                    if ((room_c == self.room_ncols - 1) and (room_r % 2 == 0)) \
-                            or ((room_c == 0) and (room_r % 2 == 1)):
+                    if ((room_c == self.room_ncols - 1) and (room_r % 2 == 0)) or (
+                        (room_c == 0) and (room_r % 2 == 1)
+                    ):
                         if room_r != self.room_nrows - 1:
                             wall_to_remove = self._convert_room_coord_to_global(
-                                room_r, room_c,
-                                self.room_size, self.room_size // 2)
+                                room_r, room_c, self.room_size, self.room_size // 2
+                            )
                             if wall_to_remove in walls:
                                 walls.remove(wall_to_remove)
                 # rooms to remove
@@ -129,30 +133,37 @@ def __init__(self,
                     for ii in range(-1, self.room_size + 1):
                         for jj in range(-1, self.room_size + 1):
                             wall_to_include = self._convert_room_coord_to_global(
-                                room_r, room_c,
-                                ii, jj)
-                            if wall_to_include[0] >= 0 and wall_to_include[0] < nrows \
-                                    and wall_to_include[1] >= 0 and wall_to_include[1] < ncols \
-                                    and (wall_to_include not in walls):
+                                room_r, room_c, ii, jj
+                            )
+                            if (
+                                wall_to_include[0] >= 0
+                                and wall_to_include[0] < nrows
+                                and wall_to_include[1] >= 0
+                                and wall_to_include[1] < ncols
+                                and (wall_to_include not in walls)
+                            ):
                                 walls.append(wall_to_include)
                     pass
 
                 # start coord
                 if count == nrooms // 2:
                     start_coord = self._convert_room_coord_to_global(
-                        room_r, room_c,
-                        self.room_size // 2, self.room_size // 2)
+                        room_r, room_c, self.room_size // 2, self.room_size // 2
+                    )
                 # terminal state
                 if count == nrooms - 1:
                     terminal_state = self._convert_room_coord_to_global(
-                        room_r, room_c,
-                        self.room_size // 2, self.room_size // 2)
+                        room_r, room_c, self.room_size // 2, self.room_size // 2
+                    )
                 # trap
                 if include_traps:
                     self.traps.append(
                         self._convert_room_coord_to_global(
-                            room_r, room_c,
-                            self.room_size // 2 + 1, self.room_size // 2 + 1)
+                            room_r,
+                            room_c,
+                            self.room_size // 2 + 1,
+                            self.room_size // 2 + 1,
+                        )
                     )
                 count += 1
 
@@ -164,7 +175,7 @@ def __init__(self,
             reward_at = {
                 terminal_state: 1.0,
                 start_coord: 0.01,
-                (self.room_size // 2, self.room_size // 2): 0.1
+                (self.room_size // 2, self.room_size // 2): 0.1,
             }
 
         # Check remove_walls
@@ -172,18 +183,20 @@ def __init__(self,
             walls = ()
 
         # Init base class
-        GridWorld.__init__(self,
-                           nrows=nrows,
-                           ncols=ncols,
-                           start_coord=start_coord,
-                           terminal_states=terminal_states,
-                           success_probability=success_probability,
-                           reward_at=reward_at,
-                           walls=walls,
-                           default_reward=0.0)
+        GridWorld.__init__(
+            self,
+            nrows=nrows,
+            ncols=ncols,
+            start_coord=start_coord,
+            terminal_states=terminal_states,
+            success_probability=success_probability,
+            reward_at=reward_at,
+            walls=walls,
+            default_reward=0.0,
+        )
 
         # Check initial distribution
-        if initial_state_distribution == 'uniform':
+        if initial_state_distribution == "uniform":
             distr = np.ones(self.observation_space.n) / self.observation_space.n
             self.set_initial_state_distribution(distr)
 
@@ -192,7 +205,9 @@ def __init__(self,
             self.discrete_observation_space = self.observation_space
             self.observation_space = spaces.Box(0.0, 1.0, shape=(2,))
 
-    def _convert_room_coord_to_global(self, room_row, room_col, room_coord_row, room_coord_col):
+    def _convert_room_coord_to_global(
+        self, room_row, room_col, room_coord_row, room_coord_col
+    ):
         col_offset = (self.room_size + 1) * room_col
         row_offset = (self.room_size + 1) * room_row
 
diff --git a/rlberry/envs/benchmarks/grid_exploration/six_room.py b/rlberry/envs/benchmarks/grid_exploration/six_room.py
index 131c3a3ef..b8b9232a4 100644
--- a/rlberry/envs/benchmarks/grid_exploration/six_room.py
+++ b/rlberry/envs/benchmarks/grid_exploration/six_room.py
@@ -25,6 +25,7 @@ class SixRoom(GridWorld):
     when array_observation is True. Only the functions env.reset() and
     env.step() are covered.
     """
+
     name = "SixRoom"
 
     def __init__(self, reward_free=False, array_observation=False):
@@ -60,15 +61,17 @@ def __init__(self, reward_free=False, array_observation=False):
             }
 
         # Init base class
-        GridWorld.__init__(self,
-                           nrows=nrows,
-                           ncols=ncols,
-                           start_coord=start_coord,
-                           terminal_states=terminal_states,
-                           success_probability=success_probability,
-                           reward_at=reward_at,
-                           walls=walls,
-                           default_reward=default_reward)
+        GridWorld.__init__(
+            self,
+            nrows=nrows,
+            ncols=ncols,
+            start_coord=start_coord,
+            terminal_states=terminal_states,
+            success_probability=success_probability,
+            reward_at=reward_at,
+            walls=walls,
+            default_reward=default_reward,
+        )
 
         # spaces
         if self.array_observation:
diff --git a/rlberry/envs/bullet3/pybullet_envs/__init__.py b/rlberry/envs/bullet3/pybullet_envs/__init__.py
index 533cc6a97..796a8424a 100644
--- a/rlberry/envs/bullet3/pybullet_envs/__init__.py
+++ b/rlberry/envs/bullet3/pybullet_envs/__init__.py
@@ -12,29 +12,29 @@ def register(id, *args, **kvargs):
 # ------------bullet-------------
 
 register(
-    id='PendulumBulletEnv-v0',
-    entry_point='rlberry.envs.bullet3.pybullet_envs.gym_pendulum_envs:PendulumBulletEnv',
+    id="PendulumBulletEnv-v0",
+    entry_point="rlberry.envs.bullet3.pybullet_envs.gym_pendulum_envs:PendulumBulletEnv",
     max_episode_steps=1000,
     reward_threshold=950.0,
 )
 
 register(
-    id='PendulumSwingupBulletEnv-v0',
-    entry_point='rlberry.envs.bullet3.pybullet_envs.gym_pendulum_envs:PendulumSwingupBulletEnv',
+    id="PendulumSwingupBulletEnv-v0",
+    entry_point="rlberry.envs.bullet3.pybullet_envs.gym_pendulum_envs:PendulumSwingupBulletEnv",
     max_episode_steps=1000,
     reward_threshold=800.0,
 )
 
 register(
-    id='DiscretePendulumBulletEnv-v0',
-    entry_point='rlberry.envs.bullet3.pybullet_envs.gym_pendulum_envs:DiscretePendulumBulletEnv',
+    id="DiscretePendulumBulletEnv-v0",
+    entry_point="rlberry.envs.bullet3.pybullet_envs.gym_pendulum_envs:DiscretePendulumBulletEnv",
     max_episode_steps=1000,
     reward_threshold=950.0,
 )
 
 register(
-    id='DiscretePendulumSwingupBulletEnv-v0',
-    entry_point='rlberry.envs.bullet3.pybullet_envs.gym_pendulum_envs:DiscretePendulumSwingupBulletEnv',
+    id="DiscretePendulumSwingupBulletEnv-v0",
+    entry_point="rlberry.envs.bullet3.pybullet_envs.gym_pendulum_envs:DiscretePendulumSwingupBulletEnv",
     max_episode_steps=1000,
     reward_threshold=800.0,
 )
diff --git a/rlberry/envs/bullet3/pybullet_envs/gym_pendulum_envs.py b/rlberry/envs/bullet3/pybullet_envs/gym_pendulum_envs.py
index 0288c40f6..9e9e2d8fa 100644
--- a/rlberry/envs/bullet3/pybullet_envs/gym_pendulum_envs.py
+++ b/rlberry/envs/bullet3/pybullet_envs/gym_pendulum_envs.py
@@ -1,6 +1,9 @@
 from gym import spaces
 from pybullet_envs.env_bases import MJCFBaseBulletEnv
-from pybullet_envs.gym_pendulum_envs import InvertedPendulumBulletEnv, InvertedPendulumSwingupBulletEnv
+from pybullet_envs.gym_pendulum_envs import (
+    InvertedPendulumBulletEnv,
+    InvertedPendulumSwingupBulletEnv,
+)
 from pybullet_envs.scene_abstract import SingleRobotEmptyScene
 
 from rlberry.envs.bullet3.pybullet_envs.robot_pendula import Pendulum, PendulumSwingup
@@ -16,7 +19,9 @@ def __init__(self):
         self.stateId = -1
 
     def create_single_player_scene(self, bullet_client):
-        return SingleRobotEmptyScene(bullet_client, gravity=9.81, timestep=0.02, frame_skip=1)
+        return SingleRobotEmptyScene(
+            bullet_client, gravity=9.81, timestep=0.02, frame_skip=1
+        )
 
     def step(self, a):
         self.robot.apply_action(a)
@@ -28,7 +33,7 @@ def step(self, a):
             done = False
         else:
             reward = 1.0
-            done = np.abs(self.robot.theta) > .2
+            done = np.abs(self.robot.theta) > 0.2
         self.rewards = [float(reward)]
         self.HUD(state, a, done)
         return state, sum(self.rewards), done, {}
diff --git a/rlberry/envs/bullet3/pybullet_envs/robot_bases.py b/rlberry/envs/bullet3/pybullet_envs/robot_bases.py
index e6c3190d4..d2dc50e75 100644
--- a/rlberry/envs/bullet3/pybullet_envs/robot_bases.py
+++ b/rlberry/envs/bullet3/pybullet_envs/robot_bases.py
@@ -10,69 +10,113 @@ class MJCFBasedRobot2(MJCFBasedRobot):
     def reset(self, bullet_client):
         self._p = bullet_client
         # print("Created bullet_client with id=", self._p._client)
-        if (self.doneLoading == 0):
+        if self.doneLoading == 0:
             self.ordered_joints = []
             self.doneLoading = 1
             if self.self_collision:
-                self.objects = self._p.loadMJCF(os.path.join(data.getDataPath(), "mjcf",
-                                                             self.model_xml),
-                                                flags=pybullet.URDF_USE_SELF_COLLISION |
-                                                      pybullet.URDF_USE_SELF_COLLISION_EXCLUDE_ALL_PARENTS |
-                                                      pybullet.URDF_GOOGLEY_UNDEFINED_COLORS)
-                self.parts, self.jdict, self.ordered_joints, self.robot_body = self.addToScene(
-                    self._p, self.objects)
+                self.objects = self._p.loadMJCF(
+                    os.path.join(data.getDataPath(), "mjcf", self.model_xml),
+                    flags=pybullet.URDF_USE_SELF_COLLISION
+                    | pybullet.URDF_USE_SELF_COLLISION_EXCLUDE_ALL_PARENTS
+                    | pybullet.URDF_GOOGLEY_UNDEFINED_COLORS,
+                )
+                (
+                    self.parts,
+                    self.jdict,
+                    self.ordered_joints,
+                    self.robot_body,
+                ) = self.addToScene(self._p, self.objects)
             else:
                 self.objects = self._p.loadMJCF(
-                    os.path.join(data.getDataPath(), "mjcf", self.model_xml,
-                                 flags=pybullet.URDF_GOOGLEY_UNDEFINED_COLORS))
-                self.parts, self.jdict, self.ordered_joints, self.robot_body = self.addToScene(
-                    self._p, self.objects)
+                    os.path.join(
+                        data.getDataPath(),
+                        "mjcf",
+                        self.model_xml,
+                        flags=pybullet.URDF_GOOGLEY_UNDEFINED_COLORS,
+                    )
+                )
+                (
+                    self.parts,
+                    self.jdict,
+                    self.ordered_joints,
+                    self.robot_body,
+                ) = self.addToScene(self._p, self.objects)
         self.robot_specific_reset(self._p)
 
-        s = self.calc_state(
+        s = (
+            self.calc_state()
         )  # optimization: calc_state() can calculate something in self.* for calc_potential() to use
 
         return s
 
 
 class URDFBasedRobot2(URDFBasedRobot):
-    def __init__(self,
-                 model_urdf,
-                 robot_name,
-                 action_dim,
-                 obs_dim,
-                 basePosition=[0, 0, 0],
-                 baseOrientation=[0, 0, 0, 1],
-                 fixed_base=False,
-                 self_collision=False):
-        super().__init__(model_urdf, robot_name, action_dim, obs_dim, basePosition, baseOrientation, fixed_base,
-                         self_collision)
+    def __init__(
+        self,
+        model_urdf,
+        robot_name,
+        action_dim,
+        obs_dim,
+        basePosition=[0, 0, 0],
+        baseOrientation=[0, 0, 0, 1],
+        fixed_base=False,
+        self_collision=False,
+    ):
+        super().__init__(
+            model_urdf,
+            robot_name,
+            action_dim,
+            obs_dim,
+            basePosition,
+            baseOrientation,
+            fixed_base,
+            self_collision,
+        )
         self.doneLoading = 0
 
     def reset(self, bullet_client):
         self._p = bullet_client
-        if (self.doneLoading == 0):
+        if self.doneLoading == 0:
             self.ordered_joints = []
             self.doneLoading = 1
             if self.self_collision:
-                self.parts, self.jdict, self.ordered_joints, self.robot_body = self.addToScene(
+                (
+                    self.parts,
+                    self.jdict,
+                    self.ordered_joints,
+                    self.robot_body,
+                ) = self.addToScene(
                     self._p,
-                    self._p.loadURDF(os.path.join(data.getDataPath(), self.model_urdf),
-                                     basePosition=self.basePosition,
-                                     baseOrientation=self.baseOrientation,
-                                     useFixedBase=self.fixed_base,
-                                     flags=pybullet.URDF_USE_SELF_COLLISION | pybullet.URDF_GOOGLEY_UNDEFINED_COLORS))
+                    self._p.loadURDF(
+                        os.path.join(data.getDataPath(), self.model_urdf),
+                        basePosition=self.basePosition,
+                        baseOrientation=self.baseOrientation,
+                        useFixedBase=self.fixed_base,
+                        flags=pybullet.URDF_USE_SELF_COLLISION
+                        | pybullet.URDF_GOOGLEY_UNDEFINED_COLORS,
+                    ),
+                )
             else:
-                self.parts, self.jdict, self.ordered_joints, self.robot_body = self.addToScene(
+                (
+                    self.parts,
+                    self.jdict,
+                    self.ordered_joints,
+                    self.robot_body,
+                ) = self.addToScene(
                     self._p,
-                    self._p.loadURDF(os.path.join(data.getDataPath(), self.model_urdf),
-                                     basePosition=self.basePosition,
-                                     baseOrientation=self.baseOrientation,
-                                     useFixedBase=self.fixed_base, flags=pybullet.URDF_GOOGLEY_UNDEFINED_COLORS))
+                    self._p.loadURDF(
+                        os.path.join(data.getDataPath(), self.model_urdf),
+                        basePosition=self.basePosition,
+                        baseOrientation=self.baseOrientation,
+                        useFixedBase=self.fixed_base,
+                        flags=pybullet.URDF_GOOGLEY_UNDEFINED_COLORS,
+                    ),
+                )
 
         self.robot_specific_reset(self._p)
 
-        s = self.calc_state(
+        s = (
+            self.calc_state()
         )  # optimization: calc_state() can calculate something in self.* for calc_potential() to use
         self.potential = self.calc_potential()
 
diff --git a/rlberry/envs/bullet3/pybullet_envs/robot_pendula.py b/rlberry/envs/bullet3/pybullet_envs/robot_pendula.py
index b95a86136..871fe45c8 100644
--- a/rlberry/envs/bullet3/pybullet_envs/robot_pendula.py
+++ b/rlberry/envs/bullet3/pybullet_envs/robot_pendula.py
@@ -1,7 +1,10 @@
 import gym
 import numpy as np
 
-from rlberry.envs.bullet3.pybullet_envs.robot_bases import MJCFBasedRobot2, URDFBasedRobot2
+from rlberry.envs.bullet3.pybullet_envs.robot_bases import (
+    MJCFBasedRobot2,
+    URDFBasedRobot2,
+)
 
 
 class Pendulum(URDFBasedRobot2):
@@ -9,23 +12,25 @@ class Pendulum(URDFBasedRobot2):
 
     def __init__(self):
         # MJCFBasedRobot2.__init__(self, 'pendulum.xml', 'pole', action_dim=1, obs_dim=2)
-        URDFBasedRobot2.__init__(self, 'pendulum.urdf', 'pole', action_dim=1, obs_dim=2)
+        URDFBasedRobot2.__init__(self, "pendulum.urdf", "pole", action_dim=1, obs_dim=2)
         self.action_space = gym.spaces.Box(shape=(1,), low=-20, high=20)
 
     def robot_specific_reset(self, bullet_client):
         self._p = bullet_client
         self.pole = self.parts["pole"]
         self.j1 = self.jdict["hinge"]
-        u = self.np_random.uniform(low=-.1, high=.1)
+        u = self.np_random.uniform(low=-0.1, high=0.1)
         self.j1.reset_current_position(u if not self.swingup else np.pi + u, 0)
         self.j1.set_motor_torque(0)
 
     def apply_action(self, a):
-        assert (np.isfinite(a).all())
+        assert np.isfinite(a).all()
         if not np.isfinite(a).all():
             print("a is inf")
             a[0] = 0
-        self.j1.set_motor_torque(np.clip(a[0], self.action_space.low, self.action_space.high))
+        self.j1.set_motor_torque(
+            np.clip(a[0], self.action_space.low, self.action_space.high)
+        )
 
     def calc_state(self):
         self.theta, theta_dot = self.j1.current_position()
diff --git a/rlberry/envs/classic_control/acrobot.py b/rlberry/envs/classic_control/acrobot.py
index c15069e0a..41d16cbd7 100644
--- a/rlberry/envs/classic_control/acrobot.py
+++ b/rlberry/envs/classic_control/acrobot.py
@@ -16,8 +16,13 @@
 from rlberry.rendering.common_shapes import bar_shape, circle_shape
 
 __copyright__ = "Copyright 2013, RLPy http://acl.mit.edu/RLPy"
-__credits__ = ["Alborz Geramifard", "Robert H. Klein", "Christoph Dann",
-               "William Dabney", "Jonathan P. How"]
+__credits__ = [
+    "Alborz Geramifard",
+    "Robert H. Klein",
+    "Christoph Dann",
+    "William Dabney",
+    "Jonathan P. How",
+]
 __license__ = "BSD 3-Clause"
 __author__ = "Christoph Dann <cdann@cdann.de>"
 
@@ -71,24 +76,25 @@ class Acrobot(RenderInterface2D, Model):
         than the original version which employs Euler integration,
         see the AcrobotLegacy class.
     """
+
     name = "Acrobot"
 
-    dt = .2
+    dt = 0.2
 
-    LINK_LENGTH_1 = 1.  # [m]
-    LINK_LENGTH_2 = 1.  # [m]
-    LINK_MASS_1 = 1.  #: [kg] mass of link 1
-    LINK_MASS_2 = 1.  #: [kg] mass of link 2
+    LINK_LENGTH_1 = 1.0  # [m]
+    LINK_LENGTH_2 = 1.0  # [m]
+    LINK_MASS_1 = 1.0  #: [kg] mass of link 1
+    LINK_MASS_2 = 1.0  #: [kg] mass of link 2
     LINK_COM_POS_1 = 0.5  #: [m] position of the center of mass of link 1
     LINK_COM_POS_2 = 0.5  #: [m] position of the center of mass of link 2
-    LINK_MOI = 1.  #: moments of inertia for both links
+    LINK_MOI = 1.0  #: moments of inertia for both links
 
     MAX_VEL_1 = 4 * np.pi
     MAX_VEL_2 = 9 * np.pi
 
-    AVAIL_TORQUE = [-1., 0., +1]
+    AVAIL_TORQUE = [-1.0, 0.0, +1]
 
-    torque_noise_max = 0.
+    torque_noise_max = 0.0
 
     #: use dynamics equations from the nips paper or the book
     book_or_nips = "book"
@@ -123,8 +129,10 @@ def reset(self):
         return self._get_ob()
 
     def step(self, action):
-        assert self.action_space.contains(action), \
-            "%r (%s) invalid" % (action, type(action))
+        assert self.action_space.contains(action), "%r (%s) invalid" % (
+            action,
+            type(action),
+        )
 
         # save state for rendering
         if self.is_render_enabled():
@@ -135,8 +143,7 @@ def step(self, action):
 
         # Add noise to the force action
         if self.torque_noise_max > 0:
-            torque += self.rng.uniform(-self.torque_noise_max,
-                                       self.torque_noise_max)
+            torque += self.rng.uniform(-self.torque_noise_max, self.torque_noise_max)
 
         # Now, augment the state with our force action so it can be passed to
         # _dsdt
@@ -158,17 +165,18 @@ def step(self, action):
         ns[3] = bound(ns[3], -self.MAX_VEL_2, self.MAX_VEL_2)
         self.state = ns
         terminal = self._terminal()
-        reward = -1. if not terminal else 0.
+        reward = -1.0 if not terminal else 0.0
         return self._get_ob(), reward, terminal, {}
 
     def _get_ob(self):
         s = self.state
-        return np.array([np.cos(s[0]), np.sin(s[0]), np.cos(s[1]),
-                         np.sin(s[1]), s[2], s[3]])
+        return np.array(
+            [np.cos(s[0]), np.sin(s[0]), np.cos(s[1]), np.sin(s[1]), s[2], s[3]]
+        )
 
     def _terminal(self):
         s = self.state
-        return bool(-np.cos(s[0]) - np.cos(s[1] + s[0]) > 1.)
+        return bool(-np.cos(s[0]) - np.cos(s[1] + s[0]) > 1.0)
 
     def _dsdt(self, s_augmented, t):
         m1 = self.LINK_MASS_1
@@ -185,26 +193,35 @@ def _dsdt(self, s_augmented, t):
         theta2 = s[1]
         dtheta1 = s[2]
         dtheta2 = s[3]
-        d1 = m1 * lc1 ** 2 + m2 * \
-             (l1 ** 2 + lc2 ** 2 + 2 * l1 * lc2 * np.cos(theta2)) + I1 + I2
+        d1 = (
+            m1 * lc1 ** 2
+            + m2 * (l1 ** 2 + lc2 ** 2 + 2 * l1 * lc2 * np.cos(theta2))
+            + I1
+            + I2
+        )
         d2 = m2 * (lc2 ** 2 + l1 * lc2 * np.cos(theta2)) + I2
-        phi2 = m2 * lc2 * g * np.cos(theta1 + theta2 - np.pi / 2.)
-        phi1 = - m2 * l1 * lc2 * dtheta2 ** 2 * np.sin(theta2) \
-               - 2 * m2 * l1 * lc2 * dtheta2 * dtheta1 * np.sin(theta2) \
-               + (m1 * lc1 + m2 * l1) * g * np.cos(theta1 - np.pi / 2) + phi2
+        phi2 = m2 * lc2 * g * np.cos(theta1 + theta2 - np.pi / 2.0)
+        phi1 = (
+            -m2 * l1 * lc2 * dtheta2 ** 2 * np.sin(theta2)
+            - 2 * m2 * l1 * lc2 * dtheta2 * dtheta1 * np.sin(theta2)
+            + (m1 * lc1 + m2 * l1) * g * np.cos(theta1 - np.pi / 2)
+            + phi2
+        )
         if self.book_or_nips == "nips":
             # the following line is consistent with the description in the
             # paper
-            ddtheta2 = (a + d2 / d1 * phi1 - phi2) / \
-                       (m2 * lc2 ** 2 + I2 - d2 ** 2 / d1)
+            ddtheta2 = (a + d2 / d1 * phi1 - phi2) / (m2 * lc2 ** 2 + I2 - d2 ** 2 / d1)
         else:
             # the following line is consistent with the java implementation
             # and the book
-            ddtheta2 = (a + d2 / d1 * phi1 - m2 * l1 * lc2 * dtheta1 ** 2 *
-                        np.sin(theta2) - phi2) \
-                       / (m2 * lc2 ** 2 + I2 - d2 ** 2 / d1)
+            ddtheta2 = (
+                a
+                + d2 / d1 * phi1
+                - m2 * l1 * lc2 * dtheta1 ** 2 * np.sin(theta2)
+                - phi2
+            ) / (m2 * lc2 ** 2 + I2 - d2 ** 2 / d1)
         ddtheta1 = -(d2 * ddtheta2 + phi1) / d1
-        return (dtheta1, dtheta2, ddtheta1, ddtheta2, 0.)
+        return (dtheta1, dtheta2, ddtheta1, ddtheta2, 0.0)
 
     #
     # Below: code for rendering
@@ -219,10 +236,14 @@ def get_scene(self, state):
 
         p0 = (0.0, 0.0)
 
-        p1 = (self.LINK_LENGTH_1 * np.sin(state[0]),
-              -self.LINK_LENGTH_1 * np.cos(state[0]))
-        p2 = (p1[0] + self.LINK_LENGTH_2 * np.sin(state[0] + state[1]),
-              p1[1] - self.LINK_LENGTH_2 * np.cos(state[0] + state[1]))
+        p1 = (
+            self.LINK_LENGTH_1 * np.sin(state[0]),
+            -self.LINK_LENGTH_1 * np.cos(state[0]),
+        )
+        p2 = (
+            p1[0] + self.LINK_LENGTH_2 * np.sin(state[0] + state[1]),
+            p1[1] - self.LINK_LENGTH_2 * np.cos(state[0] + state[1]),
+        )
 
         link1 = bar_shape(p0, p1, 0.1)
         link1.set_color((255 / 255, 140 / 255, 0 / 255))
diff --git a/rlberry/envs/classic_control/mountain_car.py b/rlberry/envs/classic_control/mountain_car.py
index 6692541e1..6f8fa0589 100644
--- a/rlberry/envs/classic_control/mountain_car.py
+++ b/rlberry/envs/classic_control/mountain_car.py
@@ -59,6 +59,7 @@ class MountainCar(RenderInterface2D, Model):
     Episode Termination:
         The car position is more than 0.5
     """
+
     name = "MountainCar"
 
     def __init__(self, goal_velocity=0):
@@ -91,8 +92,10 @@ def __init__(self, goal_velocity=0):
         self.reset()
 
     def step(self, action):
-        assert self.action_space.contains(action), \
-            "%r (%s) invalid" % (action, type(action))
+        assert self.action_space.contains(action), "%r (%s) invalid" % (
+            action,
+            type(action),
+        )
 
         # save state for rendering
         if self.is_render_enabled():
@@ -110,23 +113,24 @@ def reset(self):
     def sample(self, state, action):
         if not isinstance(state, np.ndarray):
             state = np.array(state)
-        assert self.observation_space.contains(state), \
-            "Invalid state as argument of reset()."
-        assert self.action_space.contains(action), \
-            "%r (%s) invalid" % (action, type(action))
+        assert self.observation_space.contains(
+            state
+        ), "Invalid state as argument of reset()."
+        assert self.action_space.contains(action), "%r (%s) invalid" % (
+            action,
+            type(action),
+        )
 
         position = state[0]
         velocity = state[1]
-        velocity += (action - 1) * self.force \
-                    + math.cos(3 * position) * (-self.gravity)
+        velocity += (action - 1) * self.force + math.cos(3 * position) * (-self.gravity)
         velocity = np.clip(velocity, -self.max_speed, self.max_speed)
         position += velocity
         position = np.clip(position, self.min_position, self.max_position)
-        if (position == self.min_position and velocity < 0):
+        if position == self.min_position and velocity < 0:
             velocity = 0
 
-        done = bool(position >= self.goal_position and
-                    velocity >= self.goal_velocity)
+        done = bool(position >= self.goal_position and velocity >= self.goal_velocity)
         reward = 0.0
         if done:
             reward = 1.0
@@ -136,7 +140,7 @@ def sample(self, state, action):
 
     @staticmethod
     def _height(xs):
-        return np.sin(3 * xs) * .45 + .55
+        return np.sin(3 * xs) * 0.45 + 0.55
 
     #
     # Below: code for rendering
@@ -154,8 +158,7 @@ def get_background(self):
         mountain.add_vertex((0.6, -1.0))
 
         n_points = 50
-        obs_range = self.observation_space.high[0] \
-                    - self.observation_space.low[0]
+        obs_range = self.observation_space.high[0] - self.observation_space.low[0]
         eps = obs_range / (n_points - 1)
         for ii in reversed(range(n_points)):
             x = self.observation_space.low[0] + ii * eps
diff --git a/rlberry/envs/classic_control/pendulum.py b/rlberry/envs/classic_control/pendulum.py
index e068da35f..26d45f0e3 100644
--- a/rlberry/envs/classic_control/pendulum.py
+++ b/rlberry/envs/classic_control/pendulum.py
@@ -22,6 +22,7 @@ class Pendulum(RenderInterface2D, Model):
     the pendulum starts in a random position, and the goal
     is to swing it up so it stays upright.
     """
+
     name = "Pendulum"
 
     def __init__(self):
@@ -30,23 +31,23 @@ def __init__(self):
         RenderInterface2D.__init__(self)
 
         # environment parameters
-        self.max_speed = 8.
-        self.max_torque = 2.
+        self.max_speed = 8.0
+        self.max_torque = 2.0
         self.dt = 0.5
-        self.gravity = 10.
-        self.mass = 1.
-        self.length = 1.
+        self.gravity = 10.0
+        self.mass = 1.0
+        self.length = 1.0
 
         # rendering info
         self.set_clipping_area((-2.2, 2.2, -2.2, 2.2))
         self.set_refresh_interval(10)
 
         # observation and action spaces
-        high = np.array([1., 1., self.max_speed])
+        high = np.array([1.0, 1.0, self.max_speed])
         low = -high
-        self.action_space = spaces.Box(low=-self.max_torque,
-                                       high=self.max_torque,
-                                       shape=(1,))
+        self.action_space = spaces.Box(
+            low=-self.max_torque, high=self.max_torque, shape=(1,)
+        )
         self.observation_space = spaces.Box(low=low, high=high)
 
         # initialize
@@ -60,8 +61,10 @@ def reset(self):
         return self._get_ob()
 
     def step(self, action):
-        assert self.action_space.contains(action), \
-            "%r (%s) invalid" % (action, type(action))
+        assert self.action_space.contains(action), "%r (%s) invalid" % (
+            action,
+            type(action),
+        )
 
         # save state for rendering
         if self.is_render_enabled():
@@ -75,11 +78,19 @@ def step(self, action):
 
         action = np.clip(action, -self.max_torque, self.max_torque)[0]
         self.last_action = action  # for rendering
-        costs = angle_normalize(theta) ** 2 + .1 * thetadot ** 2 + .001 * (action ** 2)
+        costs = (
+            angle_normalize(theta) ** 2 + 0.1 * thetadot ** 2 + 0.001 * (action ** 2)
+        )
 
         # compute the next state after action
-        newthetadot = thetadot + (-3 * gravity / (2 * length) * np.sin(theta + np.pi) +
-                                  3. / (mass * length ** 2) * action) * dt
+        newthetadot = (
+            thetadot
+            + (
+                -3 * gravity / (2 * length) * np.sin(theta + np.pi)
+                + 3.0 / (mass * length ** 2) * action
+            )
+            * dt
+        )
         newtheta = theta + newthetadot * dt
         newthetadot = np.clip(newthetadot, -self.max_speed, self.max_speed)
 
@@ -102,8 +113,7 @@ def get_scene(self, state):
         scene = Scene()
 
         p0 = (0.0, 0.0)
-        p1 = (self.length * np.sin(state[0]),
-              -self.length * np.cos(state[0]))
+        p1 = (self.length * np.sin(state[0]), -self.length * np.cos(state[0]))
 
         link = bar_shape(p0, p1, 0.1)
         link.set_color((255 / 255, 105 / 255, 30 / 255))
@@ -118,4 +128,4 @@ def get_scene(self, state):
 
 
 def angle_normalize(x):
-    return (((x + np.pi) % (2 * np.pi)) - np.pi)
+    return ((x + np.pi) % (2 * np.pi)) - np.pi
diff --git a/rlberry/envs/finite/finite_mdp.py b/rlberry/envs/finite/finite_mdp.py
index ff092b7f5..57649a15b 100644
--- a/rlberry/envs/finite/finite_mdp.py
+++ b/rlberry/envs/finite/finite_mdp.py
@@ -62,8 +62,9 @@ def reset(self):
         Reset the environment to a default state.
         """
         if isinstance(self.initial_state_distribution, np.ndarray):
-            self.state = self.rng.choice(self._states,
-                                         p=self.initial_state_distribution)
+            self.state = self.rng.choice(
+                self._states, p=self.initial_state_distribution
+            )
         else:
             self.state = self.initial_state_distribution
         return self.state
@@ -159,17 +160,20 @@ def log(self):
         """
         Print the structure of the MDP.
         """
-        indent = '    '
+        indent = "    "
         for s in self._states:
             logger.info(f"State {s} {indent}")
             for a in self._actions:
                 logger.info(f"{indent} Action {a}")
                 for ss in self._states:
                     if self.P[s, a, ss] > 0.0:
-                        logger.info(f'{2 * indent} transition to {ss} '
-                                    f'with prob {self.P[s, a, ss]: .2f}')
+                        logger.info(
+                            f"{2 * indent} transition to {ss} "
+                            f"with prob {self.P[s, a, ss]: .2f}"
+                        )
             logger.info("~~~~~~~~~~~~~~~~~~~~")
 
+
 # if __name__ == '__main__':
 #     S = 3
 #     A = 2
diff --git a/rlberry/envs/finite/gridworld.py b/rlberry/envs/finite/gridworld.py
index dfc036199..3088b8198 100644
--- a/rlberry/envs/finite/gridworld.py
+++ b/rlberry/envs/finite/gridworld.py
@@ -37,17 +37,20 @@ class GridWorld(RenderInterface2D, FiniteMDP):
         reward received at states not in  'reward_at'
 
     """
+
     name = "GridWorld"
 
-    def __init__(self,
-                 nrows=5,
-                 ncols=5,
-                 start_coord=(0, 0),
-                 terminal_states=None,
-                 success_probability=0.9,
-                 reward_at=None,
-                 walls=((1, 1), (2, 2)),
-                 default_reward=0.0):
+    def __init__(
+        self,
+        nrows=5,
+        ncols=5,
+        start_coord=(0, 0),
+        terminal_states=None,
+        success_probability=0.9,
+        reward_at=None,
+        walls=((1, 1), (2, 2)),
+        default_reward=0.0,
+    ):
         # Grid dimensions
         self.nrows = nrows
         self.ncols = ncols
@@ -79,8 +82,8 @@ def __init__(self,
         self.start_coord = tuple(start_coord)
 
         # Actions (string to index & index to string)
-        self.a_str2idx = {'left': 0, 'right': 1, 'down': 2, 'up': 3}
-        self.a_idx2str = {0: 'left', 1: 'right', 2: 'down', 3: 'up'}
+        self.a_str2idx = {"left": 0, "right": 1, "down": 2, "up": 3}
+        self.a_idx2str = {0: "left", 1: "right", 2: "down", 3: "up"}
 
         # --------------------------------------------
         # The variables below are defined in _build()
@@ -99,8 +102,9 @@ def __init__(self,
         # Build
         self._build()
         init_state_idx = self.coord2index[start_coord]
-        FiniteMDP.__init__(self, self.R, self.P,
-                           initial_state_distribution=init_state_idx)
+        FiniteMDP.__init__(
+            self, self.R, self.P, initial_state_distribution=init_state_idx
+        )
         RenderInterface2D.__init__(self)
         self.reset()
         self.reward_range = (self.R.min(), self.R.max())
@@ -108,7 +112,7 @@ def __init__(self,
         # rendering info
         self.set_clipping_area((0, self.ncols, 0, self.nrows))
         self.set_refresh_interval(100)  # in milliseconds
-        self.renderer_type = 'pygame'
+        self.renderer_type = "pygame"
 
     def is_terminal(self, state):
         state_coord = self.index2coord[state]
@@ -158,8 +162,7 @@ def _build_transition_probabilities(self):
         for s in range(Ns):
             s_coord = self.index2coord[s]
             neighbors = self._get_neighbors(*s_coord)
-            valid_neighbors = [neighbors[nn][0] for nn in neighbors
-                               if neighbors[nn][1]]
+            valid_neighbors = [neighbors[nn][0] for nn in neighbors if neighbors[nn][1]]
             n_valid = len(valid_neighbors)
             for a in range(Na):  # each action corresponds to a direction
                 for nn in neighbors:
@@ -167,23 +170,23 @@ def _build_transition_probabilities(self):
                     if next_s_coord in valid_neighbors:
                         next_s = self.coord2index[next_s_coord]
                         if a == nn:  # action is successful
-                            self.P[s, a, next_s] = self.success_probability \
-                                                   + (1 - self.success_probability) \
-                                                   * (n_valid == 1)
+                            self.P[s, a, next_s] = self.success_probability + (
+                                1 - self.success_probability
+                            ) * (n_valid == 1)
                         elif neighbors[a][0] not in valid_neighbors:
                             self.P[s, a, s] = 1.0
                         else:
                             if n_valid > 1:
-                                self.P[s, a, next_s] = \
-                                    (1.0 - self.success_probability) \
-                                    / (n_valid - 1)
+                                self.P[s, a, next_s] = (
+                                    1.0 - self.success_probability
+                                ) / (n_valid - 1)
 
     def _get_neighbors(self, row, col):
         aux = {}
-        aux['left'] = (row, col - 1)  # left
-        aux['right'] = (row, col + 1)  # right
-        aux['up'] = (row - 1, col)  # up
-        aux['down'] = (row + 1, col)  # down
+        aux["left"] = (row, col - 1)  # left
+        aux["right"] = (row, col + 1)  # right
+        aux["up"] = (row - 1, col)  # up
+        aux["down"] = (row + 1, col)  # down
         neighbors = {}
         for direction_str in aux:
             direction = self.a_str2idx[direction_str]
@@ -193,10 +196,10 @@ def _get_neighbors(self, row, col):
 
     def get_transition_support(self, state):
         row, col = self.index2coord[state]
-        neighbors = [(row, col - 1), (row, col + 1),
-                     (row - 1, col), (row + 1, col)]
-        return [self.coord2index[coord] for coord in neighbors
-                if self._is_valid(*coord)]
+        neighbors = [(row, col - 1), (row, col + 1), (row - 1, col), (row + 1, col)]
+        return [
+            self.coord2index[coord] for coord in neighbors if self._is_valid(*coord)
+        ]
 
     def _is_valid(self, row, col):
         if (row, col) in self.walls:
@@ -208,38 +211,38 @@ def _is_valid(self, row, col):
         return True
 
     def _build_ascii(self):
-        grid = [[''] * self.ncols for rr in range(self.nrows)]
-        grid_idx = [[''] * self.ncols for rr in range(self.nrows)]
+        grid = [[""] * self.ncols for rr in range(self.nrows)]
+        grid_idx = [[""] * self.ncols for rr in range(self.nrows)]
         for rr in range(self.nrows):
             for cc in range(self.ncols):
                 if (rr, cc) in self.walls:
-                    grid[rr][cc] = 'x '
+                    grid[rr][cc] = "x "
                 else:
-                    grid[rr][cc] = 'o '
+                    grid[rr][cc] = "o "
                 grid_idx[rr][cc] = str(self.coord2index[(rr, cc)]).zfill(3)
 
         for (rr, cc) in self.reward_at:
             rwd = self.reward_at[(rr, cc)]
             if rwd > 0:
-                grid[rr][cc] = '+ '
+                grid[rr][cc] = "+ "
             if rwd < 0:
-                grid[rr][cc] = '-'
+                grid[rr][cc] = "-"
 
-        grid[self.start_coord[0]][self.start_coord[1]] = 'I '
+        grid[self.start_coord[0]][self.start_coord[1]] = "I "
 
         # current position of the agent
         x, y = self.index2coord[self.state]
-        grid[x][y] = 'A '
+        grid[x][y] = "A "
 
         #
-        grid_ascii = ''
+        grid_ascii = ""
         for rr in range(self.nrows + 1):
             if rr < self.nrows:
-                grid_ascii += str(rr).zfill(2) + 2 * ' ' \
-                              + ' '.join(grid[rr]) + '\n'
+                grid_ascii += str(rr).zfill(2) + 2 * " " + " ".join(grid[rr]) + "\n"
             else:
-                grid_ascii += 3 * ' ' + ' '.join([str(jj).zfill(2) for jj
-                                                  in range(self.ncols)])
+                grid_ascii += 3 * " " + " ".join(
+                    [str(jj).zfill(2) for jj in range(self.ncols)]
+                )
 
         self.grid_ascii = grid_ascii
         self.grid_idx = grid_idx
@@ -247,21 +250,22 @@ def _build_ascii(self):
 
     def display_values(self, values):
         assert len(values) == self.Ns
-        grid_values = [['X'.ljust(9)] * self.ncols for ii in range(self.nrows)]
+        grid_values = [["X".ljust(9)] * self.ncols for ii in range(self.nrows)]
         for s_idx in range(self.Ns):
             v = values[s_idx]
             row, col = self.index2coord[s_idx]
             grid_values[row][col] = ("%0.2f" % v).ljust(9)
 
-        grid_values_ascii = ''
+        grid_values_ascii = ""
         for rr in range(self.nrows + 1):
             if rr < self.nrows:
-                grid_values_ascii += str(rr).zfill(2) + 2 * ' ' \
-                                     + ' '.join(grid_values[rr]) + '\n'
+                grid_values_ascii += (
+                    str(rr).zfill(2) + 2 * " " + " ".join(grid_values[rr]) + "\n"
+                )
             else:
-                grid_values_ascii += 4 * ' ' \
-                                     + ' '.join([str(jj).zfill(2).ljust(9) for jj
-                                                 in range(self.ncols)])
+                grid_values_ascii += 4 * " " + " ".join(
+                    [str(jj).zfill(2).ljust(9) for jj in range(self.ncols)]
+                )
         logger.info(grid_values_ascii)
 
     def print_transition_at(self, row, col, action):
@@ -272,8 +276,10 @@ def print_transition_at(self, row, col, action):
         a_idx = self.a_str2idx[action]
         for next_s_idx, prob in enumerate(self.P[s_idx, a_idx]):
             if prob > 0:
-                logger.info("to (%d, %d) with prob %f" %
-                            (self.index2coord[next_s_idx] + (prob,)))
+                logger.info(
+                    "to (%d, %d) with prob %f"
+                    % (self.index2coord[next_s_idx] + (prob,))
+                )
 
     def render_ascii(self):
         logger.info(self._build_ascii())
@@ -330,10 +336,8 @@ def get_layout_array(self, state_data=None, fill_walls_with=np.nan):
         return layout
 
     def get_layout_img(
-            self,
-            state_data=None,
-            colormap_name='cool',
-            wall_color=(0.0, 0.0, 0.0)):
+        self, state_data=None, colormap_name="cool", wall_color=(0.0, 0.0, 0.0)
+    ):
         """
         Returns an image array representing the value of `state_data` on
         the gridworld layout.
@@ -367,7 +371,9 @@ def get_layout_img(
                 if np.isnan(layout[rr, cc]):
                     img[self.nrows - 1 - rr, cc, :] = wall_color
                 else:
-                    img[self.nrows - 1 - rr, cc, :3] = scalar_map.to_rgba(layout[rr, cc])[:3]
+                    img[self.nrows - 1 - rr, cc, :3] = scalar_map.to_rgba(
+                        layout[rr, cc]
+                    )[:3]
         return img
 
     def get_background(self):
@@ -423,6 +429,7 @@ def get_scene(self, state):
         scene.add_shape(agent)
         return scene
 
+
 # if __name__ == '__main__':
 #     env = GridWorld(nrows=5, ncols=5,
 #                     reward_at={(4, 4): 1, (4, 3): -1})
diff --git a/rlberry/envs/gym_make.py b/rlberry/envs/gym_make.py
index 93766e049..a75deab04 100644
--- a/rlberry/envs/gym_make.py
+++ b/rlberry/envs/gym_make.py
@@ -30,9 +30,11 @@ def gym_make(id, wrap_spaces=False, **kwargs):
 def atari_make(id, scalarize=True, **kwargs):
     from stable_baselines3.common.env_util import make_atari_env
     from stable_baselines3.common.vec_env import VecFrameStack
+
     env = make_atari_env(env_id=id, **kwargs)
     env = VecFrameStack(env, n_stack=4)
     if scalarize:
         from rlberry.wrappers.scalarize import ScalarizeEnvWrapper
+
         env = ScalarizeEnvWrapper(env)
     return env
diff --git a/rlberry/envs/interface/model.py b/rlberry/envs/interface/model.py
index d5fd2852a..fb29cc927 100644
--- a/rlberry/envs/interface/model.py
+++ b/rlberry/envs/interface/model.py
@@ -90,8 +90,10 @@ def sample(self, state, action):
         raise NotImplementedError("sample() method not implemented.")
 
     def is_online(self):
-        logger.warning("Checking if Model is\
-online calls reset() and step() methods.")
+        logger.warning(
+            "Checking if Model is\
+online calls reset() and step() methods."
+        )
         try:
             self.reset()
             self.step(self.action_space.sample())
@@ -103,11 +105,12 @@ def is_online(self):
                 raise
 
     def is_generative(self):
-        logger.warning("Checking if Model is \
-generative calls sample() method.")
+        logger.warning(
+            "Checking if Model is \
+generative calls sample() method."
+        )
         try:
-            self.sample(self.observation_space.sample(),
-                        self.action_space.sample())
+            self.sample(self.observation_space.sample(), self.action_space.sample())
             return True
         except Exception as ex:
             if isinstance(ex, NotImplementedError):
@@ -121,5 +124,5 @@ def unwrapped(self):
 
     @property
     def rng(self):
-        """ Random number generator. """
+        """Random number generator."""
         return self.seeder.rng
diff --git a/rlberry/envs/tests/test_env_seeding.py b/rlberry/envs/tests/test_env_seeding.py
index 6b8ca2f2c..44477528a 100644
--- a/rlberry/envs/tests/test_env_seeding.py
+++ b/rlberry/envs/tests/test_env_seeding.py
@@ -21,7 +21,7 @@
     Pendulum,
     FourRoom,
     SixRoom,
-    AppleGold
+    AppleGold,
 ]
 
 
@@ -63,7 +63,9 @@ def test_env_seeding(ModelClass):
     env4.reseed(seeder4)
 
     env5 = ModelClass()
-    env5.reseed(seeder1)  # same seeder as env1, but different trajectories. This is expected.
+    env5.reseed(
+        seeder1
+    )  # same seeder as env1, but different trajectories. This is expected.
 
     seeding.safe_reseed(env4, seeder4)
 
diff --git a/rlberry/envs/tests/test_gym_env_seeding.py b/rlberry/envs/tests/test_gym_env_seeding.py
index 2db8b8653..b5e1872d5 100644
--- a/rlberry/envs/tests/test_gym_env_seeding.py
+++ b/rlberry/envs/tests/test_gym_env_seeding.py
@@ -8,9 +8,9 @@
 from copy import deepcopy
 
 gym_envs = [
-    'Acrobot-v1',
-    'CartPole-v1',
-    'MountainCar-v0',
+    "Acrobot-v1",
+    "CartPole-v1",
+    "MountainCar-v0",
 ]
 
 
diff --git a/rlberry/envs/tests/test_instantiation.py b/rlberry/envs/tests/test_instantiation.py
index 5a12fd9b9..632a66a32 100644
--- a/rlberry/envs/tests/test_instantiation.py
+++ b/rlberry/envs/tests/test_instantiation.py
@@ -23,7 +23,7 @@
     FourRoom,
     SixRoom,
     AppleGold,
-    NRoom
+    NRoom,
 ]
 
 
@@ -57,13 +57,14 @@ def test_rendering_calls(ModelClass):
 
 
 def test_gridworld_aux_functions():
-    env = GridWorld(nrows=5, ncols=8, walls=((1, 1),),
-                    reward_at={(4, 4): 1, (4, 3): -1})
+    env = GridWorld(
+        nrows=5, ncols=8, walls=((1, 1),), reward_at={(4, 4): 1, (4, 3): -1}
+    )
     env.log()  # from FiniteMDP
     env.render_ascii()  # from GridWorld
     vals = np.arange(env.observation_space.n)
     env.display_values(vals)
-    env.print_transition_at(0, 0, 'up')
+    env.print_transition_at(0, 0, "up")
 
     layout = env.get_layout_array(vals, fill_walls_with=np.inf)
     for rr in range(env.nrows):
@@ -89,20 +90,24 @@ def test_pball_env(p):
     env.get_transitions_lipschitz_constant()
 
 
-@pytest.mark.parametrize("reward_free, difficulty, array_observation",
-                         [
-                             (True, 0, False),
-                             (False, 0, False),
-                             (False, 0, True),
-                             (False, 1, False),
-                             (False, 1, True),
-                             (False, 2, False),
-                             (False, 2, True),
-                         ])
+@pytest.mark.parametrize(
+    "reward_free, difficulty, array_observation",
+    [
+        (True, 0, False),
+        (False, 0, False),
+        (False, 0, True),
+        (False, 1, False),
+        (False, 1, True),
+        (False, 2, False),
+        (False, 2, True),
+    ],
+)
 def test_four_room(reward_free, difficulty, array_observation):
-    env = FourRoom(reward_free=reward_free,
-                   difficulty=difficulty,
-                   array_observation=array_observation)
+    env = FourRoom(
+        reward_free=reward_free,
+        difficulty=difficulty,
+        array_observation=array_observation,
+    )
 
     initial_state = env.reset()
     next_state, reward, _, _ = env.step(1)
@@ -121,13 +126,15 @@ def test_four_room(reward_free, difficulty, array_observation):
         assert isinstance(next_state, np.ndarray)
 
 
-@pytest.mark.parametrize("reward_free, array_observation",
-                         [
-                             (False, False),
-                             (False, True),
-                             (True, False),
-                             (True, True),
-                         ])
+@pytest.mark.parametrize(
+    "reward_free, array_observation",
+    [
+        (False, False),
+        (False, True),
+        (True, False),
+        (True, True),
+    ],
+)
 def test_six_room(reward_free, array_observation):
     env = SixRoom(reward_free=reward_free, array_observation=array_observation)
 
@@ -145,13 +152,15 @@ def test_six_room(reward_free, array_observation):
         assert isinstance(next_state, np.ndarray)
 
 
-@pytest.mark.parametrize("reward_free, array_observation",
-                         [
-                             (False, False),
-                             (False, True),
-                             (True, False),
-                             (True, True),
-                         ])
+@pytest.mark.parametrize(
+    "reward_free, array_observation",
+    [
+        (False, False),
+        (False, True),
+        (True, False),
+        (True, True),
+    ],
+)
 def test_apple_gold(reward_free, array_observation):
     env = AppleGold(reward_free=reward_free, array_observation=array_observation)
 
@@ -168,23 +177,27 @@ def test_apple_gold(reward_free, array_observation):
         assert isinstance(next_state, np.ndarray)
 
 
-@pytest.mark.parametrize("reward_free, array_observation, initial_state_distribution",
-                         [
-                             (False, False, 'center'),
-                             (False, True, 'center'),
-                             (True, False, 'center'),
-                             (True, True, 'center'),
-                             (True, False, 'uniform'),
-                         ])
+@pytest.mark.parametrize(
+    "reward_free, array_observation, initial_state_distribution",
+    [
+        (False, False, "center"),
+        (False, True, "center"),
+        (True, False, "center"),
+        (True, True, "center"),
+        (True, False, "uniform"),
+    ],
+)
 def test_n_room(reward_free, array_observation, initial_state_distribution):
-    env = NRoom(reward_free=reward_free,
-                array_observation=array_observation,
-                initial_state_distribution=initial_state_distribution)
+    env = NRoom(
+        reward_free=reward_free,
+        array_observation=array_observation,
+        initial_state_distribution=initial_state_distribution,
+    )
 
     initial_state = env.reset()
     next_state, reward, _, _ = env.step(1)
 
-    if initial_state_distribution == 'uniform':
+    if initial_state_distribution == "uniform":
         assert env.initial_state_distribution[0] == 1.0 / env.observation_space.n
 
     assert env.observation_space.contains(initial_state)
diff --git a/rlberry/experiment/generator.py b/rlberry/experiment/generator.py
index 816016a7e..982b4bf0a 100644
--- a/rlberry/experiment/generator.py
+++ b/rlberry/experiment/generator.py
@@ -27,14 +27,17 @@ def experiment_generator():
     """
     args = docopt(__doc__)
     for (_, agent_manager_kwargs) in parse_experiment_config(
-            Path(args["<experiment_path>"]),
-            n_fit=int(args["--n_fit"]),
-            output_base_dir=args["--output_dir"],
-            parallelization=args["--parallelization"]):
+        Path(args["<experiment_path>"]),
+        n_fit=int(args["--n_fit"]),
+        output_base_dir=args["--output_dir"],
+        parallelization=args["--parallelization"],
+    ):
         if args["--enable_tensorboard"]:
             if check_packages.TENSORBOARD_INSTALLED:
                 agent_manager_kwargs.update(dict(enable_tensorboard=True))
             else:
-                logger.warning('Option --enable_tensorboard is not available: tensorboard is not installed.')
+                logger.warning(
+                    "Option --enable_tensorboard is not available: tensorboard is not installed."
+                )
 
         yield AgentManager(**agent_manager_kwargs)
diff --git a/rlberry/experiment/load_results.py b/rlberry/experiment/load_results.py
index 8bf02a1f1..434642344 100644
--- a/rlberry/experiment/load_results.py
+++ b/rlberry/experiment/load_results.py
@@ -45,10 +45,10 @@ def load_experiment_results(output_dir, experiment_name):
         output_data['data_dir'][agent_name] = directory from which the results were loaded
     """
     output_data = {}
-    output_data['agent_list'] = []
-    output_data['manager'] = {}
-    output_data['dataframes'] = {}
-    output_data['data_dir'] = {}
+    output_data["agent_list"] = []
+    output_data["manager"] = {}
+    output_data["dataframes"] = {}
+    output_data["data_dir"] = {}
 
     # preprocess input
     if not isinstance(output_dir, list):
@@ -58,14 +58,16 @@ def load_experiment_results(output_dir, experiment_name):
     ndirs = len(output_dir)
 
     if ndirs > 1:
-        assert len(experiment_name) == ndirs, "Number of experiment names must match the number of output_dirs "
+        assert (
+            len(experiment_name) == ndirs
+        ), "Number of experiment names must match the number of output_dirs "
     else:
         output_dir = len(experiment_name) * output_dir
 
     results_dirs = []
     for dd, exper in zip(output_dir, experiment_name):
         results_dirs.append(Path(dd) / Path(exper).stem)
-    output_data['experiment_dirs'] = results_dirs
+    output_data["experiment_dirs"] = results_dirs
 
     # Subdirectories with data for each agent
     subdirs = []
@@ -75,31 +77,33 @@ def load_experiment_results(output_dir, experiment_name):
     # Create dictionary dict[agent_name] = most recent result dir
     data_dirs = {}
     for dd in subdirs:
-        data_dirs[dd.name] = _get_most_recent_path([f for f in dd.iterdir() if f.is_dir()])
-        data_dirs[dd.name] = data_dirs[dd.name] / 'manager_data'
+        data_dirs[dd.name] = _get_most_recent_path(
+            [f for f in dd.iterdir() if f.is_dir()]
+        )
+        data_dirs[dd.name] = data_dirs[dd.name] / "manager_data"
 
     # Load data from each subdir
     for agent_name in data_dirs:
-        output_data['agent_list'].append(agent_name)
+        output_data["agent_list"].append(agent_name)
 
         # store data_dir
-        output_data['data_dir'][agent_name] = data_dirs[agent_name]
+        output_data["data_dir"][agent_name] = data_dirs[agent_name]
 
         # store AgentManager
-        output_data['manager'][agent_name] = None
-        fname = data_dirs[agent_name] / 'manager_obj.pickle'
+        output_data["manager"][agent_name] = None
+        fname = data_dirs[agent_name] / "manager_obj.pickle"
         try:
-            output_data['manager'][agent_name] = AgentManager.load(fname)
+            output_data["manager"][agent_name] = AgentManager.load(fname)
         except Exception:
-            logger.warning(f'Could not load AgentManager instance for {agent_name}.')
+            logger.warning(f"Could not load AgentManager instance for {agent_name}.")
         logger.info("... loaded " + str(fname))
 
         # store data frames
         dataframes = {}
-        csv_files = [f for f in data_dirs[agent_name].iterdir() if f.suffix == '.csv']
+        csv_files = [f for f in data_dirs[agent_name].iterdir() if f.suffix == ".csv"]
         for ff in csv_files:
             dataframes[ff.stem] = pd.read_csv(ff)
             logger.info("... loaded " + str(ff))
-        output_data['dataframes'][agent_name] = dataframes
+        output_data["dataframes"][agent_name] = dataframes
 
     return output_data
diff --git a/rlberry/experiment/tests/old_test_experiment_generator.py b/rlberry/experiment/tests/old_test_experiment_generator.py
index 0d6e6c8ab..44834c336 100644
--- a/rlberry/experiment/tests/old_test_experiment_generator.py
+++ b/rlberry/experiment/tests/old_test_experiment_generator.py
@@ -6,8 +6,7 @@
 
 def test_mock_args(monkeypatch):
     monkeypatch.setattr(
-        "sys.argv",
-        ['', 'rlberry/experiment/tests/params_experiment.yaml']
+        "sys.argv", ["", "rlberry/experiment/tests/params_experiment.yaml"]
     )
     random_numbers = []
 
@@ -16,25 +15,25 @@ def test_mock_args(monkeypatch):
         random_numbers.append(rng.uniform(size=10))
 
         assert agent_manager.agent_class is RSUCBVIAgent
-        assert agent_manager._base_init_kwargs['horizon'] == 51
+        assert agent_manager._base_init_kwargs["horizon"] == 51
         assert agent_manager.fit_budget == 10
-        assert agent_manager.eval_kwargs['eval_horizon'] == 51
+        assert agent_manager.eval_kwargs["eval_horizon"] == 51
 
-        assert agent_manager._base_init_kwargs['lp_metric'] == 2
-        assert agent_manager._base_init_kwargs['min_dist'] == 0.0
-        assert agent_manager._base_init_kwargs['max_repr'] == 800
-        assert agent_manager._base_init_kwargs['bonus_scale_factor'] == 1.0
-        assert agent_manager._base_init_kwargs['reward_free'] is True
+        assert agent_manager._base_init_kwargs["lp_metric"] == 2
+        assert agent_manager._base_init_kwargs["min_dist"] == 0.0
+        assert agent_manager._base_init_kwargs["max_repr"] == 800
+        assert agent_manager._base_init_kwargs["bonus_scale_factor"] == 1.0
+        assert agent_manager._base_init_kwargs["reward_free"] is True
 
         train_env = agent_manager.train_env[0](**agent_manager.train_env[1])
         assert train_env.reward_free is False
         assert train_env.array_observation is True
 
-        if agent_manager.agent_name == 'rsucbvi':
-            assert agent_manager._base_init_kwargs['gamma'] == 1.0
+        if agent_manager.agent_name == "rsucbvi":
+            assert agent_manager._base_init_kwargs["gamma"] == 1.0
 
-        elif agent_manager.agent_name == 'rsucbvi_alternative':
-            assert agent_manager._base_init_kwargs['gamma'] == 0.9
+        elif agent_manager.agent_name == "rsucbvi_alternative":
+            assert agent_manager._base_init_kwargs["gamma"] == 0.9
 
         else:
             raise ValueError()
diff --git a/rlberry/experiment/yaml_utils.py b/rlberry/experiment/yaml_utils.py
index e9852512e..2471c960f 100644
--- a/rlberry/experiment/yaml_utils.py
+++ b/rlberry/experiment/yaml_utils.py
@@ -3,7 +3,7 @@
 import yaml
 from rlberry.utils.factory import load
 
-_AGENT_KEYS = ('init_kwargs', 'eval_kwargs', 'fit_kwargs')
+_AGENT_KEYS = ("init_kwargs", "eval_kwargs", "fit_kwargs")
 
 
 def read_yaml(path):
@@ -97,10 +97,12 @@ def read_env_config(config_path):
         return load(env_config["constructor"]), env_config["params"]
 
 
-def parse_experiment_config(path: Path,
-                            n_fit: int = 4,
-                            output_base_dir: str = 'results',
-                            parallelization: str = 'process') -> Generator[Tuple[int, dict], None, None]:
+def parse_experiment_config(
+    path: Path,
+    n_fit: int = 4,
+    output_base_dir: str = "results",
+    parallelization: str = "process",
+) -> Generator[Tuple[int, dict], None, None]:
     """
     Read .yaml files. set global seed and convert to AgentManager instances.
 
@@ -165,20 +167,20 @@ def parse_experiment_config(path: Path,
                     last = idx
 
             # kwargs
-            init_kwargs = agent_config['init_kwargs']
-            eval_kwargs = agent_config['eval_kwargs']
-            fit_kwargs = agent_config['fit_kwargs']
+            init_kwargs = agent_config["init_kwargs"]
+            eval_kwargs = agent_config["eval_kwargs"]
+            fit_kwargs = agent_config["fit_kwargs"]
 
             # check if there are global kwargs
-            if 'global_init_kwargs' in config:
-                init_kwargs.update(config['global_init_kwargs'])
-            if 'global_eval_kwargs' in config:
-                eval_kwargs.update(config['global_eval_kwargs'])
-            if 'global_fit_kwargs' in config:
-                fit_kwargs.update(config['global_fit_kwargs'])
+            if "global_init_kwargs" in config:
+                init_kwargs.update(config["global_init_kwargs"])
+            if "global_eval_kwargs" in config:
+                eval_kwargs.update(config["global_eval_kwargs"])
+            if "global_fit_kwargs" in config:
+                fit_kwargs.update(config["global_fit_kwargs"])
 
             # pop fit_budget from fit_kwargs
-            fit_budget = fit_kwargs.pop('fit_budget')
+            fit_budget = fit_kwargs.pop("fit_budget")
 
             # append run index to dir
             output_dir = output_dir / str(last + 1)
@@ -196,10 +198,11 @@ def parse_experiment_config(path: Path,
                 output_dir=output_dir,
                 parallelization=parallelization,
                 seed=seed,
-                create_unique_out_dir=False)  # output_dir is already made unique above
+                create_unique_out_dir=False,
+            )  # output_dir is already made unique above
 
 
-if __name__ == '__main__':
-    filename = 'examples/demo_experiment/params_experiment.yaml'
+if __name__ == "__main__":
+    filename = "examples/demo_experiment/params_experiment.yaml"
     for (seed, agent_manager) in parse_experiment_config(Path(filename)):
         print(seed)
diff --git a/rlberry/exploration_tools/discrete_counter.py b/rlberry/exploration_tools/discrete_counter.py
index a14dceae2..549a39955 100644
--- a/rlberry/exploration_tools/discrete_counter.py
+++ b/rlberry/exploration_tools/discrete_counter.py
@@ -19,13 +19,15 @@ class DiscreteCounter(UncertaintyEstimator):
         Returns bonuses in 1/n ** rate_power.
     """
 
-    def __init__(self,
-                 observation_space,
-                 action_space,
-                 n_bins_obs=10,
-                 n_bins_actions=10,
-                 rate_power=0.5,
-                 **kwargs):
+    def __init__(
+        self,
+        observation_space,
+        action_space,
+        n_bins_obs=10,
+        n_bins_actions=10,
+        rate_power=0.5,
+        **kwargs
+    ):
         UncertaintyEstimator.__init__(self, observation_space, action_space)
 
         self.rate_power = rate_power
@@ -37,16 +39,14 @@ def __init__(self,
             self.n_states = observation_space.n
         else:
             self.continuous_state = True
-            self.state_discretizer = Discretizer(self.observation_space,
-                                                 n_bins_obs)
+            self.state_discretizer = Discretizer(self.observation_space, n_bins_obs)
             self.n_states = self.state_discretizer.discrete_space.n
 
         if isinstance(action_space, Discrete):
             self.n_actions = action_space.n
         else:
             self.continuous_action = True
-            self.action_discretizer = Discretizer(self.action_space,
-                                                  n_bins_actions)
+            self.action_discretizer = Discretizer(self.action_space, n_bins_actions)
             self.n_actions = self.action_discretizer.discrete_space.n
 
         self.N_sa = np.zeros((self.n_states, self.n_actions))
@@ -61,12 +61,12 @@ def _preprocess(self, state, action):
     def reset(self):
         self.N_sa = np.zeros((self.n_states, self.n_actions))
 
-    @preprocess_args(expected_type='numpy')
+    @preprocess_args(expected_type="numpy")
     def update(self, state, action, next_state=None, reward=None, **kwargs):
         state, action = self._preprocess(state, action)
         self.N_sa[state, action] += 1
 
-    @preprocess_args(expected_type='numpy')
+    @preprocess_args(expected_type="numpy")
     def measure(self, state, action, **kwargs):
         state, action = self._preprocess(state, action)
         n = np.maximum(1.0, self.N_sa[state, action])
diff --git a/rlberry/exploration_tools/online_discretization_counter.py b/rlberry/exploration_tools/online_discretization_counter.py
index e0ba65f3d..98fc11fac 100644
--- a/rlberry/exploration_tools/online_discretization_counter.py
+++ b/rlberry/exploration_tools/online_discretization_counter.py
@@ -10,29 +10,32 @@
 
 
 @numba_jit
-def map_to_representative(state,
-                          lp_metric,
-                          representative_states,
-                          n_representatives,
-                          min_dist,
-                          scaling,
-                          accept_new_repr):
+def map_to_representative(
+    state,
+    lp_metric,
+    representative_states,
+    n_representatives,
+    min_dist,
+    scaling,
+    accept_new_repr,
+):
     """
     Map state to representative state.
     """
     dist_to_closest = np.inf
     argmin = -1
     for ii in range(n_representatives):
-        dist = metric_lp(state, representative_states[ii, :],
-                         lp_metric, scaling)
+        dist = metric_lp(state, representative_states[ii, :], lp_metric, scaling)
         if dist < dist_to_closest:
             dist_to_closest = dist
             argmin = ii
 
     max_representatives = representative_states.shape[0]
-    if dist_to_closest > min_dist \
-            and n_representatives < max_representatives \
-            and accept_new_repr:
+    if (
+        dist_to_closest > min_dist
+        and n_representatives < max_representatives
+        and accept_new_repr
+    ):
         new_index = n_representatives
         representative_states[new_index, :] = state
         return new_index, 0.0
@@ -69,15 +72,17 @@ class OnlineDiscretizationCounter(UncertaintyEstimator):
         returns bonuses in n^power.
     """
 
-    def __init__(self,
-                 observation_space,
-                 action_space,
-                 lp_metric=2,
-                 min_dist=0.1,
-                 max_repr=1000,
-                 scaling=None,
-                 rate_power=1,
-                 **kwargs):
+    def __init__(
+        self,
+        observation_space,
+        action_space,
+        lp_metric=2,
+        min_dist=0.1,
+        max_repr=1000,
+        scaling=None,
+        rate_power=1,
+        **kwargs
+    ):
         UncertaintyEstimator.__init__(self, observation_space, action_space)
 
         assert isinstance(action_space, Discrete)
@@ -94,8 +99,7 @@ def __init__(self,
         if scaling is None:
             # if high and low are bounded
             if self.observation_space.is_bounded():
-                scaling = self.observation_space.high \
-                          - self.observation_space.low
+                scaling = self.observation_space.high - self.observation_space.low
                 # if high or low are unbounded
             else:
                 scaling = np.ones(self.state_dim)
@@ -118,40 +122,42 @@ def reset(self):
         self._overflow_warning = False
 
     def _get_representative_state(self, state, accept_new_repr=True):
-        state_idx, dist_to_closest \
-            = map_to_representative(state,
-                                    self.lp_metric,
-                                    self.representative_states,
-                                    self.n_representatives,
-                                    self.min_dist,
-                                    self.scaling,
-                                    accept_new_repr)
+        state_idx, dist_to_closest = map_to_representative(
+            state,
+            self.lp_metric,
+            self.representative_states,
+            self.n_representatives,
+            self.min_dist,
+            self.scaling,
+            accept_new_repr,
+        )
         # check if new representative state
         if state_idx == self.n_representatives:
             self.n_representatives += 1
 
-        if self.n_representatives >= self.max_repr \
-                and (not self._overflow_warning):
-            logger.warning("OnlineDiscretizationCounter reached \
-the maximum number of representative states.")
+        if self.n_representatives >= self.max_repr and (not self._overflow_warning):
+            logger.warning(
+                "OnlineDiscretizationCounter reached \
+the maximum number of representative states."
+            )
             self._overflow_warning = True
 
         return state_idx, dist_to_closest
 
-    @preprocess_args(expected_type='numpy')
+    @preprocess_args(expected_type="numpy")
     def update(self, state, action, next_state=None, reward=None, **kwargs):
         state_idx, _ = self._get_representative_state(state)
         self.N_sa[state_idx, action] += 1
 
-    @preprocess_args(expected_type='numpy')
+    @preprocess_args(expected_type="numpy")
     def measure(self, state, action, **kwargs):
         n = np.maximum(1.0, self.count(state, action))
         return np.power(1 / n, self.rate_power)
 
     def count(self, state, action):
         state_idx, dist_to_closest = self._get_representative_state(
-            state,
-            accept_new_repr=False)
+            state, accept_new_repr=False
+        )
         # if state is too far from the closest representative,
         # its count is zero.
         if dist_to_closest > self.min_dist:
diff --git a/rlberry/exploration_tools/tests/test_discrete_counter.py b/rlberry/exploration_tools/tests/test_discrete_counter.py
index d80dc56ad..25a8c36d0 100644
--- a/rlberry/exploration_tools/tests/test_discrete_counter.py
+++ b/rlberry/exploration_tools/tests/test_discrete_counter.py
@@ -4,13 +4,17 @@
 from rlberry.envs import MountainCar
 from rlberry.envs.benchmarks.grid_exploration.nroom import NRoom
 from rlberry.exploration_tools.discrete_counter import DiscreteCounter
-from rlberry.exploration_tools.online_discretization_counter import OnlineDiscretizationCounter
+from rlberry.exploration_tools.online_discretization_counter import (
+    OnlineDiscretizationCounter,
+)
 
 
 @pytest.mark.parametrize("rate_power", [0.5, 1])
 def test_discrete_env(rate_power):
     env = GridWorld()
-    counter = DiscreteCounter(env.observation_space, env.action_space, rate_power=rate_power)
+    counter = DiscreteCounter(
+        env.observation_space, env.action_space, rate_power=rate_power
+    )
 
     for N in range(10, 20):
         assert counter.get_n_visited_states() == 0
@@ -37,7 +41,9 @@ def test_discrete_env(rate_power):
 @pytest.mark.parametrize("rate_power", [0.5, 1])
 def test_continuous_state_env(rate_power):
     env = MountainCar()
-    counter = DiscreteCounter(env.observation_space, env.action_space, rate_power=rate_power)
+    counter = DiscreteCounter(
+        env.observation_space, env.action_space, rate_power=rate_power
+    )
 
     for N in [10, 20]:
         for _ in range(50):
@@ -60,9 +66,9 @@ def test_continuous_state_env(rate_power):
 @pytest.mark.parametrize("rate_power", [True, False])
 def test_continuous_state_env_2(rate_power):
     env = MountainCar()
-    counter = OnlineDiscretizationCounter(env.observation_space,
-                                          env.action_space,
-                                          rate_power=rate_power)
+    counter = OnlineDiscretizationCounter(
+        env.observation_space, env.action_space, rate_power=rate_power
+    )
 
     for N in [10, 20]:
         for _ in range(50):
@@ -81,10 +87,9 @@ def test_continuous_state_env_2(rate_power):
 
 def test_continuous_state_env_3():
     env = NRoom(nrooms=3, array_observation=True)
-    counter = OnlineDiscretizationCounter(env.observation_space,
-                                          env.action_space,
-                                          rate_power=0.5,
-                                          min_dist=0.0)
+    counter = OnlineDiscretizationCounter(
+        env.observation_space, env.action_space, rate_power=0.5, min_dist=0.0
+    )
 
     for N in range(10, 20):
         assert counter.get_n_visited_states() == 0
@@ -101,6 +106,8 @@ def test_continuous_state_env_3():
                 assert np.allclose(counter.measure(continuous_ss, aa), np.sqrt(1.0 / N))
 
         assert counter.get_n_visited_states() == env.discrete_observation_space.n
-        assert np.allclose(counter.get_entropy(), np.log2(env.discrete_observation_space.n))
+        assert np.allclose(
+            counter.get_entropy(), np.log2(env.discrete_observation_space.n)
+        )
 
         counter.reset()
diff --git a/rlberry/exploration_tools/torch/rnd.py b/rlberry/exploration_tools/torch/rnd.py
index 8e137c91c..1acf88de7 100644
--- a/rlberry/exploration_tools/torch/rnd.py
+++ b/rlberry/exploration_tools/torch/rnd.py
@@ -24,29 +24,34 @@ def get_network(shape, embedding_dim):
         else:
             raise ValueError("Unknown image convention")
 
-        return ConvolutionalNetwork(in_channels=C,
-                                    in_width=W,
-                                    in_height=H,
-                                    out_size=embedding_dim,
-                                    activation="ELU",
-                                    transpose_obs=transpose_obs,
-                                    is_policy=False)
+        return ConvolutionalNetwork(
+            in_channels=C,
+            in_width=W,
+            in_height=H,
+            out_size=embedding_dim,
+            activation="ELU",
+            transpose_obs=transpose_obs,
+            is_policy=False,
+        )
     elif len(shape) == 2:
         H, W = shape
-        return ConvolutionalNetwork(in_channels=1,
-                                    in_width=W,
-                                    in_height=H,
-                                    activation="ELU",
-                                    out_size=embedding_dim)
+        return ConvolutionalNetwork(
+            in_channels=1,
+            in_width=W,
+            in_height=H,
+            activation="ELU",
+            out_size=embedding_dim,
+        )
 
     elif len(shape) == 1:
-        return MultiLayerPerceptron(in_size=shape[0],
-                                    activation="RELU",
-                                    layer_sizes=[64, 64],
-                                    out_size=embedding_dim)
+        return MultiLayerPerceptron(
+            in_size=shape[0],
+            activation="RELU",
+            layer_sizes=[64, 64],
+            out_size=embedding_dim,
+        )
     else:
-        raise ValueError("Incompatible observation shape: {}"
-                         .format(shape))
+        raise ValueError("Incompatible observation shape: {}".format(shape))
 
 
 class RandomNetworkDistillation(UncertaintyEstimator):
@@ -58,20 +63,22 @@ class RandomNetworkDistillation(UncertaintyEstimator):
     In International Conference on Learning Representations.
     """
 
-    def __init__(self,
-                 observation_space,
-                 action_space,
-                 learning_rate=0.001,
-                 update_period=100,
-                 embedding_dim=10,
-                 net_fn=None,
-                 net_kwargs=None,
-                 device="cuda:best",
-                 rate_power=0.5,
-                 batch_size=10,
-                 memory_size=10000,
-                 with_action=False,
-                 **kwargs):
+    def __init__(
+        self,
+        observation_space,
+        action_space,
+        learning_rate=0.001,
+        update_period=100,
+        embedding_dim=10,
+        net_fn=None,
+        net_kwargs=None,
+        device="cuda:best",
+        rate_power=0.5,
+        batch_size=10,
+        memory_size=10000,
+        with_action=False,
+        **kwargs
+    ):
         assert isinstance(observation_space, spaces.Box)
         UncertaintyEstimator.__init__(self, observation_space, action_space)
         self.learning_rate = learning_rate
@@ -79,8 +86,14 @@ def __init__(self,
         self.update_period = update_period
         self.embedding_dim = embedding_dim
         out_size = embedding_dim * action_space.n if with_action else embedding_dim
-        self.net_fn = load(net_fn) if isinstance(net_fn, str) else \
-            net_fn or partial(get_network, shape=observation_space.shape, embedding_dim=out_size)
+        self.net_fn = (
+            load(net_fn)
+            if isinstance(net_fn, str)
+            else net_fn
+            or partial(
+                get_network, shape=observation_space.shape, embedding_dim=out_size
+            )
+        )
         self.net_kwargs = net_kwargs or {}
         if "out_size" in self.net_kwargs:
             self.net_kwargs["out_size"] = out_size
@@ -97,7 +110,8 @@ def reset(self, **kwargs):
         self.rnd_optimizer = torch.optim.Adam(
             self.predictor_network.parameters(),
             lr=self.learning_rate,
-            betas=(0.9, 0.999))
+            betas=(0.9, 0.999),
+        )
 
         self.count = 0
         self.loss = torch.tensor(0.0).to(self.device)
@@ -111,24 +125,27 @@ def _get_embeddings(self, state, action=None, batch=False, all_actions=False):
         predicted_embedding = self.predictor_network(state)
 
         if self.with_action:
-            random_embedding = random_embedding.view((state.shape[0], self.action_space.n, -1))
-            predicted_embedding = predicted_embedding.view((state.shape[0], self.action_space.n, -1))
+            random_embedding = random_embedding.view(
+                (state.shape[0], self.action_space.n, -1)
+            )
+            predicted_embedding = predicted_embedding.view(
+                (state.shape[0], self.action_space.n, -1)
+            )
             if not all_actions:
                 action = action.long().to(self.device)
                 if not batch:
                     action = action.unsqueeze(0)
-                action = action.unsqueeze(1).repeat(1, random_embedding.shape[-1]).unsqueeze(1)
+                action = (
+                    action.unsqueeze(1)
+                    .repeat(1, random_embedding.shape[-1])
+                    .unsqueeze(1)
+                )
                 random_embedding = random_embedding.gather(1, action).squeeze(1)
                 predicted_embedding = predicted_embedding.gather(1, action).squeeze(1)
         return random_embedding, predicted_embedding
 
-    @preprocess_args(expected_type='torch')
-    def update(self,
-               state,
-               action=None,
-               next_state=None,
-               reward=None,
-               **kwargs):
+    @preprocess_args(expected_type="torch")
+    def update(self, state, action=None, next_state=None, reward=None, **kwargs):
 
         batch = [(state, action)]
         if self.batch_size > 0 and not self.memory.is_empty():
@@ -139,10 +156,11 @@ def update(self,
         if self.with_action:
             actions = torch.stack(actions)
 
-        random_embedding, predicted_embedding = self._get_embeddings(states, actions, batch=True)
+        random_embedding, predicted_embedding = self._get_embeddings(
+            states, actions, batch=True
+        )
 
-        self.loss += self.loss_fn(random_embedding.detach(),
-                                  predicted_embedding)
+        self.loss += self.loss_fn(random_embedding.detach(), predicted_embedding)
 
         self.count += 1
         if self.count % self.update_period == 0:
@@ -152,19 +170,27 @@ def update(self,
             self.rnd_optimizer.step()
             self.loss = torch.tensor(0.0).to(self.device)
 
-    @preprocess_args(expected_type='torch')
+    @preprocess_args(expected_type="torch")
     def measure(self, state, action=None, **kwargs):
-        random_embedding, predicted_embedding = self._get_embeddings(state, action, batch=False)
-        error = torch.norm(predicted_embedding.detach() - random_embedding.detach(), p=2, dim=-1)
+        random_embedding, predicted_embedding = self._get_embeddings(
+            state, action, batch=False
+        )
+        error = torch.norm(
+            predicted_embedding.detach() - random_embedding.detach(), p=2, dim=-1
+        )
         return error.pow(2 * self.rate_power).item()
 
-    @preprocess_args(expected_type='torch')
+    @preprocess_args(expected_type="torch")
     def measure_batch(self, states, actions, **kwargs):
-        random_embedding, predicted_embedding = self._get_embeddings(states, actions, batch=True)
-        error = torch.norm(predicted_embedding.detach() - random_embedding.detach(), p=2, dim=-1)
+        random_embedding, predicted_embedding = self._get_embeddings(
+            states, actions, batch=True
+        )
+        error = torch.norm(
+            predicted_embedding.detach() - random_embedding.detach(), p=2, dim=-1
+        )
         return error.pow(2 * self.rate_power)
 
-    @preprocess_args(expected_type='torch')
+    @preprocess_args(expected_type="torch")
     def measure_batch_all_actions(self, states, **kwargs):
         """
         Measure N(s,a) for all a in A.
@@ -178,6 +204,10 @@ def measure_batch_all_actions(self, states, **kwargs):
         N(s,a): an array of shape B x A
         """
         assert self.with_action
-        random_embedding, predicted_embedding = self._get_embeddings(states, None, batch=True, all_actions=True)
-        error = torch.norm(predicted_embedding.detach() - random_embedding.detach(), p=2, dim=-1)
+        random_embedding, predicted_embedding = self._get_embeddings(
+            states, None, batch=True, all_actions=True
+        )
+        error = torch.norm(
+            predicted_embedding.detach() - random_embedding.detach(), p=2, dim=-1
+        )
         return error.pow(2 * self.rate_power)
diff --git a/rlberry/exploration_tools/torch/tests/test_rnd.py b/rlberry/exploration_tools/torch/tests/test_rnd.py
index 60d13fa50..60649b096 100644
--- a/rlberry/exploration_tools/torch/tests/test_rnd.py
+++ b/rlberry/exploration_tools/torch/tests/test_rnd.py
@@ -12,7 +12,8 @@ def test_rnd():
         env.action_space,
         learning_rate=0.1,
         update_period=100,
-        embedding_dim=2)
+        embedding_dim=2,
+    )
 
     # Test
     state = env.reset()
diff --git a/rlberry/exploration_tools/typing.py b/rlberry/exploration_tools/typing.py
index 0b41c70a2..b51aabf64 100644
--- a/rlberry/exploration_tools/typing.py
+++ b/rlberry/exploration_tools/typing.py
@@ -9,9 +9,9 @@
 
 def _get_type(arg):
     if _TORCH_INSTALLED and isinstance(arg, torch.Tensor):
-        return 'torch'
+        return "torch"
     elif isinstance(arg, np.ndarray):
-        return 'numpy'
+        return "numpy"
     else:
         return type(arg)
 
@@ -32,7 +32,7 @@ def process_type(arg, expected_type):
     if arg is None:
         return None
 
-    if expected_type == 'torch':
+    if expected_type == "torch":
         assert _TORCH_INSTALLED, "expected_type is 'torch', but torch is not installed!"
         if isinstance(arg, torch.Tensor):
             return arg
@@ -42,7 +42,7 @@ def process_type(arg, expected_type):
             return torch.tensor(arg)
         else:
             return arg
-    elif expected_type == 'numpy':
+    elif expected_type == "numpy":
         if isinstance(arg, np.ndarray):
             return arg
         elif _TORCH_INSTALLED and isinstance(arg, torch.Tensor):
diff --git a/rlberry/exploration_tools/uncertainty_estimator.py b/rlberry/exploration_tools/uncertainty_estimator.py
index 923b9f9d3..868b4c90e 100644
--- a/rlberry/exploration_tools/uncertainty_estimator.py
+++ b/rlberry/exploration_tools/uncertainty_estimator.py
@@ -22,10 +22,13 @@ def measure(self, state, action, **kwargs):
 
     def measure_batch(self, states, actions, **kwargs):
         batch = [self.measure(s, a, **kwargs) for s, a in zip(states, actions)]
-        if _get_type(batch[0]) == 'torch':
+        if _get_type(batch[0]) == "torch":
             import torch
+
             return torch.FloatTensor(batch)
         return np.array(batch)
 
     def measure_batch_all_actions(self, states):
-        return np.array([[self.measure(s, a) for a in range(self.action_space.n)] for s in states])
+        return np.array(
+            [[self.measure(s, a) for a in range(self.action_space.n)] for s in states]
+        )
diff --git a/rlberry/manager/agent_manager.py b/rlberry/manager/agent_manager.py
index 9c4ec685c..0215501fd 100644
--- a/rlberry/manager/agent_manager.py
+++ b/rlberry/manager/agent_manager.py
@@ -36,6 +36,7 @@
 # Aux
 #
 
+
 class AgentHandler:
     """
     Wraps an Agent so that it can be either loaded in memory
@@ -58,13 +59,9 @@ class AgentHandler:
         Arguments required by __init__ method of agent_class.
     """
 
-    def __init__(self,
-                 id,
-                 filename,
-                 seeder,
-                 agent_class,
-                 agent_instance=None,
-                 agent_kwargs=None) -> None:
+    def __init__(
+        self, id, filename, seeder, agent_class, agent_instance=None, agent_kwargs=None
+    ) -> None:
         self._id = id
         self._fname = Path(filename)
         self._seeder = seeder
@@ -92,12 +89,16 @@ def is_loaded(self):
 
     def load(self) -> bool:
         try:
-            self._agent_instance = self._agent_class.load(self._fname, **self._agent_kwargs)
+            self._agent_instance = self._agent_class.load(
+                self._fname, **self._agent_kwargs
+            )
             safe_reseed(self._agent_instance.env, self._seeder)
             return True
         except Exception as ex:
             self._agent_instance = None
-            logger.error(f'Failed call to AgentHandler.load() for {self._agent_class}: {ex}')
+            logger.error(
+                f"Failed call to AgentHandler.load() for {self._agent_class}: {ex}"
+            )
             return False
 
     def dump(self):
@@ -107,7 +108,9 @@ def dump(self):
             # saved_filename might have appended the correct extension, for instance,
             # so self._fname must be updated.
             if not saved_filename:
-                logger.warning(f'Instance of {self._agent_class} cannot be saved and will be kept in memory.')
+                logger.warning(
+                    f"Instance of {self._agent_class} cannot be saved and will be kept in memory."
+                )
                 return
             self._fname = Path(saved_filename)
             del self._agent_instance
@@ -117,16 +120,18 @@ def __getattr__(self, attr):
         """
         Allows AgentHandler to behave like the handled Agent.
         """
-        if attr[:2] == '__':
+        if attr[:2] == "__":
             raise AttributeError(attr)
         if attr in self.__dict__:
             return getattr(self, attr)
 
-        assert not self.is_empty(), 'Calling AgentHandler with no agent instance stored.'
+        assert (
+            not self.is_empty()
+        ), "Calling AgentHandler with no agent instance stored."
         if not self.is_loaded():
             loaded = self.load()
             if not loaded:
-                raise RuntimeError(f'Could not load Agent from {self._fname}.')
+                raise RuntimeError(f"Could not load Agent from {self._fname}.")
         return getattr(self._agent_instance, attr)
 
 
@@ -195,24 +200,26 @@ class AgentManager:
         init_kwargs_per_instance will be used.
     """
 
-    def __init__(self,
-                 agent_class,
-                 train_env,
-                 fit_budget=None,
-                 eval_env=None,
-                 init_kwargs=None,
-                 fit_kwargs=None,
-                 eval_kwargs=None,
-                 agent_name=None,
-                 n_fit=4,
-                 output_dir=None,
-                 parallelization='thread',
-                 worker_logging_level='INFO',
-                 seed=None,
-                 enable_tensorboard=False,
-                 create_unique_out_dir=True,
-                 default_writer_kwargs=None,
-                 init_kwargs_per_instance=None):
+    def __init__(
+        self,
+        agent_class,
+        train_env,
+        fit_budget=None,
+        eval_env=None,
+        init_kwargs=None,
+        fit_kwargs=None,
+        eval_kwargs=None,
+        agent_name=None,
+        n_fit=4,
+        output_dir=None,
+        parallelization="thread",
+        worker_logging_level="INFO",
+        seed=None,
+        enable_tensorboard=False,
+        create_unique_out_dir=True,
+        default_writer_kwargs=None,
+        init_kwargs_per_instance=None,
+    ):
         # agent_class should only be None when the constructor is called
         # by the class method AgentManager.load(), since the agent class
         # will be loaded.
@@ -229,10 +236,12 @@ def __init__(self,
 
         # Check train_env and eval_env
         assert isinstance(
-            train_env, Tuple), "[AgentManager]train_env must be Tuple (constructor, kwargs)"
+            train_env, Tuple
+        ), "[AgentManager]train_env must be Tuple (constructor, kwargs)"
         if eval_env is not None:
             assert isinstance(
-                eval_env, Tuple), "[AgentManager]train_env must be Tuple (constructor, kwargs)"
+                eval_env, Tuple
+            ), "[AgentManager]train_env must be Tuple (constructor, kwargs)"
 
         # create oject identifier
         self.unique_id = metadata_utils.get_unique_id(self)
@@ -265,25 +274,27 @@ def __init__(self,
             self.fit_budget = fit_budget
         else:
             try:
-                self.fit_budget = self.fit_kwargs.pop('fit_budget')
+                self.fit_budget = self.fit_kwargs.pop("fit_budget")
             except KeyError:
-                raise ValueError('[AgentManager] fit_budget missing in __init__().')
+                raise ValueError("[AgentManager] fit_budget missing in __init__().")
         # extra params per instance
         if init_kwargs_per_instance is not None:
             assert len(init_kwargs_per_instance) == n_fit
             init_kwargs_per_instance = deepcopy(init_kwargs_per_instance)
-        self.init_kwargs_per_instance = init_kwargs_per_instance or [dict() for _ in range(n_fit)]
+        self.init_kwargs_per_instance = init_kwargs_per_instance or [
+            dict() for _ in range(n_fit)
+        ]
 
         # output dir
         if output_dir is None:
             output_dir = metadata_utils.RLBERRY_TEMP_DATA_DIR
-        self.output_dir = Path(output_dir) / 'manager_data'
+        self.output_dir = Path(output_dir) / "manager_data"
         if create_unique_out_dir:
-            self.output_dir = self.output_dir / (self.agent_name + '_' + self.unique_id)
+            self.output_dir = self.output_dir / (self.agent_name + "_" + self.unique_id)
 
         # Create list of writers for each agent that will be trained
         # 'default' will keep Agent's use of DefaultWriter.
-        self.writers = [('default', None) for _ in range(n_fit)]
+        self.writers = [("default", None) for _ in range(n_fit)]
 
         # Parameters to setup Agent's DefaultWriter
         self.agent_default_writer_kwargs = [
@@ -291,22 +302,24 @@ def __init__(self,
                 name=self.agent_name,
                 log_interval=3,
                 tensorboard_kwargs=None,
-                execution_metadata=metadata_utils.ExecutionMetadata(obj_worker_id=idx)
+                execution_metadata=metadata_utils.ExecutionMetadata(obj_worker_id=idx),
             )
             for idx in range(n_fit)
         ]
         self.tensorboard_dir = None
         if enable_tensorboard:
-            self.tensorboard_dir = self.output_dir / 'tensorboard'
+            self.tensorboard_dir = self.output_dir / "tensorboard"
             for idx, params in enumerate(self.agent_default_writer_kwargs):
-                params['tensorboard_kwargs'] = dict(
+                params["tensorboard_kwargs"] = dict(
                     log_dir=self.tensorboard_dir / str(idx)
                 )
         # Update DefaultWriter according to user's settings.
         default_writer_kwargs = default_writer_kwargs or {}
         if default_writer_kwargs:
-            logger.warning('(Re)defining the following DefaultWriter'
-                           f' parameters in AgentManager: {list(default_writer_kwargs.keys())}')
+            logger.warning(
+                "(Re)defining the following DefaultWriter"
+                f" parameters in AgentManager: {list(default_writer_kwargs.keys())}"
+            )
         for ii in range(n_fit):
             self.agent_default_writer_kwargs[ii].update(default_writer_kwargs)
 
@@ -324,13 +337,15 @@ def __init__(self,
 
     def _init_optuna_storage_url(self):
         self.output_dir.mkdir(parents=True, exist_ok=True)
-        self.db_filename = self.output_dir / 'optuna_data.db'
+        self.db_filename = self.output_dir / "optuna_data.db"
         if create_database(self.db_filename):
             self.optuna_storage_url = f"sqlite:///{self.db_filename}"
         else:
             self.db_filename = None
             self.optuna_storage_url = "sqlite:///:memory:"
-            logger.warning(f'Unable to create databate {self.db_filename}. Using sqlite:///:memory:')
+            logger.warning(
+                f"Unable to create databate {self.db_filename}. Using sqlite:///:memory:"
+            )
 
     def _set_init_kwargs(self):
         init_seeders = self.seeder.spawn(self.n_fit, squeeze=False)
@@ -344,7 +359,9 @@ def _set_init_kwargs(self):
                     copy_env=False,
                     seeder=init_seeders[ii],
                     output_dir=Path(self.output_dir) / f"output_{ii}",
-                    _execution_metadata=self.agent_default_writer_kwargs[ii]['execution_metadata'],
+                    _execution_metadata=self.agent_default_writer_kwargs[ii][
+                        "execution_metadata"
+                    ],
                     _default_writer_kwargs=self.agent_default_writer_kwargs[ii],
                 )
             )
@@ -357,7 +374,7 @@ def _reset_agent_handlers(self):
         self.agent_handlers = [
             AgentHandler(
                 id=ii,
-                filename=self.output_dir / Path(f'agent_handlers/idx_{ii}'),
+                filename=self.output_dir / Path(f"agent_handlers/idx_{ii}"),
                 seeder=handlers_seeders[ii],
                 agent_class=self.agent_class,
                 agent_instance=None,
@@ -379,7 +396,9 @@ def get_writer_data(self):
 
     def get_agent_instances(self):
         if self.agent_handlers:
-            return [agent_handler.get_instance() for agent_handler in self.agent_handlers]
+            return [
+                agent_handler.get_instance() for agent_handler in self.agent_handlers
+            ]
         return []
 
     def eval_agents(self, n_simulations: Optional[int] = None) -> list:
@@ -403,11 +422,13 @@ def eval_agents(self, n_simulations: Optional[int] = None) -> list:
             agent_idx = self.eval_seeder.rng.choice(len(self.agent_handlers))
             agent = self.agent_handlers[agent_idx]
             if agent.is_empty():
-                logger.error('Calling eval() in an AgentManager instance contaning an empty AgentHandler.'
-                             ' Returning [].')
+                logger.error(
+                    "Calling eval() in an AgentManager instance contaning an empty AgentHandler."
+                    " Returning []."
+                )
                 return []
             values.append(agent.eval(**self.eval_kwargs))
-            logger.info(f'[eval]... simulation {ii + 1}/{n_simulations}')
+            logger.info(f"[eval]... simulation {ii + 1}/{n_simulations}")
         return values
 
     def clear_output_dir(self):
@@ -415,7 +436,7 @@ def clear_output_dir(self):
         try:
             shutil.rmtree(self.output_dir)
         except FileNotFoundError:
-            logger.warning(f'No directory {self.output_dir} found to be deleted.')
+            logger.warning(f"No directory {self.output_dir} found to be deleted.")
 
     def clear_handlers(self):
         """Delete files from output_dir/agent_handlers that are managed by this class."""
@@ -443,8 +464,9 @@ def set_writer(self, idx, writer_fn, writer_kwargs=None):
             AgentManager fits `n_fit` agents, the writer of each one of them
             needs to be set separetely.
         """
-        assert idx >= 0 and idx < self.n_fit, \
-            "Invalid index sent to AgentManager.set_writer()"
+        assert (
+            idx >= 0 and idx < self.n_fit
+        ), "Invalid index sent to AgentManager.set_writer()"
         writer_kwargs = writer_kwargs or {}
         self.writers[idx] = (writer_fn, writer_kwargs)
 
@@ -464,29 +486,36 @@ def fit(self, budget=None, **kwargs):
         for handler in self.agent_handlers:
             handler.dump()
 
-        if self.parallelization == 'thread':
+        if self.parallelization == "thread":
             executor_class = concurrent.futures.ThreadPoolExecutor
             lock = threading.Lock()
-        elif self.parallelization == 'process':
+        elif self.parallelization == "process":
             executor_class = functools.partial(
                 concurrent.futures.ProcessPoolExecutor,
-                mp_context=multiprocessing.get_context('spawn'))
+                mp_context=multiprocessing.get_context("spawn"),
+            )
             lock = multiprocessing.Manager().Lock()
         else:
-            raise ValueError(f'Invalid backend for parallelization: {self.parallelization}')
-
-        args = [(
-            lock,
-            handler,
-            self.agent_class,
-            budget,
-            init_kwargs,
-            deepcopy(self.fit_kwargs),
-            writer,
-            self.worker_logging_level,
-            seeder)
-            for init_kwargs, handler, seeder, writer
-            in zip(self.init_kwargs, self.agent_handlers, seeders, self.writers)]
+            raise ValueError(
+                f"Invalid backend for parallelization: {self.parallelization}"
+            )
+
+        args = [
+            (
+                lock,
+                handler,
+                self.agent_class,
+                budget,
+                init_kwargs,
+                deepcopy(self.fit_kwargs),
+                writer,
+                self.worker_logging_level,
+                seeder,
+            )
+            for init_kwargs, handler, seeder, writer in zip(
+                self.init_kwargs, self.agent_handlers, seeders, self.writers
+            )
+        ]
 
         if len(args) == 1:
             workers_output = [_fit_worker(args[0])]
@@ -499,9 +528,7 @@ def fit(self, budget=None, **kwargs):
 
                 workers_output = []
                 for future in concurrent.futures.as_completed(futures):
-                    workers_output.append(
-                        future.result()
-                    )
+                    workers_output.append(future.result())
                 executor.shutdown()
 
         workers_output.sort(key=lambda x: x.id)
@@ -536,7 +563,7 @@ def save(self):
         output_dir.mkdir(parents=True, exist_ok=True)
         # save optimized hyperparameters
         if self.best_hyperparams is not None:
-            fname = Path(output_dir) / 'best_hyperparams.json'
+            fname = Path(output_dir) / "best_hyperparams.json"
             _safe_serialize_json(self.best_hyperparams, fname)
         # save default_writer_data that can be aggregated in a pandas DataFrame
         if self.default_writer_data is not None:
@@ -549,7 +576,7 @@ def save(self):
                 try:
                     output = pd.DataFrame(all_writer_data)
                     # save
-                    fname = Path(output_dir) / 'data.csv'
+                    fname = Path(output_dir) / "data.csv"
                     output.to_csv(fname, index=None)
                 except Exception:
                     logger.warning("Could not save default_writer_data.")
@@ -563,7 +590,7 @@ def save(self):
             handler.dump()
 
         # save
-        filename = Path('manager_obj').with_suffix('.pickle')
+        filename = Path("manager_obj").with_suffix(".pickle")
         filename = output_dir / filename
         filename.parent.mkdir(parents=True, exist_ok=True)
         try:
@@ -574,7 +601,9 @@ def save(self):
             try:
                 with filename.open("wb") as ff:
                     dill.dump(self.__dict__, ff)
-                logger.info("Saved AgentManager({}) using dill.".format(self.agent_name))
+                logger.info(
+                    "Saved AgentManager({}) using dill.".format(self.agent_name)
+                )
             except Exception as ex:
                 logger.warning("[AgentManager] Instance cannot be pickled: " + str(ex))
 
@@ -582,15 +611,15 @@ def save(self):
 
     @classmethod
     def load(cls, filename):
-        filename = Path(filename).with_suffix('.pickle')
+        filename = Path(filename).with_suffix(".pickle")
 
         obj = cls(None, None, None)
         try:
-            with filename.open('rb') as ff:
+            with filename.open("rb") as ff:
                 tmp_dict = pickle.load(ff)
             logger.info("Loaded AgentManager using pickle.")
         except Exception:
-            with filename.open('rb') as ff:
+            with filename.open("rb") as ff:
                 tmp_dict = dill.load(ff)
             logger.info("Loaded AgentManager using dill.")
 
@@ -598,18 +627,20 @@ def load(cls, filename):
         obj.__dict__.update(tmp_dict)
         return obj
 
-    def optimize_hyperparams(self,
-                             n_trials=256,
-                             timeout=60,
-                             n_fit=2,
-                             n_optuna_workers=2,
-                             optuna_parallelization='thread',
-                             sampler_method='optuna_default',
-                             pruner_method='halving',
-                             continue_previous=False,
-                             fit_fraction=1.0,
-                             sampler_kwargs=None,
-                             disable_evaluation_writers=True):
+    def optimize_hyperparams(
+        self,
+        n_trials=256,
+        timeout=60,
+        n_fit=2,
+        n_optuna_workers=2,
+        optuna_parallelization="thread",
+        sampler_method="optuna_default",
+        pruner_method="halving",
+        continue_previous=False,
+        fit_fraction=1.0,
+        sampler_kwargs=None,
+        disable_evaluation_writers=True,
+    ):
         """
         Run hyperparameter optimization and updates init_kwargs with the
         best hyperparameters found.
@@ -670,7 +701,7 @@ def optimize_hyperparams(self,
         #
         # setup
         #
-        TEMP_DIR = self.output_dir / 'optim'
+        TEMP_DIR = self.output_dir / "optim"
         global _OPTUNA_INSTALLED
         if not _OPTUNA_INSTALLED:
             logging.error("Optuna not installed.")
@@ -689,42 +720,43 @@ def optimize_hyperparams(self,
             if sampler_kwargs is None:
                 sampler_kwargs = {}
             # get sampler
-            if sampler_method == 'random':
+            if sampler_method == "random":
                 sampler = optuna.samplers.RandomSampler()
-            elif sampler_method == 'grid':
-                assert sampler_kwargs is not None, \
-                    "To use GridSampler, " + \
-                    "a search_space dictionary must be provided."
+            elif sampler_method == "grid":
+                assert sampler_kwargs is not None, (
+                    "To use GridSampler, "
+                    + "a search_space dictionary must be provided."
+                )
                 sampler = optuna.samplers.GridSampler(**sampler_kwargs)
-            elif sampler_method == 'cmaes':
+            elif sampler_method == "cmaes":
                 sampler = optuna.samplers.CmaEsSampler(**sampler_kwargs)
-            elif sampler_method == 'optuna_default':
+            elif sampler_method == "optuna_default":
                 sampler = optuna.samplers.TPESampler(**sampler_kwargs)
             else:
                 raise NotImplementedError(
-                    "Sampler method %s is not implemented." % sampler_method)
+                    "Sampler method %s is not implemented." % sampler_method
+                )
 
             # get pruner
-            if pruner_method == 'halving':
+            if pruner_method == "halving":
                 pruner = optuna.pruners.SuccessiveHalvingPruner(
-                    min_resource=1,
-                    reduction_factor=4,
-                    min_early_stopping_rate=0)
-            elif pruner_method == 'none':
+                    min_resource=1, reduction_factor=4, min_early_stopping_rate=0
+                )
+            elif pruner_method == "none":
                 pruner = None
             else:
                 raise NotImplementedError(
-                    "Pruner method %s is not implemented." % pruner_method)
+                    "Pruner method %s is not implemented." % pruner_method
+                )
 
             # storage
             self._init_optuna_storage_url()
             storage = optuna.storages.RDBStorage(self.optuna_storage_url)
 
             # optuna study
-            study = optuna.create_study(sampler=sampler,
-                                        pruner=pruner,
-                                        storage=storage,
-                                        direction='maximize')
+            study = optuna.create_study(
+                sampler=sampler, pruner=pruner, storage=storage, direction="maximize"
+            )
             self.optuna_study = study
 
         # save, to that optimization can be resumed later
@@ -744,11 +776,11 @@ def optimize_hyperparams(self,
             n_fit=n_fit,
             temp_dir=TEMP_DIR,  # TEMP_DIR
             disable_evaluation_writers=disable_evaluation_writers,
-            fit_fraction=fit_fraction
+            fit_fraction=fit_fraction,
         )
 
         try:
-            if optuna_parallelization == 'thread':
+            if optuna_parallelization == "thread":
                 with concurrent.futures.ThreadPoolExecutor() as executor:
                     for _ in range(n_optuna_workers):
                         executor.submit(
@@ -756,21 +788,26 @@ def optimize_hyperparams(self,
                             objective,
                             n_trials=n_trials,
                             timeout=timeout,
-                            gc_after_trial=True)
+                            gc_after_trial=True,
+                        )
                     executor.shutdown()
-            elif optuna_parallelization == 'process':
+            elif optuna_parallelization == "process":
                 with concurrent.futures.ProcessPoolExecutor(
-                        mp_context=multiprocessing.get_context('spawn')) as executor:
+                    mp_context=multiprocessing.get_context("spawn")
+                ) as executor:
                     for _ in range(n_optuna_workers):
                         executor.submit(
                             study.optimize,
                             objective,
                             n_trials=n_trials // n_optuna_workers,
                             timeout=timeout,
-                            gc_after_trial=True)
+                            gc_after_trial=True,
+                        )
                     executor.shutdown()
             else:
-                raise ValueError(f'Invalid value for optuna_parallelization: {optuna_parallelization}.')
+                raise ValueError(
+                    f"Invalid value for optuna_parallelization: {optuna_parallelization}."
+                )
 
         except KeyboardInterrupt:
             logger.warning("Evaluation stopped.")
@@ -779,21 +816,21 @@ def optimize_hyperparams(self,
         try:
             shutil.rmtree(TEMP_DIR)
         except FileNotFoundError as ex:
-            logger.warning(f'Could not delete {TEMP_DIR}: {ex}')
+            logger.warning(f"Could not delete {TEMP_DIR}: {ex}")
 
         # continue
         try:
             best_trial = study.best_trial
         except ValueError as ex:
-            logger.error(f'Hyperparam optimization failed due to the error: {ex}')
+            logger.error(f"Hyperparam optimization failed due to the error: {ex}")
             return dict()
 
-        logger.info(f'Number of finished trials: {len(study.trials)}')
-        logger.info('Best trial:')
-        logger.info(f'Value: {best_trial.value}')
-        logger.info('Params:')
+        logger.info(f"Number of finished trials: {len(study.trials)}")
+        logger.info("Best trial:")
+        logger.info(f"Value: {best_trial.value}")
+        logger.info("Params:")
         for key, value in best_trial.params.items():
-            logger.info(f'    {key}: {value}')
+            logger.info(f"    {key}: {value}")
 
         # store best parameters
         self.best_hyperparams = best_trial.params
@@ -817,8 +854,17 @@ def _fit_worker(args):
     """
     Create and fit an agent instance
     """
-    (lock, agent_handler, agent_class, fit_budget, init_kwargs,
-     fit_kwargs, writer, worker_logging_level, seeder) = args
+    (
+        lock,
+        agent_handler,
+        agent_class,
+        fit_budget,
+        init_kwargs,
+        fit_kwargs,
+        writer,
+        worker_logging_level,
+        seeder,
+    ) = args
 
     # reseed external libraries
     set_external_seed(seeder)
@@ -833,13 +879,15 @@ def _fit_worker(args):
             # create agent
             agent = agent_class(**init_kwargs)
             # seed agent
-            agent.reseed(seeder)    # TODO: check if extra reseeding here is necessary
+            agent.reseed(seeder)  # TODO: check if extra reseeding here is necessary
             agent_handler.set_instance(agent)
 
     # set writer
     if writer[0] is None:
         agent_handler.set_writer(None)
-    elif writer[0] != 'default':  # 'default' corresponds to DefaultWriter created by Agent.__init__()
+    elif (
+        writer[0] != "default"
+    ):  # 'default' corresponds to DefaultWriter created by Agent.__init__()
         writer_fn = writer[0]
         writer_kwargs = writer[1]
         agent_handler.set_writer(writer_fn(**writer_kwargs))
@@ -868,22 +916,22 @@ def _safe_serialize_json(obj, filename):
     def default(obj):
         return f"<<non-serializable: {type(obj).__qualname__}>>"
 
-    with open(filename, 'w') as fp:
+    with open(filename, "w") as fp:
         json.dump(obj, fp, sort_keys=True, indent=4, default=default)
 
 
 def _optuna_objective(
-        trial,
-        base_init_kwargs,  # self._base_init_kwargs
-        agent_class,  # self.agent_class
-        train_env,  # self.train_env
-        eval_env,
-        fit_budget,  # self.fit_budget
-        eval_kwargs,  # self.eval_kwargs
-        n_fit,
-        temp_dir,  # TEMP_DIR
-        disable_evaluation_writers,
-        fit_fraction
+    trial,
+    base_init_kwargs,  # self._base_init_kwargs
+    agent_class,  # self.agent_class
+    train_env,  # self.train_env
+    eval_env,
+    fit_budget,  # self.fit_budget
+    eval_kwargs,  # self.eval_kwargs
+    n_fit,
+    temp_dir,  # TEMP_DIR
+    disable_evaluation_writers,
+    fit_fraction,
 ):
     kwargs = deepcopy(base_init_kwargs)
 
@@ -902,13 +950,14 @@ def _optuna_objective(
         eval_env=eval_env,
         init_kwargs=kwargs,  # kwargs are being optimized
         eval_kwargs=deepcopy(eval_kwargs),
-        agent_name='optim',
+        agent_name="optim",
         n_fit=n_fit,
-        worker_logging_level='INFO',
-        parallelization='thread',
+        worker_logging_level="INFO",
+        parallelization="thread",
         output_dir=temp_dir,
         enable_tensorboard=False,
-        create_unique_out_dir=True)
+        create_unique_out_dir=True,
+    )
 
     if disable_evaluation_writers:
         for ii in range(params_stats.n_fit):
diff --git a/rlberry/manager/evaluation.py b/rlberry/manager/evaluation.py
index dd3a64b47..8d56a9ae0 100644
--- a/rlberry/manager/evaluation.py
+++ b/rlberry/manager/evaluation.py
@@ -6,12 +6,14 @@
 logger = logging.getLogger(__name__)
 
 
-def evaluate_agents(agent_manager_list,
-                    n_simulations=5,
-                    fignum=None,
-                    show=True,
-                    plot=True,
-                    sns_kwargs=None):
+def evaluate_agents(
+    agent_manager_list,
+    n_simulations=5,
+    fignum=None,
+    show=True,
+    plot=True,
+    sns_kwargs=None,
+):
     """
     Evaluate and compare each of the agents in agent_manager_list.
 
@@ -41,13 +43,15 @@ def evaluate_agents(agent_manager_list,
 
     eval_outputs = []
     for agent_manager in agent_manager_list:
-        logger.info(f'Evaluating {agent_manager.agent_name}...')
+        logger.info(f"Evaluating {agent_manager.agent_name}...")
         outputs = agent_manager.eval_agents(n_simulations)
         if len(outputs) > 0:
             eval_outputs.append(outputs)
 
     if len(eval_outputs) == 0:
-        logger.error('[evaluate_agents]: No evaluation data. Make sure AgentManager.fit() has been called.')
+        logger.error(
+            "[evaluate_agents]: No evaluation data. Make sure AgentManager.fit() has been called."
+        )
         return
 
     #
@@ -85,14 +89,16 @@ def evaluate_agents(agent_manager_list,
     return output
 
 
-def plot_writer_data(agent_manager,
-                     tag,
-                     xtag=None,
-                     fignum=None,
-                     show=True,
-                     preprocess_func=None,
-                     title=None,
-                     sns_kwargs=None):
+def plot_writer_data(
+    agent_manager,
+    tag,
+    xtag=None,
+    fignum=None,
+    show=True,
+    preprocess_func=None,
+    title=None,
+    sns_kwargs=None,
+):
     """
     Given a list of AgentManager, plot data (corresponding to info) obtained in each episode.
     The dictionary returned by agents' .fit() method must contain a key equal to `info`.
@@ -120,11 +126,11 @@ def plot_writer_data(agent_manager,
     -------
     Pandas DataFrame with processed data used by seaborn's lineplot.
     """
-    sns_kwargs = sns_kwargs or {'ci': 'sd'}
+    sns_kwargs = sns_kwargs or {"ci": "sd"}
 
     title = title or tag
     if preprocess_func is not None:
-        ylabel = 'value'
+        ylabel = "value"
     else:
         ylabel = tag
     preprocess_func = preprocess_func or (lambda x: x)
@@ -143,34 +149,36 @@ def plot_writer_data(agent_manager,
         if writer_data is not None:
             for idx in writer_data:
                 df = writer_data[idx]
-                processed_df = pd.DataFrame(df[df['tag'] == tag])
-                processed_df['value'] = preprocess_func(processed_df['value'].values)
+                processed_df = pd.DataFrame(df[df["tag"] == tag])
+                processed_df["value"] = preprocess_func(processed_df["value"].values)
                 # update name according to AgentManager name
-                processed_df['name'] = agent_name
+                processed_df["name"] = agent_name
                 # add column with xtag, if given
                 if xtag is not None:
-                    df_xtag = pd.DataFrame(df[df['tag'] == xtag])
-                    processed_df[xtag] = df_xtag['value'].values
+                    df_xtag = pd.DataFrame(df[df["tag"] == xtag])
+                    processed_df[xtag] = df_xtag["value"].values
                 data_list.append(processed_df)
     if len(data_list) == 0:
-        logger.error('[plot_writer_data]: No data to be plotted.')
+        logger.error("[plot_writer_data]: No data to be plotted.")
         return
 
     all_writer_data = pd.concat(data_list, ignore_index=True)
 
-    data = all_writer_data[all_writer_data['tag'] == tag]
+    data = all_writer_data[all_writer_data["tag"] == tag]
     if xtag is None:
-        xtag = 'global_step'
+        xtag = "global_step"
 
     if data[xtag].notnull().sum() > 0:
         xx = xtag
-        if data['global_step'].isna().sum() > 0:
-            logger.warning(f'Plotting {tag} vs {xtag}, but {xtag} might be missing for some agents.')
+        if data["global_step"].isna().sum() > 0:
+            logger.warning(
+                f"Plotting {tag} vs {xtag}, but {xtag} might be missing for some agents."
+            )
     else:
         xx = data.index
 
     plt.figure(fignum)
-    lineplot_kwargs = dict(x=xx, y='value', hue='name', style='name', data=data)
+    lineplot_kwargs = dict(x=xx, y="value", hue="name", style="name", data=data)
     lineplot_kwargs.update(sns_kwargs)
     sns.lineplot(**lineplot_kwargs)
     plt.title(title)
diff --git a/rlberry/manager/multiple_managers.py b/rlberry/manager/multiple_managers.py
index 9ca22429d..498b3f42f 100644
--- a/rlberry/manager/multiple_managers.py
+++ b/rlberry/manager/multiple_managers.py
@@ -40,15 +40,11 @@ def run(self, save=False):
         with concurrent.futures.ThreadPoolExecutor() as executor:
             futures = []
             for inst in self.instances:
-                futures.append(
-                    executor.submit(fit_stats, inst, save=save)
-                )
+                futures.append(executor.submit(fit_stats, inst, save=save))
 
             fitted_instances = []
             for future in concurrent.futures.as_completed(futures):
-                fitted_instances.append(
-                    future.result()
-                )
+                fitted_instances.append(future.result())
 
             self.instances = fitted_instances
 
diff --git a/rlberry/manager/remote_agent_manager.py b/rlberry/manager/remote_agent_manager.py
index 5d296bf34..397252c80 100644
--- a/rlberry/manager/remote_agent_manager.py
+++ b/rlberry/manager/remote_agent_manager.py
@@ -26,6 +26,7 @@ class RemoteAgentManager:
         Parameters for AgentManager instance.
         Some parameters (as agent_class, train_env, eval_env) can be defined using a ResourceRequest.
     """
+
     def __init__(
         self,
         client: BerryClient,
@@ -46,13 +47,11 @@ def __init__(
             if msg.command == interface.Command.RAISE_EXCEPTION:
                 raise Exception(msg.message)
 
-            self._remote_agent_manager_filename = pathlib.Path(
-                msg.info['filename']
-            )
+            self._remote_agent_manager_filename = pathlib.Path(msg.info["filename"])
 
             # get useful attributes
-            self.agent_name = msg.info['agent_name']
-            self.output_dir = pathlib.Path(msg.info['output_dir'])  # to save locally
+            self.agent_name = msg.info["agent_name"]
+            self.output_dir = pathlib.Path(msg.info["output_dir"])  # to save locally
 
     def set_client(self, client: BerryClient):
         self._client = client
@@ -75,21 +74,25 @@ def get_writer_data(self):
         )
         if msg.command == interface.Command.RAISE_EXCEPTION:
             raise Exception(msg.message)
-        raw_data = msg.data['writer_data']
+        raw_data = msg.data["writer_data"]
         writer_data = dict()
         for idx in raw_data:
             csv_content = raw_data[idx]
-            writer_data[idx] = pd.read_csv(io.StringIO(csv_content), sep=',')
+            writer_data[idx] = pd.read_csv(io.StringIO(csv_content), sep=",")
 
         # check if tensorboard data was received
         # If so, read file and unzip it.
-        tensorboard_bin_data = msg.data['tensorboard_bin_data']
+        tensorboard_bin_data = msg.data["tensorboard_bin_data"]
         if tensorboard_bin_data is not None:
-            tensorboard_bin_data = base64.b64decode(tensorboard_bin_data.encode('ascii'))
-            zip_file = open(self.output_dir / 'tensorboard_data.zip', "wb")
+            tensorboard_bin_data = base64.b64decode(
+                tensorboard_bin_data.encode("ascii")
+            )
+            zip_file = open(self.output_dir / "tensorboard_data.zip", "wb")
             zip_file.write(tensorboard_bin_data)
             zip_file.close()
-            with zipfile.ZipFile(self.output_dir / 'tensorboard_data.zip', 'r') as zip_ref:
+            with zipfile.ZipFile(
+                self.output_dir / "tensorboard_data.zip", "r"
+            ) as zip_ref:
                 zip_ref.extractall(self.output_dir)
         return writer_data
 
@@ -98,9 +101,8 @@ def fit(self, budget=None, **kwargs):
             interface.Message.create(
                 command=interface.Command.AGENT_MANAGER_FIT,
                 params=dict(
-                    filename=self.remote_file,
-                    budget=budget,
-                    extra_params=kwargs),
+                    filename=self.remote_file, budget=budget, extra_params=kwargs
+                ),
                 data=None,
             )
         )
@@ -111,15 +113,13 @@ def eval_agents(self, n_simulations: Optional[int] = None):
         msg = self._client.send(
             interface.Message.create(
                 command=interface.Command.AGENT_MANAGER_EVAL,
-                params=dict(
-                    filename=self.remote_file,
-                    n_simulations=n_simulations),
+                params=dict(filename=self.remote_file, n_simulations=n_simulations),
                 data=None,
             )
         )
         if msg.command == interface.Command.RAISE_EXCEPTION:
             raise Exception(msg.message)
-        out = msg.data['output']
+        out = msg.data["output"]
         return out
 
     def clear_output_dir(self):
@@ -146,11 +146,7 @@ def clear_handlers(self):
 
     def set_writer(self, idx, writer_fn, writer_kwargs=None):
         """Note: Use ResourceRequest for writer_fn."""
-        params = dict(
-            idx=idx,
-            writer_fn=writer_fn,
-            writer_kwargs=writer_kwargs
-        )
+        params = dict(idx=idx, writer_fn=writer_fn, writer_kwargs=writer_kwargs)
         msg = self._client.send(
             interface.Message.create(
                 command=interface.Command.AGENT_MANAGER_SET_WRITER,
@@ -189,34 +185,40 @@ def save(self):
         output_dir.mkdir(parents=True, exist_ok=True)
 
         # save
-        filename = pathlib.Path('remote_manager_obj').with_suffix('.pickle')
+        filename = pathlib.Path("remote_manager_obj").with_suffix(".pickle")
         filename = output_dir / filename
         filename.parent.mkdir(parents=True, exist_ok=True)
         try:
             with filename.open("wb") as ff:
                 pickle.dump(self.__dict__, ff)
-            logger.info("Saved RemoteAgentManager({}) using pickle.".format(self.agent_name))
+            logger.info(
+                "Saved RemoteAgentManager({}) using pickle.".format(self.agent_name)
+            )
         except Exception:
             try:
                 with filename.open("wb") as ff:
                     dill.dump(self.__dict__, ff)
-                logger.info("Saved RemoteAgentManager({}) using dill.".format(self.agent_name))
+                logger.info(
+                    "Saved RemoteAgentManager({}) using dill.".format(self.agent_name)
+                )
             except Exception as ex:
-                logger.warning("[RemoteAgentManager] Instance cannot be pickled: " + str(ex))
+                logger.warning(
+                    "[RemoteAgentManager] Instance cannot be pickled: " + str(ex)
+                )
 
         return filename
 
     @classmethod
     def load(cls, filename):
-        filename = pathlib.Path(filename).with_suffix('.pickle')
+        filename = pathlib.Path(filename).with_suffix(".pickle")
 
         obj = cls(None)
         try:
-            with filename.open('rb') as ff:
+            with filename.open("rb") as ff:
                 tmp_dict = pickle.load(ff)
             logger.info("Loaded RemoteAgentManager using pickle.")
         except Exception:
-            with filename.open('rb') as ff:
+            with filename.open("rb") as ff:
                 tmp_dict = dill.load(ff)
             logger.info("Loaded RemoteAgentManager using dill.")
 
diff --git a/rlberry/manager/tests/test_agent_manager.py b/rlberry/manager/tests/test_agent_manager.py
index 22500a92c..4de7f56b8 100644
--- a/rlberry/manager/tests/test_agent_manager.py
+++ b/rlberry/manager/tests/test_agent_manager.py
@@ -5,11 +5,7 @@
 
 
 class DummyAgent(AgentWithSimplePolicy):
-    def __init__(self,
-                 env,
-                 hyperparameter1=0,
-                 hyperparameter2=0,
-                 **kwargs):
+    def __init__(self, env, hyperparameter1=0, hyperparameter2=0, **kwargs):
         AgentWithSimplePolicy.__init__(self, env, **kwargs)
         self.name = "DummyAgent"
         self.fitted = False
@@ -24,8 +20,8 @@ def fit(self, budget, **kwargs):
         self.total_budget += budget
         for ii in range(budget):
             if self.writer is not None:
-                self.writer.add_scalar('a', 5)
-                self.writer.add_scalar('b', 6, ii)
+                self.writer.add_scalar("a", 5)
+                self.writer.add_scalar("b", 6, ii)
         return None
 
     def policy(self, observation):
@@ -33,12 +29,9 @@ def policy(self, observation):
 
     @classmethod
     def sample_parameters(cls, trial):
-        hyperparameter1 \
-            = trial.suggest_categorical('hyperparameter1', [1, 2, 3])
-        hyperparameter2 \
-            = trial.suggest_uniform('hyperparameter2', -10, 10)
-        return {'hyperparameter1': hyperparameter1,
-                'hyperparameter2': hyperparameter2}
+        hyperparameter1 = trial.suggest_categorical("hyperparameter1", [1, 2, 3])
+        hyperparameter2 = trial.suggest_uniform("hyperparameter2", -10, 10)
+        return {"hyperparameter1": hyperparameter1, "hyperparameter2": hyperparameter2}
 
 
 def test_agent_manager_1():
@@ -57,11 +50,24 @@ def test_agent_manager_1():
     # Run AgentManager
     params_per_instance = [dict(hyperparameter2=ii) for ii in range(4)]
     stats_agent1 = AgentManager(
-        DummyAgent, train_env, fit_budget=5, eval_kwargs=eval_kwargs,
-        init_kwargs=params, n_fit=4, seed=123, init_kwargs_per_instance=params_per_instance)
+        DummyAgent,
+        train_env,
+        fit_budget=5,
+        eval_kwargs=eval_kwargs,
+        init_kwargs=params,
+        n_fit=4,
+        seed=123,
+        init_kwargs_per_instance=params_per_instance,
+    )
     stats_agent2 = AgentManager(
-        DummyAgent, train_env, fit_budget=5, eval_kwargs=eval_kwargs,
-        init_kwargs=params, n_fit=4, seed=123)
+        DummyAgent,
+        train_env,
+        fit_budget=5,
+        eval_kwargs=eval_kwargs,
+        init_kwargs=params,
+        n_fit=4,
+        seed=123,
+    )
     agent_manager_list = [stats_agent1, stats_agent2]
     for st in agent_manager_list:
         st.fit()
@@ -75,7 +81,7 @@ def test_agent_manager_1():
         assert instance.hyperparameter2 == 100
 
     # learning curves
-    plot_writer_data(agent_manager_list, tag='episode_rewards', show=False)
+    plot_writer_data(agent_manager_list, tag="episode_rewards", show=False)
 
     # compare final policies
     evaluate_agents(agent_manager_list, show=False)
@@ -110,15 +116,25 @@ def test_agent_manager_2():
 
     # Run AgentManager
     stats_agent1 = AgentManager(
-        DummyAgent, train_env, eval_env=eval_env,
-        fit_budget=5, eval_kwargs=eval_kwargs,
-        init_kwargs=params, n_fit=4,
-        seed=123)
+        DummyAgent,
+        train_env,
+        eval_env=eval_env,
+        fit_budget=5,
+        eval_kwargs=eval_kwargs,
+        init_kwargs=params,
+        n_fit=4,
+        seed=123,
+    )
     stats_agent2 = AgentManager(
-        DummyAgent, train_env, eval_env=eval_env,
-        fit_budget=5, eval_kwargs=eval_kwargs,
-        init_kwargs=params, n_fit=4,
-        seed=123)
+        DummyAgent,
+        train_env,
+        eval_env=eval_env,
+        fit_budget=5,
+        eval_kwargs=eval_kwargs,
+        init_kwargs=params,
+        n_fit=4,
+        seed=123,
+    )
     agent_manager_list = [stats_agent1, stats_agent2]
     for st in agent_manager_list:
         st.fit()
@@ -128,7 +144,7 @@ def test_agent_manager_2():
     evaluate_agents(agent_manager_list, show=False)
 
     # learning curves
-    plot_writer_data(agent_manager_list, tag='episode_rewards', show=False)
+    plot_writer_data(agent_manager_list, tag="episode_rewards", show=False)
 
     # check if fitted
     for agent_manager in agent_manager_list:
@@ -154,7 +170,10 @@ def test_agent_manager_2():
 
 def test_agent_manager_partial_fit_and_tuple_env():
     # Define train and evaluation envs
-    train_env = (GridWorld, None)  # tuple (constructor, kwargs) must also work in AgentManager
+    train_env = (
+        GridWorld,
+        None,
+    )  # tuple (constructor, kwargs) must also work in AgentManager
 
     # Parameters
     params = {}
@@ -162,15 +181,23 @@ def test_agent_manager_partial_fit_and_tuple_env():
 
     # Run AgentManager
     stats = AgentManager(
-        DummyAgent, train_env,
-        init_kwargs=params, n_fit=4,
-        fit_budget=5, eval_kwargs=eval_kwargs,
-        seed=123)
+        DummyAgent,
+        train_env,
+        init_kwargs=params,
+        n_fit=4,
+        fit_budget=5,
+        eval_kwargs=eval_kwargs,
+        seed=123,
+    )
     stats2 = AgentManager(
-        DummyAgent, train_env,
-        init_kwargs=params, n_fit=4,
-        fit_budget=5, eval_kwargs=eval_kwargs,
-        seed=123)
+        DummyAgent,
+        train_env,
+        init_kwargs=params,
+        n_fit=4,
+        fit_budget=5,
+        eval_kwargs=eval_kwargs,
+        seed=123,
+    )
 
     # Run partial fit
     stats.fit(10)
@@ -182,7 +209,9 @@ def test_agent_manager_partial_fit_and_tuple_env():
     stats2.fit()
 
     # learning curves
-    plot_writer_data([stats], tag='episode_rewards', show=False, preprocess_func=np.cumsum)
+    plot_writer_data(
+        [stats], tag="episode_rewards", show=False, preprocess_func=np.cumsum
+    )
 
     # compare final policies
     evaluate_agents([stats], show=False)
diff --git a/rlberry/manager/tests/test_agent_manager_seeding.py b/rlberry/manager/tests/test_agent_manager_seeding.py
index aea00ace0..8f7cc1a08 100644
--- a/rlberry/manager/tests/test_agent_manager_seeding.py
+++ b/rlberry/manager/tests/test_agent_manager_seeding.py
@@ -8,30 +8,24 @@
 import pytest
 
 
-@pytest.mark.parametrize("env, agent_class",
-                         [
-                             ((MountainCar, {}), RSUCBVIAgent),
-                             ((gym_make, {'id': 'MountainCar-v0'}), RSUCBVIAgent),
-                             ((gym.make, {'id': 'MountainCar-v0'}), RSUCBVIAgent),
-                             ((MountainCar, {}), A2CAgent),
-                             ((gym_make, {'id': 'MountainCar-v0'}), A2CAgent),
-                             ((gym.make, {'id': 'MountainCar-v0'}), A2CAgent)
-                         ])
+@pytest.mark.parametrize(
+    "env, agent_class",
+    [
+        ((MountainCar, {}), RSUCBVIAgent),
+        ((gym_make, {"id": "MountainCar-v0"}), RSUCBVIAgent),
+        ((gym.make, {"id": "MountainCar-v0"}), RSUCBVIAgent),
+        ((MountainCar, {}), A2CAgent),
+        ((gym_make, {"id": "MountainCar-v0"}), A2CAgent),
+        ((gym.make, {"id": "MountainCar-v0"}), A2CAgent),
+    ],
+)
 def test_agent_manager_and_multiple_managers_seeding(env, agent_class):
     agent_manager = AgentManager(
-        agent_class,
-        env,
-        fit_budget=2,
-        init_kwargs={'horizon': 10},
-        n_fit=6,
-        seed=3456)
+        agent_class, env, fit_budget=2, init_kwargs={"horizon": 10}, n_fit=6, seed=3456
+    )
     agent_manager_test = AgentManager(
-        agent_class,
-        env,
-        fit_budget=2,
-        init_kwargs={'horizon': 10},
-        n_fit=6,
-        seed=3456)
+        agent_class, env, fit_budget=2, init_kwargs={"horizon": 10}, n_fit=6, seed=3456
+    )
 
     multimanagers = MultipleManagers()
     multimanagers.append(agent_manager)
diff --git a/rlberry/manager/tests/test_hyperparam_optim.py b/rlberry/manager/tests/test_hyperparam_optim.py
index 1fa00d514..47d3134d5 100644
--- a/rlberry/manager/tests/test_hyperparam_optim.py
+++ b/rlberry/manager/tests/test_hyperparam_optim.py
@@ -6,11 +6,7 @@
 
 
 class DummyAgent(AgentWithSimplePolicy):
-    def __init__(self,
-                 env,
-                 hyperparameter1=0,
-                 hyperparameter2=0,
-                 **kwargs):
+    def __init__(self, env, hyperparameter1=0, hyperparameter2=0, **kwargs):
         AgentWithSimplePolicy.__init__(self, env, **kwargs)
         self.name = "DummyAgent"
         self.fitted = False
@@ -29,12 +25,9 @@ def policy(self, observation):
 
     @classmethod
     def sample_parameters(cls, trial):
-        hyperparameter1 \
-            = trial.suggest_categorical('hyperparameter1', [1, 2, 3])
-        hyperparameter2 \
-            = trial.suggest_uniform('hyperparameter2', -10, 10)
-        return {'hyperparameter1': hyperparameter1,
-                'hyperparameter2': hyperparameter2}
+        hyperparameter1 = trial.suggest_categorical("hyperparameter1", [1, 2, 3])
+        hyperparameter2 = trial.suggest_uniform("hyperparameter2", -10, 10)
+        return {"hyperparameter1": hyperparameter1, "hyperparameter2": hyperparameter2}
 
 
 def test_hyperparam_optim_tpe():
@@ -42,12 +35,14 @@ def test_hyperparam_optim_tpe():
     train_env = (GridWorld, {})
 
     # Run AgentManager
-    stats_agent = AgentManager(DummyAgent,
-                             train_env,
-                             fit_budget=1,
-                             init_kwargs={},
-                             eval_kwargs={'eval_horizon': 5},
-                             n_fit=4)
+    stats_agent = AgentManager(
+        DummyAgent,
+        train_env,
+        fit_budget=1,
+        init_kwargs={},
+        eval_kwargs={"eval_horizon": 5},
+        n_fit=4,
+    )
 
     # test hyperparameter optimization with TPE sampler
     # using hyperopt default values
@@ -61,12 +56,14 @@ def test_hyperparam_optim_random():
     train_env = (GridWorld, {})
 
     # Run AgentManager
-    stats_agent = AgentManager(DummyAgent,
-                             train_env,
-                             init_kwargs={},
-                             fit_budget=1,
-                             eval_kwargs={'eval_horizon': 5},
-                             n_fit=4)
+    stats_agent = AgentManager(
+        DummyAgent,
+        train_env,
+        init_kwargs={},
+        fit_budget=1,
+        eval_kwargs={"eval_horizon": 5},
+        n_fit=4,
+    )
 
     # test hyperparameter optimization with random sampler
     stats_agent.optimize_hyperparams(sampler_method="random", n_trials=5)
@@ -78,20 +75,21 @@ def test_hyperparam_optim_grid():
     train_env = (GridWorld, {})
 
     # Run AgentManager
-    stats_agent = AgentManager(DummyAgent,
-                             train_env,
-                             init_kwargs={},
-                             fit_budget=1,
-                             eval_kwargs={'eval_horizon': 5},
-                             n_fit=4)
+    stats_agent = AgentManager(
+        DummyAgent,
+        train_env,
+        init_kwargs={},
+        fit_budget=1,
+        eval_kwargs={"eval_horizon": 5},
+        n_fit=4,
+    )
 
     # test hyperparameter optimization with grid sampler
-    search_space = {"hyperparameter1": [1, 2, 3],
-                    "hyperparameter2": [-5, 0, 5]}
+    search_space = {"hyperparameter1": [1, 2, 3], "hyperparameter2": [-5, 0, 5]}
     sampler_kwargs = {"search_space": search_space}
-    stats_agent.optimize_hyperparams(n_trials=3 * 3,
-                                     sampler_method="grid",
-                                     sampler_kwargs=sampler_kwargs)
+    stats_agent.optimize_hyperparams(
+        n_trials=3 * 3, sampler_method="grid", sampler_kwargs=sampler_kwargs
+    )
     stats_agent.clear_output_dir()
 
 
@@ -100,12 +98,14 @@ def test_hyperparam_optim_cmaes():
     train_env = (GridWorld, {})
 
     # Run AgentManager
-    stats_agent = AgentManager(DummyAgent,
-                             train_env,
-                             init_kwargs={},
-                             fit_budget=1,
-                             eval_kwargs={'eval_horizon': 5},
-                             n_fit=4)
+    stats_agent = AgentManager(
+        DummyAgent,
+        train_env,
+        init_kwargs={},
+        fit_budget=1,
+        eval_kwargs={"eval_horizon": 5},
+        n_fit=4,
+    )
 
     # test hyperparameter optimization with CMA-ES sampler
     stats_agent.optimize_hyperparams(sampler_method="cmaes", n_trials=5)
@@ -119,27 +119,35 @@ def sample_parameters(cls, trial):
             """
             Sample hyperparameters for hyperparam optimization using Optuna (https://optuna.org/)
             """
-            gamma = trial.suggest_categorical('gamma', [0.1, 0.99])
-            return {'gamma': gamma}
-
-    env = (GridWorld, dict(
-        nrows=3, ncols=10,
-        reward_at={(1, 1): 0.1, (2, 9): 1.0},
-        walls=((1, 4), (2, 4), (1, 5)),
-        success_probability=0.9))
-
-    vi_params = {'gamma': 0.1, 'epsilon': 1e-3}
-
-    vi_stats = AgentManager(ValueIterationAgentToOptimize,
-                          env,
-                          fit_budget=0,
-                          eval_kwargs=dict(eval_horizon=20),
-                          init_kwargs=vi_params,
-                          n_fit=4,
-                          seed=123)
-
-    vi_stats.optimize_hyperparams(n_trials=5, n_fit=1,
-                                  sampler_method='random', pruner_method='none')
+            gamma = trial.suggest_categorical("gamma", [0.1, 0.99])
+            return {"gamma": gamma}
+
+    env = (
+        GridWorld,
+        dict(
+            nrows=3,
+            ncols=10,
+            reward_at={(1, 1): 0.1, (2, 9): 1.0},
+            walls=((1, 4), (2, 4), (1, 5)),
+            success_probability=0.9,
+        ),
+    )
+
+    vi_params = {"gamma": 0.1, "epsilon": 1e-3}
+
+    vi_stats = AgentManager(
+        ValueIterationAgentToOptimize,
+        env,
+        fit_budget=0,
+        eval_kwargs=dict(eval_horizon=20),
+        init_kwargs=vi_params,
+        n_fit=4,
+        seed=123,
+    )
+
+    vi_stats.optimize_hyperparams(
+        n_trials=5, n_fit=1, sampler_method="random", pruner_method="none"
+    )
 
     assert vi_stats.optuna_study
     vi_stats.clear_output_dir()
diff --git a/rlberry/manager/utils.py b/rlberry/manager/utils.py
index a3c4fc409..347ed0ed2 100644
--- a/rlberry/manager/utils.py
+++ b/rlberry/manager/utils.py
@@ -6,7 +6,7 @@ def create_database(db_file):
     connection = None
     try:
         connection = sqlite3.connect(db_file)
-        print(f'Connected to {db_file} (sqlite3 version = {sqlite3.version})')
+        print(f"Connected to {db_file} (sqlite3 version = {sqlite3.version})")
     except sqlite3.Error as err:
         print(err)
 
diff --git a/rlberry/metadata_utils.py b/rlberry/metadata_utils.py
index 7a4302451..a443b6420 100644
--- a/rlberry/metadata_utils.py
+++ b/rlberry/metadata_utils.py
@@ -5,10 +5,10 @@
 
 
 # Default output directory used by the library.
-RLBERRY_DEFAULT_DATA_DIR = 'rlberry_data/'
+RLBERRY_DEFAULT_DATA_DIR = "rlberry_data/"
 
 # Temporary directory used by the library
-RLBERRY_TEMP_DATA_DIR = 'rlberry_data/temp/'
+RLBERRY_TEMP_DATA_DIR = "rlberry_data/temp/"
 
 
 def get_unique_id(obj):
@@ -19,7 +19,7 @@ def get_unique_id(obj):
     # uuid4() is an universal id, but there might be issues if called simultaneously in different processes.
     # This function combines id(), uuid4(), and a timestamp in a single ID, and hashes it.
     timestamp = datetime.timestamp(datetime.now())
-    timestamp = str(timestamp).replace('.', '')
+    timestamp = str(timestamp).replace(".", "")
     str_id = timestamp + str(id(obj)) + uuid.uuid4().hex
     str_id = hashlib.md5(str_id.encode()).hexdigest()
     return str_id
@@ -38,5 +38,6 @@ class ExecutionMetadata(NamedTuple):
     obj_info : dict, default: None
         Extra info about the object.
     """
+
     obj_worker_id: int = -1
     obj_info: Optional[dict] = None
diff --git a/rlberry/network/client.py b/rlberry/network/client.py
index ea8cc3036..51e5ffba3 100644
--- a/rlberry/network/client.py
+++ b/rlberry/network/client.py
@@ -6,7 +6,7 @@
 from rlberry.network.utils import serialize_message
 
 
-class BerryClient():
+class BerryClient:
     """
     rlberry client
 
@@ -17,9 +17,10 @@ class BerryClient():
     port : int
         Integer from 1-65535
     """
+
     def __init__(
         self,
-        host='127.0.0.1',
+        host="127.0.0.1",
         port: int = 65432,
     ) -> None:
         assert port >= 1 and port <= 65535
@@ -27,9 +28,9 @@ def __init__(
         self._port = port
 
     def send(
-            self,
-            *messages: interface.Message,
-            print_response: bool = False,
+        self,
+        *messages: interface.Message,
+        print_response: bool = False,
     ) -> Union[List[interface.Message], interface.Message]:
         returned_messages = []
         pp = pprint.PrettyPrinter(indent=4)
diff --git a/rlberry/network/interface.py b/rlberry/network/interface.py
index 7ee4b5cc8..929a3f366 100644
--- a/rlberry/network/interface.py
+++ b/rlberry/network/interface.py
@@ -2,22 +2,22 @@
 from typing import Any, Dict, Mapping, NamedTuple, Optional
 
 
-REQUEST_PREFIX = 'ResourceRequest_'
+REQUEST_PREFIX = "ResourceRequest_"
 
 
 class Command:
-    NONE = 'NONE'
-    RAISE_EXCEPTION = 'RAISE_EXCEPTION'
-    ECHO = 'ECHO'
-    LIST_RESOURCES = 'LIST_RESOURCES'
-    AGENT_MANAGER_CREATE_INSTANCE = 'AGENT_MANAGER_CREATE_INSTANCE'
-    AGENT_MANAGER_FIT = 'AGENT_MANAGER_FIT'
-    AGENT_MANAGER_EVAL = 'AGENT_MANAGER_EVAL'
-    AGENT_MANAGER_CLEAR_OUTPUT_DIR = 'AGENT_MANAGER_CLEAR_OUTPUT_DIR'
-    AGENT_MANAGER_CLEAR_HANDLERS = 'AGENT_MANAGER_CLEAR_HANDLERS'
-    AGENT_MANAGER_SET_WRITER = 'AGENT_MANAGER_SET_WRITER'
-    AGENT_MANAGER_OPTIMIZE_HYPERPARAMS = 'AGENT_MANAGER_OPTIMIZE_HYPERPARAMS'
-    AGENT_MANAGER_GET_WRITER_DATA = 'AGENT_MANAGER_GET_WRITER_DATA'
+    NONE = "NONE"
+    RAISE_EXCEPTION = "RAISE_EXCEPTION"
+    ECHO = "ECHO"
+    LIST_RESOURCES = "LIST_RESOURCES"
+    AGENT_MANAGER_CREATE_INSTANCE = "AGENT_MANAGER_CREATE_INSTANCE"
+    AGENT_MANAGER_FIT = "AGENT_MANAGER_FIT"
+    AGENT_MANAGER_EVAL = "AGENT_MANAGER_EVAL"
+    AGENT_MANAGER_CLEAR_OUTPUT_DIR = "AGENT_MANAGER_CLEAR_OUTPUT_DIR"
+    AGENT_MANAGER_CLEAR_HANDLERS = "AGENT_MANAGER_CLEAR_HANDLERS"
+    AGENT_MANAGER_SET_WRITER = "AGENT_MANAGER_SET_WRITER"
+    AGENT_MANAGER_OPTIMIZE_HYPERPARAMS = "AGENT_MANAGER_OPTIMIZE_HYPERPARAMS"
+    AGENT_MANAGER_GET_WRITER_DATA = "AGENT_MANAGER_GET_WRITER_DATA"
 
 
 class BerryServerInfo:
@@ -26,7 +26,7 @@ class BerryServerInfo:
 
 
 class Message(NamedTuple):
-    message: Optional[str] = ''
+    message: Optional[str] = ""
     command: Optional[Command] = None
     params: Optional[Mapping[str, Any]] = None
     data: Optional[Mapping[str, Any]] = None
@@ -37,13 +37,14 @@ def to_dict(self):
 
     @classmethod
     def create(
-            cls,
-            message: Optional[str] = '',
-            command: Optional[Command] = None,
-            params: Optional[Mapping[str, Any]] = None,
-            data: Optional[Mapping[str, Any]] = None,
-            info: Optional[Mapping[str, Any]] = None):
-        command = command or ''
+        cls,
+        message: Optional[str] = "",
+        command: Optional[Command] = None,
+        params: Optional[Mapping[str, Any]] = None,
+        data: Optional[Mapping[str, Any]] = None,
+        info: Optional[Mapping[str, Any]] = None,
+    ):
+        command = command or ""
         params = params or dict()
         data = data or dict()
         info = info or dict()
@@ -81,8 +82,8 @@ def send_data(socket, data):
     """
     adapted from: https://stackoverflow.com/a/63532988
     """
-    print(f'[rlberry.network] sending {len(data)} bytes...')
-    socket.sendall(struct.pack('>I', len(data)) + data)
+    print(f"[rlberry.network] sending {len(data)} bytes...")
+    socket.sendall(struct.pack(">I", len(data)) + data)
 
 
 def receive_data(socket):
@@ -92,11 +93,11 @@ def receive_data(socket):
     data_size_packed = socket.recv(4)
     if not data_size_packed:
         return data_size_packed
-    data_size = struct.unpack('>I', data_size_packed)[0]
+    data_size = struct.unpack(">I", data_size_packed)[0]
     received_data = b""
     remaining_size = min(next_power_of_two(data_size), 4096)
     while remaining_size > 0:
         received_data += socket.recv(remaining_size)
         remaining_size = data_size - len(received_data)
-        print(f'[rlberry.network] ... received {len(received_data)}/{data_size} bytes.')
+        print(f"[rlberry.network] ... received {len(received_data)}/{data_size} bytes.")
     return received_data
diff --git a/rlberry/network/server.py b/rlberry/network/server.py
index 5c737fc1b..ab0926416 100644
--- a/rlberry/network/server.py
+++ b/rlberry/network/server.py
@@ -5,7 +5,11 @@
 import json
 import rlberry.network.server_utils as server_utils
 from rlberry.network import interface
-from rlberry.network.utils import apply_fn_to_tree, map_request_to_obj, serialize_message
+from rlberry.network.utils import (
+    apply_fn_to_tree,
+    map_request_to_obj,
+    serialize_message,
+)
 from rlberry.envs import gym_make
 from typing import Optional
 
@@ -18,14 +22,16 @@ def __init__(self, client_socket, client_address, resources, timeout):
         self._socket = client_socket
         self._address = client_address
         self._resources = resources
-        self._logger = logging.getLogger('ClientHandler')
+        self._logger = logging.getLogger("ClientHandler")
         self._timeout = timeout
 
     def _process_message(self, message: interface.Message):
         """Replace resource requests in 'message' by available resources."""
         message = message.to_dict()
         message = apply_fn_to_tree(
-            lambda key, val: map_request_to_obj(key, val, self._resources), message, apply_to_nodes=True
+            lambda key, val: map_request_to_obj(key, val, self._resources),
+            message,
+            apply_to_nodes=True,
         )
         return interface.Message.from_dict(message)
 
@@ -39,8 +45,8 @@ def _execute_message(self, message: interface.Message):
             interface.send_data(self._socket, serialize_message(response))
         except Exception as ex:
             response = interface.Message.create(
-                command=interface.Command.RAISE_EXCEPTION,
-                message=str(ex))
+                command=interface.Command.RAISE_EXCEPTION, message=str(ex)
+            )
             interface.send_data(self._socket, serialize_message(response))
             return 1
         return 0
@@ -49,7 +55,9 @@ def run(self):
         with self._socket:
             try:
                 while True:
-                    print(f'\n<server: client process> Handling client @ {self._address}')
+                    print(
+                        f"\n<server: client process> Handling client @ {self._address}"
+                    )
                     self._socket.settimeout(self._timeout)
                     message_bytes = interface.receive_data(self._socket)
                     if not message_bytes:
@@ -57,17 +65,17 @@ def run(self):
                     # process bytes
                     message = interface.Message.from_dict(json.loads(message_bytes))
                     message = self._process_message(message)
-                    print(f'<server: client process> Received message: \n{message}')
+                    print(f"<server: client process> Received message: \n{message}")
                     # execute message commands and send back a response
                     self._execute_message(message)
             except Exception as ex:
-                print(f'<server: client process> [ERROR]: {ex}')
+                print(f"<server: client process> [ERROR]: {ex}")
                 self._logger.exception(ex)
             finally:
-                print(f'<server: client process> Finished client @ {self._address}')
+                print(f"<server: client process> Finished client @ {self._address}")
 
 
-class BerryServer():
+class BerryServer:
     """
     rlberry server
 
@@ -87,9 +95,10 @@ class BerryServer():
         Number of received client sockets after which to terminate the server. If None,
         does not terminate.
     """
+
     def __init__(
         self,
-        host='127.0.0.1',
+        host="127.0.0.1",
         port: int = 65432,
         backlog: int = 5,
         resources: Optional[interface.Resources] = None,
@@ -109,42 +118,55 @@ def __init__(
         # Define basic resources
         if resources is None:
             self._resources = dict(
-                gym_make=interface.ResourceItem(
-                    obj=gym_make,
-                    description='gym_make'),
+                gym_make=interface.ResourceItem(obj=gym_make, description="gym_make"),
             )
         else:
             for _, val in resources.items():
-                if set(val.keys()) != set(['obj', 'description']):
+                if set(val.keys()) != set(["obj", "description"]):
                     raise ValueError(
                         "resources items must be a dictionary with keys ['obj', 'description']."
-                        f" Received: {list(val.keys())}")
+                        f" Received: {list(val.keys())}"
+                    )
 
     def start(self):
-        print(f'\n\nStarting BerryServer @ (host, port) = ({self._host}, {self._port}).\n\n')
+        print(
+            f"\n\nStarting BerryServer @ (host, port) = ({self._host}, {self._port}).\n\n"
+        )
         with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
             s.bind((self._host, self._port))
             s.listen(self._backlog)
-            with concurrent.futures.ProcessPoolExecutor(mp_context=multiprocessing.get_context('spawn')) as executor:
+            with concurrent.futures.ProcessPoolExecutor(
+                mp_context=multiprocessing.get_context("spawn")
+            ) as executor:
                 futures = []
                 while True:
-                    print(f'<server: main process> BerryServer({self._host}, {self._port}): waiting for connection...')
-                    client_socket, client_address = s.accept()   # wait for connection
+                    print(
+                        f"<server: main process> BerryServer({self._host}, {self._port}): waiting for connection..."
+                    )
+                    client_socket, client_address = s.accept()  # wait for connection
                     self._client_socket_counter += 1
                     client_handler = ClientHandler(
                         client_socket,
                         client_address,
                         self._resources,
-                        self._client_socket_timeout)
-                    print(f'<server: main process> BerryServer({self._host}, {self._port}): '
-                          f'new client @ {client_address}')
+                        self._client_socket_timeout,
+                    )
+                    print(
+                        f"<server: main process> BerryServer({self._host}, {self._port}): "
+                        f"new client @ {client_address}"
+                    )
                     futures.append(executor.submit(client_handler.run))
-                    if self._terminate_after and self._client_socket_counter >= self._terminate_after:
-                        print('<server: main process> Terminating server (main process): '
-                              'reached max number of client sockets.')
+                    if (
+                        self._terminate_after
+                        and self._client_socket_counter >= self._terminate_after
+                    ):
+                        print(
+                            "<server: main process> Terminating server (main process): "
+                            "reached max number of client sockets."
+                        )
                         break
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     server = BerryServer()
     server.start()
diff --git a/rlberry/network/server_utils.py b/rlberry/network/server_utils.py
index e5beed83c..4a63ee0fd 100644
--- a/rlberry/network/server_utils.py
+++ b/rlberry/network/server_utils.py
@@ -7,38 +7,40 @@
 
 
 def execute_message(
-        message: interface.Message,
-        resources: interface.Resources) -> interface.Message:
+    message: interface.Message, resources: interface.Resources
+) -> interface.Message:
     response = interface.Message.create(command=interface.Command.ECHO)
     # LIST_RESOURCES
     if message.command == interface.Command.LIST_RESOURCES:
         info = {}
         for rr in resources:
-            info[rr] = resources[rr]['description']
+            info[rr] = resources[rr]["description"]
         response = interface.Message.create(info=info)
     # AGENT_MANAGER_CREATE_INSTANCE
     elif message.command == interface.Command.AGENT_MANAGER_CREATE_INSTANCE:
         params = message.params
         base_dir = pathlib.Path(metadata_utils.RLBERRY_DEFAULT_DATA_DIR)
-        if 'output_dir' in params:
-            params['output_dir'] = base_dir / 'server_data' / params['output_dir']
+        if "output_dir" in params:
+            params["output_dir"] = base_dir / "server_data" / params["output_dir"]
         else:
-            params['output_dir'] = base_dir / 'server_data/'
+            params["output_dir"] = base_dir / "server_data/"
         agent_manager = AgentManager(**params)
         filename = str(agent_manager.save())
         response = interface.Message.create(
             info=dict(
                 filename=filename,
                 agent_name=agent_manager.agent_name,
-                output_dir=str(agent_manager.output_dir).replace('server_data/', 'client_data/')
+                output_dir=str(agent_manager.output_dir).replace(
+                    "server_data/", "client_data/"
+                ),
             )
         )
         del agent_manager
     # AGENT_MANAGER_FIT
     elif message.command == interface.Command.AGENT_MANAGER_FIT:
-        filename = message.params['filename']
-        budget = message.params['budget']
-        extra_params = message.params['extra_params']
+        filename = message.params["filename"]
+        budget = message.params["budget"]
+        extra_params = message.params["extra_params"]
         agent_manager = AgentManager.load(filename)
         agent_manager.fit(budget, **extra_params)
         agent_manager.save()
@@ -46,45 +48,49 @@ def execute_message(
         del agent_manager
     # AGENT_MANAGER_EVAL
     elif message.command == interface.Command.AGENT_MANAGER_EVAL:
-        filename = message.params['filename']
+        filename = message.params["filename"]
         agent_manager = AgentManager.load(filename)
-        eval_output = agent_manager.eval_agents(message.params['n_simulations'])
+        eval_output = agent_manager.eval_agents(message.params["n_simulations"])
         response = interface.Message.create(data=dict(output=eval_output))
         del agent_manager
     # AGENT_MANAGER_CLEAR_OUTPUT_DIR
     elif message.command == interface.Command.AGENT_MANAGER_CLEAR_OUTPUT_DIR:
-        filename = message.params['filename']
+        filename = message.params["filename"]
         agent_manager = AgentManager.load(filename)
         agent_manager.clear_output_dir()
-        response = interface.Message.create(message=f'Cleared output dir: {agent_manager.output_dir}')
+        response = interface.Message.create(
+            message=f"Cleared output dir: {agent_manager.output_dir}"
+        )
         del agent_manager
     # AGENT_MANAGER_CLEAR_HANDLERS
     elif message.command == interface.Command.AGENT_MANAGER_CLEAR_HANDLERS:
-        filename = message.params['filename']
+        filename = message.params["filename"]
         agent_manager = AgentManager.load(filename)
         agent_manager.clear_handlers()
         agent_manager.save()
-        response = interface.Message.create(message=f'Cleared handlers: {filename}')
+        response = interface.Message.create(message=f"Cleared handlers: {filename}")
         del agent_manager
     # AGENT_MANAGER_SET_WRITER
     elif message.command == interface.Command.AGENT_MANAGER_SET_WRITER:
-        filename = message.params['filename']
+        filename = message.params["filename"]
         agent_manager = AgentManager.load(filename)
-        agent_manager.set_writer(**message.params['kwargs'])
+        agent_manager.set_writer(**message.params["kwargs"])
         agent_manager.save()
         del agent_manager
     # AGENT_MANAGER_OPTIMIZE_HYPERPARAMS
     elif message.command == interface.Command.AGENT_MANAGER_OPTIMIZE_HYPERPARAMS:
-        filename = message.params['filename']
+        filename = message.params["filename"]
         agent_manager = AgentManager.load(filename)
-        best_params_dict = agent_manager.optimize_hyperparams(**message.params['kwargs'])
+        best_params_dict = agent_manager.optimize_hyperparams(
+            **message.params["kwargs"]
+        )
         agent_manager.save()
         del agent_manager
         response = interface.Message.create(data=best_params_dict)
     # AGENT_MANAGER_GET_WRITER_DATA
     elif message.command == interface.Command.AGENT_MANAGER_GET_WRITER_DATA:
         # writer scalar data
-        filename = message.params['filename']
+        filename = message.params["filename"]
         agent_manager = AgentManager.load(filename)
         writer_data = agent_manager.get_writer_data()
         writer_data = writer_data or dict()
@@ -95,14 +101,17 @@ def execute_message(
         if agent_manager.tensorboard_dir is not None:
             tensorboard_zip_file = rlberry.utils.io.zipdir(
                 agent_manager.tensorboard_dir,
-                agent_manager.output_dir / 'tensorboard_data.zip')
+                agent_manager.output_dir / "tensorboard_data.zip",
+            )
             if tensorboard_zip_file is not None:
                 tensorboard_bin_data = open(tensorboard_zip_file, "rb").read()
-                tensorboard_bin_data = base64.b64encode(tensorboard_bin_data).decode('ascii')
+                tensorboard_bin_data = base64.b64encode(tensorboard_bin_data).decode(
+                    "ascii"
+                )
         response = interface.Message.create(
             data=dict(
-                writer_data=writer_data,
-                tensorboard_bin_data=tensorboard_bin_data)
+                writer_data=writer_data, tensorboard_bin_data=tensorboard_bin_data
+            )
         )
         del agent_manager
     # end
diff --git a/rlberry/network/utils.py b/rlberry/network/utils.py
index a2feb7fb1..67e2ae1f7 100644
--- a/rlberry/network/utils.py
+++ b/rlberry/network/utils.py
@@ -4,18 +4,21 @@
 from typing import Any, Callable, Mapping, Optional, Tuple, Union
 
 
-Tree = Union[Any, Tuple, Mapping[Any, 'Tree']]
+Tree = Union[Any, Tuple, Mapping[Any, "Tree"]]
 
 
 def apply_fn_to_tree(
-        fn: Callable[[Any, Any], Tuple[Any, Any]],
-        tree: Tree,
-        is_leaf: Optional[Callable[[Any], Any]] = None,
-        apply_to_nodes: Optional[bool] = False):
+    fn: Callable[[Any, Any], Tuple[Any, Any]],
+    tree: Tree,
+    is_leaf: Optional[Callable[[Any], Any]] = None,
+    apply_to_nodes: Optional[bool] = False,
+):
     """
     new_key, new_val = fn(key, my_dict[key])
     """
-    is_leaf = is_leaf or (lambda x: not isinstance(x, Mapping) and not isinstance(x, Tuple))
+    is_leaf = is_leaf or (
+        lambda x: not isinstance(x, Mapping) and not isinstance(x, Tuple)
+    )
     if is_leaf(tree):
         return deepcopy(tree)
     if isinstance(tree, Mapping):
@@ -27,12 +30,16 @@ def apply_fn_to_tree(
                 new_key, new_val = fn(key, tree[key])
                 new_tree.pop(key)
                 new_tree[new_key] = new_val
-        return {key: apply_fn_to_tree(
-            fn, val, is_leaf, apply_to_nodes) for (key, val) in new_tree.items()}
+        return {
+            key: apply_fn_to_tree(fn, val, is_leaf, apply_to_nodes)
+            for (key, val) in new_tree.items()
+        }
     elif isinstance(tree, Tuple):
-        return tuple([apply_fn_to_tree(fn, val, is_leaf, apply_to_nodes) for val in tree])
+        return tuple(
+            [apply_fn_to_tree(fn, val, is_leaf, apply_to_nodes) for val in tree]
+        )
     else:
-        raise RuntimeError('Tree is not a Mapping or Tuple.')
+        raise RuntimeError("Tree is not a Mapping or Tuple.")
 
 
 def _map_resource_request_to_dict(key, val):
@@ -46,29 +53,31 @@ def _map_resource_request_to_dict(key, val):
 
 def map_request_to_obj(key, val, resources: interface.Resources):
     if key.startswith(interface.REQUEST_PREFIX):
-        new_key = key[len(interface.REQUEST_PREFIX):]
-        resource_name = val['name']
+        new_key = key[len(interface.REQUEST_PREFIX) :]
+        resource_name = val["name"]
         try:
-            resource_kwargs = val['kwargs']
+            resource_kwargs = val["kwargs"]
         except KeyError:
             resource_kwargs = None
         if resource_name in resources:
             if resource_kwargs:
-                new_val = (resources[resource_name]['obj'], resource_kwargs)
+                new_val = (resources[resource_name]["obj"], resource_kwargs)
             else:
-                new_val = resources[resource_name]['obj']
+                new_val = resources[resource_name]["obj"]
             return new_key, new_val
         else:
-            raise RuntimeError(f'Unavailable requested resource: {resource_name}')
+            raise RuntimeError(f"Unavailable requested resource: {resource_name}")
     else:
         return key, val
 
 
 def serialize_message(message: interface.Message) -> bytes:
     message = message.to_dict()
-    message = apply_fn_to_tree(_map_resource_request_to_dict, message, apply_to_nodes=True)
+    message = apply_fn_to_tree(
+        _map_resource_request_to_dict, message, apply_to_nodes=True
+    )
 
     def default(obj):
         return f"<<non-serializable: {type(obj).__qualname__}>>"
 
-    return str.encode(json.dumps(message, default=default))
\ No newline at end of file
+    return str.encode(json.dumps(message, default=default))
diff --git a/rlberry/rendering/opengl_render2d.py b/rlberry/rendering/opengl_render2d.py
index 269427cd2..562b35aea 100644
--- a/rlberry/rendering/opengl_render2d.py
+++ b/rlberry/rendering/opengl_render2d.py
@@ -8,10 +8,10 @@
 from rlberry.rendering import Scene
 
 logger = logging.getLogger(__name__)
-environ['PYGAME_HIDE_SUPPORT_PROMPT'] = '1'
+environ["PYGAME_HIDE_SUPPORT_PROMPT"] = "1"
 
 _IMPORT_SUCESSFUL = True
-_IMPORT_ERROR_MSG = ''
+_IMPORT_ERROR_MSG = ""
 try:
     import pygame as pg
     from pygame.locals import DOUBLEBUF, OPENGL
@@ -93,16 +93,24 @@ def initGL(self):
         """
         glMatrixMode(GL_PROJECTION)
         glLoadIdentity()
-        gluOrtho2D(self.clipping_area[0], self.clipping_area[1],
-                   self.clipping_area[2], self.clipping_area[3])
+        gluOrtho2D(
+            self.clipping_area[0],
+            self.clipping_area[1],
+            self.clipping_area[2],
+            self.clipping_area[3],
+        )
 
     def display(self):
         """
         Callback function, handler for window re-paint
         """
         # Set background color (clear background)
-        glClearColor(self.background_color[0], self.background_color[1],
-                     self.background_color[2], 1.0)
+        glClearColor(
+            self.background_color[0],
+            self.background_color[1],
+            self.background_color[2],
+            1.0,
+        )
         glClear(GL_COLOR_BUFFER_BIT)
 
         # Display background
@@ -193,7 +201,9 @@ def run_graphics(self, loop=True):
     def get_gl_image_str(self):
         # see https://gist.github.com/Jerdak/7364746
         glReadBuffer(GL_FRONT)
-        pixels = glReadPixels(0, 0, self.window_width, self.window_height, GL_RGB, GL_UNSIGNED_BYTE)
+        pixels = glReadPixels(
+            0, 0, self.window_width, self.window_height, GL_RGB, GL_UNSIGNED_BYTE
+        )
         return pixels
 
     def get_video_data(self):
@@ -223,9 +233,9 @@ def get_video_data(self):
                 # See https://stackoverflow.com/a/42754578/5691288
                 #
                 string_image = self.get_gl_image_str()
-                temp_surf = pg.image.fromstring(string_image,
-                                                (self.window_width,
-                                                 self.window_height), 'RGB')
+                temp_surf = pg.image.fromstring(
+                    string_image, (self.window_width, self.window_height), "RGB"
+                )
                 tmp_arr = pg.surfarray.array3d(temp_surf)
                 imgdata = np.moveaxis(tmp_arr, 0, 1)
                 imgdata = np.flipud(imgdata)
diff --git a/rlberry/rendering/pygame_render2d.py b/rlberry/rendering/pygame_render2d.py
index 4b1bf8889..a2a43a842 100644
--- a/rlberry/rendering/pygame_render2d.py
+++ b/rlberry/rendering/pygame_render2d.py
@@ -9,10 +9,10 @@
 
 logger = logging.getLogger(__name__)
 
-environ['PYGAME_HIDE_SUPPORT_PROMPT'] = '1'
+environ["PYGAME_HIDE_SUPPORT_PROMPT"] = "1"
 
 _IMPORT_SUCESSFUL = True
-_IMPORT_ERROR_MSG = ''
+_IMPORT_ERROR_MSG = ""
 try:
     import pygame as pg
 
@@ -97,7 +97,7 @@ def draw_geometric2d(self, shape):
         """
         Draw a 2D shape, of type GeometricPrimitive
         """
-        if shape.type in ['POLYGON']:
+        if shape.type in ["POLYGON"]:
             area = self.clipping_area
             width_range = area[1] - area[0]
             height_range = area[3] - area[2]
@@ -113,15 +113,13 @@ def draw_geometric2d(self, shape):
                 pg_vertex = (xx, yy)
                 vertices.append(pg_vertex)
 
-            color = (255 * shape.color[0],
-                     255 * shape.color[1],
-                     255 * shape.color[2])
+            color = (255 * shape.color[0], 255 * shape.color[1], 255 * shape.color[2])
             pg.draw.polygon(self.screen, color, vertices)
 
         else:
             raise NotImplementedError(
-                "Shape type %s not implemented in pygame renderer."
-                % shape.type)
+                "Shape type %s not implemented in pygame renderer." % shape.type
+            )
 
     def run_graphics(self, loop=True):
         """
@@ -180,10 +178,10 @@ def get_video_data(self):
                 #
                 # See https://stackoverflow.com/a/42754578/5691288
                 #
-                string_image = pg.image.tostring(self.screen, 'RGB')
-                temp_surf = pg.image.fromstring(string_image,
-                                                (self.window_width,
-                                                 self.window_height), 'RGB')
+                string_image = pg.image.tostring(self.screen, "RGB")
+                temp_surf = pg.image.fromstring(
+                    string_image, (self.window_width, self.window_height), "RGB"
+                )
                 tmp_arr = pg.surfarray.array3d(temp_surf)
                 imgdata = np.moveaxis(tmp_arr, 0, 1)
                 video_data.append(imgdata)
diff --git a/rlberry/rendering/render_interface.py b/rlberry/rendering/render_interface.py
index 563df3b8f..13c8d415c 100644
--- a/rlberry/rendering/render_interface.py
+++ b/rlberry/rendering/render_interface.py
@@ -34,6 +34,7 @@ def save_video(self, filename, **kwargs):
         Save video file.
         """
         pass
+
     def get_video(self, **kwargs):
         """
         Get video data.
@@ -62,12 +63,12 @@ def __init__(self):
         self._clipping_area = (-1.0, 1.0, -1.0, 1.0)  # (left,right,bottom,top)
 
         # rendering type, either 'pygame' or 'opengl'
-        self.renderer_type = 'opengl'
+        self.renderer_type = "opengl"
 
     def get_renderer(self):
-        if self.renderer_type == 'opengl':
+        if self.renderer_type == "opengl":
             return OpenGLRender2D()
-        elif self.renderer_type == 'pygame':
+        elif self.renderer_type == "pygame":
             return PyGameRender2D()
         else:
             raise NotImplementedError("Unknown renderer type.")
@@ -155,8 +156,6 @@ def get_video(self, framerate=25, **kwargs):
 
         return renderer.get_video_data()
 
-
-
     def save_video(self, filename, framerate=25, **kwargs):
         video_data = self.get_video(framerate=framerate, **kwargs)
         video_write(filename, video_data, framerate=framerate)
diff --git a/rlberry/rendering/tests/test_rendering_interface.py b/rlberry/rendering/tests/test_rendering_interface.py
index 678f93fc9..e17c2761f 100644
--- a/rlberry/rendering/tests/test_rendering_interface.py
+++ b/rlberry/rendering/tests/test_rendering_interface.py
@@ -31,7 +31,7 @@
     SimplePBallND,
     FourRoom,
     SixRoom,
-    AppleGold
+    AppleGold,
 ]
 
 
@@ -62,10 +62,10 @@ def test_render2d_interface(ModelClass):
                     next_s, _, _, _ = env.step(action)
                     state = next_s
                 env.render(loop=False)
-            env.save_video('test_video.mp4')
+            env.save_video("test_video.mp4")
             env.clear_render_buffer()
         try:
-            os.remove('test_video.mp4')
+            os.remove("test_video.mp4")
         except Exception:
             pass
 
@@ -85,9 +85,9 @@ def test_render2d_interface_wrapped(ModelClass):
                     next_s, _, _, _ = env.step(action)
                     state = next_s
                 env.render(loop=False)
-            env.save_video('test_video.mp4')
+            env.save_video("test_video.mp4")
             env.clear_render_buffer()
         try:
-            os.remove('test_video.mp4')
+            os.remove("test_video.mp4")
         except Exception:
             pass
diff --git a/rlberry/rendering/utils.py b/rlberry/rendering/utils.py
index bd23e4f53..f02ce89bf 100644
--- a/rlberry/rendering/utils.py
+++ b/rlberry/rendering/utils.py
@@ -10,7 +10,7 @@
 logger = logging.getLogger(__name__)
 
 
-def video_write(fn, images, framerate=60, vcodec='libx264'):
+def video_write(fn, images, framerate=60, vcodec="libx264"):
     """
     Save list of images to a video file.
 
@@ -38,29 +38,34 @@ def video_write(fn, images, framerate=60, vcodec='libx264'):
         if not _FFMPEG_INSTALLED:
             logger.error(
                 "video_write(): Unable to save video, ffmpeg-python \
-    package required (https://github.com/kkroening/ffmpeg-python)")
+    package required (https://github.com/kkroening/ffmpeg-python)"
+            )
             return
 
         if not isinstance(images, np.ndarray):
             images = np.asarray(images)
         _, height, width, channels = images.shape
         process = (
-            ffmpeg
-                .input('pipe:', format='rawvideo', pix_fmt='rgb24',
-                       s='{}x{}'.format(width, height), r=framerate)
-                .output(fn, pix_fmt='yuv420p', vcodec=vcodec)
-                .overwrite_output()
-                .run_async(pipe_stdin=True)
+            ffmpeg.input(
+                "pipe:",
+                format="rawvideo",
+                pix_fmt="rgb24",
+                s="{}x{}".format(width, height),
+                r=framerate,
+            )
+            .output(fn, pix_fmt="yuv420p", vcodec=vcodec)
+            .overwrite_output()
+            .run_async(pipe_stdin=True)
         )
         for frame in images:
-            process.stdin.write(
-                frame
-                    .astype(np.uint8)
-                    .tobytes()
-            )
+            process.stdin.write(frame.astype(np.uint8).tobytes())
         process.stdin.close()
         process.wait()
 
     except Exception as ex:
-        logger.warning("Not possible to save \
-video, due to exception: {}".format(str(ex)))
+        logger.warning(
+            "Not possible to save \
+video, due to exception: {}".format(
+                str(ex)
+            )
+        )
diff --git a/rlberry/seeding/tests/test_seeding.py b/rlberry/seeding/tests/test_seeding.py
index 279a10fc3..8e87bf529 100644
--- a/rlberry/seeding/tests/test_seeding.py
+++ b/rlberry/seeding/tests/test_seeding.py
@@ -13,8 +13,12 @@ def test_seeder_basic():
 
     assert (data1 != data2).sum() > 5
     assert (data2 != data3).sum() == 0
-    assert seeder2.spawn(1).generate_state(1)[0] == seeder3.spawn(1).generate_state(1)[0]
-    assert seeder1.spawn(1).generate_state(1)[0] != seeder3.spawn(1).generate_state(1)[0]
+    assert (
+        seeder2.spawn(1).generate_state(1)[0] == seeder3.spawn(1).generate_state(1)[0]
+    )
+    assert (
+        seeder1.spawn(1).generate_state(1)[0] != seeder3.spawn(1).generate_state(1)[0]
+    )
 
 
 def test_seeder_initialized_from_seeder():
diff --git a/rlberry/seeding/tests/test_threads.py b/rlberry/seeding/tests/test_threads.py
index 83f211e9a..010655c0e 100644
--- a/rlberry/seeding/tests/test_threads.py
+++ b/rlberry/seeding/tests/test_threads.py
@@ -23,7 +23,5 @@ def test_multithread_seeding():
 
                 results = []
                 for future in concurrent.futures.as_completed(futures):
-                    results.append(
-                        future.result()
-                    )
+                    results.append(future.result())
                 assert results[0] != results[1], f"error in simulation {(ii, jj)}"
diff --git a/rlberry/seeding/tests/test_threads_torch.py b/rlberry/seeding/tests/test_threads_torch.py
index 08dfa1a3c..585e7713a 100644
--- a/rlberry/seeding/tests/test_threads_torch.py
+++ b/rlberry/seeding/tests/test_threads_torch.py
@@ -30,7 +30,5 @@ def test_torch_multithread_seeding():
 
                 results = []
                 for future in concurrent.futures.as_completed(futures):
-                    results.append(
-                        future.result()
-                    )
+                    results.append(future.result())
                 assert results[0] != results[1], f"error in simulation {(ii, jj)}"
diff --git a/rlberry/spaces/box.py b/rlberry/spaces/box.py
index 2ff60f81e..2409eaf97 100644
--- a/rlberry/spaces/box.py
+++ b/rlberry/spaces/box.py
@@ -64,8 +64,7 @@ def sample(self):
         * (-oo, b] : shifted negative exponential distribution
         * (-oo, oo) : normal distribution
         """
-        high = self.high if self.dtype.kind == 'f' \
-            else self.high.astype('int64') + 1
+        high = self.high if self.dtype.kind == "f" else self.high.astype("int64") + 1
         sample = np.empty(self.shape)
 
         # Masking arrays which classify the coordinates according to interval
@@ -76,19 +75,22 @@ def sample(self):
         bounded = self.bounded_below & self.bounded_above
 
         # Vectorized sampling by interval type
-        sample[unbounded] = self.rng.normal(
-            size=unbounded[unbounded].shape)
-
-        sample[low_bounded] = self.rng.exponential(
-            size=low_bounded[low_bounded].shape) + self.low[low_bounded]
-
-        sample[upp_bounded] = -self.rng.exponential(
-            size=upp_bounded[upp_bounded].shape) + self.high[upp_bounded]
-
-        sample[bounded] = self.rng.uniform(low=self.low[bounded],
-                                           high=high[bounded],
-                                           size=bounded[bounded].shape)
-        if self.dtype.kind == 'i':
+        sample[unbounded] = self.rng.normal(size=unbounded[unbounded].shape)
+
+        sample[low_bounded] = (
+            self.rng.exponential(size=low_bounded[low_bounded].shape)
+            + self.low[low_bounded]
+        )
+
+        sample[upp_bounded] = (
+            -self.rng.exponential(size=upp_bounded[upp_bounded].shape)
+            + self.high[upp_bounded]
+        )
+
+        sample[bounded] = self.rng.uniform(
+            low=self.low[bounded], high=high[bounded], size=bounded[bounded].shape
+        )
+        if self.dtype.kind == "i":
             sample = np.floor(sample)
 
         return sample.astype(self.dtype)
diff --git a/rlberry/spaces/from_gym.py b/rlberry/spaces/from_gym.py
index 939a3740c..6bd87ce06 100644
--- a/rlberry/spaces/from_gym.py
+++ b/rlberry/spaces/from_gym.py
@@ -3,31 +3,36 @@
 
 
 def convert_space_from_gym(space):
-    if isinstance(space, gym.spaces.Box) and (not isinstance(space, rlberry.spaces.Box)):
+    if isinstance(space, gym.spaces.Box) and (
+        not isinstance(space, rlberry.spaces.Box)
+    ):
         return rlberry.spaces.Box(
-            space.low,
-            space.high,
-            shape=space.shape,
-            dtype=space.dtype
+            space.low, space.high, shape=space.shape, dtype=space.dtype
         )
-    if isinstance(space, gym.spaces.Discrete) and (not isinstance(space, rlberry.spaces.Discrete)):
-        return rlberry.spaces.Discrete(
-            n=space.n
-        )
-    if isinstance(space, gym.spaces.MultiBinary) and (not isinstance(space, rlberry.spaces.MultiBinary)):
-        return rlberry.spaces.MultiBinary(
-            n=space.n
-        )
-    if isinstance(space, gym.spaces.MultiDiscrete) and (not isinstance(space, rlberry.spaces.MultiDiscrete)):
+    if isinstance(space, gym.spaces.Discrete) and (
+        not isinstance(space, rlberry.spaces.Discrete)
+    ):
+        return rlberry.spaces.Discrete(n=space.n)
+    if isinstance(space, gym.spaces.MultiBinary) and (
+        not isinstance(space, rlberry.spaces.MultiBinary)
+    ):
+        return rlberry.spaces.MultiBinary(n=space.n)
+    if isinstance(space, gym.spaces.MultiDiscrete) and (
+        not isinstance(space, rlberry.spaces.MultiDiscrete)
+    ):
         return rlberry.spaces.MultiDiscrete(
             nvec=space.nvec,
             dtype=space.dtype,
         )
-    if isinstance(space, gym.spaces.Tuple) and (not isinstance(space, rlberry.spaces.Tuple)):
+    if isinstance(space, gym.spaces.Tuple) and (
+        not isinstance(space, rlberry.spaces.Tuple)
+    ):
         return rlberry.spaces.Tuple(
             spaces=[convert_space_from_gym(sp) for sp in space.spaces]
         )
-    if isinstance(space, gym.spaces.Dict) and (not isinstance(space, rlberry.spaces.Dict)):
+    if isinstance(space, gym.spaces.Dict) and (
+        not isinstance(space, rlberry.spaces.Dict)
+    ):
         converted_spaces = dict()
         for key in space.spaces:
             converted_spaces[key] = convert_space_from_gym(space.spaces[key])
diff --git a/rlberry/spaces/multi_binary.py b/rlberry/spaces/multi_binary.py
index d3703526d..84bb27b4f 100644
--- a/rlberry/spaces/multi_binary.py
+++ b/rlberry/spaces/multi_binary.py
@@ -44,5 +44,4 @@ def reseed(self, seed_seq=None):
         self.seeder.reseed(seed_seq)
 
     def sample(self):
-        return self.rng.integers(low=0, high=2,
-                                 size=self.n, dtype=self.dtype)
+        return self.rng.integers(low=0, high=2, size=self.n, dtype=self.dtype)
diff --git a/rlberry/spaces/tests/test_from_gym.py b/rlberry/spaces/tests/test_from_gym.py
index 779c55597..945553fe8 100644
--- a/rlberry/spaces/tests/test_from_gym.py
+++ b/rlberry/spaces/tests/test_from_gym.py
@@ -18,24 +18,26 @@ def test_discrete_space(n):
         assert sp.contains(sp.sample())
 
 
-@pytest.mark.parametrize("low, high, dim",
-                         [
-                             (1.0, 10.0, 1),
-                             (1.0, 10.0, 2),
-                             (1.0, 10.0, 4),
-                             (-10.0, 1.0, 1),
-                             (-10.0, 1.0, 2),
-                             (-10.0, 1.0, 4),
-                             (-np.inf, 1.0, 1),
-                             (-np.inf, 1.0, 2),
-                             (-np.inf, 1.0, 4),
-                             (1.0, np.inf, 1),
-                             (1.0, np.inf, 2),
-                             (1.0, np.inf, 4),
-                             (-np.inf, np.inf, 1),
-                             (-np.inf, np.inf, 2),
-                             (-np.inf, np.inf, 4),
-                         ])
+@pytest.mark.parametrize(
+    "low, high, dim",
+    [
+        (1.0, 10.0, 1),
+        (1.0, 10.0, 2),
+        (1.0, 10.0, 4),
+        (-10.0, 1.0, 1),
+        (-10.0, 1.0, 2),
+        (-10.0, 1.0, 4),
+        (-np.inf, 1.0, 1),
+        (-np.inf, 1.0, 2),
+        (-np.inf, 1.0, 4),
+        (1.0, np.inf, 1),
+        (1.0, np.inf, 2),
+        (1.0, np.inf, 4),
+        (-np.inf, np.inf, 1),
+        (-np.inf, np.inf, 2),
+        (-np.inf, np.inf, 4),
+    ],
+)
 def test_box_space_case_1(low, high, dim):
     shape = (dim, 1)
     gym_sp = gym.spaces.Box(low, high, shape=shape)
@@ -43,7 +45,7 @@ def test_box_space_case_1(low, high, dim):
     assert isinstance(sp, rlberry.spaces.Box)
     sp.reseed(123)
     for _ in range(2 ** dim):
-        assert (sp.contains(sp.sample()))
+        assert sp.contains(sp.sample())
 
 
 @pytest.mark.parametrize(
@@ -53,8 +55,9 @@ def test_box_space_case_1(low, high, dim):
         (np.array([-10.0, -10.0, -10.0]), np.array([10.0, 10.0, 10.0])),
         (np.array([-10.0, -10.0, -10.0]), np.array([10.0, 10.0, np.inf])),
         (np.array([-np.inf, -10.0, -10.0]), np.array([10.0, 10.0, np.inf])),
-        (np.array([-np.inf, -10.0, -10.0]), np.array([np.inf, 10.0, np.inf]))
-    ])
+        (np.array([-np.inf, -10.0, -10.0]), np.array([np.inf, 10.0, np.inf])),
+    ],
+)
 def test_box_space_case_2(low, high):
     gym_sp = gym.spaces.Box(low, high, dtype=np.float64)
     sp = convert_space_from_gym(gym_sp)
@@ -65,7 +68,7 @@ def test_box_space_case_2(low, high):
     else:
         assert sp.is_bounded()
     for ii in range(2 ** sp.shape[0]):
-        assert (sp.contains(sp.sample()))
+        assert sp.contains(sp.sample())
 
 
 def test_tuple():
@@ -101,26 +104,36 @@ def test_multibinary():
 
 
 def test_dict():
-    nested_observation_space = gym.spaces.Dict({
-        'sensors': gym.spaces.Dict({
-            'position': gym.spaces.Box(low=-100, high=100, shape=(3,)),
-            'velocity': gym.spaces.Box(low=-1, high=1, shape=(3,)),
-            'front_cam': gym.spaces.Tuple((
-                gym.spaces.Box(low=0, high=1, shape=(10, 10, 3)),
-                gym.spaces.Box(low=0, high=1, shape=(10, 10, 3))
-            )),
-            'rear_cam': gym.spaces.Box(low=0, high=1, shape=(10, 10, 3)),
-        }),
-        'ext_controller': gym.spaces.MultiDiscrete((5, 2, 2)),
-        'inner_state': gym.spaces.Dict({
-            'charge': gym.spaces.Discrete(100),
-            'system_checks': gym.spaces.MultiBinary(10),
-            'job_status': gym.spaces.Dict({
-                'task': gym.spaces.Discrete(5),
-                'progress': gym.spaces.Box(low=0, high=100, shape=()),
-            })
-        })
-    })
+    nested_observation_space = gym.spaces.Dict(
+        {
+            "sensors": gym.spaces.Dict(
+                {
+                    "position": gym.spaces.Box(low=-100, high=100, shape=(3,)),
+                    "velocity": gym.spaces.Box(low=-1, high=1, shape=(3,)),
+                    "front_cam": gym.spaces.Tuple(
+                        (
+                            gym.spaces.Box(low=0, high=1, shape=(10, 10, 3)),
+                            gym.spaces.Box(low=0, high=1, shape=(10, 10, 3)),
+                        )
+                    ),
+                    "rear_cam": gym.spaces.Box(low=0, high=1, shape=(10, 10, 3)),
+                }
+            ),
+            "ext_controller": gym.spaces.MultiDiscrete((5, 2, 2)),
+            "inner_state": gym.spaces.Dict(
+                {
+                    "charge": gym.spaces.Discrete(100),
+                    "system_checks": gym.spaces.MultiBinary(10),
+                    "job_status": gym.spaces.Dict(
+                        {
+                            "task": gym.spaces.Discrete(5),
+                            "progress": gym.spaces.Box(low=0, high=100, shape=()),
+                        }
+                    ),
+                }
+            ),
+        }
+    )
     gym_sp = nested_observation_space
     sp = convert_space_from_gym(gym_sp)
     assert isinstance(sp, rlberry.spaces.Dict)
diff --git a/rlberry/spaces/tests/test_spaces.py b/rlberry/spaces/tests/test_spaces.py
index 53837c6f7..ddb735814 100644
--- a/rlberry/spaces/tests/test_spaces.py
+++ b/rlberry/spaces/tests/test_spaces.py
@@ -19,29 +19,31 @@ def test_discrete_space(n):
         assert sp.contains(sp.sample())
 
 
-@pytest.mark.parametrize("low, high, dim",
-                         [
-                             (1.0, 10.0, 1),
-                             (1.0, 10.0, 2),
-                             (1.0, 10.0, 4),
-                             (-10.0, 1.0, 1),
-                             (-10.0, 1.0, 2),
-                             (-10.0, 1.0, 4),
-                             (-np.inf, 1.0, 1),
-                             (-np.inf, 1.0, 2),
-                             (-np.inf, 1.0, 4),
-                             (1.0, np.inf, 1),
-                             (1.0, np.inf, 2),
-                             (1.0, np.inf, 4),
-                             (-np.inf, np.inf, 1),
-                             (-np.inf, np.inf, 2),
-                             (-np.inf, np.inf, 4),
-                         ])
+@pytest.mark.parametrize(
+    "low, high, dim",
+    [
+        (1.0, 10.0, 1),
+        (1.0, 10.0, 2),
+        (1.0, 10.0, 4),
+        (-10.0, 1.0, 1),
+        (-10.0, 1.0, 2),
+        (-10.0, 1.0, 4),
+        (-np.inf, 1.0, 1),
+        (-np.inf, 1.0, 2),
+        (-np.inf, 1.0, 4),
+        (1.0, np.inf, 1),
+        (1.0, np.inf, 2),
+        (1.0, np.inf, 4),
+        (-np.inf, np.inf, 1),
+        (-np.inf, np.inf, 2),
+        (-np.inf, np.inf, 4),
+    ],
+)
 def test_box_space_case_1(low, high, dim):
     shape = (dim, 1)
     sp = Box(low, high, shape=shape)
     for ii in range(2 ** dim):
-        assert (sp.contains(sp.sample()))
+        assert sp.contains(sp.sample())
 
 
 @pytest.mark.parametrize(
@@ -51,8 +53,9 @@ def test_box_space_case_1(low, high, dim):
         (np.array([-10.0, -10.0, -10.0]), np.array([10.0, 10.0, 10.0])),
         (np.array([-10.0, -10.0, -10.0]), np.array([10.0, 10.0, np.inf])),
         (np.array([-np.inf, -10.0, -10.0]), np.array([10.0, 10.0, np.inf])),
-        (np.array([-np.inf, -10.0, -10.0]), np.array([np.inf, 10.0, np.inf]))
-    ])
+        (np.array([-np.inf, -10.0, -10.0]), np.array([np.inf, 10.0, np.inf])),
+    ],
+)
 def test_box_space_case_2(low, high):
     sp = Box(low, high)
     if (-np.inf in low) or (np.inf in high):
@@ -60,7 +63,7 @@ def test_box_space_case_2(low, high):
     else:
         assert sp.is_bounded()
     for ii in range(2 ** sp.shape[0]):
-        assert (sp.contains(sp.sample()))
+        assert sp.contains(sp.sample())
 
 
 def test_tuple():
@@ -88,26 +91,36 @@ def test_multibinary():
 
 
 def test_dict():
-    nested_observation_space = Dict({
-        'sensors': Dict({
-            'position': Box(low=-100, high=100, shape=(3,)),
-            'velocity': Box(low=-1, high=1, shape=(3,)),
-            'front_cam': Tuple((
-                Box(low=0, high=1, shape=(10, 10, 3)),
-                Box(low=0, high=1, shape=(10, 10, 3))
-            )),
-            'rear_cam': Box(low=0, high=1, shape=(10, 10, 3)),
-        }),
-        'ext_controller': MultiDiscrete((5, 2, 2)),
-        'inner_state': Dict({
-            'charge': Discrete(100),
-            'system_checks': MultiBinary(10),
-            'job_status': Dict({
-                'task': Discrete(5),
-                'progress': Box(low=0, high=100, shape=()),
-            })
-        })
-    })
+    nested_observation_space = Dict(
+        {
+            "sensors": Dict(
+                {
+                    "position": Box(low=-100, high=100, shape=(3,)),
+                    "velocity": Box(low=-1, high=1, shape=(3,)),
+                    "front_cam": Tuple(
+                        (
+                            Box(low=0, high=1, shape=(10, 10, 3)),
+                            Box(low=0, high=1, shape=(10, 10, 3)),
+                        )
+                    ),
+                    "rear_cam": Box(low=0, high=1, shape=(10, 10, 3)),
+                }
+            ),
+            "ext_controller": MultiDiscrete((5, 2, 2)),
+            "inner_state": Dict(
+                {
+                    "charge": Discrete(100),
+                    "system_checks": MultiBinary(10),
+                    "job_status": Dict(
+                        {
+                            "task": Discrete(5),
+                            "progress": Box(low=0, high=100, shape=()),
+                        }
+                    ),
+                }
+            ),
+        }
+    )
     sp = nested_observation_space
     for _ in range(10):
         assert sp.contains(sp.sample())
diff --git a/rlberry/utils/binsearch.py b/rlberry/utils/binsearch.py
index e23ed327a..e0a8d2a4c 100644
--- a/rlberry/utils/binsearch.py
+++ b/rlberry/utils/binsearch.py
@@ -22,10 +22,10 @@ def binary_search_nd(x_vec, bins):
     aux = 1
     assert dim == len(x_vec), "dimension mismatch in binary_search_nd()"
     for dd in range(dim):
-        index_dd = np.searchsorted(bins[dd], x_vec[dd], side='right') - 1
+        index_dd = np.searchsorted(bins[dd], x_vec[dd], side="right") - 1
         assert index_dd != -1, "error in binary_search_nd()"
         flat_index += aux * index_dd
-        aux *= (len(bins[dd]) - 1)
+        aux *= len(bins[dd]) - 1
     return flat_index
 
 
@@ -39,8 +39,7 @@ def unravel_index_uniform_bin(flat_index, dim, n_per_dim):
 
 
 if __name__ == "__main__":
-    bins = [(0, 1, 2, 3, 4),
-            (0, 1, 2, 3, 4)]
+    bins = [(0, 1, 2, 3, 4), (0, 1, 2, 3, 4)]
     x = [3.9, 3.5]
     index = binary_search_nd(x, bins)
     print(index)
diff --git a/rlberry/utils/io.py b/rlberry/utils/io.py
index 94f7092ce..cb269f29a 100644
--- a/rlberry/utils/io.py
+++ b/rlberry/utils/io.py
@@ -1,4 +1,3 @@
-
 import os
 import zipfile
 import pathlib
@@ -22,12 +21,13 @@ def zipdir(dir_path, ouput_fname):
     dir_path = pathlib.Path(dir_path)
     if not dir_path.exists():
         return None
-    ouput_fname = pathlib.Path(ouput_fname).with_suffix('.zip')
-    zipf = zipfile.ZipFile(ouput_fname, 'w', zipfile.ZIP_DEFLATED)
+    ouput_fname = pathlib.Path(ouput_fname).with_suffix(".zip")
+    zipf = zipfile.ZipFile(ouput_fname, "w", zipfile.ZIP_DEFLATED)
     for root, _, files in os.walk(dir_path):
         for file in files:
-            zipf.write(os.path.join(root, file),
-                       os.path.relpath(os.path.join(root, file),
-                                       os.path.join(dir_path, '..')))
+            zipf.write(
+                os.path.join(root, file),
+                os.path.relpath(os.path.join(root, file), os.path.join(dir_path, "..")),
+            )
     zipf.close()
     return ouput_fname
diff --git a/rlberry/utils/jit_setup.py b/rlberry/utils/jit_setup.py
index 20a566ac8..c5f2f55a1 100644
--- a/rlberry/utils/jit_setup.py
+++ b/rlberry/utils/jit_setup.py
@@ -10,6 +10,7 @@
 
     numba_jit = jit(nopython=True)
 else:
+
     def numba_jit(func, **options):
         """This decorator does not modify the decorated function."""
         return func
diff --git a/rlberry/utils/logging.py b/rlberry/utils/logging.py
index 05748c213..db9d0898d 100644
--- a/rlberry/utils/logging.py
+++ b/rlberry/utils/logging.py
@@ -3,10 +3,12 @@
 import gym
 
 
-def configure_logging(level: str = "INFO",
-                      file_path: Path = None,
-                      file_level: str = "DEBUG",
-                      default_msg: str = "") -> None:
+def configure_logging(
+    level: str = "INFO",
+    file_path: Path = None,
+    file_level: str = "DEBUG",
+    default_msg: str = "",
+) -> None:
     """
     Set the logging configuration
 
@@ -28,29 +30,19 @@ def configure_logging(level: str = "INFO",
         "version": 1,
         "disable_existing_loggers": False,
         "formatters": {
-            "standard": {
-                "format": default_msg + "[%(levelname)s] %(message)s "
-            },
+            "standard": {"format": default_msg + "[%(levelname)s] %(message)s "},
             "detailed": {
                 "format": default_msg + "[%(name)s:%(levelname)s] %(message)s "
-            }
+            },
         },
         "handlers": {
             "default": {
                 "level": level,
                 "formatter": "standard",
-                "class": "logging.StreamHandler"
+                "class": "logging.StreamHandler",
             }
         },
-        "loggers": {
-            "": {
-                "handlers": [
-                    "default"
-                ],
-                "level": "DEBUG",
-                "propagate": True
-            }
-        }
+        "loggers": {"": {"handlers": ["default"], "level": "DEBUG", "propagate": True}},
     }
     if file_path:
         config["handlers"][file_path.name] = {
@@ -58,10 +50,10 @@ def configure_logging(level: str = "INFO",
             "filename": file_path,
             "level": file_level,
             "formatter": "detailed",
-            "mode": 'w'
+            "mode": "w",
         }
         config["loggers"][""]["handlers"].append(file_path.name)
     logging.config.dictConfig(config)
     gym.logger.set_level(logging.getLevelName(level))
-    numba_logger = logging.getLogger('numba')
+    numba_logger = logging.getLogger("numba")
     numba_logger.setLevel(logging.WARNING)
diff --git a/rlberry/utils/math.py b/rlberry/utils/math.py
index a6ce0e886..5fcb09841 100644
--- a/rlberry/utils/math.py
+++ b/rlberry/utils/math.py
@@ -1,11 +1,7 @@
 import numpy as np
 from typing import Union, Tuple
 
-Interval = Union[
-    np.ndarray,
-    Tuple[float, float],
-    Tuple[np.ndarray, np.ndarray]
-]
+Interval = Union[np.ndarray, Tuple[float, float], Tuple[np.ndarray, np.ndarray]]
 
 
 def lmap(v: np.ndarray, x: Interval, y: Interval) -> np.ndarray:
diff --git a/rlberry/utils/space_discretizer.py b/rlberry/utils/space_discretizer.py
index bd8ab3df1..217a7a525 100644
--- a/rlberry/utils/space_discretizer.py
+++ b/rlberry/utils/space_discretizer.py
@@ -6,8 +6,9 @@
 
 class Discretizer:
     def __init__(self, space, n_bins):
-        assert isinstance(space, Box), \
-            "Discretization is only implemented for Box spaces."
+        assert isinstance(
+            space, Box
+        ), "Discretization is only implemented for Box spaces."
         assert space.is_bounded()
         self.space = space
         self.n_bins = n_bins
diff --git a/rlberry/utils/tests/test_binsearch.py b/rlberry/utils/tests/test_binsearch.py
index 00172d8fa..fd94adde4 100644
--- a/rlberry/utils/tests/test_binsearch.py
+++ b/rlberry/utils/tests/test_binsearch.py
@@ -23,13 +23,9 @@ def test_binary_search_nd():
     assert binary_search_nd(vec3, bins) == 1 + 3 * 1 + 3 * 3 * 0
 
 
-@pytest.mark.parametrize("i, j, k, N",
-                         [
-                             (0, 0, 0, 5),
-                             (0, 1, 2, 5),
-                             (4, 3, 2, 5),
-                             (4, 4, 4, 5)
-                         ])
+@pytest.mark.parametrize(
+    "i, j, k, N", [(0, 0, 0, 5), (0, 1, 2, 5), (4, 3, 2, 5), (4, 4, 4, 5)]
+)
 def test_unravel_index_uniform_bin(i, j, k, N):
     # index = i + N * j + N * N * k
     dim = 3
diff --git a/rlberry/utils/tests/test_metrics.py b/rlberry/utils/tests/test_metrics.py
index d18eac384..487dcbb61 100644
--- a/rlberry/utils/tests/test_metrics.py
+++ b/rlberry/utils/tests/test_metrics.py
@@ -11,7 +11,7 @@ def test_metrics(dim):
     scaling_2 = 0.5 * np.ones(dim)
 
     for p in range(1, 10):
-        assert np.abs(metric_lp(x, y, p, scaling_1)
-                      - np.power(dim, 1.0 / p)) < 1e-15
-        assert np.abs(metric_lp(x, y, p, scaling_2)
-                      - 2 * np.power(dim, 1.0 / p)) < 1e-15
+        assert np.abs(metric_lp(x, y, p, scaling_1) - np.power(dim, 1.0 / p)) < 1e-15
+        assert (
+            np.abs(metric_lp(x, y, p, scaling_2) - 2 * np.power(dim, 1.0 / p)) < 1e-15
+        )
diff --git a/rlberry/utils/torch.py b/rlberry/utils/torch.py
index 5370fa901..f72d78d5d 100644
--- a/rlberry/utils/torch.py
+++ b/rlberry/utils/torch.py
@@ -10,25 +10,28 @@
 
 
 def get_gpu_memory_map():
-    result = check_output(['nvidia-smi',
-                           '--query-gpu=memory.used',
-                           '--format=csv,nounits,noheader'])
+    result = check_output(
+        ["nvidia-smi", "--query-gpu=memory.used", "--format=csv,nounits,noheader"]
+    )
     return [int(x) for x in result.split()]
 
 
 def least_used_device():
-    """ Get the  GPU device with most available memory. """
+    """Get the  GPU device with most available memory."""
     if not torch.cuda.is_available():
         raise RuntimeError("cuda unavailable")
 
-    if shutil.which('nvidia-smi') is None:
-        raise RuntimeError("nvidia-smi unavailable: \
-cannot select device with most least memory used.")
+    if shutil.which("nvidia-smi") is None:
+        raise RuntimeError(
+            "nvidia-smi unavailable: \
+cannot select device with most least memory used."
+        )
 
     memory_map = get_gpu_memory_map()
     device_id = np.argmin(memory_map)
-    logger.info(f"Choosing GPU device: {device_id}, "
-                f"memory used: {memory_map[device_id]}")
+    logger.info(
+        f"Choosing GPU device: {device_id}, " f"memory used: {memory_map[device_id]}"
+    )
     return torch.device("cuda:{}".format(device_id))
 
 
@@ -37,7 +40,9 @@ def choose_device(preferred_device, default_device="cpu"):
         try:
             preferred_device = least_used_device()
         except RuntimeError:
-            logger.info(f"Could not find least used device (nvidia-smi might be missing), use cuda:0 instead")
+            logger.info(
+                f"Could not find least used device (nvidia-smi might be missing), use cuda:0 instead"
+            )
             if torch.cuda.is_available():
                 return choose_device("cuda:0")
             else:
@@ -45,8 +50,10 @@ def choose_device(preferred_device, default_device="cpu"):
     try:
         torch.zeros((1,), device=preferred_device)  # Test availability
     except (RuntimeError, AssertionError) as e:
-        logger.info(f"Preferred device {preferred_device} unavailable ({e})."
-                    f"Switching to default {default_device}")
+        logger.info(
+            f"Preferred device {preferred_device} unavailable ({e})."
+            f"Switching to default {default_device}"
+        )
         return default_device
     return preferred_device
 
@@ -55,9 +62,12 @@ def get_memory(pid=None):
     if not pid:
         pid = os.getpid()
     command = "nvidia-smi"
-    result = run(command, stdout=PIPE, stderr=PIPE,
-                 universal_newlines=True, shell=True).stdout
-    m = re.findall("\| *[0-9] *"
-                   + str(pid)
-                   + " *C *.*python.*? +([0-9]+).*\|", result, re.MULTILINE)
+    result = run(
+        command, stdout=PIPE, stderr=PIPE, universal_newlines=True, shell=True
+    ).stdout
+    m = re.findall(
+        "\| *[0-9] *" + str(pid) + " *C *.*python.*? +([0-9]+).*\|",
+        result,
+        re.MULTILINE,
+    )
     return [int(mem) for mem in m]
diff --git a/rlberry/utils/writers.py b/rlberry/utils/writers.py
index 7e714117b..56a31e6de 100644
--- a/rlberry/utils/writers.py
+++ b/rlberry/utils/writers.py
@@ -38,11 +38,13 @@ class DefaultWriter:
     """
 
     def __init__(
-            self, name: str,
-            log_interval: int = 3,
-            tensorboard_kwargs: Optional[dict] = None,
-            execution_metadata: Optional[metadata_utils.ExecutionMetadata] = None,
-            maxlen: Optional[int] = None):
+        self,
+        name: str,
+        log_interval: int = 3,
+        tensorboard_kwargs: Optional[dict] = None,
+        execution_metadata: Optional[metadata_utils.ExecutionMetadata] = None,
+        maxlen: Optional[int] = None,
+    ):
         self._name = name
         self._log_interval = log_interval
         self._execution_metadata = execution_metadata
@@ -53,8 +55,12 @@ def __init__(
         self.reset()
 
         # initialize tensorboard
-        if (tensorboard_kwargs is not None) and (not check_packages.TENSORBOARD_INSTALLED):
-            logger.warning('[DefaultWriter]: received tensorboard_kwargs, but tensorboard is not installed.')
+        if (tensorboard_kwargs is not None) and (
+            not check_packages.TENSORBOARD_INSTALLED
+        ):
+            logger.warning(
+                "[DefaultWriter]: received tensorboard_kwargs, but tensorboard is not installed."
+            )
         self._tensorboard_kwargs = tensorboard_kwargs
         self._tensorboard_logdir = None
         self._summary_writer = None
@@ -74,13 +80,19 @@ def summary_writer(self):
 
     @property
     def data(self):
-        df = pd.DataFrame(columns=('name', 'tag', 'value', 'global_step'))
+        df = pd.DataFrame(columns=("name", "tag", "value", "global_step"))
         for tag in self._data:
             df = df.append(pd.DataFrame(self._data[tag]), ignore_index=True)
         return df
 
     def add_scalar(
-            self, tag: str, scalar_value: float, global_step: Optional[int] = None, walltime=None, new_style=False):
+        self,
+        tag: str,
+        scalar_value: float,
+        global_step: Optional[int] = None,
+        walltime=None,
+        new_style=False,
+    ):
         """
         Behaves as SummaryWriter.add_scalar().
 
@@ -102,34 +114,46 @@ def add_scalar(
             style (simple_value field). New style could lead to faster data loading.
         """
         if self._summary_writer:
-            self._summary_writer.add_scalar(tag, scalar_value, global_step, walltime, new_style)
+            self._summary_writer.add_scalar(
+                tag, scalar_value, global_step, walltime, new_style
+            )
         self._add_scalar(tag, scalar_value, global_step)
 
-    def _add_scalar(self, tag: str, scalar_value: float, global_step: Optional[int] = None):
+    def _add_scalar(
+        self, tag: str, scalar_value: float, global_step: Optional[int] = None
+    ):
         """
         Store scalar value in self._data.
         """
         # Update data structures
         if tag not in self._data:
             self._data[tag] = dict()
-            self._data[tag]['name'] = deque(maxlen=self._maxlen)
-            self._data[tag]['tag'] = deque(maxlen=self._maxlen)
-            self._data[tag]['value'] = deque(maxlen=self._maxlen)
-            self._data[tag]['global_step'] = deque(maxlen=self._maxlen)
-
-        self._data[tag]['name'].append(self._name)  # used in plots, when aggregating several writers
-        self._data[tag]['tag'].append(tag)  # useful to convert all data to a single DataFrame
-        self._data[tag]['value'].append(scalar_value)
+            self._data[tag]["name"] = deque(maxlen=self._maxlen)
+            self._data[tag]["tag"] = deque(maxlen=self._maxlen)
+            self._data[tag]["value"] = deque(maxlen=self._maxlen)
+            self._data[tag]["global_step"] = deque(maxlen=self._maxlen)
+
+        self._data[tag]["name"].append(
+            self._name
+        )  # used in plots, when aggregating several writers
+        self._data[tag]["tag"].append(
+            tag
+        )  # useful to convert all data to a single DataFrame
+        self._data[tag]["value"].append(scalar_value)
         if global_step is None:
-            self._data[tag]['global_step'].append(np.nan)
+            self._data[tag]["global_step"].append(np.nan)
         else:
-            self._data[tag]['global_step'].append(global_step)
+            self._data[tag]["global_step"].append(global_step)
 
         # Append time interval corresponding to global_step
         if global_step is not None and self._log_time:
-            assert tag != 'dw_time_elapsed', 'The tag dw_time_elapsed is reserved.'
+            assert tag != "dw_time_elapsed", "The tag dw_time_elapsed is reserved."
             self._log_time = False
-            self._add_scalar(tag='dw_time_elapsed', scalar_value=timer() - self._initial_time, global_step=global_step)
+            self._add_scalar(
+                tag="dw_time_elapsed",
+                scalar_value=timer() - self._initial_time,
+                global_step=global_step,
+            )
             self._log_time = True
 
         # Log
@@ -144,18 +168,18 @@ def _log(self):
         max_global_step = 0
         if time_elapsed > self._log_interval:
             self._time_last_log = t_now
-            message = ''
+            message = ""
             for tag in self._data:
-                val = self._data[tag]['value'][-1]
-                gstep = self._data[tag]['global_step'][-1]
-                message += f'{tag} = {val} | '
+                val = self._data[tag]["value"][-1]
+                gstep = self._data[tag]["global_step"][-1]
+                message += f"{tag} = {val} | "
                 if not np.isnan(gstep):
                     max_global_step = max(max_global_step, gstep)
 
             header = self._name
             if self._execution_metadata:
-                header += f'[worker: {self._execution_metadata.obj_worker_id}]'
-            message = f'[{header}] | max_global_step = {max_global_step} | ' + message
+                header += f"[worker: {self._execution_metadata.obj_worker_id}]"
+            message = f"[{header}] | max_global_step = {max_global_step} | " + message
             logger.info(message)
 
     def __getattr__(self, attr):
@@ -163,7 +187,7 @@ def __getattr__(self, attr):
         Calls SummaryWriter methods, if self._summary_writer is not None.
         Otherwise, does nothing.
         """
-        if attr[:2] == '__':
+        if attr[:2] == "__":
             raise AttributeError(attr)
         if attr in self.__dict__:
             return getattr(self, attr)
@@ -172,6 +196,7 @@ def __getattr__(self, attr):
 
         def method(*args, **kwargs):
             pass
+
         return method
 
     #
@@ -185,7 +210,11 @@ def __getstate__(self):
 
     def __setstate__(self, newstate):
         # Re-create summary writer with the same logdir
-        if newstate['_summary_writer']:
-            newstate['_tensorboard_kwargs'].update(dict(log_dir=newstate['_tensorboard_logdir']))
-            newstate['_summary_writer'] = SummaryWriter(**newstate['_tensorboard_kwargs'])
+        if newstate["_summary_writer"]:
+            newstate["_tensorboard_kwargs"].update(
+                dict(log_dir=newstate["_tensorboard_logdir"])
+            )
+            newstate["_summary_writer"] = SummaryWriter(
+                **newstate["_tensorboard_kwargs"]
+            )
         self.__dict__.update(newstate)
diff --git a/rlberry/wrappers/discretize_state.py b/rlberry/wrappers/discretize_state.py
index 168b4e1e7..2aa4c6bab 100644
--- a/rlberry/wrappers/discretize_state.py
+++ b/rlberry/wrappers/discretize_state.py
@@ -24,8 +24,9 @@ def __init__(self, _env, n_bins):
         self._bins = []
         self._open_bins = []
         for dd in range(self.dim):
-            range_dd = self.env.observation_space.high[dd] \
-                       - self.env.observation_space.low[dd]
+            range_dd = (
+                self.env.observation_space.high[dd] - self.env.observation_space.low[dd]
+            )
             epsilon = range_dd / n_bins
             bins_dd = []
             for bb in range(n_bins + 1):
@@ -41,8 +42,7 @@ def __init__(self, _env, n_bins):
         # List of discretized states
         self.discretized_states = np.zeros((self.dim, n_states))
         for ii in range(n_states):
-            self.discretized_states[:, ii] = \
-                self.get_continuous_state(ii, False)
+            self.discretized_states[:, ii] = self.get_continuous_state(ii, False)
 
     def reset(self):
         return self.get_discrete_state(self.env.reset())
@@ -55,11 +55,9 @@ def step(self, action):
     def sample(self, discrete_state, action):
         # map disctete state to continuous one
         assert self.observation_space.contains(discrete_state)
-        continuous_state = self.get_continuous_state(discrete_state,
-                                                     randomize=True)
+        continuous_state = self.get_continuous_state(discrete_state, randomize=True)
         # sample in the true environment
-        next_state, reward, done, info = \
-            self.env.sample(continuous_state, action)
+        next_state, reward, done, info = self.env.sample(continuous_state, action)
         # discretize next state
         next_state = binary_search_nd(next_state, self._bins)
 
@@ -69,20 +67,21 @@ def get_discrete_state(self, continuous_state):
         return binary_search_nd(continuous_state, self._bins)
 
     def get_continuous_state(self, discrete_state, randomize=False):
-        assert discrete_state >= 0 \
-               and discrete_state < self.observation_space.n, \
-            "invalid discrete_state"
+        assert (
+            discrete_state >= 0 and discrete_state < self.observation_space.n
+        ), "invalid discrete_state"
         # get multi-index
-        index \
-            = unravel_index_uniform_bin(discrete_state, self.dim, self.n_bins)
+        index = unravel_index_uniform_bin(discrete_state, self.dim, self.n_bins)
 
         # get state
         continuous_state = np.zeros(self.dim)
         for dd in range(self.dim):
             continuous_state[dd] = self._bins[dd][index[dd]]
             if randomize:
-                range_dd = self.env.observation_space.high[dd] \
-                           - self.env.observation_space.low[dd]
+                range_dd = (
+                    self.env.observation_space.high[dd]
+                    - self.env.observation_space.low[dd]
+                )
                 epsilon = range_dd / self.n_bins
                 continuous_state[dd] += epsilon * self.rng.uniform()
         return continuous_state
diff --git a/rlberry/wrappers/gym_utils.py b/rlberry/wrappers/gym_utils.py
index f765db85b..736c8eb04 100644
--- a/rlberry/wrappers/gym_utils.py
+++ b/rlberry/wrappers/gym_utils.py
@@ -13,10 +13,7 @@ def convert_space_from_gym(gym_space):
     #
     #
     elif isinstance(gym_space, gym.spaces.Box):
-        return Box(gym_space.low,
-                   gym_space.high,
-                   gym_space.shape,
-                   gym_space.dtype)
+        return Box(gym_space.low, gym_space.high, gym_space.shape, gym_space.dtype)
     #
     #
     elif isinstance(gym_space, gym.spaces.Tuple):
diff --git a/rlberry/wrappers/tests/test_basewrapper.py b/rlberry/wrappers/tests/test_basewrapper.py
index edbc33ab4..2545ffa59 100644
--- a/rlberry/wrappers/tests/test_basewrapper.py
+++ b/rlberry/wrappers/tests/test_basewrapper.py
@@ -14,12 +14,11 @@ def test_wrapper():
     # calling some functions
     wrapped.reset()
     wrapped.step(wrapped.action_space.sample())
-    wrapped.sample(wrapped.observation_space.sample(),
-                   wrapped.action_space.sample())
+    wrapped.sample(wrapped.observation_space.sample(), wrapped.action_space.sample())
 
 
 def test_gym_wrapper():
-    gym_env = gym.make('Acrobot-v1')
+    gym_env = gym.make("Acrobot-v1")
     wrapped = Wrapper(gym_env)
     assert isinstance(wrapped, Model)
     assert wrapped.is_online()
diff --git a/rlberry/wrappers/tests/test_common_wrappers.py b/rlberry/wrappers/tests/test_common_wrappers.py
index 1f45b093a..4b842e5c9 100644
--- a/rlberry/wrappers/tests/test_common_wrappers.py
+++ b/rlberry/wrappers/tests/test_common_wrappers.py
@@ -10,8 +10,7 @@
 from rlberry.wrappers.rescale_reward import RescaleRewardWrapper
 from rlberry.agents import RSUCBVIAgent
 from rlberry.wrappers.autoreset import AutoResetWrapper
-from rlberry.wrappers.uncertainty_estimator_wrapper import \
-    UncertaintyEstimatorWrapper
+from rlberry.wrappers.uncertainty_estimator_wrapper import UncertaintyEstimatorWrapper
 from rlberry.wrappers.vis2d import Vis2dWrapper
 
 
@@ -58,8 +57,8 @@ def test_rescale_reward():
         _ = wrapped.reset()
         for _ in range(100):
             _, reward, _, _ = wrapped.sample(
-                wrapped.observation_space.sample(),
-                wrapped.action_space.sample())
+                wrapped.observation_space.sample(), wrapped.action_space.sample()
+            )
             assert reward <= 10 + tol and reward >= -10 - tol
 
         _ = wrapped.reset()
@@ -140,28 +139,30 @@ def test_uncertainty_est_wrapper():
     env = GridWorld()
 
     def uncertainty_est_fn(observation_space, action_space):
-        return DiscreteCounter(observation_space,
-                               action_space)
+        return DiscreteCounter(observation_space, action_space)
 
-    w_env = UncertaintyEstimatorWrapper(
-        env,
-        uncertainty_est_fn,
-        bonus_scale_factor=1.0)
+    w_env = UncertaintyEstimatorWrapper(env, uncertainty_est_fn, bonus_scale_factor=1.0)
 
     for ii in range(10):
         w_env.reset()
         _, _, _, info = w_env.step(0)
         nn = w_env.uncertainty_estimator.count(0, 0)
         assert nn == ii + 1
-        assert info['exploration_bonus'] == pytest.approx(1 / np.sqrt(nn))
+        assert info["exploration_bonus"] == pytest.approx(1 / np.sqrt(nn))
 
 
 def test_vis2dwrapper():
     env = MountainCar()
     env = Vis2dWrapper(env, n_bins_obs=20, memory_size=200)
 
-    agent = RSUCBVIAgent(env, gamma=0.99, horizon=200,
-                         bonus_scale_factor=0.1, copy_env=False, min_dist=0.1)
+    agent = RSUCBVIAgent(
+        env,
+        gamma=0.99,
+        horizon=200,
+        bonus_scale_factor=0.1,
+        copy_env=False,
+        min_dist=0.1,
+    )
 
     agent.fit(budget=15)
     env.plot_trajectories(show=False)
diff --git a/rlberry/wrappers/tests/test_gym_space_conversion.py b/rlberry/wrappers/tests/test_gym_space_conversion.py
index 5b335034e..4a6eb0463 100644
--- a/rlberry/wrappers/tests/test_gym_space_conversion.py
+++ b/rlberry/wrappers/tests/test_gym_space_conversion.py
@@ -7,11 +7,11 @@
 
 def convert_and_compare(sp, rlberry_space):
     sp_conv = convert_space_from_gym(sp)
-    assert (isinstance(sp_conv, rlberry_space))
+    assert isinstance(sp_conv, rlberry_space)
     sp_conv.reseed()
     for _ in range(100):
-        assert (sp.contains(sp_conv.sample()))
-        assert (sp_conv.contains(sp.sample()))
+        assert sp.contains(sp_conv.sample())
+        assert sp_conv.contains(sp.sample())
 
 
 @pytest.mark.parametrize("n", list(range(1, 10)))
@@ -20,24 +20,26 @@ def test_discrete_space(n):
     convert_and_compare(sp, rlberry.spaces.Discrete)
 
 
-@pytest.mark.parametrize("low, high, dim",
-                         [
-                             (1.0, 10.0, 1),
-                             (1.0, 10.0, 2),
-                             (1.0, 10.0, 4),
-                             (-10.0, 1.0, 1),
-                             (-10.0, 1.0, 2),
-                             (-10.0, 1.0, 4),
-                             (-np.inf, 1.0, 1),
-                             (-np.inf, 1.0, 2),
-                             (-np.inf, 1.0, 4),
-                             (1.0, np.inf, 1),
-                             (1.0, np.inf, 2),
-                             (1.0, np.inf, 4),
-                             (-np.inf, np.inf, 1),
-                             (-np.inf, np.inf, 2),
-                             (-np.inf, np.inf, 4),
-                         ])
+@pytest.mark.parametrize(
+    "low, high, dim",
+    [
+        (1.0, 10.0, 1),
+        (1.0, 10.0, 2),
+        (1.0, 10.0, 4),
+        (-10.0, 1.0, 1),
+        (-10.0, 1.0, 2),
+        (-10.0, 1.0, 4),
+        (-np.inf, 1.0, 1),
+        (-np.inf, 1.0, 2),
+        (-np.inf, 1.0, 4),
+        (1.0, np.inf, 1),
+        (1.0, np.inf, 2),
+        (1.0, np.inf, 4),
+        (-np.inf, np.inf, 1),
+        (-np.inf, np.inf, 2),
+        (-np.inf, np.inf, 4),
+    ],
+)
 def test_box_space_case(low, high, dim):
     shape = (dim, 1)
     sp = gym.spaces.Box(low, high, shape=shape)
@@ -63,25 +65,35 @@ def test_multibinary():
 
 
 def test_dict():
-    nested_observation_space = gym.spaces.Dict({
-        'sensors': gym.spaces.Dict({
-            'position': gym.spaces.Box(low=-100, high=100, shape=(3,)),
-            'velocity': gym.spaces.Box(low=-1, high=1, shape=(3,)),
-            'front_cam': gym.spaces.Tuple((
-                gym.spaces.Box(low=0, high=1, shape=(10, 10, 3)),
-                gym.spaces.Box(low=0, high=1, shape=(10, 10, 3))
-            )),
-            'rear_cam': gym.spaces.Box(low=0, high=1, shape=(10, 10, 3)),
-        }),
-        'ext_controller': gym.spaces.MultiDiscrete((5, 2, 2)),
-        'inner_state': gym.spaces.Dict({
-            'charge': gym.spaces.Discrete(100),
-            'system_checks': gym.spaces.MultiBinary(10),
-            'job_status': gym.spaces.Dict({
-                'task': gym.spaces.Discrete(5),
-                'progress': gym.spaces.Box(low=0, high=100, shape=()),
-            })
-        })
-    })
+    nested_observation_space = gym.spaces.Dict(
+        {
+            "sensors": gym.spaces.Dict(
+                {
+                    "position": gym.spaces.Box(low=-100, high=100, shape=(3,)),
+                    "velocity": gym.spaces.Box(low=-1, high=1, shape=(3,)),
+                    "front_cam": gym.spaces.Tuple(
+                        (
+                            gym.spaces.Box(low=0, high=1, shape=(10, 10, 3)),
+                            gym.spaces.Box(low=0, high=1, shape=(10, 10, 3)),
+                        )
+                    ),
+                    "rear_cam": gym.spaces.Box(low=0, high=1, shape=(10, 10, 3)),
+                }
+            ),
+            "ext_controller": gym.spaces.MultiDiscrete((5, 2, 2)),
+            "inner_state": gym.spaces.Dict(
+                {
+                    "charge": gym.spaces.Discrete(100),
+                    "system_checks": gym.spaces.MultiBinary(10),
+                    "job_status": gym.spaces.Dict(
+                        {
+                            "task": gym.spaces.Discrete(5),
+                            "progress": gym.spaces.Box(low=0, high=100, shape=()),
+                        }
+                    ),
+                }
+            ),
+        }
+    )
     sp = nested_observation_space
     convert_and_compare(sp, rlberry.spaces.Dict)
diff --git a/rlberry/wrappers/tests/test_wrapper_seeding.py b/rlberry/wrappers/tests/test_wrapper_seeding.py
index c1584b99b..3422117ff 100644
--- a/rlberry/wrappers/tests/test_wrapper_seeding.py
+++ b/rlberry/wrappers/tests/test_wrapper_seeding.py
@@ -16,14 +16,7 @@
 except Exception:
     _GYM_INSTALLED = False
 
-classes = [
-    MountainCar,
-    GridWorld,
-    Chain,
-    PBall2D,
-    SimplePBallND,
-    Acrobot
-]
+classes = [MountainCar, GridWorld, Chain, PBall2D, SimplePBallND, Acrobot]
 
 
 def get_env_trajectory(env, horizon):
@@ -121,7 +114,7 @@ def test_double_wrapper_copy_reseeding(ModelClass):
 def test_gym_copy_reseeding():
     seeder = Seeder(123)
     if _GYM_INSTALLED:
-        gym_env = gym.make('Acrobot-v1')
+        gym_env = gym.make("Acrobot-v1")
         env = Wrapper(gym_env)
         env.reseed(seeder)
 
@@ -137,7 +130,7 @@ def test_gym_copy_reseeding():
 def test_gym_copy_reseeding_2():
     seeder = Seeder(123)
     if _GYM_INSTALLED:
-        gym_env = gym.make('Acrobot-v1')
+        gym_env = gym.make("Acrobot-v1")
         # nested wrapping
         env = RescaleRewardWrapper(Wrapper(Wrapper(gym_env)), (0, 1))
         env.reseed(seeder)
diff --git a/rlberry/wrappers/uncertainty_estimator_wrapper.py b/rlberry/wrappers/uncertainty_estimator_wrapper.py
index b3b35d419..4648edece 100644
--- a/rlberry/wrappers/uncertainty_estimator_wrapper.py
+++ b/rlberry/wrappers/uncertainty_estimator_wrapper.py
@@ -33,24 +33,28 @@ class UncertaintyEstimatorWrapper(Wrapper):
             Scale factor for the bonus.
     """
 
-    def __init__(self,
-                 env,
-                 uncertainty_estimator_fn,
-                 uncertainty_estimator_kwargs=None,
-                 bonus_scale_factor=1.0,
-                 bonus_max=np.inf):
+    def __init__(
+        self,
+        env,
+        uncertainty_estimator_fn,
+        uncertainty_estimator_kwargs=None,
+        bonus_scale_factor=1.0,
+        bonus_max=np.inf,
+    ):
         Wrapper.__init__(self, env)
 
         self.bonus_scale_factor = bonus_scale_factor
         self.bonus_max = bonus_max
         uncertainty_estimator_kwargs = uncertainty_estimator_kwargs or {}
 
-        uncertainty_estimator_fn = load(uncertainty_estimator_fn) if isinstance(uncertainty_estimator_fn, str) else \
-            uncertainty_estimator_fn
+        uncertainty_estimator_fn = (
+            load(uncertainty_estimator_fn)
+            if isinstance(uncertainty_estimator_fn, str)
+            else uncertainty_estimator_fn
+        )
         self.uncertainty_estimator = uncertainty_estimator_fn(
-            env.observation_space,
-            env.action_space,
-            **uncertainty_estimator_kwargs)
+            env.observation_space, env.action_space, **uncertainty_estimator_kwargs
+        )
         self.previous_obs = None
 
     def reset(self):
@@ -61,20 +65,16 @@ def _update_and_get_bonus(self, state, action, next_state, reward):
         if self.previous_obs is None:
             return 0.0
         #
-        self.uncertainty_estimator.update(state,
-                                          action,
-                                          next_state,
-                                          reward)
+        self.uncertainty_estimator.update(state, action, next_state, reward)
         return self.bonus(state, action)
 
     def step(self, action):
         observation, reward, done, info = self.env.step(action)
 
         # update uncertainty and compute bonus
-        bonus = self._update_and_get_bonus(self.previous_obs,
-                                           action,
-                                           observation,
-                                           reward)
+        bonus = self._update_and_get_bonus(
+            self.previous_obs, action, observation, reward
+        )
         #
         self.previous_obs = observation
 
@@ -82,25 +82,35 @@ def step(self, action):
         if info is None:
             info = {}
         else:
-            if 'exploration_bonus' in info:
-                logger.error("UncertaintyEstimatorWrapper Error: info has" +
-                             "  already a key named exploration_bonus!")
+            if "exploration_bonus" in info:
+                logger.error(
+                    "UncertaintyEstimatorWrapper Error: info has"
+                    + "  already a key named exploration_bonus!"
+                )
 
-        info['exploration_bonus'] = bonus
+        info["exploration_bonus"] = bonus
 
         return observation, reward, done, info
 
     def sample(self, state, action):
         logger.warning(
-            '[UncertaintyEstimatorWrapper]: sample()'
-            + ' method does not consider nor update bonuses.')
+            "[UncertaintyEstimatorWrapper]: sample()"
+            + " method does not consider nor update bonuses."
+        )
         return self.env.sample(state, action)
 
     def bonus(self, state, action=None):
-        bonus = self.bonus_scale_factor * self.uncertainty_estimator.measure(state, action)
+        bonus = self.bonus_scale_factor * self.uncertainty_estimator.measure(
+            state, action
+        )
         return np.clip(bonus, 0, self.bonus_max)
 
     def bonus_batch(self, states, actions=None):
-        bonus = self.bonus_scale_factor * self.uncertainty_estimator.measure_batch(states, actions)
-        return np.clip(bonus, 0, self.bonus_max) if isinstance(bonus, np.ndarray) else torch.clamp(bonus, 0,
-                                                                                                   self.bonus_max)
+        bonus = self.bonus_scale_factor * self.uncertainty_estimator.measure_batch(
+            states, actions
+        )
+        return (
+            np.clip(bonus, 0, self.bonus_max)
+            if isinstance(bonus, np.ndarray)
+            else torch.clamp(bonus, 0, self.bonus_max)
+        )
diff --git a/rlberry/wrappers/vis2d.py b/rlberry/wrappers/vis2d.py
index d410d7a4f..70991cb2e 100644
--- a/rlberry/wrappers/vis2d.py
+++ b/rlberry/wrappers/vis2d.py
@@ -11,7 +11,9 @@
 
 
 class Transition:
-    def __init__(self, raw_state, state, action, reward, n_total_visits, n_episode_visits):
+    def __init__(
+        self, raw_state, state, action, reward, n_total_visits, n_episode_visits
+    ):
         self.raw_state = raw_state
         self.state = state
         self.action = action
@@ -79,12 +81,14 @@ class Vis2dWrapper(Wrapper):
         kwargs for state_preprocess_fn
     """
 
-    def __init__(self,
-                 env,
-                 n_bins_obs=10,
-                 memory_size=100,
-                 state_preprocess_fn=None,
-                 state_preprocess_kwargs=None):
+    def __init__(
+        self,
+        env,
+        n_bins_obs=10,
+        memory_size=100,
+        state_preprocess_fn=None,
+        state_preprocess_kwargs=None,
+    ):
         Wrapper.__init__(self, env)
 
         if state_preprocess_fn is None:
@@ -95,12 +99,12 @@ def __init__(self,
         self.state_preprocess_kwargs = state_preprocess_kwargs or {}
 
         self.memory = TrajectoryMemory(memory_size)
-        self.total_visit_counter = DiscreteCounter(self.env.observation_space,
-                                                   self.env.action_space,
-                                                   n_bins_obs=n_bins_obs)
-        self.episode_visit_counter = DiscreteCounter(self.env.observation_space,
-                                                     self.env.action_space,
-                                                     n_bins_obs=n_bins_obs)
+        self.total_visit_counter = DiscreteCounter(
+            self.env.observation_space, self.env.action_space, n_bins_obs=n_bins_obs
+        )
+        self.episode_visit_counter = DiscreteCounter(
+            self.env.observation_space, self.env.action_space, n_bins_obs=n_bins_obs
+        )
         self.current_state = None
         self.curret_step = 0
 
@@ -122,31 +126,35 @@ def step(self, action):
         self.total_visit_counter.update(ss, aa, ns, reward)
         self.episode_visit_counter.update(ss, aa, ns, reward)
         # store transition
-        transition = Transition(ss,
-                                self.state_preprocess_fn(ss, self.env, **self.state_preprocess_kwargs),
-                                aa,
-                                reward,
-                                self.total_visit_counter.count(ss, aa),
-                                self.episode_visit_counter.count(ss, aa))
+        transition = Transition(
+            ss,
+            self.state_preprocess_fn(ss, self.env, **self.state_preprocess_kwargs),
+            aa,
+            reward,
+            self.total_visit_counter.count(ss, aa),
+            self.episode_visit_counter.count(ss, aa),
+        )
         self.memory.append(transition)
         # update current state
         self.current_state = observation
         return observation, reward, done, info
 
-    def plot_trajectories(self,
-                          fignum=None,
-                          figsize=(6, 6),
-                          hide_axis=True,
-                          show=True,
-                          video_filename=None,
-                          colormap_name='cool',
-                          framerate=15,
-                          n_skip=1,
-                          dot_scale_factor=2.5,
-                          alpha=0.25,
-                          xlim=None,
-                          ylim=None,
-                          dot_size_means='episode_visits'):
+    def plot_trajectories(
+        self,
+        fignum=None,
+        figsize=(6, 6),
+        hide_axis=True,
+        show=True,
+        video_filename=None,
+        colormap_name="cool",
+        framerate=15,
+        n_skip=1,
+        dot_scale_factor=2.5,
+        alpha=0.25,
+        xlim=None,
+        ylim=None,
+        dot_size_means="episode_visits",
+    ):
         """
         Plot history of trajectories in a scatter plot.
         Colors distinguish recent and old trajectories, the size of the dots represent
@@ -194,8 +202,10 @@ def plot_trajectories(self,
         # discretizer
         try:
             discretizer = self.episode_visit_counter.state_discretizer
-            epsilon = min(discretizer._bins[0][1] - discretizer._bins[0][0],
-                          discretizer._bins[1][1] - discretizer._bins[1][0])
+            epsilon = min(
+                discretizer._bins[0][1] - discretizer._bins[0][0],
+                discretizer._bins[1][1] - discretizer._bins[1][0],
+            )
         except Exception:
             epsilon = 0.01
 
@@ -225,15 +235,18 @@ def plot_trajectories(self,
 
             states = np.array([traj[ii].state for ii in range(len(traj))])
 
-            if dot_size_means == 'episode_visits':
-                sizes = np.array(
-                    [traj[ii].n_episode_visits for ii in range(len(traj))]
-                )
-            elif dot_size_means == 'total_visits':
+            if dot_size_means == "episode_visits":
+                sizes = np.array([traj[ii].n_episode_visits for ii in range(len(traj))])
+            elif dot_size_means == "total_visits":
                 raw_states = [traj[ii].raw_state for ii in range(len(traj))]
                 sizes = np.array(
                     [
-                        np.sum([self.total_visit_counter.count(ss, aa) for aa in range(self.env.action_space.n)])
+                        np.sum(
+                            [
+                                self.total_visit_counter.count(ss, aa)
+                                for aa in range(self.env.action_space.n)
+                            ]
+                        )
                         for ss in raw_states
                     ]
                 )
@@ -243,13 +256,19 @@ def plot_trajectories(self,
             sizes = 1 + sizes
             sizes = (dot_scale_factor ** 2) * 100 * epsilon * sizes / sizes.max()
 
-            ax.scatter(x=states[:, 0], y=states[:, 1], color=color, s=sizes, alpha=alpha)
+            ax.scatter(
+                x=states[:, 0], y=states[:, 1], color=color, s=sizes, alpha=alpha
+            )
             plt.tight_layout()
 
             if video_filename is not None:
                 canvas.draw()
-                image_from_plot = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
-                image_from_plot = image_from_plot.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+                image_from_plot = np.frombuffer(
+                    fig.canvas.tostring_rgb(), dtype=np.uint8
+                )
+                image_from_plot = image_from_plot.reshape(
+                    fig.canvas.get_width_height()[::-1] + (3,)
+                )
                 images.append(image_from_plot)
 
         if video_filename is not None:
@@ -261,21 +280,23 @@ def plot_trajectories(self,
         if show:
             plt.show()
 
-    def plot_trajectory_actions(self,
-                                fignum=None,
-                                figsize=(8, 6),
-                                n_traj_to_show=10,
-                                hide_axis=True,
-                                show=True,
-                                video_filename=None,
-                                colormap_name='Paired',
-                                framerate=15,
-                                n_skip=1,
-                                dot_scale_factor=2.5,
-                                alpha=1.0,
-                                action_description=None,
-                                xlim=None,
-                                ylim=None):
+    def plot_trajectory_actions(
+        self,
+        fignum=None,
+        figsize=(8, 6),
+        n_traj_to_show=10,
+        hide_axis=True,
+        show=True,
+        video_filename=None,
+        colormap_name="Paired",
+        framerate=15,
+        n_skip=1,
+        dot_scale_factor=2.5,
+        alpha=1.0,
+        action_description=None,
+        xlim=None,
+        ylim=None,
+    ):
         """
         Plot actions (one action = one color) chosen in recent trajectories.
 
@@ -317,15 +338,17 @@ def plot_trajectory_actions(self,
         """
         logger.info("Plotting...")
 
-        fignum = fignum or (str(self) + '-actions')
+        fignum = fignum or (str(self) + "-actions")
         colormap_fn = plt.get_cmap(colormap_name)
         action_description = action_description or list(range(self.env.action_space.n))
 
         # discretizer
         try:
             discretizer = self.episode_visit_counter.state_discretizer
-            epsilon = min(discretizer._bins[0][1] - discretizer._bins[0][0],
-                          discretizer._bins[1][1] - discretizer._bins[1][0])
+            epsilon = min(
+                discretizer._bins[0][1] - discretizer._bins[0][0],
+                discretizer._bins[1][1] - discretizer._bins[1][0],
+            )
         except Exception:
             epsilon = 0.01
 
@@ -368,19 +391,35 @@ def plot_trajectory_actions(self,
                 for aa in range(self.env.action_space.n):
                     states_aa = states[actions == aa]
                     color = colormap_fn(aa / self.env.action_space.n)
-                    ax.scatter(x=states_aa[:, 0], y=states_aa[:, 1], color=color,
-                               s=sizes, alpha=alpha,
-                               label=f'action = {action_description[aa]}')
+                    ax.scatter(
+                        x=states_aa[:, 0],
+                        y=states_aa[:, 1],
+                        color=color,
+                        s=sizes,
+                        alpha=alpha,
+                        label=f"action = {action_description[aa]}",
+                    )
 
             # for unique legend entries, source: https://stackoverflow.com/a/57600060
-            plt.legend(*[*zip(*{l: h for h, l in zip(*ax.get_legend_handles_labels())}.items())][::-1],
-                       loc='upper left', bbox_to_anchor=(1.00, 1.00))
+            plt.legend(
+                *[
+                    *zip(
+                        *{l: h for h, l in zip(*ax.get_legend_handles_labels())}.items()
+                    )
+                ][::-1],
+                loc="upper left",
+                bbox_to_anchor=(1.00, 1.00),
+            )
             plt.tight_layout()
 
             if video_filename is not None:
                 canvas.draw()
-                image_from_plot = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
-                image_from_plot = image_from_plot.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+                image_from_plot = np.frombuffer(
+                    fig.canvas.tostring_rgb(), dtype=np.uint8
+                )
+                image_from_plot = image_from_plot.reshape(
+                    fig.canvas.get_width_height()[::-1] + (3,)
+                )
                 images.append(image_from_plot)
 
         if video_filename is not None:
diff --git a/setup.py b/setup.py
index a1ad82a9d..0d14f7f4f 100644
--- a/setup.py
+++ b/setup.py
@@ -1,20 +1,20 @@
 from setuptools import setup, find_packages
 
-packages = find_packages(exclude=['docs', 'notebooks', 'assets'])
+packages = find_packages(exclude=["docs", "notebooks", "assets"])
 
 #
 # Base installation (interface only)
 #
 install_requires = [
-    'numpy>=1.17',
-    'pygame',
-    'matplotlib',
-    'seaborn',
-    'pandas',
-    'gym',
-    'dill',
-    'docopt',
-    'pyyaml',
+    "numpy>=1.17",
+    "pygame",
+    "matplotlib",
+    "seaborn",
+    "pandas",
+    "gym",
+    "dill",
+    "docopt",
+    "pyyaml",
 ]
 
 #
@@ -23,50 +23,50 @@
 
 # default installation
 default_requires = [
-    'numba',
-    'optuna',
-    'ffmpeg-python',
-    'PyOpenGL',
-    'PyOpenGL_accelerate',
-    'pyvirtualdisplay',
+    "numba",
+    "optuna",
+    "ffmpeg-python",
+    "PyOpenGL",
+    "PyOpenGL_accelerate",
+    "pyvirtualdisplay",
 ]
 
 # tensorboard must be installed manually, due to conflicts with
 # dm-reverb-nightly[tensorflow] in jax_agents_requires
 torch_agents_requires = default_requires + [
-    'torch>=1.6.0',
+    "torch>=1.6.0",
     # 'tensorboard'
 ]
 
 jax_agents_requires = default_requires + [
-    'jax[cpu]',
-    'chex',
-    'dm-haiku',
-    'optax',
-    'dm-reverb[tensorflow]==0.5.0',
-    'dm-tree',
-    'rlax'
+    "jax[cpu]",
+    "chex",
+    "dm-haiku",
+    "optax",
+    "dm-reverb[tensorflow]==0.5.0",
+    "dm-tree",
+    "rlax",
 ]
 
 extras_require = {
-    'default': default_requires,
-    'jax_agents': jax_agents_requires,
-    'torch_agents': torch_agents_requires,
-    'deploy': ['sphinx', 'sphinx_rtd_theme'],
+    "default": default_requires,
+    "jax_agents": jax_agents_requires,
+    "torch_agents": torch_agents_requires,
+    "deploy": ["sphinx", "sphinx_rtd_theme"],
 }
 
 with open("README.md", "r") as fh:
     long_description = fh.read()
 
 setup(
-    name='rlberry',
-    version='0.2.1',
-    description='An easy-to-use reinforcement learning library for research and education',
+    name="rlberry",
+    version="0.2.1",
+    description="An easy-to-use reinforcement learning library for research and education",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    author='Omar Darwiche Domingues, Yannis Flet-Berliac, Edouard Leurent, Pierre Menard, Xuedong Shang',
-    url='https://github.com/rlberry-py',
-    license='MIT',
+    author="Omar Darwiche Domingues, Yannis Flet-Berliac, Edouard Leurent, Pierre Menard, Xuedong Shang",
+    url="https://github.com/rlberry-py",
+    license="MIT",
     packages=packages,
     classifiers=[
         "Development Status :: 4 - Beta",

From 0a17c0041004047beb6e511938ecbed81b278bf2 Mon Sep 17 00:00:00 2001
From: SHILOVA Alena <alena.shilova@inria.fr>
Date: Wed, 4 May 2022 14:59:02 +0200
Subject: [PATCH 03/13] More details in DQN docstring about Qnet and test on
 changing default Qnet

---
 rlberry/agents/torch/dqn/dqn.py        | 19 +++++++++++++++++++
 rlberry/agents/torch/tests/test_dqn.py | 17 +++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/rlberry/agents/torch/dqn/dqn.py b/rlberry/agents/torch/dqn/dqn.py
index 70ea82e63..b340b3eb3 100644
--- a/rlberry/agents/torch/dqn/dqn.py
+++ b/rlberry/agents/torch/dqn/dqn.py
@@ -83,6 +83,25 @@ class DQNAgent(AgentWithSimplePolicy):
 
         * Ouput shape = (batch_dim, chunk_size, number_of_actions)
 
+        Example: use `rlberry.agents.torch.utils.training.model_factory`,
+        `rlberry.agents.torch.utils.training.size_model_config` and `q_net_kwargs` 
+        parameter to modify the neural network::
+
+            model_configs = {
+                "type": "MultiLayerPerceptron",
+                "layer_sizes": (5, 5),
+                "reshape": False,
+            }
+
+            def mlp(env, **kwargs):
+                model_config = size_model_config(env, **model_config)
+                return model_factory(**kwargs)
+
+            agent = DQNAgent(env, q_net_constructor=mlp, q_net_kwargs=model_configs)
+        
+        If not specified then it is set to MultiLayerPerceptron with 2 hidden layers
+        of size 64
+
     q_net_kwargs : optional, dict
         Parameters for q_net_constructor.
     use_double_dqn : bool, default = False
diff --git a/rlberry/agents/torch/tests/test_dqn.py b/rlberry/agents/torch/tests/test_dqn.py
index 6e932b983..8dae4d51b 100644
--- a/rlberry/agents/torch/tests/test_dqn.py
+++ b/rlberry/agents/torch/tests/test_dqn.py
@@ -1,6 +1,7 @@
 import pytest
 from rlberry.envs import gym_make
 from rlberry.agents.torch.dqn import DQNAgent
+from rlberry.agents.torch.utils.training import model_factory
 
 
 @pytest.mark.parametrize(
@@ -18,3 +19,19 @@ def test_dqn_agent(use_double_dqn, use_prioritized_replay):
         use_prioritized_replay=use_prioritized_replay,
     )
     agent.fit(budget=500)
+
+    model_configs = {
+        "type": "MultiLayerPerceptron",
+        "layer_sizes": (5, 5),
+        "reshape": False,
+    }
+
+    def mlp(env, **kwargs):
+        """
+        Returns a default Q value network.
+        """
+        kwargs["in_size"] = env.observation_space.shape[0]
+        return model_factory(**kwargs)
+
+    new_agent = DQNAgent(env, q_net_constructor=mlp, q_net_kwargs=model_configs)
+    new_agent.fit(budget=2000)
\ No newline at end of file

From cdfe2c07745df5808670e33cf558ef950a3f9b00 Mon Sep 17 00:00:00 2001
From: SHILOVA Alena <alena.shilova@inria.fr>
Date: Wed, 4 May 2022 15:02:28 +0200
Subject: [PATCH 04/13] blacked last commit

---
 rlberry/agents/torch/dqn/dqn.py        | 4 ++--
 rlberry/agents/torch/tests/test_dqn.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/rlberry/agents/torch/dqn/dqn.py b/rlberry/agents/torch/dqn/dqn.py
index b340b3eb3..2508ccb4a 100644
--- a/rlberry/agents/torch/dqn/dqn.py
+++ b/rlberry/agents/torch/dqn/dqn.py
@@ -84,7 +84,7 @@ class DQNAgent(AgentWithSimplePolicy):
         * Ouput shape = (batch_dim, chunk_size, number_of_actions)
 
         Example: use `rlberry.agents.torch.utils.training.model_factory`,
-        `rlberry.agents.torch.utils.training.size_model_config` and `q_net_kwargs` 
+        `rlberry.agents.torch.utils.training.size_model_config` and `q_net_kwargs`
         parameter to modify the neural network::
 
             model_configs = {
@@ -98,7 +98,7 @@ def mlp(env, **kwargs):
                 return model_factory(**kwargs)
 
             agent = DQNAgent(env, q_net_constructor=mlp, q_net_kwargs=model_configs)
-        
+
         If not specified then it is set to MultiLayerPerceptron with 2 hidden layers
         of size 64
 
diff --git a/rlberry/agents/torch/tests/test_dqn.py b/rlberry/agents/torch/tests/test_dqn.py
index 8dae4d51b..9bfbb1304 100644
--- a/rlberry/agents/torch/tests/test_dqn.py
+++ b/rlberry/agents/torch/tests/test_dqn.py
@@ -34,4 +34,4 @@ def mlp(env, **kwargs):
         return model_factory(**kwargs)
 
     new_agent = DQNAgent(env, q_net_constructor=mlp, q_net_kwargs=model_configs)
-    new_agent.fit(budget=2000)
\ No newline at end of file
+    new_agent.fit(budget=2000)

From 4a25e6fc39e1ec09e3340f428db848eb65fac806 Mon Sep 17 00:00:00 2001
From: SHILOVA Alena <alena.shilova@inria.fr>
Date: Wed, 4 May 2022 15:08:40 +0200
Subject: [PATCH 05/13] None is more clearly stated in DQN docstring in
 q_net_constructor

---
 rlberry/agents/torch/dqn/dqn.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rlberry/agents/torch/dqn/dqn.py b/rlberry/agents/torch/dqn/dqn.py
index 2508ccb4a..7c122a15a 100644
--- a/rlberry/agents/torch/dqn/dqn.py
+++ b/rlberry/agents/torch/dqn/dqn.py
@@ -73,7 +73,7 @@ class DQNAgent(AgentWithSimplePolicy):
         After :code:`epsilon_decay` timesteps, epsilon approaches :code:`epsilon_final`.
     optimizer_type : {"ADAM", "RMS_PROP"}
         Optimization algorithm.
-    q_net_constructor : Callable
+    q_net_constructor : Callable or None
         Function/constructor that returns a torch module for the Q-network:
         :code:`qnet = q_net_constructor(env, **kwargs)`.
 
@@ -99,7 +99,7 @@ def mlp(env, **kwargs):
 
             agent = DQNAgent(env, q_net_constructor=mlp, q_net_kwargs=model_configs)
 
-        If not specified then it is set to MultiLayerPerceptron with 2 hidden layers
+        If None then it is set to MultiLayerPerceptron with 2 hidden layers
         of size 64
 
     q_net_kwargs : optional, dict

From 1984059e9a58adb5cc7275098d4287c5f9a5e0d4 Mon Sep 17 00:00:00 2001
From: SHILOVA Alena <alena.shilova@inria.fr>
Date: Thu, 5 May 2022 18:00:14 +0200
Subject: [PATCH 06/13] Better docstring for DQN: explains what is str for
 q_net_constructor and gives shorter example when it is function

---
 rlberry/agents/torch/dqn/dqn.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/rlberry/agents/torch/dqn/dqn.py b/rlberry/agents/torch/dqn/dqn.py
index 7c122a15a..92cf1a2ad 100644
--- a/rlberry/agents/torch/dqn/dqn.py
+++ b/rlberry/agents/torch/dqn/dqn.py
@@ -73,7 +73,7 @@ class DQNAgent(AgentWithSimplePolicy):
         After :code:`epsilon_decay` timesteps, epsilon approaches :code:`epsilon_final`.
     optimizer_type : {"ADAM", "RMS_PROP"}
         Optimization algorithm.
-    q_net_constructor : Callable or None
+    q_net_constructor : Callable, str or None
         Function/constructor that returns a torch module for the Q-network:
         :code:`qnet = q_net_constructor(env, **kwargs)`.
 
@@ -83,8 +83,8 @@ class DQNAgent(AgentWithSimplePolicy):
 
         * Ouput shape = (batch_dim, chunk_size, number_of_actions)
 
-        Example: use `rlberry.agents.torch.utils.training.model_factory`,
-        `rlberry.agents.torch.utils.training.size_model_config` and `q_net_kwargs`
+        Example: use `rlberry.agents.torch.utils.training.model_factory_from_env`,
+         and `q_net_kwargs`
         parameter to modify the neural network::
 
             model_configs = {
@@ -93,11 +93,16 @@ class DQNAgent(AgentWithSimplePolicy):
                 "reshape": False,
             }
 
-            def mlp(env, **kwargs):
-                model_config = size_model_config(env, **model_config)
-                return model_factory(**kwargs)
-
-            agent = DQNAgent(env, q_net_constructor=mlp, q_net_kwargs=model_configs)
+            agent = DQNAgent(env, 
+                q_net_constructor=model_factory_from_env, 
+                q_net_kwargs=model_configs
+                )
+        If str then it should correspond to the full path to the constructor function,
+        e.g.::
+            agent = DQNAgent(env, 
+                q_net_constructor='rlberry.agents.torch.utils.training.model_factory_from_env', 
+                q_net_kwargs=model_configs
+                )
 
         If None then it is set to MultiLayerPerceptron with 2 hidden layers
         of size 64

From 57ce593f5c353e6b425903f4d46224e4cbdecc5c Mon Sep 17 00:00:00 2001
From: SHILOVA Alena <alena.shilova@inria.fr>
Date: Thu, 5 May 2022 18:04:08 +0200
Subject: [PATCH 07/13] blacked last commit

---
 rlberry/agents/torch/dqn/dqn.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/rlberry/agents/torch/dqn/dqn.py b/rlberry/agents/torch/dqn/dqn.py
index 92cf1a2ad..5462ead77 100644
--- a/rlberry/agents/torch/dqn/dqn.py
+++ b/rlberry/agents/torch/dqn/dqn.py
@@ -93,14 +93,14 @@ class DQNAgent(AgentWithSimplePolicy):
                 "reshape": False,
             }
 
-            agent = DQNAgent(env, 
-                q_net_constructor=model_factory_from_env, 
+            agent = DQNAgent(env,
+                q_net_constructor=model_factory_from_env,
                 q_net_kwargs=model_configs
                 )
         If str then it should correspond to the full path to the constructor function,
         e.g.::
-            agent = DQNAgent(env, 
-                q_net_constructor='rlberry.agents.torch.utils.training.model_factory_from_env', 
+            agent = DQNAgent(env,
+                q_net_constructor='rlberry.agents.torch.utils.training.model_factory_from_env',
                 q_net_kwargs=model_configs
                 )
 

From 4d442c25e6253ca22be44709071ffb5d1f273fc1 Mon Sep 17 00:00:00 2001
From: SHILOVA Alena <alena.shilova@inria.fr>
Date: Mon, 24 Jul 2023 15:47:48 +0200
Subject: [PATCH 08/13] model factory can take externally defined nn and load
 it from file + checks that externally defined nn is suitable for environment

---
 .../agents/torch/tests/test_torch_training.py |  61 ++++++-
 rlberry/agents/torch/utils/training.py        | 157 ++++++++++++++++--
 2 files changed, 200 insertions(+), 18 deletions(-)

diff --git a/rlberry/agents/torch/tests/test_torch_training.py b/rlberry/agents/torch/tests/test_torch_training.py
index fe5fb722c..663cd5ed8 100644
--- a/rlberry/agents/torch/tests/test_torch_training.py
+++ b/rlberry/agents/torch/tests/test_torch_training.py
@@ -1,7 +1,11 @@
 import torch
-from rlberry.agents.torch.utils.training import loss_function_factory, optimizer_factory
+
+import os
+from rlberry.agents.torch.utils.training import loss_function_factory, optimizer_factory, model_factory, model_factory_from_env
 from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env
-from rlberry.agents.torch.utils.models import default_policy_net_fn
+from rlberry.agents.torch.utils.models import default_policy_net_fn, Net, MultiLayerPerceptron
+from rlberry.agents.torch.dqn import DQNAgent
+
 
 # loss_function_factory
 assert isinstance(loss_function_factory("l2"), torch.nn.MSELoss)
@@ -30,3 +34,56 @@
     ]
     == 0.99
 )
+
+
+#test model_factory
+
+obs_shape = env.observation_space.shape
+n_act = env.action_space.n
+
+test_net = Net(obs_size=obs_shape[0],hidden_size=10, n_actions=n_act)
+
+test_net2 = MultiLayerPerceptron(in_size=obs_shape[0], layer_sizes=[10],out_size=1)
+
+
+test_net3 = MultiLayerPerceptron(in_size=obs_shape[0], layer_sizes=[10],out_size=n_act, is_policy=True)
+
+
+model_factory(net = test_net)
+model_factory_from_env(env, net=test_net)
+model_factory_from_env(env, net=test_net2, out_size = 1)
+model_factory_from_env(env, net=test_net3, is_policy = True)
+
+
+
+# test loading pretrained nn
+dqn_agent = DQNAgent(env, q_net_constructor=model_factory_from_env, q_net_kwargs=dict(net = test_net))
+
+dqn_agent.fit(50)
+
+torch.save(dqn_agent._qnet_online, "test_dqn.pickle")
+
+
+parameters_to_save = dqn_agent._qnet_online.state_dict()
+torch.save(parameters_to_save, "test_dqn.pt")
+
+
+
+model_factory(filename="test_dqn.pickle")
+model_factory(net = test_net, filename="test_dqn.pt")
+
+
+
+dqn_agent = DQNAgent(env, q_net_constructor=model_factory_from_env, q_net_kwargs=dict(filename = "test_dqn.pickle"))
+
+dqn_agent = DQNAgent(env, q_net_constructor=model_factory_from_env, q_net_kwargs=dict(net = test_net, filename = "test_dqn.pt"))
+
+assert dqn_agent._qnet_online.state_dict().keys() == parameters_to_save.keys()
+
+for k in parameters_to_save.keys():
+    assert (dqn_agent._qnet_online.state_dict()[k] == parameters_to_save[k]).all()
+
+os.remove("test_dqn.pickle")
+os.remove("test_dqn.pt")
+
+print("done")
\ No newline at end of file
diff --git a/rlberry/agents/torch/utils/training.py b/rlberry/agents/torch/utils/training.py
index ed338b3bb..bd9883602 100644
--- a/rlberry/agents/torch/utils/training.py
+++ b/rlberry/agents/torch/utils/training.py
@@ -27,21 +27,67 @@ def optimizer_factory(params, optimizer_type="ADAM", **kwargs):
         raise ValueError("Unknown optimizer type: {}".format(optimizer_type))
 
 
-def model_factory_from_env(env, **kwargs):
+def model_factory_from_env(env, type = "MultiLayerPerceptron", net = None, filename = None, **net_kwargs):
     """Returns a torch module after setting up input/output dimensions according to an env.
 
     Parameters
     ----------
     env: gym.Env
         Environment
+    type: {"MultiLayerPerceptron",
+           "ConvolutionalNetwork",
+           "DuelingNetwork",
+           "Table"}, default = "MultiLayerPerceptron"
+        Type of neural network.
+    net: torch.nn.Module or None
+        If not None, return this neural network. It can be used to pass user-defined neural network.
+    filename: str or None
+        The path to a saved module or its 'state_dict'. If not None, it will load a net or a checkpoint. 
     **kwargs: Dict
         Parameters to be updated, used to call :func:`~rlberry.agents.torch.utils.training.model_factory`.
     """
-    kwargs = size_model_config(env, **kwargs)
-    return model_factory(**kwargs)
+
+    if filename is not None:
+        load_dict = load_from_file(filename)
+        if load_dict["model"] is not None:
+            net = load_dict["model"]
+        checkpoint = load_dict["checkpoint"]
+    else:
+        checkpoint = None
+
+
+    kwargs = size_model_config(env, type, **net_kwargs)
+
+    if net is not None:
+        check_network(env, net, **kwargs)
+
+
+    return model_factory(type, net, checkpoint=checkpoint, **kwargs)
+
 
 
-def model_factory(type="MultiLayerPerceptron", **kwargs) -> nn.Module:
+def load_from_file(filename):
+    """Load a module or a checkpoint.
+    
+    Parameters
+    ----------
+    filename: str
+        The path to a saved module or its 'state_dict'. It will load a net or a checkpoint.
+    """
+    output_dict = dict(model = None, checkpoint = None)
+
+    loaded = torch.load(filename)
+    if isinstance(loaded, torch.nn.Module):
+        output_dict["model"] = loaded
+    elif isinstance(loaded, dict):
+        output_dict["checkpoint"] = loaded
+    else:
+        raise ValueError("Invalid 'load_from_file'. File is expected to store either an entire model or its 'state_dict'.")
+    return output_dict
+
+
+
+def model_factory(type="MultiLayerPerceptron", net = None, filename = None, checkpoint = None, **net_kwargs) -> nn.Module:
     """Build a neural net of a given type.
 
     Parameters
@@ -51,7 +97,13 @@ def model_factory(type="MultiLayerPerceptron", **kwargs) -> nn.Module:
            "DuelingNetwork",
            "Table"}, default = "MultiLayerPerceptron"
         Type of neural network.
-    **kwargs: dict
+    net: torch.nn.Module or None
+        If not None, return this neural network. It can be used to pass user-defined neural network.
+    filename: str or None
+        The path to a saved module or its 'state_dict'. If not None, it will load a net or a checkpoint. 
+    checkpoint: dict or None
+        If not None, then it is treated as a 'state_dict' that is assigned to a neural network model.    
+    **net_kwargs: dict
         Parameters that vary according to each neural net type, see
 
         * :class:`~rlberry.agents.torch.utils.models.MultiLayerPerceptron`
@@ -69,19 +121,89 @@ def model_factory(type="MultiLayerPerceptron", **kwargs) -> nn.Module:
         Table,
     )
 
-    if type == "MultiLayerPerceptron":
-        return MultiLayerPerceptron(**kwargs)
-    elif type == "DuelingNetwork":
-        return DuelingNetwork(**kwargs)
-    elif type == "ConvolutionalNetwork":
-        return ConvolutionalNetwork(**kwargs)
-    elif type == "Table":
-        return Table(**kwargs)
+    if filename is not None:
+        load_dict = load_from_file(filename)
+        if load_dict["model"] is not None:
+            return load_dict["model"]
+        else:
+            checkpoint = load_dict["checkpoint"]
+ 
+
+    if net is not None:
+        model = net
     else:
-        raise ValueError("Unknown model type")
+        if type == "MultiLayerPerceptron":
+            model =  MultiLayerPerceptron(**net_kwargs)
+        elif type == "DuelingNetwork":
+            model =  DuelingNetwork(**net_kwargs)
+        elif type == "ConvolutionalNetwork":
+            model = ConvolutionalNetwork(**net_kwargs)
+        elif type == "Table":
+            model =  Table(**net_kwargs)
+        else:
+            raise ValueError("Unknown model type")
+        
+    if checkpoint is not None:
+        model.load_state_dict(checkpoint)
+
+    return model
+
+
+def check_network(env, net, **model_config):
+    """
+    Check the neural network that it satisfies the environment and predefined model_config. If the network is not good, it should raise an error.
+
+    Parameters
+    ----------
+    env : gym.Env
+        An environment.
+    net: torch.nn.Module
+        A neural network.
+    model_config : dict
+        Desired parameters.
+    """
+
+    if isinstance(env.observation_space, spaces.Box):
+        obs_shape = env.observation_space.shape
+    elif isinstance(env.observation_space, spaces.Tuple):
+        obs_shape = env.observation_space.spaces[0].shape
+    elif isinstance(env.observation_space, spaces.Discrete):
+        return model_config
+    
+
+    if net is not None:
+        #check that it is compliant with environment
+        #input check
+        fake_input = torch.zeros(1, *obs_shape)
+        try:
+            output = net(fake_input)
+        except Exception as err:
+            print(f"NN input is not compatible with the environment. Got an error {err=}, {type(err)=}")
+            raise
+        #output check
+        if "is_policy" in model_config:
+            is_policy = model_config["is_policy"]
+            if is_policy:
+                assert isinstance(output, torch.distributions.distribution.Distribution), "Policy should return distribution over actions"
+        else:
+            if "out_size" in model_config:
+                out_size = [model_config["out_size"]]
+            else:
+                if isinstance(env.action_space, spaces.Discrete):
+                    out_size = [env.action_space.n]
+                elif isinstance(env.action_space, spaces.Tuple):
+                    out_size = [env.action_space.spaces[0].n]
+                elif isinstance(env.action_space, spaces.Box):
+                    out_size = env.action_space.shape
+            assert output.shape == (1, *out_size), f"Output should be of size {out_size}"
+
+
+
+
+
 
 
-def size_model_config(env, **model_config):
+def size_model_config(env, type = None, **model_config):
     """
     Setup input/output dimensions for the configuration of
     a model depending on the environment observation/action spaces.
@@ -90,6 +212,8 @@ def size_model_config(env, **model_config):
     ----------
     env : gym.Env
         An environment.
+    type: str or None
+        Make configs corresponding to the chosen type of neural network.
     model_config : dict
         Parameters to be updated, used to call :func:`~rlberry.agents.torch.utils.training.model_factory`.
         If "out_size" is not given in model_config, assumes
@@ -103,9 +227,10 @@ def size_model_config(env, **model_config):
         obs_shape = env.observation_space.spaces[0].shape
     elif isinstance(env.observation_space, spaces.Discrete):
         return model_config
+    
 
     # Assume CHW observation space
-    if "type" in model_config and model_config["type"] == "ConvolutionalNetwork":
+    if type == "ConvolutionalNetwork":
         if "transpose_obs" in model_config and not model_config["transpose_obs"]:
             # Assume CHW observation space
             if "in_channels" not in model_config:

From c278e6c5e63e92be3cf243d220a7e8b9e4892409 Mon Sep 17 00:00:00 2001
From: SHILOVA Alena <alena.shilova@inria.fr>
Date: Mon, 24 Jul 2023 15:48:44 +0200
Subject: [PATCH 09/13] blacked

---
 .../agents/torch/tests/test_torch_training.py | 52 ++++++++++-----
 rlberry/agents/torch/utils/training.py        | 63 ++++++++++---------
 2 files changed, 67 insertions(+), 48 deletions(-)

diff --git a/rlberry/agents/torch/tests/test_torch_training.py b/rlberry/agents/torch/tests/test_torch_training.py
index 663cd5ed8..3e795eb29 100644
--- a/rlberry/agents/torch/tests/test_torch_training.py
+++ b/rlberry/agents/torch/tests/test_torch_training.py
@@ -1,9 +1,18 @@
 import torch
 
 import os
-from rlberry.agents.torch.utils.training import loss_function_factory, optimizer_factory, model_factory, model_factory_from_env
+from rlberry.agents.torch.utils.training import (
+    loss_function_factory,
+    optimizer_factory,
+    model_factory,
+    model_factory_from_env,
+)
 from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env
-from rlberry.agents.torch.utils.models import default_policy_net_fn, Net, MultiLayerPerceptron
+from rlberry.agents.torch.utils.models import (
+    default_policy_net_fn,
+    Net,
+    MultiLayerPerceptron,
+)
 from rlberry.agents.torch.dqn import DQNAgent
 
 
@@ -36,28 +45,31 @@
 )
 
 
-#test model_factory
+# test model_factory
 
 obs_shape = env.observation_space.shape
 n_act = env.action_space.n
 
-test_net = Net(obs_size=obs_shape[0],hidden_size=10, n_actions=n_act)
+test_net = Net(obs_size=obs_shape[0], hidden_size=10, n_actions=n_act)
 
-test_net2 = MultiLayerPerceptron(in_size=obs_shape[0], layer_sizes=[10],out_size=1)
+test_net2 = MultiLayerPerceptron(in_size=obs_shape[0], layer_sizes=[10], out_size=1)
 
 
-test_net3 = MultiLayerPerceptron(in_size=obs_shape[0], layer_sizes=[10],out_size=n_act, is_policy=True)
+test_net3 = MultiLayerPerceptron(
+    in_size=obs_shape[0], layer_sizes=[10], out_size=n_act, is_policy=True
+)
 
 
-model_factory(net = test_net)
+model_factory(net=test_net)
 model_factory_from_env(env, net=test_net)
-model_factory_from_env(env, net=test_net2, out_size = 1)
-model_factory_from_env(env, net=test_net3, is_policy = True)
-
+model_factory_from_env(env, net=test_net2, out_size=1)
+model_factory_from_env(env, net=test_net3, is_policy=True)
 
 
 # test loading pretrained nn
-dqn_agent = DQNAgent(env, q_net_constructor=model_factory_from_env, q_net_kwargs=dict(net = test_net))
+dqn_agent = DQNAgent(
+    env, q_net_constructor=model_factory_from_env, q_net_kwargs=dict(net=test_net)
+)
 
 dqn_agent.fit(50)
 
@@ -68,15 +80,21 @@
 torch.save(parameters_to_save, "test_dqn.pt")
 
 
-
 model_factory(filename="test_dqn.pickle")
-model_factory(net = test_net, filename="test_dqn.pt")
-
+model_factory(net=test_net, filename="test_dqn.pt")
 
 
-dqn_agent = DQNAgent(env, q_net_constructor=model_factory_from_env, q_net_kwargs=dict(filename = "test_dqn.pickle"))
+dqn_agent = DQNAgent(
+    env,
+    q_net_constructor=model_factory_from_env,
+    q_net_kwargs=dict(filename="test_dqn.pickle"),
+)
 
-dqn_agent = DQNAgent(env, q_net_constructor=model_factory_from_env, q_net_kwargs=dict(net = test_net, filename = "test_dqn.pt"))
+dqn_agent = DQNAgent(
+    env,
+    q_net_constructor=model_factory_from_env,
+    q_net_kwargs=dict(net=test_net, filename="test_dqn.pt"),
+)
 
 assert dqn_agent._qnet_online.state_dict().keys() == parameters_to_save.keys()
 
@@ -86,4 +104,4 @@
 os.remove("test_dqn.pickle")
 os.remove("test_dqn.pt")
 
-print("done")
\ No newline at end of file
+print("done")
diff --git a/rlberry/agents/torch/utils/training.py b/rlberry/agents/torch/utils/training.py
index bd9883602..ed9d9cf92 100644
--- a/rlberry/agents/torch/utils/training.py
+++ b/rlberry/agents/torch/utils/training.py
@@ -27,7 +27,9 @@ def optimizer_factory(params, optimizer_type="ADAM", **kwargs):
         raise ValueError("Unknown optimizer type: {}".format(optimizer_type))
 
 
-def model_factory_from_env(env, type = "MultiLayerPerceptron", net = None, filename = None, **net_kwargs):
+def model_factory_from_env(
+    env, type="MultiLayerPerceptron", net=None, filename=None, **net_kwargs
+):
     """Returns a torch module after setting up input/output dimensions according to an env.
 
     Parameters
@@ -42,7 +44,7 @@ def model_factory_from_env(env, type = "MultiLayerPerceptron", net = None, filen
     net: torch.nn.Module or None
         If not None, return this neural network. It can be used to pass user-defined neural network.
     filename: str or None
-        The path to a saved module or its 'state_dict'. If not None, it will load a net or a checkpoint. 
+        The path to a saved module or its 'state_dict'. If not None, it will load a net or a checkpoint.
     **kwargs: Dict
         Parameters to be updated, used to call :func:`~rlberry.agents.torch.utils.training.model_factory`.
     """
@@ -55,26 +57,23 @@ def model_factory_from_env(env, type = "MultiLayerPerceptron", net = None, filen
     else:
         checkpoint = None
 
-
     kwargs = size_model_config(env, type, **net_kwargs)
 
     if net is not None:
         check_network(env, net, **kwargs)
 
-
     return model_factory(type, net, checkpoint=checkpoint, **kwargs)
 
 
-
 def load_from_file(filename):
     """Load a module or a checkpoint.
-    
+
     Parameters
     ----------
     filename: str
         The path to a saved module or its 'state_dict'. It will load a net or a checkpoint.
     """
-    output_dict = dict(model = None, checkpoint = None)
+    output_dict = dict(model=None, checkpoint=None)
 
     loaded = torch.load(filename)
     if isinstance(loaded, torch.nn.Module):
@@ -82,12 +81,15 @@ def load_from_file(filename):
     elif isinstance(loaded, dict):
         output_dict["checkpoint"] = loaded
     else:
-        raise ValueError("Invalid 'load_from_file'. File is expected to store either an entire model or its 'state_dict'.")
+        raise ValueError(
+            "Invalid 'load_from_file'. File is expected to store either an entire model or its 'state_dict'."
+        )
     return output_dict
 
 
-
-def model_factory(type="MultiLayerPerceptron", net = None, filename = None, checkpoint = None, **net_kwargs) -> nn.Module:
+def model_factory(
+    type="MultiLayerPerceptron", net=None, filename=None, checkpoint=None, **net_kwargs
+) -> nn.Module:
     """Build a neural net of a given type.
 
     Parameters
@@ -100,9 +102,9 @@ def model_factory(type="MultiLayerPerceptron", net = None, filename = None, chec
     net: torch.nn.Module or None
         If not None, return this neural network. It can be used to pass user-defined neural network.
     filename: str or None
-        The path to a saved module or its 'state_dict'. If not None, it will load a net or a checkpoint. 
+        The path to a saved module or its 'state_dict'. If not None, it will load a net or a checkpoint.
     checkpoint: dict or None
-        If not None, then it is treated as a 'state_dict' that is assigned to a neural network model.    
+        If not None, then it is treated as a 'state_dict' that is assigned to a neural network model.
     **net_kwargs: dict
         Parameters that vary according to each neural net type, see
 
@@ -127,22 +129,21 @@ def model_factory(type="MultiLayerPerceptron", net = None, filename = None, chec
             return load_dict["model"]
         else:
             checkpoint = load_dict["checkpoint"]
- 
 
     if net is not None:
         model = net
     else:
         if type == "MultiLayerPerceptron":
-            model =  MultiLayerPerceptron(**net_kwargs)
+            model = MultiLayerPerceptron(**net_kwargs)
         elif type == "DuelingNetwork":
-            model =  DuelingNetwork(**net_kwargs)
+            model = DuelingNetwork(**net_kwargs)
         elif type == "ConvolutionalNetwork":
             model = ConvolutionalNetwork(**net_kwargs)
         elif type == "Table":
-            model =  Table(**net_kwargs)
+            model = Table(**net_kwargs)
         else:
             raise ValueError("Unknown model type")
-        
+
     if checkpoint is not None:
         model.load_state_dict(checkpoint)
 
@@ -169,22 +170,25 @@ def check_network(env, net, **model_config):
         obs_shape = env.observation_space.spaces[0].shape
     elif isinstance(env.observation_space, spaces.Discrete):
         return model_config
-    
 
     if net is not None:
-        #check that it is compliant with environment
-        #input check
+        # check that it is compliant with environment
+        # input check
         fake_input = torch.zeros(1, *obs_shape)
         try:
             output = net(fake_input)
         except Exception as err:
-            print(f"NN input is not compatible with the environment. Got an error {err=}, {type(err)=}")
+            print(
+                f"NN input is not compatible with the environment. Got an error {err=}, {type(err)=}"
+            )
             raise
-        #output check
+        # output check
         if "is_policy" in model_config:
             is_policy = model_config["is_policy"]
             if is_policy:
-                assert isinstance(output, torch.distributions.distribution.Distribution), "Policy should return distribution over actions"
+                assert isinstance(
+                    output, torch.distributions.distribution.Distribution
+                ), "Policy should return distribution over actions"
         else:
             if "out_size" in model_config:
                 out_size = [model_config["out_size"]]
@@ -195,15 +199,13 @@ def check_network(env, net, **model_config):
                     out_size = [env.action_space.spaces[0].n]
                 elif isinstance(env.action_space, spaces.Box):
                     out_size = env.action_space.shape
-            assert output.shape == (1, *out_size), f"Output should be of size {out_size}"
-
-
-
-
-
+            assert output.shape == (
+                1,
+                *out_size,
+            ), f"Output should be of size {out_size}"
 
 
-def size_model_config(env, type = None, **model_config):
+def size_model_config(env, type=None, **model_config):
     """
     Setup input/output dimensions for the configuration of
     a model depending on the environment observation/action spaces.
@@ -227,7 +229,6 @@ def size_model_config(env, type = None, **model_config):
         obs_shape = env.observation_space.spaces[0].shape
     elif isinstance(env.observation_space, spaces.Discrete):
         return model_config
-    
 
     # Assume CHW observation space
     if type == "ConvolutionalNetwork":

From 19b9bfd720f393535feae3871afba288c179f7b7 Mon Sep 17 00:00:00 2001
From: SHILOVA Alena <alena.shilova@inria.fr>
Date: Mon, 24 Jul 2023 17:34:45 +0200
Subject: [PATCH 10/13] more coverage

---
 .../agents/torch/tests/test_torch_training.py | 33 +++++++++++++++++++
 rlberry/agents/torch/utils/training.py        | 10 +++---
 2 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/rlberry/agents/torch/tests/test_torch_training.py b/rlberry/agents/torch/tests/test_torch_training.py
index 3e795eb29..3f4d81282 100644
--- a/rlberry/agents/torch/tests/test_torch_training.py
+++ b/rlberry/agents/torch/tests/test_torch_training.py
@@ -6,8 +6,11 @@
     optimizer_factory,
     model_factory,
     model_factory_from_env,
+    check_network
 )
 from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env
+from rlberry.envs.finite import Chain
+from rlberry.envs import gym_make
 from rlberry.agents.torch.utils.models import (
     default_policy_net_fn,
     Net,
@@ -24,6 +27,10 @@
 
 # optimizer_factory
 env = get_benchmark_env(level=1)
+
+finite_env = Chain()
+
+cont_act_env = gym_make("Pendulum-v1")
 assert (
     optimizer_factory(default_policy_net_fn(env).parameters(), "ADAM").defaults["lr"]
     == 0.001
@@ -59,11 +66,18 @@
     in_size=obs_shape[0], layer_sizes=[10], out_size=n_act, is_policy=True
 )
 
+test_net4 = MultiLayerPerceptron(
+    in_size=100, layer_sizes=[10], out_size=n_act
+)
+
+test_net5 = MultiLayerPerceptron(in_size = cont_act_env.observation_space.shape[0], layer_sizes=[10], out_size=cont_act_env.action_space.shape[0])
+
 
 model_factory(net=test_net)
 model_factory_from_env(env, net=test_net)
 model_factory_from_env(env, net=test_net2, out_size=1)
 model_factory_from_env(env, net=test_net3, is_policy=True)
+model_factory_from_env(cont_act_env, net=test_net5)
 
 
 # test loading pretrained nn
@@ -78,6 +92,25 @@
 
 parameters_to_save = dqn_agent._qnet_online.state_dict()
 torch.save(parameters_to_save, "test_dqn.pt")
+torch.save((parameters_to_save, parameters_to_save), "test_dqn2.pt")
+
+try:
+    model_factory(filename="test_dqn2.pt")
+except Exception as err:
+    os.remove("test_dqn2.pt")
+    print(err, "Bad file was removed.")
+
+try:
+    model_factory(type = "dummy")
+except Exception as err:
+    print(err)
+
+
+# This test should fail as 
+# try:
+#     check_network(cont_act_env, test_net)
+# except Exception as err:
+#     print(err)
 
 
 model_factory(filename="test_dqn.pickle")
diff --git a/rlberry/agents/torch/utils/training.py b/rlberry/agents/torch/utils/training.py
index ed9d9cf92..41475e1ba 100644
--- a/rlberry/agents/torch/utils/training.py
+++ b/rlberry/agents/torch/utils/training.py
@@ -166,10 +166,12 @@ def check_network(env, net, **model_config):
 
     if isinstance(env.observation_space, spaces.Box):
         obs_shape = env.observation_space.shape
-    elif isinstance(env.observation_space, spaces.Tuple):
-        obs_shape = env.observation_space.spaces[0].shape
-    elif isinstance(env.observation_space, spaces.Discrete):
-        return model_config
+    else:
+        raise NotImplementedError
+    # elif isinstance(env.observation_space, spaces.Tuple):
+    #     obs_shape = env.observation_space.spaces[0].shape
+    # elif isinstance(env.observation_space, spaces.Discrete):
+    #     return model_config
 
     if net is not None:
         # check that it is compliant with environment

From b99188d32685f5efcf99c35cbc384e9b628998c2 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 24 Jul 2023 15:37:03 +0000
Subject: [PATCH 11/13] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../agents/torch/tests/test_torch_training.py    | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/rlberry/agents/torch/tests/test_torch_training.py b/rlberry/agents/torch/tests/test_torch_training.py
index 3f4d81282..c5a91c325 100644
--- a/rlberry/agents/torch/tests/test_torch_training.py
+++ b/rlberry/agents/torch/tests/test_torch_training.py
@@ -6,7 +6,7 @@
     optimizer_factory,
     model_factory,
     model_factory_from_env,
-    check_network
+    check_network,
 )
 from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env
 from rlberry.envs.finite import Chain
@@ -66,11 +66,13 @@
     in_size=obs_shape[0], layer_sizes=[10], out_size=n_act, is_policy=True
 )
 
-test_net4 = MultiLayerPerceptron(
-    in_size=100, layer_sizes=[10], out_size=n_act
-)
+test_net4 = MultiLayerPerceptron(in_size=100, layer_sizes=[10], out_size=n_act)
 
-test_net5 = MultiLayerPerceptron(in_size = cont_act_env.observation_space.shape[0], layer_sizes=[10], out_size=cont_act_env.action_space.shape[0])
+test_net5 = MultiLayerPerceptron(
+    in_size=cont_act_env.observation_space.shape[0],
+    layer_sizes=[10],
+    out_size=cont_act_env.action_space.shape[0],
+)
 
 
 model_factory(net=test_net)
@@ -101,12 +103,12 @@
     print(err, "Bad file was removed.")
 
 try:
-    model_factory(type = "dummy")
+    model_factory(type="dummy")
 except Exception as err:
     print(err)
 
 
-# This test should fail as 
+# This test should fail as
 # try:
 #     check_network(cont_act_env, test_net)
 # except Exception as err:

From 215bec51cc0355f8750e43ee8984a2fd20dc1745 Mon Sep 17 00:00:00 2001
From: SHILOVA Alena <alena.shilova@inria.fr>
Date: Mon, 24 Jul 2023 18:40:24 +0200
Subject: [PATCH 12/13] blacked

---
 .../agents/torch/tests/test_torch_training.py    | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/rlberry/agents/torch/tests/test_torch_training.py b/rlberry/agents/torch/tests/test_torch_training.py
index 3f4d81282..c5a91c325 100644
--- a/rlberry/agents/torch/tests/test_torch_training.py
+++ b/rlberry/agents/torch/tests/test_torch_training.py
@@ -6,7 +6,7 @@
     optimizer_factory,
     model_factory,
     model_factory_from_env,
-    check_network
+    check_network,
 )
 from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env
 from rlberry.envs.finite import Chain
@@ -66,11 +66,13 @@
     in_size=obs_shape[0], layer_sizes=[10], out_size=n_act, is_policy=True
 )
 
-test_net4 = MultiLayerPerceptron(
-    in_size=100, layer_sizes=[10], out_size=n_act
-)
+test_net4 = MultiLayerPerceptron(in_size=100, layer_sizes=[10], out_size=n_act)
 
-test_net5 = MultiLayerPerceptron(in_size = cont_act_env.observation_space.shape[0], layer_sizes=[10], out_size=cont_act_env.action_space.shape[0])
+test_net5 = MultiLayerPerceptron(
+    in_size=cont_act_env.observation_space.shape[0],
+    layer_sizes=[10],
+    out_size=cont_act_env.action_space.shape[0],
+)
 
 
 model_factory(net=test_net)
@@ -101,12 +103,12 @@
     print(err, "Bad file was removed.")
 
 try:
-    model_factory(type = "dummy")
+    model_factory(type="dummy")
 except Exception as err:
     print(err)
 
 
-# This test should fail as 
+# This test should fail as
 # try:
 #     check_network(cont_act_env, test_net)
 # except Exception as err:

From 411d3ca1caa40f8b26416e4def403eb415046a71 Mon Sep 17 00:00:00 2001
From: SHILOVA Alena <alena.shilova@inria.fr>
Date: Mon, 24 Jul 2023 18:43:52 +0200
Subject: [PATCH 13/13] flake 8 should be fine

---
 rlberry/agents/torch/tests/test_torch_training.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/rlberry/agents/torch/tests/test_torch_training.py b/rlberry/agents/torch/tests/test_torch_training.py
index c5a91c325..478fa4564 100644
--- a/rlberry/agents/torch/tests/test_torch_training.py
+++ b/rlberry/agents/torch/tests/test_torch_training.py
@@ -6,7 +6,6 @@
     optimizer_factory,
     model_factory,
     model_factory_from_env,
-    check_network,
 )
 from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env
 from rlberry.envs.finite import Chain