Merge branch 'main' into pre-commit-ci-update-config

Eclectic-Sheep · May 9, 2024 · 671b8e6 · 671b8e6
2 parents f94c8b3 + 75d752f
commit 671b8e6
Show file tree

Hide file tree

Showing 32 changed files with 300 additions and 138 deletions.
diff --git a/howto/add_environment.md b/howto/add_environment.md
@@ -14,6 +14,19 @@ The main properties/methods that the environment has to provide are the followin
 >
 > All the observations returned by the `step` and `reset` functions must be python dictionary of numpy arrays.
 
+## About observations and actions spaces
+
+> [!NOTE]
+>
+> Please remember that any environment is considered independent of any other and it is supposed to interact with a single agent as Multi-Agent Reinforcement Learning (MARL) is not actually supported.
+
+The current observations shapes supported are:
+
+* 1D vector: everything that is a 1D vector will be processed by an MLP by the agent.
+* 2D/3D images: everything that is not a 1D vector will be processed by a CNN by the agent. A 2D image or a 3D image of shape `[H,W,1]` or `[1,H,W]` will be considered as a grayscale image, a multi-channel image otherwise.
+
+An action of type `gymnasium.spaces.Box` must be of shape `(n,)`, where `n` is the number of (possibly continuous) actions the environment supports. 
+
 # Add a new Environment
 There are two ways to add a new environment:
 1. Create from scratch a custom environment by inheriting from the [`gymnasium.Env`](https://gymnasium.farama.org/api/env/#gymnasium-env) class.

diff --git a/howto/configs.md b/howto/configs.md
@@ -14,19 +14,26 @@ This document explains how the configuration files and folders are structured. I
 ```tree
 sheeprl/configs
 ├── algo
+│   ├── a2c.yaml
 │   ├── default.yaml
 │   ├── dreamer_v1.yaml
 │   ├── dreamer_v2.yaml
+│   ├── dreamer_v3_L.yaml
+│   ├── dreamer_v3_M.yaml
+│   ├── dreamer_v3_S.yaml
+│   ├── dreamer_v3_XL.yaml
+│   ├── dreamer_v3_XS.yaml
 │   ├── dreamer_v3.yaml
 │   ├── droq.yaml
 │   ├── p2e_dv1.yaml
 │   ├── p2e_dv2.yaml
-│   ├── ppo.yaml
+│   ├── p2e_dv3.yaml
 │   ├── ppo_decoupled.yaml
 │   ├── ppo_recurrent.yaml
-│   ├── sac.yaml
+│   ├── ppo.yaml
 │   ├── sac_ae.yaml
-│   └── sac_decoupled.yaml
+│   ├── sac_decoupled.yaml
+│   └── sac.yaml
 ├── buffer
 │   └── default.yaml
 ├── checkpoint
@@ -44,40 +51,86 @@ sheeprl/configs
 │   ├── gym.yaml
 │   ├── minecraft.yaml
 │   ├── minedojo.yaml
-│   └── minerl.yaml
+│   ├── minerl_obtain_diamond.yaml
+│   ├── minerl_obtain_iron_pickaxe.yaml
+│   ├── minerl.yaml
+│   ├── mujoco.yaml
+│   └── super_mario_bros.yaml
 ├── env_config.yaml
+├── eval_config.yaml
 ├── exp
+│   ├── a2c_benchmarks.yaml
+│   ├── a2c.yaml
 │   ├── default.yaml
+│   ├── dreamer_v1_benchmarks.yaml
 │   ├── dreamer_v1.yaml
-│   ├── dreamer_v2.yaml
+│   ├── dreamer_v2_benchmarks.yaml
+│   ├── dreamer_v2_crafter.yaml
 │   ├── dreamer_v2_ms_pacman.yaml
-│   ├── dreamer_v3.yaml
+│   ├── dreamer_v2.yaml
 │   ├── dreamer_v3_100k_boxing.yaml
 │   ├── dreamer_v3_100k_ms_pacman.yaml
-│   ├── dreamer_v3_L_doapp.yaml
+│   ├── dreamer_v3_benchmarks.yaml
+│   ├── dreamer_v3_dmc_cartpole_swingup_sparse.yaml
+│   ├── dreamer_v3_dmc_walker_walk.yaml
 │   ├── dreamer_v3_L_doapp_128px_gray_combo_discrete.yaml
+│   ├── dreamer_v3_L_doapp.yaml
 │   ├── dreamer_v3_L_navigate.yaml
+│   ├── dreamer_v3_super_mario_bros.yaml
 │   ├── dreamer_v3_XL_crafter.yaml
-│   ├── dreamer_v3_dmc_walker_walk.yaml
+│   ├── dreamer_v3.yaml
 │   ├── droq.yaml
-│   ├── p2e_dv1.yaml
-│   ├── p2e_dv2.yaml
-│   ├── ppo.yaml
+│   ├── p2e_dv1_exploration.yaml
+│   ├── p2e_dv1_finetuning.yaml
+│   ├── p2e_dv2_exploration.yaml
+│   ├── p2e_dv2_finetuning.yaml
+│   ├── p2e_dv3_expl_L_doapp_128px_gray_combo_discrete_15Mexpl_20Mstps.yaml
+│   ├── p2e_dv3_exploration.yaml
+│   ├── p2e_dv3_finetuning.yaml
+│   ├── p2e_dv3_fntn_L_doapp_64px_gray_combo_discrete_5Mstps.yaml
+│   ├── ppo_benchmarks.yaml
 │   ├── ppo_decoupled.yaml
 │   ├── ppo_recurrent.yaml
-│   ├── sac.yaml
+│   ├── ppo_super_mario_bros.yaml
+│   ├── ppo.yaml
 │   ├── sac_ae.yaml
-│   └── sac_decoupled.yaml
+│   ├── sac_benchmarks.yaml
+│   ├── sac_decoupled.yaml
+│   └── sac.yaml
 ├── fabric
 │   ├── ddp-cpu.yaml
 │   ├── ddp-cuda.yaml
 │   └── default.yaml
 ├── hydra
 │   └── default.yaml
+├── __init__.py
+├── logger
+│   ├── mlflow.yaml
+│   └── tensorboard.yaml
 ├── metric
 │   └── default.yaml
+├── model_manager
+│   ├── a2c.yaml
+│   ├── default.yaml
+│   ├── dreamer_v1.yaml
+│   ├── dreamer_v2.yaml
+│   ├── dreamer_v3.yaml
+│   ├── droq.yaml
+│   ├── p2e_dv1_exploration.yaml
+│   ├── p2e_dv1_finetuning.yaml
+│   ├── p2e_dv2_exploration.yaml
+│   ├── p2e_dv2_finetuning.yaml
+│   ├── p2e_dv3_exploration.yaml
+│   ├── p2e_dv3_finetuning.yaml
+│   ├── ppo_recurrent.yaml
+│   ├── ppo.yaml
+│   ├── sac_ae.yaml
+│   └── sac.yaml
+├── model_manager_config.yaml
 └── optim
     ├── adam.yaml
+    ├── rmsprop_tf.yaml
+    ├── rmsprop.yaml
     └── sgd.yaml
 ```
 
@@ -102,24 +155,56 @@ defaults:
   - env: default.yaml
   - fabric: default.yaml
   - metric: default.yaml
+  - model_manager: default.yaml
   - hydra: default.yaml
   - exp: ???
 
 num_threads: 1
+float32_matmul_precision: "high"
 
 # Set it to True to run a single optimization step
 dry_run: False
 
 # Reproducibility
 seed: 42
-torch_deterministic: False
+
+# For more information about reproducibility in PyTorch, see https://pytorch.org/docs/stable/notes/randomness.html
+
+# torch.use_deterministic_algorithms() lets you configure PyTorch to use deterministic algorithms
+# instead of nondeterministic ones where available,
+# and to throw an error if an operation is known to be nondeterministic (and without a deterministic alternative).
+torch_use_deterministic_algorithms: False
+
+# Disabling the benchmarking feature with torch.backends.cudnn.benchmark = False 
+# causes cuDNN to deterministically select an algorithm, possibly at the cost of reduced performance.
+# However, if you do not need reproducibility across multiple executions of your application, 
+# then performance might improve if the benchmarking feature is enabled with torch.backends.cudnn.benchmark = True.
+torch_backends_cudnn_benchmark: True
+
+# While disabling CUDA convolution benchmarking (discussed above) ensures that CUDA selects the same algorithm each time an application is run,
+# that algorithm itself may be nondeterministic, unless either torch.use_deterministic_algorithms(True)
+# or torch.backends.cudnn.deterministic = True is set. 
+# The latter setting controls only this behavior, 
+# unlike torch.use_deterministic_algorithms() which will make other PyTorch operations behave deterministically, too.
+torch_backends_cudnn_deterministic: False
+
+# From: https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility
+# By design, all cuBLAS API routines from a given toolkit version, generate the same bit-wise results at every run
+# when executed on GPUs with the same architecture and the same number of SMs.
+# However, bit-wise reproducibility is not guaranteed across toolkit versions
+# because the implementation might differ due to some implementation changes.
+# This guarantee holds when a single CUDA stream is active only. 
+# If multiple concurrent streams are active, the library may optimize total performance by picking different internal implementations.
+cublas_workspace_config: null  # Possible values are: ":4096:8" or ":16:8"
 
 # Output folders
-exp_name: "default"
+exp_name: ${algo.name}_${env.id}
 run_name: ${now:%Y-%m-%d_%H-%M-%S}_${exp_name}_${seed}
 root_dir: ${algo.name}/${env.id}
 ```
 
+By default we want the user to specify the experiment config, represented by `- exp: ???` in the above example. The three-question-marks symbol tells hydra to expect that an `exp` config is specified at runtime by the user (e.g. `sheeprl.py exp=dreamer_v3`: one can look at every exp configs in `sheeprl/config/exp/` folder).
+
 ### Algorithms
 
 In the `algo` folder one can find all the configurations for every algorithm implemented in sheeprl. Those configs contain all the hyperparameters specific to a particular algorithm. Let us have a look at the `dreamer_v3.yaml` config for example:
@@ -427,9 +512,13 @@ Given this config, one can easily run an experiment to test the Dreamer-V3 algor
 python sheeprl.py exp=dreamer_v3_100k_ms_pacman
 ```
 
+> [!WARNING]
+>
+> The default hyperparameters specified in the configs gathered by the experiment config (in this example the hyperparameters specified by the `sheeprl/configs/exp/dreamer_v3.yaml`, `sheeprl/configs/env/atari.yaml` and all the configs coming with them) will be overwritten by the values in the current config whenever a naming collision happens, for example when the same field is defined in both configurations. Those naming collisions will be resolved by keeping the value defined in the current config. This behaviour is specified by letting the `_self_` keyword be the last one in the `defaults` list.
+
 ### Fabric
 
-These configurations control the parameters to be passed to the [Fabric object](https://lightning.ai/docs/fabric/stable/api/generated/lightning.fabric.fabric.Fabric.html#lightning.fabric.fabric.Fabric). With those one can control whether to run the experiments on multiple devices, on which accelerator and with thich precision. For more information please have a look at the [Lightning documentation page](https://lightning.ai/docs/fabric/stable/api/fabric_args.html#).
+These configurations control the parameters to be passed to the [Fabric object](https://lightning.ai/docs/fabric/stable/api/generated/lightning.fabric.fabric.Fabric.html#lightning.fabric.fabric.Fabric). With those one can control whether to run the experiments on multiple devices, on which accelerator and with which precision. For more information please have a look at the [Lightning documentation page](https://lightning.ai/docs/fabric/stable/api/fabric_args.html#).
 
 ### Hydra
 

diff --git a/howto/learn_in_minedojo.md b/howto/learn_in_minedojo.md
@@ -25,6 +25,10 @@ Now, you can install the MineDojo environment:
 pip install -e .[minedojo]
 ```
 
+> [!WARNING]
+>
+> If you run into any problems during the installation due to some missing files that are not downloaded, please have a look at [this issue](https://github.com/MineDojo/MineDojo/issues/113).
+
 ## MineDojo environments
 > [!NOTE]
 >

diff --git a/howto/learn_in_minerl.md b/howto/learn_in_minerl.md
@@ -17,6 +17,10 @@ Now, you can install the MineRL environment:
 pip install -e .[minerl]
 ```
 
+> [!WARNING]
+>
+> If you run into any problems during the installation due to some missing files that are not downloaded, please have a look at [this issue](https://github.com/MineDojo/MineDojo/issues/113).
+
 ## MineRL environments
 We have modified the MineRL environments to have a custom action and observation space. We provide three different tasks:
 1. Navigate: you need to set the `env.id` argument to `custom_navigate`.

diff --git a/howto/logs_and_checkpoints.md b/howto/logs_and_checkpoints.md
@@ -165,6 +165,8 @@ Then, the metrics that will be logged are the `key0` and the `key2`. The `key5`
 By default the checkpointing is enabled with the following settings:
 
 ```yaml
+# sheeprl/configs/checkpoint/default.yaml
+
 every: 100
 resume_from: null
 save_last: True
@@ -180,4 +182,24 @@ meaning that:
 
 > [!NOTE]
 >
-> When restarting an experiment from a specific checkpoint (`resume_from=/path/to/checkpoint.ckpt`), it is **mandatory** to pass as arguments the same configurations of the experiment you want to restart. This is due to the way Hydra creates the folder in which it saves configs: if you do not pass the same configurations, you may have an unexpected log directory (i.e., the folder is created in the wrong folder).
+> When restarting an experiment from a specific checkpoint (`resume_from=/path/to/checkpoint.ckpt`), it is **mandatory** to pass as arguments the same configurations of the experiment you want to restart. This is due to the way Hydra creates the folder in which it saves configs: if you do not pass the same configurations, you may have an unexpected log directory (i.e., the folder is created in the wrong folder).
+
+### Buffer checkpoint
+
+For off-policy algorithms like SAC or Dreamer there is the possibility to save the replay buffer in the checkpoint by setting `buffer.checkpoint=True` from the CLI or by setting the corresponding parameter in the buffer yaml config:
+
+```yaml
+# sheeprl/configs/buffer/default.yaml
+
+size: ???
+memmap: True
+validate_args: False
+from_numpy: False
+checkpoint: True  # Used only for off-policy algorithms
+```
+
+There can be few scenarios to pay attention to:
+
+* If the buffer is memory-mapped (i.e. `buffer.memmap=True`) and one saves the buffer in the checkpoint then one **mustn't delete the buffer folder** of the stopped experiment: if the buffer is memory-mapped a file for every key saved in the replay buffer is created on disk (`observations.memmap`, `rewards.memmap` for example) and when the experiment is resumed those files are read back from the exact same location
+* If the buffer is memory-mapped (i.e. `buffer.memmap=True`), one saves the buffer in the checkpoint and the buffer has been filled completely during the previous experiment (meaning that the olders trajectories have been overwritten by newer ones) then it could happen that the agent will be trained from "future" trajectories coming from a "future" policy. To be more precise the buffer is simply a pre-allocated numpy-array with an attribute `pos` that points to the first free slot to be written; if we are using a `sheeprl.data.buffers.SequentialReplayBuffer` we sample sequential sequences in `[0, pos - sequence_length) ∪ [pos, buffer_size)` or simply `[0, pos - sequence_length)` depending on whether the buffer has been filled or not respectively. When we save the buffer into the checkpoint we save all the relevant information regarding it (the `pos` attribute and the path to the memory-mapped files, which represents the buffer content to be retrieved upon resuming). Suppose that we have saved a checkpoint at step `N` and the experiment have gone further for `K < N` steps before it stops, with the buffer that had already been filled at least one time. When we resume the buffer is laoded from the checkpoint, meaning that the `pos` attribute points at the same position it was pointing at step `N` and because we have memory-mapped our buffer we find in `[pos, pos + K]` a bunch of trajectories that comes from a "future" policy: the one that we were training in the previous experiment and stopped! Currently we don't know if this can cause problems to the agent and neither we have found a nice solution to mitigate this problem. We have thought at a bunch of ways to solve this problem: one is to memmap the buffer metadata like the current `pos`: in this way when we load the buffer from the checkpoint we can remove all the unwanted trajectories in `[old_pos, current_pos]`; this could potentially erase a lot of the buffer content if for example one has a checkpoint at step `N` and the experiment stopped at step `2N - 1`. Another solution could be to employ an online queue to save the trajectories momentarily into and flush the queue to the replay buffer only upon checkpointing; the problem with this solution is that one has to maintain in memory a lot of info and the RAM could explode easily if one is working with images (this can be avoided by also memory-mapping the online queue). Practically, another possible solution is to set the `algo.learning_starts=K` from the CLI or in the algorithm section in the experiment config: in this way all the future trajectories will be erased by trajectories conditioned by the resumed agent. 
+* In any case, when the checkpoint is resumed the buffer **could be potentially pre-filled for `algo.learning_starts` steps** with trajectories conditioned by the resumed agent. If you don't want to pre-fill the buffer set `algo.learning_starts=0`
diff --git a/howto/register_external_algorithm.md b/howto/register_external_algorithm.md
@@ -387,6 +387,7 @@ def build_agent(
     for agent_p, player_p in zip(agent.critic.parameters(), player.critic.parameters()):
         player_p.data = agent_p.data
     return agent, player
+```
 
 ## Loss functions
 
@@ -591,7 +592,7 @@ def ext_sota_main(fabric: Fabric, cfg: Dict[str, Any]):
 
     for update in range(start_step, num_updates + 1):
         for _ in range(0, cfg.algo.rollout_steps):
-            policy_step += cfg.env.num_envs * world_size
+            policy_step += policy_steps_per_update
 
             # Measure environment interaction time: this considers both the model forward
             # to get the action given the observation and the time taken into the environment
@@ -662,13 +663,13 @@ def ext_sota_main(fabric: Fabric, cfg: Dict[str, Any]):
             # Sync distributed timers
             if not timer.disabled:
                 timer_metrics = timer.compute()
-                if "Time/train_time" in timer_metrics:
+                if "Time/train_time" in timer_metrics and timer_metrics["Time/train_time"] > 0:
                     fabric.log(
                         "Time/sps_train",
                         (train_step - last_train) / timer_metrics["Time/train_time"],
                         policy_step,
                     )
-                if "Time/env_interaction_time" in timer_metrics:
+                if "Time/env_interaction_time" in timer_metrics and timer_metrics["Time/env_interaction_time"] > 0:
                     fabric.log(
                         "Time/sps_env_interaction",
                         ((policy_step - last_log) / world_size * cfg.env.action_repeat)

diff --git a/howto/register_new_algorithm.md b/howto/register_new_algorithm.md
@@ -590,7 +590,7 @@ def sota_main(fabric: Fabric, cfg: Dict[str, Any]):
 
     for update in range(start_step, num_updates + 1):
         for _ in range(0, cfg.algo.rollout_steps):
-            policy_step += cfg.env.num_envs * world_size
+            policy_step += policy_steps_per_update
 
             # Measure environment interaction time: this considers both the model forward
             # to get the action given the observation and the time taken into the environment
@@ -661,13 +661,13 @@ def sota_main(fabric: Fabric, cfg: Dict[str, Any]):
             # Sync distributed timers
             if not timer.disabled:
                 timer_metrics = timer.compute()
-                if "Time/train_time" in timer_metrics:
+                if "Time/train_time" in timer_metrics and timer_metrics["Time/train_time"] > 0:
                     fabric.log(
                         "Time/sps_train",
                         (train_step - last_train) / timer_metrics["Time/train_time"],
                         policy_step,
                     )
-                if "Time/env_interaction_time" in timer_metrics:
+                if "Time/env_interaction_time" in timer_metrics and timer_metrics["Time/env_interaction_time"] > 0:
                     fabric.log(
                         "Time/sps_env_interaction",
                         ((policy_step - last_log) / world_size * cfg.env.action_repeat)