Eclectic-Sheep · belerico · Aug 10, 2023 · Aug 10, 2023 · Aug 10, 2023
@@ -12,6 +12,7 @@
 
 from sheeprl.algos.dreamer_v1 import dreamer_v1
 from sheeprl.algos.dreamer_v2 import dreamer_v2
+from sheeprl.algos.dreamer_v3 import dreamer_v3
 from sheeprl.algos.droq import droq
 from sheeprl.algos.p2e_dv1 import p2e_dv1
 from sheeprl.algos.p2e_dv2 import p2e_dv2

@@ -32,7 +32,11 @@ class StandardArgs:
     screen_size: int = Arg(default=64, help="the size of the pixel-from observations (if any)")
     frame_stack: int = Arg(default=-1, help="how many frame to stack (only for pixel-like observations)")
     frame_stack_dilation: int = Arg(default=1, help="the dilation between the stacked frames, 1 no dilation")
-    max_episode_steps: int = Arg(default=-1)
+    max_episode_steps: int = Arg(
+        default=-1,
+        help="the maximum duration in terms of number of steps of an episode, -1 to disable. "
+        "This value will be divided by the `action_repeat` value during the environment creation.",
+    )
 
     def __setattr__(self, __name: str, __value: Any) -> None:
         super().__setattr__(__name, __value)

@@ -193,7 +193,7 @@ def __init__(
         self.continue_model = continue_model
 
 
-class Player(nn.Module):
+class PlayerDV1(nn.Module):
     """The model of the DreamerV1 player.
 
     Args:

@@ -69,7 +69,9 @@ class DreamerV1Args(StandardArgs):
     )
     action_repeat: int = Arg(default=2, help="the number of times an action is repeated")
     max_episode_steps: int = Arg(
-        default=1000, help="the maximum duration in terms of number of steps of an episode, -1 to disable"
+        default=1000,
+        help="the maximum duration in terms of number of steps of an episode, -1 to disable. "
+        "This value will be divided by the `action_repeat` value during the environment creation.",
     )
     atari_noop_max: int = Arg(
         default=30,

@@ -19,7 +19,7 @@
 from torch.utils.data import BatchSampler
 from torchmetrics import MeanMetric
 
-from sheeprl.algos.dreamer_v1.agent import Player, WorldModel, build_models
+from sheeprl.algos.dreamer_v1.agent import PlayerDV1, WorldModel, build_models
 from sheeprl.algos.dreamer_v1.args import DreamerV1Args
 from sheeprl.algos.dreamer_v1.loss import actor_loss, critic_loss, reconstruction_loss
 from sheeprl.algos.dreamer_v2.utils import test
@@ -209,8 +209,8 @@ def train(
     aggregator.update("Loss/state_loss", state_loss.detach())
     aggregator.update("Loss/continue_loss", continue_loss.detach())
     aggregator.update("State/kl", kl.detach())
-    aggregator.update("State/p_entropy", p.entropy().mean().detach())
-    aggregator.update("State/q_entropy", q.entropy().mean().detach())
+    aggregator.update("State/post_entropy", p.entropy().mean().detach())
+    aggregator.update("State/prior_entropy", q.entropy().mean().detach())
 
     # Behaviour Learning
     # unflatten first 2 dimensions of recurrent and posterior states in order to have all the states on the first dimension.
@@ -443,7 +443,7 @@ def main():
         state["actor"] if args.checkpoint_path else None,
         state["critic"] if args.checkpoint_path else None,
     )
-    player = Player(
+    player = PlayerDV1(
         world_model.encoder.module,
         world_model.rssm.recurrent_model.module,
         world_model.rssm.representation_model.module,
@@ -482,8 +482,8 @@ def main():
                 "Loss/reward_loss": MeanMetric(sync_on_compute=False),
                 "Loss/state_loss": MeanMetric(sync_on_compute=False),
                 "Loss/continue_loss": MeanMetric(sync_on_compute=False),
-                "State/p_entropy": MeanMetric(sync_on_compute=False),
-                "State/q_entropy": MeanMetric(sync_on_compute=False),
+                "State/post_entropy": MeanMetric(sync_on_compute=False),
+                "State/prior_entropy": MeanMetric(sync_on_compute=False),
                 "State/kl": MeanMetric(sync_on_compute=False),
                 "Params/exploration_amout": MeanMetric(sync_on_compute=False),
                 "Grads/world_model": MeanMetric(sync_on_compute=False),

@@ -25,6 +25,24 @@
 
 
 class CNNEncoder(nn.Module):
+    """The Dreamer-V2 image encoder. This is composed of 4 `nn.Conv2d` with
+    kernel_size=3, stride=2 and padding=1. No bias is used if a `nn.LayerNorm`
+    is used after the convolution. This 4-stages model assumes that the image
+    is a 64x64. If more than one image is to be encoded, then those will
+    be concatenated on the channel dimension and fed to the encoder.
+
+    Args:
+        keys (Sequence[str]): the keys representing the image observations to encode.
+        input_channels (Sequence[int]): the input channels, one for each image observation to encode.
+        image_size (Tuple[int, int]): the image size as (Height,Width).
+        channels_multiplier (int): the multiplier for the output channels. Given the 4 stages, the 4 output channels
+            will be [1, 2, 4, 8] * `channels_multiplier`.
+        layer_norm (bool, optional): whether to apply the layer normalization.
+            Defaults to True.
+        activation (ModuleType, optional): the activation function.
+            Defaults to nn.ELU.
+    """
+
     def __init__(
         self,
         keys: Sequence[str],
@@ -59,6 +77,24 @@ def forward(self, obs: Dict[str, Tensor]) -> Tensor:
 
 
 class MLPEncoder(nn.Module):
+    """The Dreamer-V3 vector encoder. This is composed of N `nn.Linear` layers, where
+    N is specified by `mlp_layers`. No bias is used if a `nn.LayerNorm` is used after the linear layer.
+    If more than one vector is to be encoded, then those will concatenated on the last
+    dimension before being fed to the encoder.
+
+    Args:
+        keys (Sequence[str]): the keys representing the vector observations to encode.
+        input_dims (Sequence[int]): the dimensions of every vector to encode.
+        mlp_layers (int, optional): how many mlp layers.
+            Defaults to 4.
+        dense_units (int, optional): the dimension of every mlp.
+            Defaults to 512.
+        layer_norm (bool, optional): whether to apply the layer normalization.
+            Defaults to True.
+        activation (ModuleType, optional): the activation function after every layer.
+            Defaults to nn.ELU.
+    """
+
     def __init__(
         self,
         keys: Sequence[str],
@@ -87,6 +123,25 @@ def forward(self, obs: Dict[str, Tensor]) -> Tensor:
 
 
 class CNNDecoder(nn.Module):
+    """The almost-exact inverse of the `CNNEncoder` class, where in 4 stages it reconstructs
+    the observation image to 64x64. If multiple images are to be reconstructed,
+    then it will create a dictionary with an entry for every reconstructed image.
+    No bias is used if a `nn.LayerNorm` is used after the `nn.Conv2dTranspose` layer.
+
+    Args:
+        keys (Sequence[str]): the keys of the image observation to be reconstructed.
+        output_channels (Sequence[int]): the output channels, one for every image observation.
+        channels_multiplier (int): the channels multiplier, same for the encoder network.
+        latent_state_size (int): the size of the latent state. Before applying the decoder,
+            a `nn.Linear` layer is used to project the latent state to a feature vector.
+        cnn_encoder_output_dim (int): the output of the image encoder.
+        image_size (Tuple[int, int]): the final image size.
+        activation (nn.Module, optional): the activation function.
+            Defaults to nn.ELU.
+        layer_norm (bool, optional): whether to apply the layer normalization.
+            Defaults to True.
+    """
+
     def __init__(
         self,
         keys: Sequence[str],
@@ -137,6 +192,25 @@ def forward(self, latent_states: Tensor) -> Dict[str, Tensor]:
 
 
 class MLPDecoder(nn.Module):
+    """The exact inverse of the MLPEncoder. This is composed of N `nn.Linear` layers, where
+    N is specified by `mlp_layers`. No bias is used if a `nn.LayerNorm` is used after the linear layer.
+    If more than one vector is to be decoded, then it will create a dictionary with an entry
+    for every reconstructed vector.
+
+    Args:
+        keys (Sequence[str]): the keys representing the vector observations to decode.
+        output_dims (Sequence[int]): the dimensions of every vector to decode.
+        latent_state_size (int): the dimension of the latent state.
+        mlp_layers (int, optional): how many mlp layers.
+            Defaults to 4.
+        dense_units (int, optional): the dimension of every mlp.
+            Defaults to 512.
+        layer_norm (bool, optional): whether to apply the layer normalization.
+            Defaults to True.
+        activation (ModuleType, optional): the activation function after every layer.
+            Defaults to nn.ELU.
+    """
+
     def __init__(
         self,
         keys: Sequence[str],
@@ -168,8 +242,10 @@ def forward(self, latent_states: Tensor) -> Dict[str, Tensor]:
 
 
 class RecurrentModel(nn.Module):
-    """
-    Recurrent model for the model-base Dreamer agent.
+    """Recurrent model for the model-base Dreamer-V3 agent.
+    This implementation uses the `sheeprl.models.models.LayerNormGRUCell`, which combines
+    the standard GRUCell from PyTorch with the `nn.LayerNorm`, where the normalization is applied
+    right after having computed the projection from the input to the weight space.
 
     Args:
         input_size (int): the input size of the model.
@@ -559,7 +635,7 @@ def __init__(
         self.continue_model = continue_model
 
 
-class Player(nn.Module):
+class PlayerDV2(nn.Module):
     """
     The model of the Dreamer_v1 player.
 
@@ -605,7 +681,6 @@ def __init__(
         self.discrete_size = discrete_size
         self.recurrent_state_size = recurrent_state_size
         self.num_envs = num_envs
-        self.init_states()
 
     def init_states(self, reset_envs: Optional[Sequence[int]] = None) -> None:
         """Initialize the states and the actions for the ended environments.
@@ -751,7 +826,6 @@ def build_models(
     # Sizes
     stochastic_size = args.stochastic_size * args.discrete_size
     latent_state_size = stochastic_size + args.recurrent_state_size
-    mlp_dims = [obs_space[k].shape[0] for k in mlp_keys]
 
     # Define models
     cnn_encoder = (
@@ -769,7 +843,7 @@ def build_models(
     mlp_encoder = (
         MLPEncoder(
             keys=mlp_keys,
-            input_dims=mlp_dims,
+            input_dims=[obs_space[k].shape[0] for k in mlp_keys],
             mlp_layers=args.mlp_layers,
             dense_units=args.dense_units,
             activation=dense_act,
@@ -826,7 +900,7 @@ def build_models(
     mlp_decoder = (
         MLPDecoder(
             keys=mlp_keys,
-            output_dims=mlp_dims,
+            output_dims=[obs_space[k].shape[0] for k in mlp_keys],
             latent_state_size=latent_state_size,
             mlp_layers=args.mlp_layers,
             dense_units=args.dense_units,

@@ -90,7 +90,9 @@ class DreamerV2Args(StandardArgs):
     max_step_expl_decay: int = Arg(default=0, help="the maximum number of decay steps")
     action_repeat: int = Arg(default=2, help="the number of times an action is repeated")
     max_episode_steps: int = Arg(
-        default=1000, help="the maximum duration in terms of number of steps of an episode, -1 to disable"
+        default=1000,
+        help="the maximum duration in terms of number of steps of an episode, -1 to disable. "
+        "This value will be divided by the `action_repeat` value during the environment creation.",
     )
     atari_noop_max: int = Arg(
         default=30,