From 232b8590e4e853c907b488a4d642770581bd68a6 Mon Sep 17 00:00:00 2001 From: Michele Milesi Date: Sat, 23 Mar 2024 14:02:34 +0100 Subject: [PATCH 01/11] feat: added actions as obs wrapper --- sheeprl/configs/env/default.yaml | 2 ++ sheeprl/envs/wrappers.py | 57 ++++++++++++++++++++++++++++++++ sheeprl/utils/env.py | 9 +++++ 3 files changed, 68 insertions(+) diff --git a/sheeprl/configs/env/default.yaml b/sheeprl/configs/env/default.yaml index d80a6333..e616d303 100644 --- a/sheeprl/configs/env/default.yaml +++ b/sheeprl/configs/env/default.yaml @@ -8,6 +8,8 @@ grayscale: False clip_rewards: False capture_video: True frame_stack_dilation: 1 +action_stack: -1 +action_stack_dilation: 1 max_episode_steps: null reward_as_observation: False wrapper: ??? diff --git a/sheeprl/envs/wrappers.py b/sheeprl/envs/wrappers.py index a5fa5904..bf53a57e 100644 --- a/sheeprl/envs/wrappers.py +++ b/sheeprl/envs/wrappers.py @@ -251,3 +251,60 @@ def render(self) -> Optional[Union[RenderFrame, List[RenderFrame]]]: if len(frame.shape) == 3 and frame.shape[-1] == 1: frame = frame.repeat(3, axis=-1) return frame + + +class ActionsAsObservationWrapper(gym.Wrapper): + def __init__(self, env: Env, num_stack: int, dilation: int = 1): + super().__init__(env) + self._num_stack = num_stack + self._dilation = dilation + self._actions = deque(maxlen=num_stack * dilation) + self._is_continuous = isinstance(self.env.action_space, gym.spaces.Box) + self._is_multidiscrete = isinstance(self.env.action_space, gym.spaces.MultiDiscrete) + self.observation_space = copy.deepcopy(self.env.observation_space) + if self._is_continuous: + self._action_shape = self.action_space.shape[0] + low = np.resize(self.action_space.low, self._action_shape * (num_stack // dilation)) + high = np.resize(self.action_space.high, self._action_shape * (num_stack // dilation)) + elif self._is_multidiscrete: + low = 0 + high = max(self.action_space.nvec) + self._action_shape = len(self.env.nvec.shape) + else: + low = 0 + high = 1 + self._action_shape = self.action_space.n + self.observation_space["actions"] = gym.spaces.Box( + low=low, high=high, shape=(self._action_shape * (num_stack // dilation),), dtype=np.float32 + ) + + def step(self, action: Any) -> Tuple[Any | SupportsFloat | bool | Dict[str, Any]]: + self._actions.append(action) + obs, reward, done, truncated, info = super().step(action) + obs["actions"] = self._get_actions_stack() + return obs, reward, done, truncated, info + + def reset(self, *, seed: int | None = None, options: Dict[str, Any] | None = None) -> Tuple[Any | Dict[str, Any]]: + obs, info = super().reset(seed=seed, options=options) + self._actions.clear() + if self._is_multidiscrete or self._is_continuous: + [self._actions.append(np.zeros((self._action_shape,))) for _ in range(self._num_stack * self._dilation)] + else: + [self._actions.append(0) for _ in range(self._num_stack * self._dilation)] + obs["actions"] = self._get_actions_stack() + return obs, info + + def _get_actions_stack(self) -> np.ndarray: + actions_stack = list(self._actions)[self._dilation - 1 :: self._dilation] + if self._is_continuous: + actions = np.concatenate(actions_stack, axis=0) + elif self._is_multidiscrete: + actions = np.concatenate(actions_stack, axis=0) + else: + action_list = [] + for action in actions_stack: + one_hot_action = np.zeros(self.action_space.n) + one_hot_action[action] = 1 + action_list.append(one_hot_action) + actions = np.concatenate(action_list, axis=0) + return actions.astype(np.float32) diff --git a/sheeprl/utils/env.py b/sheeprl/utils/env.py index 14d57103..45143229 100644 --- a/sheeprl/utils/env.py +++ b/sheeprl/utils/env.py @@ -9,6 +9,7 @@ from sheeprl.envs.wrappers import ( ActionRepeat, + ActionsAsObservationWrapper, FrameStack, GrayscaleRenderWrapper, MaskVelocityWrapper, @@ -207,6 +208,14 @@ def transform_obs(obs: Dict[str, Any]): ) env = FrameStack(env, cfg.env.frame_stack, cnn_keys, cfg.env.frame_stack_dilation) + if cfg.env.action_stack > 0: + if cfg.env.action_stack_dilation <= 0: + raise ValueError( + "The actions stack dilation argument must be greater than zero, " + f"got: {cfg.env.action_stack_dilation}" + ) + env = ActionsAsObservationWrapper(env, cfg.env.action_stack, cfg.env.action_stack_dilation) + if cfg.env.reward_as_observation: env = RewardAsObservationWrapper(env) From 3871fd73d694701c88f35bac1d1c542466a4d82b Mon Sep 17 00:00:00 2001 From: Michele Milesi Date: Sat, 23 Mar 2024 14:08:30 +0100 Subject: [PATCH 02/11] fix: actions shape --- sheeprl/envs/wrappers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sheeprl/envs/wrappers.py b/sheeprl/envs/wrappers.py index bf53a57e..7d6f7cd6 100644 --- a/sheeprl/envs/wrappers.py +++ b/sheeprl/envs/wrappers.py @@ -264,8 +264,8 @@ def __init__(self, env: Env, num_stack: int, dilation: int = 1): self.observation_space = copy.deepcopy(self.env.observation_space) if self._is_continuous: self._action_shape = self.action_space.shape[0] - low = np.resize(self.action_space.low, self._action_shape * (num_stack // dilation)) - high = np.resize(self.action_space.high, self._action_shape * (num_stack // dilation)) + low = np.resize(self.action_space.low, self._action_shape * num_stack) + high = np.resize(self.action_space.high, self._action_shape * num_stack) elif self._is_multidiscrete: low = 0 high = max(self.action_space.nvec) @@ -275,7 +275,7 @@ def __init__(self, env: Env, num_stack: int, dilation: int = 1): high = 1 self._action_shape = self.action_space.n self.observation_space["actions"] = gym.spaces.Box( - low=low, high=high, shape=(self._action_shape * (num_stack // dilation),), dtype=np.float32 + low=low, high=high, shape=(self._action_shape * num_stack,), dtype=np.float32 ) def step(self, action: Any) -> Tuple[Any | SupportsFloat | bool | Dict[str, Any]]: From 8b6cc462e7657b66d47e49e4bcdccce583040f13 Mon Sep 17 00:00:00 2001 From: Michele Milesi Date: Sat, 23 Mar 2024 14:21:43 +0100 Subject: [PATCH 03/11] fix: action_stack key --- sheeprl/envs/wrappers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sheeprl/envs/wrappers.py b/sheeprl/envs/wrappers.py index 7d6f7cd6..70387ea8 100644 --- a/sheeprl/envs/wrappers.py +++ b/sheeprl/envs/wrappers.py @@ -274,14 +274,14 @@ def __init__(self, env: Env, num_stack: int, dilation: int = 1): low = 0 high = 1 self._action_shape = self.action_space.n - self.observation_space["actions"] = gym.spaces.Box( + self.observation_space["action_stack"] = gym.spaces.Box( low=low, high=high, shape=(self._action_shape * num_stack,), dtype=np.float32 ) def step(self, action: Any) -> Tuple[Any | SupportsFloat | bool | Dict[str, Any]]: self._actions.append(action) obs, reward, done, truncated, info = super().step(action) - obs["actions"] = self._get_actions_stack() + obs["action_stack"] = self._get_actions_stack() return obs, reward, done, truncated, info def reset(self, *, seed: int | None = None, options: Dict[str, Any] | None = None) -> Tuple[Any | Dict[str, Any]]: @@ -291,7 +291,7 @@ def reset(self, *, seed: int | None = None, options: Dict[str, Any] | None = Non [self._actions.append(np.zeros((self._action_shape,))) for _ in range(self._num_stack * self._dilation)] else: [self._actions.append(0) for _ in range(self._num_stack * self._dilation)] - obs["actions"] = self._get_actions_stack() + obs["action_stack"] = self._get_actions_stack() return obs, info def _get_actions_stack(self) -> np.ndarray: From b931db9d5a9925d725f3e0c4ef168cf296594299 Mon Sep 17 00:00:00 2001 From: Michele Milesi Date: Sun, 14 Apr 2024 12:33:29 +0200 Subject: [PATCH 04/11] feat: added controls --- sheeprl/envs/wrappers.py | 7 +++++++ sheeprl/utils/env.py | 5 ----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/sheeprl/envs/wrappers.py b/sheeprl/envs/wrappers.py index 70387ea8..1c785ceb 100644 --- a/sheeprl/envs/wrappers.py +++ b/sheeprl/envs/wrappers.py @@ -256,6 +256,13 @@ def render(self) -> Optional[Union[RenderFrame, List[RenderFrame]]]: class ActionsAsObservationWrapper(gym.Wrapper): def __init__(self, env: Env, num_stack: int, dilation: int = 1): super().__init__(env) + if num_stack < 1: + raise ValueError( + "The number of actions to the `action_stack` observation " + f"must be greater or equal than 1, got: {num_stack}" + ) + if dilation < 1: + raise ValueError(f"The actions stack dilation argument must be greater than zero, got: {dilation}") self._num_stack = num_stack self._dilation = dilation self._actions = deque(maxlen=num_stack * dilation) diff --git a/sheeprl/utils/env.py b/sheeprl/utils/env.py index 45143229..25b11c8a 100644 --- a/sheeprl/utils/env.py +++ b/sheeprl/utils/env.py @@ -209,11 +209,6 @@ def transform_obs(obs: Dict[str, Any]): env = FrameStack(env, cfg.env.frame_stack, cnn_keys, cfg.env.frame_stack_dilation) if cfg.env.action_stack > 0: - if cfg.env.action_stack_dilation <= 0: - raise ValueError( - "The actions stack dilation argument must be greater than zero, " - f"got: {cfg.env.action_stack_dilation}" - ) env = ActionsAsObservationWrapper(env, cfg.env.action_stack, cfg.env.action_stack_dilation) if cfg.env.reward_as_observation: From 4838829a9d42ff8db50cae5a70a66327fd28efe1 Mon Sep 17 00:00:00 2001 From: Michele Milesi Date: Wed, 1 May 2024 22:07:07 +0200 Subject: [PATCH 05/11] fix: multi-discrete action stack --- sheeprl/envs/wrappers.py | 20 +++++++++----------- tests/run_tests.py | 2 +- tests/test_algos/test_algos.py | 2 ++ 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/sheeprl/envs/wrappers.py b/sheeprl/envs/wrappers.py index 1c785ceb..616dbb8e 100644 --- a/sheeprl/envs/wrappers.py +++ b/sheeprl/envs/wrappers.py @@ -270,17 +270,17 @@ def __init__(self, env: Env, num_stack: int, dilation: int = 1): self._is_multidiscrete = isinstance(self.env.action_space, gym.spaces.MultiDiscrete) self.observation_space = copy.deepcopy(self.env.observation_space) if self._is_continuous: - self._action_shape = self.action_space.shape[0] - low = np.resize(self.action_space.low, self._action_shape * num_stack) - high = np.resize(self.action_space.high, self._action_shape * num_stack) + self._action_shape = self.env.action_space.shape[0] + low = np.resize(self.env.action_space.low, self._action_shape * num_stack) + high = np.resize(self.env.action_space.high, self._action_shape * num_stack) elif self._is_multidiscrete: low = 0 - high = max(self.action_space.nvec) - self._action_shape = len(self.env.nvec.shape) + high = max(self.env.action_space.nvec) - 1 + self._action_shape = self.env.action_space.nvec.shape[0] else: low = 0 - high = 1 - self._action_shape = self.action_space.n + high = 1 # one-hot encoding + self._action_shape = self.env.action_space.n self.observation_space["action_stack"] = gym.spaces.Box( low=low, high=high, shape=(self._action_shape * num_stack,), dtype=np.float32 ) @@ -303,14 +303,12 @@ def reset(self, *, seed: int | None = None, options: Dict[str, Any] | None = Non def _get_actions_stack(self) -> np.ndarray: actions_stack = list(self._actions)[self._dilation - 1 :: self._dilation] - if self._is_continuous: - actions = np.concatenate(actions_stack, axis=0) - elif self._is_multidiscrete: + if self._is_continuous or self._is_multidiscrete: actions = np.concatenate(actions_stack, axis=0) else: action_list = [] for action in actions_stack: - one_hot_action = np.zeros(self.action_space.n) + one_hot_action = np.zeros(self.env.action_space.n) one_hot_action[action] = 1 action_list.append(one_hot_action) actions = np.concatenate(action_list, axis=0) diff --git a/tests/run_tests.py b/tests/run_tests.py index 3daed75a..8b65bc1b 100644 --- a/tests/run_tests.py +++ b/tests/run_tests.py @@ -3,4 +3,4 @@ import pytest if __name__ == "__main__": - sys.exit(pytest.main(["-s", "--cov=sheeprl", "-vv"])) + sys.exit(pytest.main(["-s", "--cov=sheeprl", "-vv", "tests/test_algos/test_algos.py::test_dreamer_v3"])) diff --git a/tests/test_algos/test_algos.py b/tests/test_algos/test_algos.py index 511a4691..9d53b06f 100644 --- a/tests/test_algos/test_algos.py +++ b/tests/test_algos/test_algos.py @@ -476,6 +476,8 @@ def test_dreamer_v3(standard_args, env_id, start_time): "algo.mlp_keys.decoder=[state]", "algo.mlp_layer_norm.cls=sheeprl.models.models.LayerNorm", "algo.cnn_layer_norm.cls=sheeprl.models.models.LayerNormChannelLast", + "env.action_stack=5", + "env.action_stack_dilation=2", ] with mock.patch.object(sys, "argv", args): From 47de66ba14fff2cce0f1f0d704c0080f603ab3a8 Mon Sep 17 00:00:00 2001 From: Michele Milesi Date: Wed, 1 May 2024 22:09:22 +0200 Subject: [PATCH 06/11] test: update --- tests/run_tests.py | 2 +- tests/test_algos/test_algos.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/run_tests.py b/tests/run_tests.py index 8b65bc1b..3daed75a 100644 --- a/tests/run_tests.py +++ b/tests/run_tests.py @@ -3,4 +3,4 @@ import pytest if __name__ == "__main__": - sys.exit(pytest.main(["-s", "--cov=sheeprl", "-vv", "tests/test_algos/test_algos.py::test_dreamer_v3"])) + sys.exit(pytest.main(["-s", "--cov=sheeprl", "-vv"])) diff --git a/tests/test_algos/test_algos.py b/tests/test_algos/test_algos.py index 9d53b06f..511a4691 100644 --- a/tests/test_algos/test_algos.py +++ b/tests/test_algos/test_algos.py @@ -476,8 +476,6 @@ def test_dreamer_v3(standard_args, env_id, start_time): "algo.mlp_keys.decoder=[state]", "algo.mlp_layer_norm.cls=sheeprl.models.models.LayerNorm", "algo.cnn_layer_norm.cls=sheeprl.models.models.LayerNormChannelLast", - "env.action_stack=5", - "env.action_stack_dilation=2", ] with mock.patch.object(sys, "argv", args): From 3587e358bc252d8c836b2cf8a32520812eb81266 Mon Sep 17 00:00:00 2001 From: Michele Milesi Date: Mon, 20 May 2024 13:50:28 +0200 Subject: [PATCH 07/11] feat: added mlp_keys to prepare obs of sac, droq and a2c --- sheeprl/algos/a2c/a2c.py | 6 ++++-- sheeprl/algos/a2c/utils.py | 10 ++++++---- sheeprl/algos/droq/droq.py | 2 +- sheeprl/algos/sac/sac.py | 2 +- sheeprl/algos/sac/sac_decoupled.py | 2 +- sheeprl/algos/sac/utils.py | 8 +++++--- sheeprl/utils/env.py | 2 +- 7 files changed, 19 insertions(+), 13 deletions(-) diff --git a/sheeprl/algos/a2c/a2c.py b/sheeprl/algos/a2c/a2c.py index e3b23ee1..98ba65c0 100644 --- a/sheeprl/algos/a2c/a2c.py +++ b/sheeprl/algos/a2c/a2c.py @@ -236,7 +236,9 @@ def main(fabric: Fabric, cfg: Dict[str, Any]): # Sample an action given the observation received by the environment # This calls the `forward` method of the PyTorch module, escaping from Fabric # because we don't want this to be a synchronization point - torch_obs = prepare_obs(fabric, next_obs, num_envs=cfg.env.num_envs) + torch_obs = prepare_obs( + fabric, next_obs, mlp_keys=cfg.algo.mlp_keys.encoder, num_envs=cfg.env.num_envs + ) actions, _, values = player(torch_obs) if is_continuous: real_actions = torch.stack(actions, -1).cpu().numpy() @@ -304,7 +306,7 @@ def main(fabric: Fabric, cfg: Dict[str, Any]): # Estimate returns with GAE (https://arxiv.org/abs/1506.02438) with torch.inference_mode(): - torch_obs = prepare_obs(fabric, next_obs, num_envs=cfg.env.num_envs) + torch_obs = prepare_obs(fabric, next_obs, mlp_keys=cfg.algo.mlp_keys.encoder, num_envs=cfg.env.num_envs) next_values = player.get_values(torch_obs) returns, advantages = gae( local_data["rewards"].to(torch.float64), diff --git a/sheeprl/algos/a2c/utils.py b/sheeprl/algos/a2c/utils.py index c26fbbaf..88fb0099 100644 --- a/sheeprl/algos/a2c/utils.py +++ b/sheeprl/algos/a2c/utils.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Dict +from typing import Any, Dict, Sequence import numpy as np import torch @@ -13,8 +13,10 @@ AGGREGATOR_KEYS = {"Rewards/rew_avg", "Game/ep_len_avg", "Loss/value_loss", "Loss/policy_loss"} -def prepare_obs(fabric: Fabric, obs: Dict[str, np.ndarray], *, num_envs: int = 1, **kwargs) -> Dict[str, Tensor]: - torch_obs = {k: torch.from_numpy(v.copy()).to(fabric.device).float().reshape(num_envs, -1) for k, v in obs.items()} +def prepare_obs( + fabric: Fabric, obs: Dict[str, np.ndarray], *, mlp_keys: Sequence[str] = [], num_envs: int = 1, **kwargs +) -> Dict[str, Tensor]: + torch_obs = {k: torch.from_numpy(obs[k].copy()).to(fabric.device).float().reshape(num_envs, -1) for k in mlp_keys} return torch_obs @@ -28,7 +30,7 @@ def test(agent: PPOPlayer, fabric: Fabric, cfg: Dict[str, Any], log_dir: str): while not done: # Convert observations to tensors - torch_obs = prepare_obs(fabric, obs) + torch_obs = prepare_obs(fabric, obs, mlp_keys=cfg.algo.mlp_keys.encoder) # Act greedly through the environment actions = agent.get_actions(torch_obs, greedy=True) diff --git a/sheeprl/algos/droq/droq.py b/sheeprl/algos/droq/droq.py index 9c9a500e..b5cf8c35 100644 --- a/sheeprl/algos/droq/droq.py +++ b/sheeprl/algos/droq/droq.py @@ -305,7 +305,7 @@ def main(fabric: Fabric, cfg: Dict[str, Any]): else: with torch.inference_mode(): # Sample an action given the observation received by the environment - torch_obs = prepare_obs(fabric, obs, num_envs=cfg.env.num_envs) + torch_obs = prepare_obs(fabric, obs, mlp_keys=cfg.algo.mlp_keys.encoder, num_envs=cfg.env.num_envs) actions = player(torch_obs) actions = actions.cpu().numpy() next_obs, rewards, terminated, truncated, infos = envs.step(actions.reshape(envs.action_space.shape)) diff --git a/sheeprl/algos/sac/sac.py b/sheeprl/algos/sac/sac.py index e8cd9e75..4abe9506 100644 --- a/sheeprl/algos/sac/sac.py +++ b/sheeprl/algos/sac/sac.py @@ -256,7 +256,7 @@ def main(fabric: Fabric, cfg: Dict[str, Any]): else: # Sample an action given the observation received by the environment with torch.inference_mode(): - torch_obs = prepare_obs(fabric, obs, num_envs=cfg.env.num_envs) + torch_obs = prepare_obs(fabric, obs, mlp_keys=cfg.algo.mlp_keys.encoder, num_envs=cfg.env.num_envs) actions = player(torch_obs) actions = actions.cpu().numpy() next_obs, rewards, terminated, truncated, infos = envs.step(actions.reshape(envs.action_space.shape)) diff --git a/sheeprl/algos/sac/sac_decoupled.py b/sheeprl/algos/sac/sac_decoupled.py index ceb49b78..705e1dd6 100644 --- a/sheeprl/algos/sac/sac_decoupled.py +++ b/sheeprl/algos/sac/sac_decoupled.py @@ -187,7 +187,7 @@ def player( actions = envs.action_space.sample() else: # Sample an action given the observation received by the environment - torch_obs = prepare_obs(fabric, obs, num_envs=cfg.env.num_envs) + torch_obs = prepare_obs(fabric, obs, mlp_keys=cfg.algo.mlp_keys.encoder, num_envs=cfg.env.num_envs) actions = actor(torch_obs) actions = actions.cpu().numpy() next_obs, rewards, terminated, truncated, infos = envs.step(actions.reshape(envs.action_space.shape)) diff --git a/sheeprl/algos/sac/utils.py b/sheeprl/algos/sac/utils.py index 6912f278..9432db3f 100644 --- a/sheeprl/algos/sac/utils.py +++ b/sheeprl/algos/sac/utils.py @@ -28,9 +28,11 @@ MODELS_TO_REGISTER = {"agent"} -def prepare_obs(fabric: Fabric, obs: Dict[str, np.ndarray], *, num_envs: int = 1, **kwargs) -> Tensor: +def prepare_obs( + fabric: Fabric, obs: Dict[str, np.ndarray], *, mlp_keys: Sequence[str] = [], num_envs: int = 1, **kwargs +) -> Tensor: with fabric.device: - torch_obs = torch.cat([torch.as_tensor(obs[k].copy(), dtype=torch.float32) for k in obs.keys()], dim=-1) + torch_obs = torch.cat([torch.as_tensor(obs[k].copy(), dtype=torch.float32) for k in mlp_keys], dim=-1) return torch_obs.reshape(num_envs, -1) @@ -43,7 +45,7 @@ def test(actor: SACPlayer, fabric: Fabric, cfg: Dict[str, Any], log_dir: str): obs = env.reset(seed=cfg.seed)[0] while not done: # Act greedly through the environment - torch_obs = prepare_obs(fabric, obs) + torch_obs = prepare_obs(fabric, obs, mlp_keys=cfg.algo.mlp_keys.encoder) action = actor.get_actions(torch_obs, greedy=True) # Single environment step diff --git a/sheeprl/utils/env.py b/sheeprl/utils/env.py index 25b11c8a..2eaf5f2d 100644 --- a/sheeprl/utils/env.py +++ b/sheeprl/utils/env.py @@ -208,7 +208,7 @@ def transform_obs(obs: Dict[str, Any]): ) env = FrameStack(env, cfg.env.frame_stack, cnn_keys, cfg.env.frame_stack_dilation) - if cfg.env.action_stack > 0: + if cfg.env.action_stack > 0 and "diambra" not in cfg.env.wrapper._target_: env = ActionsAsObservationWrapper(env, cfg.env.action_stack, cfg.env.action_stack_dilation) if cfg.env.reward_as_observation: From 9e6d4a98a6dcb86574b9d98f109a8842c5e47f3d Mon Sep 17 00:00:00 2001 From: Michele Milesi Date: Mon, 20 May 2024 15:14:58 +0200 Subject: [PATCH 08/11] feat: added from __future__ import annotations --- sheeprl/envs/wrappers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sheeprl/envs/wrappers.py b/sheeprl/envs/wrappers.py index 616dbb8e..57ecf218 100644 --- a/sheeprl/envs/wrappers.py +++ b/sheeprl/envs/wrappers.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import copy import time from collections import deque From 1fcdd6e6e1631fa58529034d6f109df9ae7fa15f Mon Sep 17 00:00:00 2001 From: Michele Milesi Date: Sun, 26 May 2024 22:00:17 +0200 Subject: [PATCH 09/11] feat: added noop + test --- sheeprl/envs/wrappers.py | 52 +++++++++++++++++++++----------- tests/test_envs/test_wrappers.py | 32 +++++++++++++++++++- 2 files changed, 66 insertions(+), 18 deletions(-) diff --git a/sheeprl/envs/wrappers.py b/sheeprl/envs/wrappers.py index 57ecf218..539627ad 100644 --- a/sheeprl/envs/wrappers.py +++ b/sheeprl/envs/wrappers.py @@ -256,7 +256,7 @@ def render(self) -> Optional[Union[RenderFrame, List[RenderFrame]]]: class ActionsAsObservationWrapper(gym.Wrapper): - def __init__(self, env: Env, num_stack: int, dilation: int = 1): + def __init__(self, env: Env, num_stack: int, noop: float | int | List[int], dilation: int = 1): super().__init__(env) if num_stack < 1: raise ValueError( @@ -277,8 +277,9 @@ def __init__(self, env: Env, num_stack: int, dilation: int = 1): high = np.resize(self.env.action_space.high, self._action_shape * num_stack) elif self._is_multidiscrete: low = 0 - high = max(self.env.action_space.nvec) - 1 - self._action_shape = self.env.action_space.nvec.shape[0] + high = 1 # one-hot encoding + # one one-hot for each action + self._action_shape = sum(self.env.action_space.nvec) else: low = 0 high = 1 # one-hot encoding @@ -286,9 +287,37 @@ def __init__(self, env: Env, num_stack: int, dilation: int = 1): self.observation_space["action_stack"] = gym.spaces.Box( low=low, high=high, shape=(self._action_shape * num_stack,), dtype=np.float32 ) + if self._is_continuous: + if isinstance(noop, list): + raise ValueError(f"The noop actions must be a float for continuous action spaces, got: {noop}") + self.noop = np.full((self._action_shape,), noop, dtype=np.float32) + elif self._is_multidiscrete: + if not isinstance(noop, list): + raise ValueError(f"The noop actions must be a list for multi-discrete action spaces, got: {noop}") + noops = [] + for act, n in zip(noop, self.env.action_space.nvec): + noops.append(np.zeros((n,), dtype=np.float32)) + noops[-1][noop[act]] = 1.0 + self.noop = np.concatenate(noops, axis=-1) + else: + if isinstance(noop, (list, float)): + raise ValueError(f"The noop actions must be an integer for discrete action spaces, got: {noop}") + self.noop = np.zeros((self._action_shape,), dtype=np.float32) + self.noop[noop] = 1.0 def step(self, action: Any) -> Tuple[Any | SupportsFloat | bool | Dict[str, Any]]: - self._actions.append(action) + if self._is_continuous: + self._actions.append(action) + elif self._is_multidiscrete: + one_hot_actions = [] + for act, n in zip(action, self.env.action_space.nvec): + one_hot_actions.append(np.zeros((n,), dtype=np.float32)) + one_hot_actions[-1][act] = 1.0 + self._actions.append(np.concatenate(one_hot_actions, axis=-1)) + else: + one_hot_action = np.zeros((self._action_shape,), dtype=np.float32) + one_hot_action[action] = 1.0 + self._actions.append(one_hot_action) obs, reward, done, truncated, info = super().step(action) obs["action_stack"] = self._get_actions_stack() return obs, reward, done, truncated, info @@ -296,22 +325,11 @@ def step(self, action: Any) -> Tuple[Any | SupportsFloat | bool | Dict[str, Any] def reset(self, *, seed: int | None = None, options: Dict[str, Any] | None = None) -> Tuple[Any | Dict[str, Any]]: obs, info = super().reset(seed=seed, options=options) self._actions.clear() - if self._is_multidiscrete or self._is_continuous: - [self._actions.append(np.zeros((self._action_shape,))) for _ in range(self._num_stack * self._dilation)] - else: - [self._actions.append(0) for _ in range(self._num_stack * self._dilation)] + [self._actions.append(self.noop) for _ in range(self._num_stack * self._dilation)] obs["action_stack"] = self._get_actions_stack() return obs, info def _get_actions_stack(self) -> np.ndarray: actions_stack = list(self._actions)[self._dilation - 1 :: self._dilation] - if self._is_continuous or self._is_multidiscrete: - actions = np.concatenate(actions_stack, axis=0) - else: - action_list = [] - for action in actions_stack: - one_hot_action = np.zeros(self.env.action_space.n) - one_hot_action[action] = 1 - action_list.append(one_hot_action) - actions = np.concatenate(action_list, axis=0) + actions = np.concatenate(actions_stack, axis=-1) return actions.astype(np.float32) diff --git a/tests/test_envs/test_wrappers.py b/tests/test_envs/test_wrappers.py index 2c0b8dbf..66562492 100644 --- a/tests/test_envs/test_wrappers.py +++ b/tests/test_envs/test_wrappers.py @@ -1,10 +1,40 @@ import gymnasium as gym import pytest -from sheeprl.envs.wrappers import MaskVelocityWrapper +from sheeprl.envs.dummy import ContinuousDummyEnv, DiscreteDummyEnv, MultiDiscreteDummyEnv +from sheeprl.envs.wrappers import ActionsAsObservationWrapper, MaskVelocityWrapper + +ENVIRONMENTS = { + "discrete_dummy": DiscreteDummyEnv, + "multidiscrete_dummy": MultiDiscreteDummyEnv, + "continuous_dummy": ContinuousDummyEnv, +} def test_mask_velocities_fail(): with pytest.raises(NotImplementedError): env = gym.make("CarRacing-v2") env = MaskVelocityWrapper(env) + + +@pytest.mark.parametrize("num_stack", [1, 4, 8]) +@pytest.mark.parametrize("dilation", [1, 2, 4]) +@pytest.mark.parametrize("env_id", ["discrete_dummy", "multidiscrete_dummy", "continuous_dummy"]) +def test_actions_as_observations_wrapper(env_id: str, num_stack, dilation): + env = ENVIRONMENTS[env_id]() + if isinstance(env.action_space, gym.spaces.MultiDiscrete): + noop = [0, 0] + else: + noop = 0 + env = ActionsAsObservationWrapper(env, num_stack=num_stack, noop=noop, dilation=dilation) + + o = env.reset()[0] + assert len(o["action_stack"].shape) == len(env.observation_space["action_stack"].shape) + for d1, d2 in zip(o["action_stack"].shape, env.observation_space["action_stack"].shape): + assert d1 == d2 + + for _ in range(64): + o = env.step(env.action_space.sample())[0] + assert len(o["action_stack"].shape) == len(env.observation_space["action_stack"].shape) + for d1, d2 in zip(o["action_stack"].shape, env.observation_space["action_stack"].shape): + assert d1 == d2 From 104f32164363c517f1920db172776f8bcd81f793 Mon Sep 17 00:00:00 2001 From: Michele Milesi Date: Mon, 27 May 2024 10:18:40 +0200 Subject: [PATCH 10/11] feat: update tests + added controls in wrapper + update docs --- howto/actions_as_observations.md | 27 ++++++++++++++ howto/configs.md | 9 +++++ sheeprl/configs/env/default.yaml | 6 ++-- sheeprl/envs/wrappers.py | 7 ++++ sheeprl/utils/env.py | 4 +-- tests/test_envs/test_wrappers.py | 62 ++++++++++++++++++++++++++++++++ 6 files changed, 111 insertions(+), 4 deletions(-) create mode 100644 howto/actions_as_observations.md diff --git a/howto/actions_as_observations.md b/howto/actions_as_observations.md new file mode 100644 index 00000000..d7f318ae --- /dev/null +++ b/howto/actions_as_observations.md @@ -0,0 +1,27 @@ +# Actions as Observations Wrapper +In this how-to, some indications are given on how to use the Actions as Observations Wrapper. + +When you want to add the last `n` actions to the observations, you must specify three parameters in the [`./configs/env/default.yaml`](../sheeprl/configs/env/default.yaml) file: +- `actions_as_observation.num_stack` (integer greater than 0): The number of actions to add to the observations. +- `actions_as_observation.dilation` (integer greater than 0): The dilation (number of steps) between one action and the next one. +- `actions_as_observation.noop` (integer or float or list of integer): The noop action to use when resetting the environment, the buffer is filled with this action. Every environment has its own NOOP action, it is strongly recommended to use that action for the correct learning of the algorithm. + +## NOOP Parameter +The NOOP parameter must be: +- An integer for discrete action spaces +- A float for continuous action spaces +- A list of integers for multi-discrete action spaces: the length of the list must be equal to the number of actions in the environment. + +Each environment has its own NOOP action, usually it is specified in the documentation. Below we reported the list of noop actions of the environments supported in SheepRL: +- MuJoCo (both gymnasium and DMC) environments: `0.0`. +- Atari environments: `0`. +- Crafter: `0`. +- MineRL: `0`. +- MineDojo: `[0, 0, 0]`. +- Super Mario Bros: `0`. +- Diambra: + - Discrete: `0`. + - Multi-discrete: `[0, 0]`. +- Box2D (gymnasium): + - Discrete: `0`. + - Continuous: `0.0`. \ No newline at end of file diff --git a/howto/configs.md b/howto/configs.md index 3c91fc5d..75f8da5b 100644 --- a/howto/configs.md +++ b/howto/configs.md @@ -422,10 +422,19 @@ grayscale: False clip_rewards: False capture_video: True frame_stack_dilation: 1 +actions_as_observation: + num_stack: -1 + noop: "You MUST define the NOOP" + dilation: 1 max_episode_steps: null reward_as_observation: False +wrapper: ??? ``` +> [!NOTE] +> +> The actions as observations wrapper is used for adding the last `n` actions to the observations. For more information, check the corresponding [howto file](./actions_as_observations.md). + Every custom environment must then "inherit" from this default config, override the particular parameters, and define the `wrapper` field, which is the one that will be directly instantiated at runtime. The `wrapper` field must define all the specific parameters to be passed to the `_target_` function when the wrapper will be instantiated. Take for example the `atari.yaml` config: ```yaml diff --git a/sheeprl/configs/env/default.yaml b/sheeprl/configs/env/default.yaml index e616d303..459d0cab 100644 --- a/sheeprl/configs/env/default.yaml +++ b/sheeprl/configs/env/default.yaml @@ -8,8 +8,10 @@ grayscale: False clip_rewards: False capture_video: True frame_stack_dilation: 1 -action_stack: -1 -action_stack_dilation: 1 +actions_as_observation: + num_stack: -1 + noop: "You MUST define the NOOP" + dilation: 1 max_episode_steps: null reward_as_observation: False wrapper: ??? diff --git a/sheeprl/envs/wrappers.py b/sheeprl/envs/wrappers.py index 539627ad..cc285b11 100644 --- a/sheeprl/envs/wrappers.py +++ b/sheeprl/envs/wrappers.py @@ -265,6 +265,8 @@ def __init__(self, env: Env, num_stack: int, noop: float | int | List[int], dila ) if dilation < 1: raise ValueError(f"The actions stack dilation argument must be greater than zero, got: {dilation}") + if not isinstance(noop, (int, float, list)): + raise ValueError(f"The noop action must be an integer or float or list, got: {noop} ({type(noop)})") self._num_stack = num_stack self._dilation = dilation self._actions = deque(maxlen=num_stack * dilation) @@ -294,6 +296,11 @@ def __init__(self, env: Env, num_stack: int, noop: float | int | List[int], dila elif self._is_multidiscrete: if not isinstance(noop, list): raise ValueError(f"The noop actions must be a list for multi-discrete action spaces, got: {noop}") + if len(self.env.action_space.nvec) != len(noop): + raise RuntimeError( + "The number of noop actions must be equal to the number of actions of the environment. " + f"Got env_action_space = {self.env.action_space.nvec} and {noop =}" + ) noops = [] for act, n in zip(noop, self.env.action_space.nvec): noops.append(np.zeros((n,), dtype=np.float32)) diff --git a/sheeprl/utils/env.py b/sheeprl/utils/env.py index 2eaf5f2d..518dc294 100644 --- a/sheeprl/utils/env.py +++ b/sheeprl/utils/env.py @@ -208,8 +208,8 @@ def transform_obs(obs: Dict[str, Any]): ) env = FrameStack(env, cfg.env.frame_stack, cnn_keys, cfg.env.frame_stack_dilation) - if cfg.env.action_stack > 0 and "diambra" not in cfg.env.wrapper._target_: - env = ActionsAsObservationWrapper(env, cfg.env.action_stack, cfg.env.action_stack_dilation) + if cfg.env.actions_as_observations.num_stack > 0 and "diambra" not in cfg.env.wrapper._target_: + env = ActionsAsObservationWrapper(env, **cfg.env.actions_as_observations) if cfg.env.reward_as_observation: env = RewardAsObservationWrapper(env) diff --git a/tests/test_envs/test_wrappers.py b/tests/test_envs/test_wrappers.py index 66562492..c1d7afd5 100644 --- a/tests/test_envs/test_wrappers.py +++ b/tests/test_envs/test_wrappers.py @@ -1,4 +1,5 @@ import gymnasium as gym +import numpy as np import pytest from sheeprl.envs.dummy import ContinuousDummyEnv, DiscreteDummyEnv, MultiDiscreteDummyEnv @@ -38,3 +39,64 @@ def test_actions_as_observations_wrapper(env_id: str, num_stack, dilation): assert len(o["action_stack"].shape) == len(env.observation_space["action_stack"].shape) for d1, d2 in zip(o["action_stack"].shape, env.observation_space["action_stack"].shape): assert d1 == d2 + + +@pytest.mark.parametrize("num_stack", [-1, 0]) +@pytest.mark.parametrize("env_id", ["discrete_dummy", "multidiscrete_dummy", "continuous_dummy"]) +def test_actions_as_observations_wrapper_invalid_num_stack(env_id, num_stack): + env = ENVIRONMENTS[env_id]() + if isinstance(env.action_space, gym.spaces.MultiDiscrete): + noop = [0, 0] + else: + noop = 0 + with pytest.raises(ValueError, match="The number of actions to the"): + env = ActionsAsObservationWrapper(env, num_stack=num_stack, noop=noop, dilation=3) + + +@pytest.mark.parametrize("dilation", [-1, 0]) +@pytest.mark.parametrize("env_id", ["discrete_dummy", "multidiscrete_dummy", "continuous_dummy"]) +def test_actions_as_observations_wrapper_invalid_dilation(env_id, dilation): + env = ENVIRONMENTS[env_id]() + if isinstance(env.action_space, gym.spaces.MultiDiscrete): + noop = [0, 0] + else: + noop = 0 + with pytest.raises(ValueError, match="The actions stack dilation argument must be greater than zero"): + env = ActionsAsObservationWrapper(env, num_stack=3, noop=noop, dilation=dilation) + + +@pytest.mark.parametrize("noop", [set([0, 0, 0]), "this is an invalid type", np.array([0, 0, 0])]) +@pytest.mark.parametrize("env_id", ["discrete_dummy", "multidiscrete_dummy", "continuous_dummy"]) +def test_actions_as_observations_wrapper_invalid_noop_type(env_id, noop): + env = ENVIRONMENTS[env_id]() + with pytest.raises(ValueError, match="The noop action must be an integer or float or list"): + env = ActionsAsObservationWrapper(env, num_stack=3, noop=noop, dilation=2) + + +def test_actions_as_observations_wrapper_invalid_noop_continuous_type(): + env = ContinuousDummyEnv() + with pytest.raises(ValueError, match="The noop actions must be a float for continuous action spaces"): + env = ActionsAsObservationWrapper(env, num_stack=3, noop=[0, 0, 0], dilation=2) + + +@pytest.mark.parametrize("noop", [[0, 0, 0], 0.0]) +def test_actions_as_observations_wrapper_invalid_noop_discrete_type(noop): + env = DiscreteDummyEnv() + with pytest.raises(ValueError, match="The noop actions must be an integer for discrete action spaces"): + env = ActionsAsObservationWrapper(env, num_stack=3, noop=noop, dilation=2) + + +@pytest.mark.parametrize("noop", [0, 0.0]) +def test_actions_as_observations_wrapper_invalid_noop_multidiscrete_type(noop): + env = MultiDiscreteDummyEnv() + with pytest.raises(ValueError, match="The noop actions must be a list for multi-discrete action spaces"): + env = ActionsAsObservationWrapper(env, num_stack=3, noop=noop, dilation=2) + + +@pytest.mark.parametrize("noop", [[0], [0, 0, 0]]) +def test_actions_as_observations_wrapper_invalid_noop_multidiscrete_n_actions(noop): + env = MultiDiscreteDummyEnv() + with pytest.raises( + RuntimeError, match="The number of noop actions must be equal to the number of actions of the environment" + ): + env = ActionsAsObservationWrapper(env, num_stack=3, noop=noop, dilation=2) From eda86b2bfa22349ed4e2cdbac06c44a5680e7543 Mon Sep 17 00:00:00 2001 From: Michele Milesi Date: Mon, 27 May 2024 10:29:30 +0200 Subject: [PATCH 11/11] fix: typo --- ...observations.md => actions_as_observation.md} | 0 howto/configs.md | 2 +- sheeprl/utils/env.py | 4 ++-- tests/test_envs/test_wrappers.py | 16 ++++++++-------- 4 files changed, 11 insertions(+), 11 deletions(-) rename howto/{actions_as_observations.md => actions_as_observation.md} (100%) diff --git a/howto/actions_as_observations.md b/howto/actions_as_observation.md similarity index 100% rename from howto/actions_as_observations.md rename to howto/actions_as_observation.md diff --git a/howto/configs.md b/howto/configs.md index 75f8da5b..1db471b7 100644 --- a/howto/configs.md +++ b/howto/configs.md @@ -433,7 +433,7 @@ wrapper: ??? > [!NOTE] > -> The actions as observations wrapper is used for adding the last `n` actions to the observations. For more information, check the corresponding [howto file](./actions_as_observations.md). +> The actions as observations wrapper is used for adding the last `n` actions to the observations. For more information, check the corresponding [howto file](./actions_as_observation.md). Every custom environment must then "inherit" from this default config, override the particular parameters, and define the `wrapper` field, which is the one that will be directly instantiated at runtime. The `wrapper` field must define all the specific parameters to be passed to the `_target_` function when the wrapper will be instantiated. Take for example the `atari.yaml` config: diff --git a/sheeprl/utils/env.py b/sheeprl/utils/env.py index 518dc294..750d85ee 100644 --- a/sheeprl/utils/env.py +++ b/sheeprl/utils/env.py @@ -208,8 +208,8 @@ def transform_obs(obs: Dict[str, Any]): ) env = FrameStack(env, cfg.env.frame_stack, cnn_keys, cfg.env.frame_stack_dilation) - if cfg.env.actions_as_observations.num_stack > 0 and "diambra" not in cfg.env.wrapper._target_: - env = ActionsAsObservationWrapper(env, **cfg.env.actions_as_observations) + if cfg.env.actions_as_observation.num_stack > 0 and "diambra" not in cfg.env.wrapper._target_: + env = ActionsAsObservationWrapper(env, **cfg.env.actions_as_observation) if cfg.env.reward_as_observation: env = RewardAsObservationWrapper(env) diff --git a/tests/test_envs/test_wrappers.py b/tests/test_envs/test_wrappers.py index c1d7afd5..651679db 100644 --- a/tests/test_envs/test_wrappers.py +++ b/tests/test_envs/test_wrappers.py @@ -21,7 +21,7 @@ def test_mask_velocities_fail(): @pytest.mark.parametrize("num_stack", [1, 4, 8]) @pytest.mark.parametrize("dilation", [1, 2, 4]) @pytest.mark.parametrize("env_id", ["discrete_dummy", "multidiscrete_dummy", "continuous_dummy"]) -def test_actions_as_observations_wrapper(env_id: str, num_stack, dilation): +def test_actions_as_observation_wrapper(env_id: str, num_stack, dilation): env = ENVIRONMENTS[env_id]() if isinstance(env.action_space, gym.spaces.MultiDiscrete): noop = [0, 0] @@ -43,7 +43,7 @@ def test_actions_as_observations_wrapper(env_id: str, num_stack, dilation): @pytest.mark.parametrize("num_stack", [-1, 0]) @pytest.mark.parametrize("env_id", ["discrete_dummy", "multidiscrete_dummy", "continuous_dummy"]) -def test_actions_as_observations_wrapper_invalid_num_stack(env_id, num_stack): +def test_actions_as_observation_wrapper_invalid_num_stack(env_id, num_stack): env = ENVIRONMENTS[env_id]() if isinstance(env.action_space, gym.spaces.MultiDiscrete): noop = [0, 0] @@ -55,7 +55,7 @@ def test_actions_as_observations_wrapper_invalid_num_stack(env_id, num_stack): @pytest.mark.parametrize("dilation", [-1, 0]) @pytest.mark.parametrize("env_id", ["discrete_dummy", "multidiscrete_dummy", "continuous_dummy"]) -def test_actions_as_observations_wrapper_invalid_dilation(env_id, dilation): +def test_actions_as_observation_wrapper_invalid_dilation(env_id, dilation): env = ENVIRONMENTS[env_id]() if isinstance(env.action_space, gym.spaces.MultiDiscrete): noop = [0, 0] @@ -67,34 +67,34 @@ def test_actions_as_observations_wrapper_invalid_dilation(env_id, dilation): @pytest.mark.parametrize("noop", [set([0, 0, 0]), "this is an invalid type", np.array([0, 0, 0])]) @pytest.mark.parametrize("env_id", ["discrete_dummy", "multidiscrete_dummy", "continuous_dummy"]) -def test_actions_as_observations_wrapper_invalid_noop_type(env_id, noop): +def test_actions_as_observation_wrapper_invalid_noop_type(env_id, noop): env = ENVIRONMENTS[env_id]() with pytest.raises(ValueError, match="The noop action must be an integer or float or list"): env = ActionsAsObservationWrapper(env, num_stack=3, noop=noop, dilation=2) -def test_actions_as_observations_wrapper_invalid_noop_continuous_type(): +def test_actions_as_observation_wrapper_invalid_noop_continuous_type(): env = ContinuousDummyEnv() with pytest.raises(ValueError, match="The noop actions must be a float for continuous action spaces"): env = ActionsAsObservationWrapper(env, num_stack=3, noop=[0, 0, 0], dilation=2) @pytest.mark.parametrize("noop", [[0, 0, 0], 0.0]) -def test_actions_as_observations_wrapper_invalid_noop_discrete_type(noop): +def test_actions_as_observation_wrapper_invalid_noop_discrete_type(noop): env = DiscreteDummyEnv() with pytest.raises(ValueError, match="The noop actions must be an integer for discrete action spaces"): env = ActionsAsObservationWrapper(env, num_stack=3, noop=noop, dilation=2) @pytest.mark.parametrize("noop", [0, 0.0]) -def test_actions_as_observations_wrapper_invalid_noop_multidiscrete_type(noop): +def test_actions_as_observation_wrapper_invalid_noop_multidiscrete_type(noop): env = MultiDiscreteDummyEnv() with pytest.raises(ValueError, match="The noop actions must be a list for multi-discrete action spaces"): env = ActionsAsObservationWrapper(env, num_stack=3, noop=noop, dilation=2) @pytest.mark.parametrize("noop", [[0], [0, 0, 0]]) -def test_actions_as_observations_wrapper_invalid_noop_multidiscrete_n_actions(noop): +def test_actions_as_observation_wrapper_invalid_noop_multidiscrete_n_actions(noop): env = MultiDiscreteDummyEnv() with pytest.raises( RuntimeError, match="The number of noop actions must be equal to the number of actions of the environment"