From 730dd452976242f18bd01fa84349894fbf50e9a1 Mon Sep 17 00:00:00 2001 From: Vincent Moens Date: Wed, 17 Apr 2024 15:05:26 +0200 Subject: [PATCH] [CI] Fix CI issues (#2084) --- .../linux_libs/scripts_habitat/setup_env.sh | 8 +++++--- .github/workflows/test-linux-habitat.yml | 6 +++--- .github/workflows/test-linux-libs.yml | 16 +++++++++++++--- test/test_env.py | 10 +++++++--- test/test_modules.py | 4 +++- test/test_rb.py | 2 -- test/test_transforms.py | 15 +++++++++++---- torchrl/data/replay_buffers/samplers.py | 9 ++------- 8 files changed, 44 insertions(+), 26 deletions(-) diff --git a/.github/unittest/linux_libs/scripts_habitat/setup_env.sh b/.github/unittest/linux_libs/scripts_habitat/setup_env.sh index 56064fce082..fc182a669ea 100755 --- a/.github/unittest/linux_libs/scripts_habitat/setup_env.sh +++ b/.github/unittest/linux_libs/scripts_habitat/setup_env.sh @@ -39,9 +39,11 @@ if [ ! -d "${env_dir}" ]; then conda create --prefix "${env_dir}" -y python="$PYTHON_VERSION" fi conda activate "${env_dir}" -#pip3 uninstall cython -y -#pip uninstall cython -y -#conda uninstall cython -y + +# set debug variables +conda env config vars set MAGNUM_LOG=debug HABITAT_SIM_LOG=debug +conda deactivate && conda activate "${env_dir}" + pip3 install "cython<3" conda install -c anaconda cython="<3.0.0" -y diff --git a/.github/workflows/test-linux-habitat.yml b/.github/workflows/test-linux-habitat.yml index 54b582a1fab..3f6e89a70f9 100644 --- a/.github/workflows/test-linux-habitat.yml +++ b/.github/workflows/test-linux-habitat.yml @@ -19,14 +19,14 @@ jobs: tests: strategy: matrix: - python_version: ["3.9"] # "3.8", "3.9", "3.10", "3.11" - cuda_arch_version: ["11.6"] # "11.6", "11.7" + python_version: ["3.9"] + cuda_arch_version: ["12.1"] fail-fast: false uses: pytorch/test-infra/.github/workflows/linux_job.yml@main with: runner: linux.g5.4xlarge.nvidia.gpu repository: pytorch/rl - docker-image: "nvidia/cuda:12.2.0-devel-ubuntu22.04" + docker-image: "nvidia/cuda:12.1.1-devel-ubuntu22.04" gpu-arch-type: cuda gpu-arch-version: ${{ matrix.cuda_arch_version }} timeout: 90 diff --git a/.github/workflows/test-linux-libs.yml b/.github/workflows/test-linux-libs.yml index cc0e4a4f54e..91b7dc8c742 100644 --- a/.github/workflows/test-linux-libs.yml +++ b/.github/workflows/test-linux-libs.yml @@ -53,14 +53,16 @@ jobs: unittests-brax: strategy: matrix: - python_version: ["3.9"] + python_version: ["3.11"] cuda_arch_version: ["12.1"] + if: ${{ github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'Environments') }} uses: pytorch/test-infra/.github/workflows/linux_job.yml@main with: repository: pytorch/rl runner: "linux.g5.4xlarge.nvidia.gpu" gpu-arch-type: cuda gpu-arch-version: "11.7" + docker-image: "pytorch/manylinux-cuda124" timeout: 120 script: | if [[ "${{ github.ref }}" =~ release/* ]]; then @@ -73,7 +75,7 @@ jobs: set -euo pipefail - export PYTHON_VERSION="3.9" + export PYTHON_VERSION="3.11" export CU_VERSION="12.1" export TAR_OPTIONS="--no-same-owner" export UPLOAD_CHANNEL="nightly" @@ -123,7 +125,7 @@ jobs: matrix: python_version: ["3.9"] cuda_arch_version: ["12.1"] - if: ${{ github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'Data') }} + if: ${{ github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'Environments') }} uses: pytorch/test-infra/.github/workflows/linux_job.yml@main with: repository: pytorch/rl @@ -224,12 +226,14 @@ jobs: matrix: python_version: ["3.9"] cuda_arch_version: ["12.1"] + if: ${{ github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'Environments') }} uses: pytorch/test-infra/.github/workflows/linux_job.yml@main with: repository: pytorch/rl runner: "linux.g5.4xlarge.nvidia.gpu" gpu-arch-type: cuda gpu-arch-version: "11.7" + docker-image: "pytorch/manylinux-cuda124" timeout: 120 script: | if [[ "${{ github.ref }}" =~ release/* ]]; then @@ -324,12 +328,14 @@ jobs: bash .github/unittest/linux_libs/scripts_openx/post_process.sh unittests-pettingzoo: + if: ${{ github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'Environments') }} uses: pytorch/test-infra/.github/workflows/linux_job.yml@main with: repository: pytorch/rl runner: "linux.g5.4xlarge.nvidia.gpu" gpu-arch-type: cuda gpu-arch-version: "11.7" + docker-image: "pytorch/manylinux-cuda124" timeout: 120 script: | if [[ "${{ github.ref }}" =~ release/* ]]; then @@ -360,6 +366,7 @@ jobs: matrix: python_version: ["3.9"] cuda_arch_version: ["12.1"] + if: ${{ github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'Environments') }} uses: pytorch/test-infra/.github/workflows/linux_job.yml@main with: repository: pytorch/rl @@ -468,6 +475,7 @@ jobs: runner: "linux.g5.4xlarge.nvidia.gpu" gpu-arch-type: cuda gpu-arch-version: "11.7" + docker-image: "pytorch/manylinux-cuda124" timeout: 120 script: | if [[ "${{ github.ref }}" =~ release/* ]]; then @@ -532,12 +540,14 @@ jobs: matrix: python_version: ["3.9"] cuda_arch_version: ["12.1"] + if: ${{ github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'Environments') }} uses: pytorch/test-infra/.github/workflows/linux_job.yml@main with: repository: pytorch/rl runner: "linux.g5.4xlarge.nvidia.gpu" gpu-arch-type: cuda gpu-arch-version: "11.7" + docker-image: "pytorch/manylinux-cuda124" timeout: 120 script: | if [[ "${{ github.ref }}" =~ release/* ]]; then diff --git a/test/test_env.py b/test/test_env.py index 51af99a667e..f5f1f37ed8e 100644 --- a/test/test_env.py +++ b/test/test_env.py @@ -115,6 +115,10 @@ IS_OSX = platform == "darwin" IS_WIN = platform == "win32" +if IS_WIN: + mp_ctx = "spawn" +else: + mp_ctx = "fork" ## TO BE FIXED: DiscreteActionProjection queries a randint on each worker, which leads to divergent results between ## the serial and parallel batched envs @@ -463,7 +467,7 @@ def test_parallel_devices( env.shared_tensordict_parent.device.type == torch.device(edevice).type ) - @pytest.mark.parametrize("start_method", [None, "fork"]) + @pytest.mark.parametrize("start_method", [None, mp_ctx]) def test_serial_for_single(self, maybe_fork_ParallelEnv, start_method): env = ParallelEnv( 1, @@ -2959,7 +2963,7 @@ def test_auto_reset_parallel(self): env = ParallelEnv( 2, functools.partial(AutoResettingCountingEnv, 4, auto_reset=True), - mp_start_method="fork", + mp_start_method=mp_ctx, ) r = env.rollout(20, policy, break_when_any_done=False) assert r.shape == torch.Size([2, 20]) @@ -2982,7 +2986,7 @@ def test_auto_reset_parallel_hetero(self): functools.partial(AutoResettingCountingEnv, 4, auto_reset=True), functools.partial(AutoResettingCountingEnv, 5, auto_reset=True), ], - mp_start_method="fork", + mp_start_method=mp_ctx, ) r = env.rollout(20, policy, break_when_any_done=False) assert r.shape == torch.Size([2, 20]) diff --git a/test/test_modules.py b/test/test_modules.py index c2fd0cd35a9..de4333a3254 100644 --- a/test/test_modules.py +++ b/test/test_modules.py @@ -9,7 +9,8 @@ import numpy as np import pytest import torch -from _utils_internal import get_default_devices + +from _utils_internal import get_default_devices, retry from mocking_classes import MockBatchedUnLockedEnv from packaging import version from tensordict import TensorDict @@ -890,6 +891,7 @@ def _get_mock_input_td( ) return td + @retry(AssertionError, 3) @pytest.mark.parametrize("n_agents", [1, 3]) @pytest.mark.parametrize("share_params", [True, False]) @pytest.mark.parametrize("centralised", [True, False]) diff --git a/test/test_rb.py b/test/test_rb.py index 9cd3dd48399..10d71d87f89 100644 --- a/test/test_rb.py +++ b/test/test_rb.py @@ -2794,8 +2794,6 @@ def test_rb_multidim_collector( if transform is not None: assert s.ndim == 2 except Exception: - print(f"Failing at iter {i}") # noqa: T201 - print(f"rb {rb}") # noqa: T201 raise @pytest.mark.parametrize("strict_length", [True, False]) diff --git a/test/test_transforms.py b/test/test_transforms.py index 2c959ff18ab..6e978d0ab5b 100644 --- a/test/test_transforms.py +++ b/test/test_transforms.py @@ -13,6 +13,7 @@ import sys from copy import copy from functools import partial +from sys import platform import numpy as np import pytest @@ -125,6 +126,12 @@ from torchrl.envs.utils import check_env_specs, step_mdp from torchrl.modules import GRUModule, LSTMModule, MLP, ProbabilisticActor, TanhNormal +IS_WIN = platform == "win32" +if IS_WIN: + mp_ctx = "spawn" +else: + mp_ctx = "fork" + TIMEOUT = 100.0 _has_gymnasium = importlib.util.find_spec("gymnasium") is not None @@ -9404,7 +9411,7 @@ def make_env(): env = ParallelEnv( 2, make_env, - mp_start_method="fork" if not torch.cuda.is_available() else "spawn", + mp_start_method=mp_ctx if not torch.cuda.is_available() else "spawn", ) assert env.device is None try: @@ -9447,7 +9454,7 @@ def make_env(): ParallelEnv( 2, make_env, - mp_start_method="fork" if not torch.cuda.is_available() else "spawn", + mp_start_method=mp_ctx if not torch.cuda.is_available() else "spawn", ), DeviceCastTransform( "cpu:1", @@ -10696,7 +10703,7 @@ def make_env(stateless=stateless, reshape_fn=reshape_fn): assert env.batch_size == expected_batch_size return env - env = ParallelEnv(2, make_env, mp_start_method="fork") + env = ParallelEnv(2, make_env, mp_start_method=mp_ctx) assert env.batch_size == (2, *make_env().batch_size) check_env_specs(env) @@ -10751,7 +10758,7 @@ def make_env(stateless=stateless, reshape_fn=reshape_fn): assert transform.batch_size is None env = TransformedEnv( - ParallelEnv(2, make_env, mp_start_method="fork"), transform + ParallelEnv(2, make_env, mp_start_method=mp_ctx), transform ) assert env.batch_size == expected_batch_size check_env_specs(env) diff --git a/torchrl/data/replay_buffers/samplers.py b/torchrl/data/replay_buffers/samplers.py index c005188fad2..944be3afcd6 100644 --- a/torchrl/data/replay_buffers/samplers.py +++ b/torchrl/data/replay_buffers/samplers.py @@ -1572,10 +1572,6 @@ def __init__( ) def __repr__(self): - if self._sample_list is not None: - perc = len(self._sample_list) / self.len_storage * 100 - else: - perc = 0.0 return ( f"{self.__class__.__name__}(" f"num_slices={self.num_slices}, " @@ -1586,8 +1582,7 @@ def __repr__(self): f"strict_length={self.strict_length}," f"alpha={self._alpha}, " f"beta={self._beta}, " - f"eps={self._eps}," - f"{perc: 4.4f}% filled)" + f"eps={self._eps}" ) def __getstate__(self): @@ -1726,7 +1721,7 @@ def sample(self, storage: Storage, batch_size: int) -> Tuple[torch.Tensor, dict] terminated = torch.zeros_like(truncated) if traj_terminated.any(): if isinstance(seq_length, int): - terminated.view(num_slices, -1)[:, traj_terminated] = 1 + terminated.view(num_slices, -1)[traj_terminated, -1] = 1 else: terminated[(seq_length.cumsum(0) - 1)[traj_terminated]] = 1 truncated = truncated & ~terminated