Add option to specify a subset of CUDA devices for the job to run on

Summary: Add a new parameters, cuda_visible_devices_subset, which contains a list of GPU indices. If set, auto_set_cuda_visible_devices will only use indices from this list when distributing the indices. This allows masking out some GPUs, useful in scenarios like hosts shared between multiple users, where the first GPUs will be often in use by default processes, and hosts which have different types of GPUs available, and it is desired to only use a subset of those. Differential Revision: D47208267 fbshipit-source-id: 872255d6e2e9c4e053cf2a3a2ef8b582ab9a2bf9
pytorch · Jul 5, 2023 · f6eb2fd · f6eb2fd
1 parent d452a8c
commit f6eb2fd
Show file tree

Hide file tree

Showing 2 changed files with 255 additions and 5 deletions.
diff --git a/torchx/schedulers/local_scheduler.py b/torchx/schedulers/local_scheduler.py
@@ -168,6 +168,7 @@ class LocalOpts(TypedDict, total=False):
     log_dir: str
     prepend_cwd: Optional[bool]
     auto_set_cuda_visible_devices: Optional[bool]
+    auto_set_cuda_visible_devices_ids: List[str]
 
 
 class LocalDirectoryImageProvider(ImageProvider):
@@ -595,6 +596,14 @@ def _run_opts(self) -> runopts:
             help="sets the `CUDA_AVAILABLE_DEVICES` for roles that request GPU resources."
             " Each role replica will be assigned one GPU. Does nothing if the device count is less than replicas.",
         )
+        opts.add(
+            "auto_set_cuda_visible_devices_ids",
+            type_=List[str],
+            default=[],
+            help="when auto_set_cuda_visible_devices is set, the ; separated list of GPUs to use "
+            "during auto-setting CUDA_VISIBLE_DEVICES. If not set, defaults to all devices "
+            "available on the machine.",
+        )
         return opts
 
     def _validate(self, app: AppDef, scheduler: str) -> None:
@@ -795,6 +804,10 @@ def auto_set_CUDA_VISIBLE_DEVICES(
         To manually set ``CUDA_VISIBLE_DEVICES``, run with ``auto_set_cuda_visible_devices = False``
         in the scheduler runcfg.
 
+        If ``auto_set_cuda_visible_devices_ids`` is set to a list of indices, like
+        ``auto_set_cuda_visible_devices_ids=[5,7]`` it works as when set to
+        True, but will only use the device indices in the array to set the CUDA_VISIBLE_DEVICES.
+
         .. note::
             If the host's device count is less than the total number of requested GPUs,
             then ``CUDA_VISIBLE_DEVICES`` is NOT set (even if ``auto_set_cuda_visible_devices=True``).
@@ -818,6 +831,17 @@ def auto_set_CUDA_VISIBLE_DEVICES(
               #. role_1, replica_1's ``CUDA_VISIBLE_DEVICES=3``
               #. role_1, replica_2's ``CUDA_VISIBLE_DEVICES=4``
 
+        With ``cuda_visible_devices=["4", "5", "7"]``
+
+        #. ``Role(num_replicas=3, resource=Resource(gpus=1))``
+              #. replica_0's ``CUDA_VISIBLE_DEVICES=4``
+              #. replica_1's ``CUDA_VISIBLE_DEVICES=5``
+              #. replica_2's ``CUDA_VISIBLE_DEVICES=7``
+
+         #. ``[Role(num_replicas=1, resource=Resource(gpus=2)), Role(num_replicas=1, resource=Resource(gpus=1))]``
+              #. role_0, replica_0's ``CUDA_VISIBLE_DEVICES=4,5``
+              #. role_1, replica_0's ``CUDA_VISIBLE_DEVICES=7``
+
         """
 
         total_requested_gpus = 0  # total number of gpus for the app
@@ -832,7 +856,7 @@ def auto_set_CUDA_VISIBLE_DEVICES(
                     """\n
 ======================================================================
 Running multiple role replicas that require GPUs without
-setting `CUDA_VISIBLE_DEVICES` may result in multiple 
+setting `CUDA_VISIBLE_DEVICES` may result in multiple
 processes using the same GPU device with undesired consequences
 such as CUDA OutOfMemory errors.
 
@@ -844,7 +868,44 @@ def auto_set_CUDA_VISIBLE_DEVICES(
                 )
             return
 
-        device_count = self._cuda_device_count()
+        cuda_device_count: int = self._cuda_device_count()
+        if auto_set_cuda_visible_devices_ids := cfg.get(
+            "auto_set_cuda_visible_devices_ids"
+        ):
+            available_devices: List[int] = sorted(
+                set(int(idx) for idx in auto_set_cuda_visible_devices_ids)
+            )
+            if len(auto_set_cuda_visible_devices_ids) != len(available_devices):
+                log.warning(
+                    f"""\n
+======================================================================
+Cannot auto-set `CUDA_VISIBLE_DEVICES`
+Defined subset of GPUs: {auto_set_cuda_visible_devices_ids} has repeated
+instances values.
+
+Remove repeated GPU indices from the array
+======================================================================
+                """
+                )
+                return
+            if any(
+                not (0 <= index and index < cuda_device_count)
+                for index in available_devices
+            ):
+                log.warning(
+                    f"""\n
+======================================================================
+Cannot auto-set `CUDA_VISIBLE_DEVICES`
+Defined subset of GPUs: {auto_set_cuda_visible_devices_ids} has a GPU which
+is not in the GPU index range.
+======================================================================
+                """
+                )
+                return
+        else:
+            available_devices = list(range(cuda_device_count))
+        device_count: int = len(available_devices)
+
         if total_requested_gpus > device_count:
             log.warning(
                 f"""\n
@@ -869,7 +930,7 @@ def auto_set_CUDA_VISIBLE_DEVICES(
             for replica_id, replica in enumerate(role_replicas):
                 end_idx = start_idx + role.resource.gpu
                 replica.env[ENV_CUDA_VISIBLE_DEVICES] = ",".join(
-                    list(str(idx) for idx in range(start_idx, end_idx))
+                    str(available_devices[idx]) for idx in range(start_idx, end_idx)
                 )
                 start_idx = end_idx
 

diff --git a/torchx/schedulers/test/local_scheduler_test.py b/torchx/schedulers/test/local_scheduler_test.py
@@ -948,11 +948,18 @@ def role(self, name: str, replicas: int, gpu: int = 0) -> Role:
         )
 
     def dryrun(
-        self, *roles: Role, auto_set_cuda_visible_devices: bool = True
+        self,
+        *roles: Role,
+        auto_set_cuda_visible_devices: bool = True,
+        auto_set_cuda_visible_devices_ids: Optional[List[str]] = None,
     ) -> AppDryRunInfo[PopenRequest]:
+        auto_set_cuda_visible_devices_ids = auto_set_cuda_visible_devices_ids or []
         return self.scheduler.submit_dryrun(
             AppDef(name="_ignored_", roles=list(roles)),
-            cfg={"auto_set_cuda_visible_devices": auto_set_cuda_visible_devices},
+            cfg={
+                "auto_set_cuda_visible_devices": auto_set_cuda_visible_devices,
+                "auto_set_cuda_visible_devices_ids": auto_set_cuda_visible_devices_ids,
+            },
         )
 
     def assert_CUDA_VISIBLE_DEVICES(
@@ -997,6 +1004,54 @@ def test_auto_set_CUDA_VISIBLE_DEVICES_more_gpus_then_available(
             {},
         )
 
+    @patch(LOCAL_SCHED_DEVICE_COUNT, return_value=42)
+    def test_auto_set_CUDA_VISIBLE_DEVICES_more_gpus_then_available_in_subset(
+        self, _: MagicMock
+    ) -> None:
+        self.assert_CUDA_VISIBLE_DEVICES(
+            self.dryrun(
+                self.role("a", replicas=2, gpu=2),
+                auto_set_cuda_visible_devices_ids=["1", "2"],
+            ),
+            {},
+        )
+
+    @patch(LOCAL_SCHED_DEVICE_COUNT, return_value=2)
+    def test_auto_set_auto_set_cuda_visible_devices_ids_with_repetitions(
+        self, _: MagicMock
+    ) -> None:
+        self.assert_CUDA_VISIBLE_DEVICES(
+            self.dryrun(
+                self.role("a", replicas=2, gpu=2),
+                auto_set_cuda_visible_devices_ids=["2", "2"],
+            ),
+            {},
+        )
+
+    @patch(LOCAL_SCHED_DEVICE_COUNT, return_value=2)
+    def test_auto_set_auto_set_cuda_visible_devices_ids_with_out_of_range_top(
+        self, _: MagicMock
+    ) -> None:
+        self.assert_CUDA_VISIBLE_DEVICES(
+            self.dryrun(
+                self.role("a", replicas=2, gpu=2),
+                auto_set_cuda_visible_devices_ids=["1", "2", "3", "42"],
+            ),
+            {},
+        )
+
+    @patch(LOCAL_SCHED_DEVICE_COUNT, return_value=2)
+    def test_auto_set_auto_set_cuda_visible_devices_ids_with_out_of_range_below(
+        self, _: MagicMock
+    ) -> None:
+        self.assert_CUDA_VISIBLE_DEVICES(
+            self.dryrun(
+                self.role("a", replicas=2, gpu=2),
+                auto_set_cuda_visible_devices_ids=["1", "2", "3", "-1"],
+            ),
+            {},
+        )
+
     @patch(LOCAL_SCHED_DEVICE_COUNT, return_value=2)
     def test_auto_set_CUDA_VISIBLE_DEVICES_false(self, _: MagicMock) -> None:
         self.assert_CUDA_VISIBLE_DEVICES(
@@ -1038,6 +1093,73 @@ def test_auto_set_CUDA_VISIBLE_DEVICES(self, _: MagicMock) -> None:
                 {"a": expected[gpu]},
             )
 
+    @patch(LOCAL_SCHED_DEVICE_COUNT, return_value=8)
+    def test_auto_set_CUDA_VISIBLE_DEVICES_from_subset(self, _: MagicMock) -> None:
+        for gpu in [1, 2, 3, 4, 5, 6, 7, 8]:
+            subset = [
+                [],
+                ["7"],
+                ["7", "6"],
+                ["7", "6", "5"],
+                ["7", "6", "5", "4"],
+                ["7", "6", "5", "4", "3"],
+                ["7", "6", "5", "4", "3", "2"],
+                ["7", "6", "5", "4", "3", "2", "1"],
+                ["7", "6", "5", "4", "3", "2", "1", "0"],
+            ]
+            expected = [
+                [],
+                ["7"],
+                ["6,7"],
+                ["5,6,7"],
+                ["4,5,6,7"],
+                ["3,4,5,6,7"],
+                ["2,3,4,5,6,7"],
+                ["1,2,3,4,5,6,7"],
+                ["0,1,2,3,4,5,6,7"],
+            ]
+
+            self.assert_CUDA_VISIBLE_DEVICES(
+                self.dryrun(
+                    self.role(name="a", replicas=1, gpu=gpu),
+                    auto_set_cuda_visible_devices_ids=subset[gpu],
+                ),
+                {"a": expected[gpu]},
+            )
+
+        for gpu in [1, 2, 3, 4]:
+            subset = [
+                [],
+                ["7", "5"],
+                [
+                    "7",
+                    "5",
+                    "3",
+                    "1",
+                ],
+                ["7", "5", "3", "1", "2", "4"],
+                ["7", "6", "5", "4", "3", "2", "1", "0"],
+            ]
+            expected = [
+                [],
+                ["5", "7"],
+                ["1,3", "5,7"],
+                ["1,2,3", "4,5,7"],
+                ["0,1,2,3", "4,5,6,7"],
+            ]
+
+            self.assert_CUDA_VISIBLE_DEVICES(
+                self.dryrun(
+                    self.role(
+                        name="a",
+                        replicas=2,
+                        gpu=gpu,
+                    ),
+                    auto_set_cuda_visible_devices_ids=subset[gpu],
+                ),
+                {"a": expected[gpu]},
+            )
+
     @patch(LOCAL_SCHED_DEVICE_COUNT, return_value=8)
     def test_auto_set_CUDA_VISIBLE_DEVICES_multi_role(self, _: MagicMock) -> None:
         self.assert_CUDA_VISIBLE_DEVICES(
@@ -1052,6 +1174,25 @@ def test_auto_set_CUDA_VISIBLE_DEVICES_multi_role(self, _: MagicMock) -> None:
             },
         )
 
+    @patch(LOCAL_SCHED_DEVICE_COUNT, return_value=8)
+    def test_auto_set_CUDA_VISIBLE_DEVICES_multi_role_with_subset(
+        self, _: MagicMock
+    ) -> None:
+        self.assert_CUDA_VISIBLE_DEVICES(
+            self.dryrun(
+                self.role(name="a", replicas=1, gpu=1),
+                self.role(name="b", replicas=1, gpu=0),
+                self.role(name="c", replicas=3, gpu=2),
+                auto_set_cuda_visible_devices_ids=["1", "2", "3", "4", "5", "6", "7"],
+            ),
+            {
+                "a": [
+                    "1",
+                ],
+                "c": ["2,3", "4,5", "6,7"],
+            },
+        )
+
     @patch(LOCAL_SCHED_DEVICE_COUNT, return_value=16)
     def test_get_cuda_devices_is_set(self, _: MagicMock) -> None:
         appdef = AppDef(
@@ -1085,6 +1226,54 @@ def test_get_cuda_devices_is_set(self, _: MagicMock) -> None:
         self.assertEqual("4,5,6", role2_params[0].env[ENV_CUDA_VISIBLE_DEVICES])
         self.assertEqual("7,8,9", role2_params[1].env[ENV_CUDA_VISIBLE_DEVICES])
 
+    @patch(LOCAL_SCHED_DEVICE_COUNT, return_value=16)
+    def test_get_cuda_devices_is_set_with_subset(self, _: MagicMock) -> None:
+        appdef = AppDef(
+            name="role1",
+            roles=[
+                Role(
+                    name="role1",
+                    image=self.test_dir,
+                    entrypoint="train",
+                    resource=Resource(gpu=2, cpu=0, memMB=0),
+                    num_replicas=2,
+                ),
+                Role(
+                    name="role2",
+                    image=self.test_dir,
+                    entrypoint="train",
+                    resource=Resource(gpu=3, cpu=0, memMB=0),
+                    num_replicas=2,
+                ),
+            ],
+        )
+        popen_req = self.scheduler._to_popen_request(
+            appdef,
+            {
+                "auto_set_cuda_visible_devices": True,
+                "auto_set_cuda_visible_devices_ids": [
+                    "0",
+                    "1",
+                    "2",
+                    "3",
+                    "5",
+                    "7",
+                    "9",
+                    "11",
+                    "13",
+                    "15",
+                ],
+            },
+        )
+        role1_params = popen_req.role_params["role1"]
+        self.assertEqual(2, len(role1_params))
+        self.assertEqual("0,1", role1_params[0].env[ENV_CUDA_VISIBLE_DEVICES])
+        self.assertEqual("2,3", role1_params[1].env[ENV_CUDA_VISIBLE_DEVICES])
+        role2_params = popen_req.role_params["role2"]
+        self.assertEqual(2, len(role2_params))
+        self.assertEqual("5,7,9", role2_params[0].env[ENV_CUDA_VISIBLE_DEVICES])
+        self.assertEqual("11,13,15", role2_params[1].env[ENV_CUDA_VISIBLE_DEVICES])
+
     @patch(LOCAL_SCHED_DEVICE_COUNT, return_value=8)
     def test_get_cuda_devices_not_set(self, _: MagicMock) -> None:
         trainer1 = AppDef(