Skip to content

Commit

Permalink
Add option to specify a subset of CUDA devices for the job to run on
Browse files Browse the repository at this point in the history
Summary:
Add a new parameters, cuda_visible_devices_subset, which contains a list of GPU indices. If set, auto_set_cuda_visible_devices will only use indices from this list when distributing the indices.

This allows masking out some GPUs, useful in scenarios like hosts shared between multiple users, where the first GPUs will be often in use by default processes, and hosts which have different types of GPUs available, and it is desired to only use a subset of those.

Differential Revision: D47208267

fbshipit-source-id: 872255d6e2e9c4e053cf2a3a2ef8b582ab9a2bf9
  • Loading branch information
Miquel Jubert Hermoso authored and facebook-github-bot committed Jul 5, 2023
1 parent d452a8c commit f6eb2fd
Show file tree
Hide file tree
Showing 2 changed files with 255 additions and 5 deletions.
67 changes: 64 additions & 3 deletions torchx/schedulers/local_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ class LocalOpts(TypedDict, total=False):
log_dir: str
prepend_cwd: Optional[bool]
auto_set_cuda_visible_devices: Optional[bool]
auto_set_cuda_visible_devices_ids: List[str]


class LocalDirectoryImageProvider(ImageProvider):
Expand Down Expand Up @@ -595,6 +596,14 @@ def _run_opts(self) -> runopts:
help="sets the `CUDA_AVAILABLE_DEVICES` for roles that request GPU resources."
" Each role replica will be assigned one GPU. Does nothing if the device count is less than replicas.",
)
opts.add(
"auto_set_cuda_visible_devices_ids",
type_=List[str],
default=[],
help="when auto_set_cuda_visible_devices is set, the ; separated list of GPUs to use "
"during auto-setting CUDA_VISIBLE_DEVICES. If not set, defaults to all devices "
"available on the machine.",
)
return opts

def _validate(self, app: AppDef, scheduler: str) -> None:
Expand Down Expand Up @@ -795,6 +804,10 @@ def auto_set_CUDA_VISIBLE_DEVICES(
To manually set ``CUDA_VISIBLE_DEVICES``, run with ``auto_set_cuda_visible_devices = False``
in the scheduler runcfg.
If ``auto_set_cuda_visible_devices_ids`` is set to a list of indices, like
``auto_set_cuda_visible_devices_ids=[5,7]`` it works as when set to
True, but will only use the device indices in the array to set the CUDA_VISIBLE_DEVICES.
.. note::
If the host's device count is less than the total number of requested GPUs,
then ``CUDA_VISIBLE_DEVICES`` is NOT set (even if ``auto_set_cuda_visible_devices=True``).
Expand All @@ -818,6 +831,17 @@ def auto_set_CUDA_VISIBLE_DEVICES(
#. role_1, replica_1's ``CUDA_VISIBLE_DEVICES=3``
#. role_1, replica_2's ``CUDA_VISIBLE_DEVICES=4``
With ``cuda_visible_devices=["4", "5", "7"]``
#. ``Role(num_replicas=3, resource=Resource(gpus=1))``
#. replica_0's ``CUDA_VISIBLE_DEVICES=4``
#. replica_1's ``CUDA_VISIBLE_DEVICES=5``
#. replica_2's ``CUDA_VISIBLE_DEVICES=7``
#. ``[Role(num_replicas=1, resource=Resource(gpus=2)), Role(num_replicas=1, resource=Resource(gpus=1))]``
#. role_0, replica_0's ``CUDA_VISIBLE_DEVICES=4,5``
#. role_1, replica_0's ``CUDA_VISIBLE_DEVICES=7``
"""

total_requested_gpus = 0 # total number of gpus for the app
Expand All @@ -832,7 +856,7 @@ def auto_set_CUDA_VISIBLE_DEVICES(
"""\n
======================================================================
Running multiple role replicas that require GPUs without
setting `CUDA_VISIBLE_DEVICES` may result in multiple
setting `CUDA_VISIBLE_DEVICES` may result in multiple
processes using the same GPU device with undesired consequences
such as CUDA OutOfMemory errors.
Expand All @@ -844,7 +868,44 @@ def auto_set_CUDA_VISIBLE_DEVICES(
)
return

device_count = self._cuda_device_count()
cuda_device_count: int = self._cuda_device_count()
if auto_set_cuda_visible_devices_ids := cfg.get(
"auto_set_cuda_visible_devices_ids"
):
available_devices: List[int] = sorted(
set(int(idx) for idx in auto_set_cuda_visible_devices_ids)
)
if len(auto_set_cuda_visible_devices_ids) != len(available_devices):
log.warning(
f"""\n
======================================================================
Cannot auto-set `CUDA_VISIBLE_DEVICES`
Defined subset of GPUs: {auto_set_cuda_visible_devices_ids} has repeated
instances values.
Remove repeated GPU indices from the array
======================================================================
"""
)
return
if any(
not (0 <= index and index < cuda_device_count)
for index in available_devices
):
log.warning(
f"""\n
======================================================================
Cannot auto-set `CUDA_VISIBLE_DEVICES`
Defined subset of GPUs: {auto_set_cuda_visible_devices_ids} has a GPU which
is not in the GPU index range.
======================================================================
"""
)
return
else:
available_devices = list(range(cuda_device_count))
device_count: int = len(available_devices)

if total_requested_gpus > device_count:
log.warning(
f"""\n
Expand All @@ -869,7 +930,7 @@ def auto_set_CUDA_VISIBLE_DEVICES(
for replica_id, replica in enumerate(role_replicas):
end_idx = start_idx + role.resource.gpu
replica.env[ENV_CUDA_VISIBLE_DEVICES] = ",".join(
list(str(idx) for idx in range(start_idx, end_idx))
str(available_devices[idx]) for idx in range(start_idx, end_idx)
)
start_idx = end_idx

Expand Down
193 changes: 191 additions & 2 deletions torchx/schedulers/test/local_scheduler_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -948,11 +948,18 @@ def role(self, name: str, replicas: int, gpu: int = 0) -> Role:
)

def dryrun(
self, *roles: Role, auto_set_cuda_visible_devices: bool = True
self,
*roles: Role,
auto_set_cuda_visible_devices: bool = True,
auto_set_cuda_visible_devices_ids: Optional[List[str]] = None,
) -> AppDryRunInfo[PopenRequest]:
auto_set_cuda_visible_devices_ids = auto_set_cuda_visible_devices_ids or []
return self.scheduler.submit_dryrun(
AppDef(name="_ignored_", roles=list(roles)),
cfg={"auto_set_cuda_visible_devices": auto_set_cuda_visible_devices},
cfg={
"auto_set_cuda_visible_devices": auto_set_cuda_visible_devices,
"auto_set_cuda_visible_devices_ids": auto_set_cuda_visible_devices_ids,
},
)

def assert_CUDA_VISIBLE_DEVICES(
Expand Down Expand Up @@ -997,6 +1004,54 @@ def test_auto_set_CUDA_VISIBLE_DEVICES_more_gpus_then_available(
{},
)

@patch(LOCAL_SCHED_DEVICE_COUNT, return_value=42)
def test_auto_set_CUDA_VISIBLE_DEVICES_more_gpus_then_available_in_subset(
self, _: MagicMock
) -> None:
self.assert_CUDA_VISIBLE_DEVICES(
self.dryrun(
self.role("a", replicas=2, gpu=2),
auto_set_cuda_visible_devices_ids=["1", "2"],
),
{},
)

@patch(LOCAL_SCHED_DEVICE_COUNT, return_value=2)
def test_auto_set_auto_set_cuda_visible_devices_ids_with_repetitions(
self, _: MagicMock
) -> None:
self.assert_CUDA_VISIBLE_DEVICES(
self.dryrun(
self.role("a", replicas=2, gpu=2),
auto_set_cuda_visible_devices_ids=["2", "2"],
),
{},
)

@patch(LOCAL_SCHED_DEVICE_COUNT, return_value=2)
def test_auto_set_auto_set_cuda_visible_devices_ids_with_out_of_range_top(
self, _: MagicMock
) -> None:
self.assert_CUDA_VISIBLE_DEVICES(
self.dryrun(
self.role("a", replicas=2, gpu=2),
auto_set_cuda_visible_devices_ids=["1", "2", "3", "42"],
),
{},
)

@patch(LOCAL_SCHED_DEVICE_COUNT, return_value=2)
def test_auto_set_auto_set_cuda_visible_devices_ids_with_out_of_range_below(
self, _: MagicMock
) -> None:
self.assert_CUDA_VISIBLE_DEVICES(
self.dryrun(
self.role("a", replicas=2, gpu=2),
auto_set_cuda_visible_devices_ids=["1", "2", "3", "-1"],
),
{},
)

@patch(LOCAL_SCHED_DEVICE_COUNT, return_value=2)
def test_auto_set_CUDA_VISIBLE_DEVICES_false(self, _: MagicMock) -> None:
self.assert_CUDA_VISIBLE_DEVICES(
Expand Down Expand Up @@ -1038,6 +1093,73 @@ def test_auto_set_CUDA_VISIBLE_DEVICES(self, _: MagicMock) -> None:
{"a": expected[gpu]},
)

@patch(LOCAL_SCHED_DEVICE_COUNT, return_value=8)
def test_auto_set_CUDA_VISIBLE_DEVICES_from_subset(self, _: MagicMock) -> None:
for gpu in [1, 2, 3, 4, 5, 6, 7, 8]:
subset = [
[],
["7"],
["7", "6"],
["7", "6", "5"],
["7", "6", "5", "4"],
["7", "6", "5", "4", "3"],
["7", "6", "5", "4", "3", "2"],
["7", "6", "5", "4", "3", "2", "1"],
["7", "6", "5", "4", "3", "2", "1", "0"],
]
expected = [
[],
["7"],
["6,7"],
["5,6,7"],
["4,5,6,7"],
["3,4,5,6,7"],
["2,3,4,5,6,7"],
["1,2,3,4,5,6,7"],
["0,1,2,3,4,5,6,7"],
]

self.assert_CUDA_VISIBLE_DEVICES(
self.dryrun(
self.role(name="a", replicas=1, gpu=gpu),
auto_set_cuda_visible_devices_ids=subset[gpu],
),
{"a": expected[gpu]},
)

for gpu in [1, 2, 3, 4]:
subset = [
[],
["7", "5"],
[
"7",
"5",
"3",
"1",
],
["7", "5", "3", "1", "2", "4"],
["7", "6", "5", "4", "3", "2", "1", "0"],
]
expected = [
[],
["5", "7"],
["1,3", "5,7"],
["1,2,3", "4,5,7"],
["0,1,2,3", "4,5,6,7"],
]

self.assert_CUDA_VISIBLE_DEVICES(
self.dryrun(
self.role(
name="a",
replicas=2,
gpu=gpu,
),
auto_set_cuda_visible_devices_ids=subset[gpu],
),
{"a": expected[gpu]},
)

@patch(LOCAL_SCHED_DEVICE_COUNT, return_value=8)
def test_auto_set_CUDA_VISIBLE_DEVICES_multi_role(self, _: MagicMock) -> None:
self.assert_CUDA_VISIBLE_DEVICES(
Expand All @@ -1052,6 +1174,25 @@ def test_auto_set_CUDA_VISIBLE_DEVICES_multi_role(self, _: MagicMock) -> None:
},
)

@patch(LOCAL_SCHED_DEVICE_COUNT, return_value=8)
def test_auto_set_CUDA_VISIBLE_DEVICES_multi_role_with_subset(
self, _: MagicMock
) -> None:
self.assert_CUDA_VISIBLE_DEVICES(
self.dryrun(
self.role(name="a", replicas=1, gpu=1),
self.role(name="b", replicas=1, gpu=0),
self.role(name="c", replicas=3, gpu=2),
auto_set_cuda_visible_devices_ids=["1", "2", "3", "4", "5", "6", "7"],
),
{
"a": [
"1",
],
"c": ["2,3", "4,5", "6,7"],
},
)

@patch(LOCAL_SCHED_DEVICE_COUNT, return_value=16)
def test_get_cuda_devices_is_set(self, _: MagicMock) -> None:
appdef = AppDef(
Expand Down Expand Up @@ -1085,6 +1226,54 @@ def test_get_cuda_devices_is_set(self, _: MagicMock) -> None:
self.assertEqual("4,5,6", role2_params[0].env[ENV_CUDA_VISIBLE_DEVICES])
self.assertEqual("7,8,9", role2_params[1].env[ENV_CUDA_VISIBLE_DEVICES])

@patch(LOCAL_SCHED_DEVICE_COUNT, return_value=16)
def test_get_cuda_devices_is_set_with_subset(self, _: MagicMock) -> None:
appdef = AppDef(
name="role1",
roles=[
Role(
name="role1",
image=self.test_dir,
entrypoint="train",
resource=Resource(gpu=2, cpu=0, memMB=0),
num_replicas=2,
),
Role(
name="role2",
image=self.test_dir,
entrypoint="train",
resource=Resource(gpu=3, cpu=0, memMB=0),
num_replicas=2,
),
],
)
popen_req = self.scheduler._to_popen_request(
appdef,
{
"auto_set_cuda_visible_devices": True,
"auto_set_cuda_visible_devices_ids": [
"0",
"1",
"2",
"3",
"5",
"7",
"9",
"11",
"13",
"15",
],
},
)
role1_params = popen_req.role_params["role1"]
self.assertEqual(2, len(role1_params))
self.assertEqual("0,1", role1_params[0].env[ENV_CUDA_VISIBLE_DEVICES])
self.assertEqual("2,3", role1_params[1].env[ENV_CUDA_VISIBLE_DEVICES])
role2_params = popen_req.role_params["role2"]
self.assertEqual(2, len(role2_params))
self.assertEqual("5,7,9", role2_params[0].env[ENV_CUDA_VISIBLE_DEVICES])
self.assertEqual("11,13,15", role2_params[1].env[ENV_CUDA_VISIBLE_DEVICES])

@patch(LOCAL_SCHED_DEVICE_COUNT, return_value=8)
def test_get_cuda_devices_not_set(self, _: MagicMock) -> None:
trainer1 = AppDef(
Expand Down

0 comments on commit f6eb2fd

Please sign in to comment.