Skip to content

Commit

Permalink
Resolve run_opts before passing it to the workspace (pytorch#755) (py…
Browse files Browse the repository at this point in the history
…torch#755)

Summary:
Pull Request resolved: pytorch#755

We were resolving the default values for runopts during the dryrun call on the scheduler. This made the ops passed to the workspace not have the defaults correctly populated for workspace opts. This change also resolves the runopts in the runner dryrun and scheduler submit apis.

Haven't removed the runopts resolution in scheduler dry run here as a lot of other tests broke with it and it seems reasonable to also have runopts resolved for just the scheduler dryrun. The double resolving of runopts for the runner dryrun and scheduler submit cases shouldnt cause any meaningful differences.

There is a separate question on whether workspace should also be built during scheduler dryrun but that can be a follow up change.

Differential Revision: D48395915

fbshipit-source-id: c076e933fe3b8c64ff1d9a6b68c37815f73ab060
  • Loading branch information
manav-a authored and KPostOffice committed Sep 7, 2023
1 parent 9390301 commit e880542
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 8 deletions.
6 changes: 3 additions & 3 deletions torchx/runner/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,7 @@ def dryrun(
cfg = cfg or dict()
with log_event("dryrun", scheduler, runcfg=json.dumps(cfg) if cfg else None):
sched = self._scheduler(scheduler)

resolved_cfg = sched.run_opts().resolve(cfg)
if workspace and isinstance(sched, WorkspaceMixin):
role = app.roles[0]
old_img = role.image
Expand All @@ -366,7 +366,7 @@ def dryrun(
logger.info(
'To disable workspaces pass: --workspace="" from CLI or workspace=None programmatically.'
)
sched.build_workspace_and_update_role(role, workspace, cfg)
sched.build_workspace_and_update_role(role, workspace, resolved_cfg)

if old_img != role.image:
logger.info(
Expand All @@ -380,7 +380,7 @@ def dryrun(
)

sched._validate(app, scheduler)
dryrun_info = sched.submit_dryrun(app, cfg)
dryrun_info = sched.submit_dryrun(app, resolved_cfg)
dryrun_info._scheduler = scheduler
return dryrun_info

Expand Down
8 changes: 7 additions & 1 deletion torchx/runner/test/api_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,10 @@ def test_run(self, _) -> None:

def test_dryrun(self, _) -> None:
scheduler_mock = MagicMock()
scheduler_mock.run_opts.return_value.resolve.return_value = {
**self.cfg,
"foo": "bar",
}
with Runner(
name=SESSION_NAME,
scheduler_factories={"local_dir": lambda name: scheduler_mock},
Expand All @@ -127,7 +131,9 @@ def test_dryrun(self, _) -> None:
)
app = AppDef("name", roles=[role])
runner.dryrun(app, "local_dir", cfg=self.cfg)
scheduler_mock.submit_dryrun.assert_called_once_with(app, self.cfg)
scheduler_mock.submit_dryrun.assert_called_once_with(
app, {**self.cfg, "foo": "bar"}
)
scheduler_mock._validate.assert_called_once()

def test_dryrun_env_variables(self, _) -> None:
Expand Down
7 changes: 5 additions & 2 deletions torchx/schedulers/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,12 +136,15 @@ def submit(
Returns:
The application id that uniquely identifies the submitted app.
"""
# pyre-fixme: Generic cfg type passed to resolve
resolved_cfg = self.run_opts().resolve(cfg)
if workspace:
sched = self
assert isinstance(sched, WorkspaceMixin)
role = app.roles[0]
sched.build_workspace_and_update_role(role, workspace, cfg)
dryrun_info = self.submit_dryrun(app, cfg)
sched.build_workspace_and_update_role(role, workspace, resolved_cfg)
# pyre-fixme: submit_dryrun takes Generic type for resolved_cfg
dryrun_info = self.submit_dryrun(app, resolved_cfg)
return self.schedule(dryrun_info)

@abc.abstractmethod
Expand Down
4 changes: 2 additions & 2 deletions torchx/schedulers/test/api_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,8 @@ def test_submit_workspace(self) -> None:

scheduler_mock = SchedulerTest.MockScheduler("test_session")

bad_type_cfg = {"foo": "asdf"}
scheduler_mock.submit(app, bad_type_cfg, workspace="some_workspace")
cfg = {"foo": "asdf"}
scheduler_mock.submit(app, cfg, workspace="some_workspace")
self.assertEqual(app.roles[0].image, "some_workspace")

def test_invalid_dryrun_cfg(self) -> None:
Expand Down

0 comments on commit e880542

Please sign in to comment.