Resolve run_opts before passing it to the workspace

Summary: We were resolving the default values for runopts during the dryrun call on the scheduler. This made the ops passed to the workspace not have the defaults correctly populated for workspace opts. This change also resolves the runopts in the runner dryrun and scheduler submit apis. Haven't removed the runopts resolution in scheduler dry run here as a lot of other tests broke with it and it seems reasonable to also have runopts resolved for just the scheduler dryrun. The double resolving of runopts for the runner dryrun and scheduler submit cases shouldnt cause any meaningful differences. There is a separate question on whether workspace should also be built during scheduler dryrun but that can be a follow up change. Differential Revision: D48395915 fbshipit-source-id: dad30eb6b31d7c7f475b92b5a7047abac59a1177
pytorch · Aug 16, 2023 · c037dba · c037dba
1 parent 744706b
commit c037dba
Show file tree

Hide file tree

Showing 4 changed files with 16 additions and 8 deletions.
diff --git a/torchx/runner/api.py b/torchx/runner/api.py
@@ -357,7 +357,7 @@ def dryrun(
         cfg = cfg or dict()
         with log_event("dryrun", scheduler, runcfg=json.dumps(cfg) if cfg else None):
             sched = self._scheduler(scheduler)
-
+            resolved_cfg = sched.run_opts().resolve(cfg)
             if workspace and isinstance(sched, WorkspaceMixin):
                 role = app.roles[0]
                 old_img = role.image
@@ -366,7 +366,7 @@ def dryrun(
                 logger.info(
                     'To disable workspaces pass: --workspace="" from CLI or workspace=None programmatically.'
                 )
-                sched.build_workspace_and_update_role(role, workspace, cfg)
+                sched.build_workspace_and_update_role(role, workspace, resolved_cfg)
 
                 if old_img != role.image:
                     logger.info(
@@ -380,7 +380,7 @@ def dryrun(
                     )
 
             sched._validate(app, scheduler)
-            dryrun_info = sched.submit_dryrun(app, cfg)
+            dryrun_info = sched.submit_dryrun(app, resolved_cfg)
             dryrun_info._scheduler = scheduler
             return dryrun_info
 

diff --git a/torchx/runner/test/api_test.py b/torchx/runner/test/api_test.py
@@ -114,6 +114,10 @@ def test_run(self, _) -> None:
 
     def test_dryrun(self, _) -> None:
         scheduler_mock = MagicMock()
+        scheduler_mock.run_opts.return_value.resolve.return_value = {
+            **self.cfg,
+            "foo": "bar",
+        }
         with Runner(
             name=SESSION_NAME,
             scheduler_factories={"local_dir": lambda name: scheduler_mock},
@@ -127,7 +131,9 @@ def test_dryrun(self, _) -> None:
             )
             app = AppDef("name", roles=[role])
             runner.dryrun(app, "local_dir", cfg=self.cfg)
-            scheduler_mock.submit_dryrun.assert_called_once_with(app, self.cfg)
+            scheduler_mock.submit_dryrun.assert_called_once_with(
+                app, {**self.cfg, "foo": "bar"}
+            )
             scheduler_mock._validate.assert_called_once()
 
     def test_dryrun_env_variables(self, _) -> None:

diff --git a/torchx/schedulers/api.py b/torchx/schedulers/api.py
@@ -136,12 +136,14 @@ def submit(
         Returns:
             The application id that uniquely identifies the submitted app.
         """
+        # pyre-fixme: Generic cfg type passed to resolve
+        resolved_cfg = self.run_opts().resolve(cfg)
         if workspace:
             sched = self
             assert isinstance(sched, WorkspaceMixin)
             role = app.roles[0]
-            sched.build_workspace_and_update_role(role, workspace, cfg)
-        dryrun_info = self.submit_dryrun(app, cfg)
+            sched.build_workspace_and_update_role(role, workspace, resolved_cfg)
+        dryrun_info = self.submit_dryrun(app, resolved_cfg)
         return self.schedule(dryrun_info)
 
     @abc.abstractmethod

diff --git a/torchx/schedulers/test/api_test.py b/torchx/schedulers/test/api_test.py
@@ -108,8 +108,8 @@ def test_submit_workspace(self) -> None:
 
         scheduler_mock = SchedulerTest.MockScheduler("test_session")
 
-        bad_type_cfg = {"foo": "asdf"}
-        scheduler_mock.submit(app, bad_type_cfg, workspace="some_workspace")
+        cfg = {"foo": "asdf"}
+        scheduler_mock.submit(app, cfg, workspace="some_workspace")
         self.assertEqual(app.roles[0].image, "some_workspace")
 
     def test_invalid_dryrun_cfg(self) -> None: