Skip to content

Commit

Permalink
Merge pull request #254 from lexming/deadstart
Browse files Browse the repository at this point in the history
poll job status while waiting for single-user server to be reachable
  • Loading branch information
consideRatio authored Mar 19, 2024
2 parents 5f84c9c + 451cf0a commit 3a65827
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 24 deletions.
7 changes: 7 additions & 0 deletions batchspawner/batchspawner.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,13 @@ async def start(self):
# don't actually run the single-user server yet.
if hasattr(self, "mock_port"):
self.port = self.mock_port
# Check if job is still running
status = await self.poll()
if status:
raise RuntimeError(
"The Jupyter batch job started"
" but died before launching the single-user server."
)

self.db.commit()
self.log.info(
Expand Down
64 changes: 40 additions & 24 deletions batchspawner/tests/test_spawners.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,20 +349,19 @@ async def test_torque(db, event_loop):
re.compile(r"ppn=5"),
re.compile(r"^#PBS some_option_asdf", re.M),
]
poll_running = (
re.compile(r"sudo.*qstat"),
f"<job_state>R</job_state><exec_host>{testhost}/1</exec_host>",
)
script = [
(re.compile(r"sudo.*qsub"), str(testjob)),
(
re.compile(r"sudo.*qstat"),
"<job_state>Q</job_state><exec_host></exec_host>",
), # pending
(
re.compile(r"sudo.*qstat"),
f"<job_state>R</job_state><exec_host>{testhost}/1</exec_host>",
), # running
(
re.compile(r"sudo.*qstat"),
f"<job_state>R</job_state><exec_host>{testhost}/1</exec_host>",
), # running
poll_running,
poll_running,
poll_running,
(re.compile(r"sudo.*qdel"), "STOP"),
(re.compile(r"sudo.*qstat"), ""),
]
Expand Down Expand Up @@ -394,17 +393,16 @@ async def test_moab(db, event_loop):
re.compile(r"ppn=5"),
re.compile(r"^#PBS some_option_asdf", re.M),
]
poll_running = (
re.compile(r"sudo.*mdiag"),
f'State="Running" AllocNodeList="{testhost}"',
)
script = [
(re.compile(r"sudo.*msub"), str(testjob)),
(re.compile(r"sudo.*mdiag"), 'State="Idle"'), # pending
(
re.compile(r"sudo.*mdiag"),
f'State="Running" AllocNodeList="{testhost}"',
), # running
(
re.compile(r"sudo.*mdiag"),
f'State="Running" AllocNodeList="{testhost}"',
), # running
poll_running,
poll_running,
poll_running,
(re.compile(r"sudo.*mjobctl.*-c"), "STOP"),
(re.compile(r"sudo.*mdiag"), ""),
]
Expand Down Expand Up @@ -436,17 +434,16 @@ async def test_pbs(db, event_loop):
re.compile(r"@some_pbs_admin_node"),
re.compile(r"^#PBS some_option_asdf", re.M),
]
poll_running = (
re.compile(r"sudo.*qstat"),
f"job_state = R\nexec_host = {testhost}/2*1",
)
script = [
(re.compile(r"sudo.*qsub"), str(testjob)),
(re.compile(r"sudo.*qstat"), "job_state = Q"), # pending
(
re.compile(r"sudo.*qstat"),
f"job_state = R\nexec_host = {testhost}/2*1",
), # running
(
re.compile(r"sudo.*qstat"),
f"job_state = R\nexec_host = {testhost}/2*1",
), # running
poll_running,
poll_running,
poll_running,
(re.compile(r"sudo.*qdel"), "STOP"),
(re.compile(r"sudo.*qstat"), ""),
]
Expand Down Expand Up @@ -504,6 +501,7 @@ async def test_slurm(db, event_loop):
), # unknown
(re.compile(r"sudo.*squeue"), "RUNNING " + testhost), # running
(re.compile(r"sudo.*squeue"), "RUNNING " + testhost),
(re.compile(r"sudo.*squeue"), "RUNNING " + testhost),
(re.compile(r"sudo.*scancel"), "STOP"),
(re.compile(r"sudo.*squeue"), ""),
]
Expand Down Expand Up @@ -573,6 +571,7 @@ async def test_condor(db, event_loop):
(re.compile(r"sudo.*condor_q"), "1,"), # pending
(re.compile(r"sudo.*condor_q"), f"2, @{testhost}"), # runing
(re.compile(r"sudo.*condor_q"), f"2, @{testhost}"),
(re.compile(r"sudo.*condor_q"), f"2, @{testhost}"),
(re.compile(r"sudo.*condor_rm"), "STOP"),
(re.compile(r"sudo.*condor_q"), ""),
]
Expand Down Expand Up @@ -611,6 +610,7 @@ async def test_lfs(db, event_loop):
(re.compile(r"sudo.*bjobs"), "PEND "), # pending
(re.compile(r"sudo.*bjobs"), f"RUN {testhost}"), # running
(re.compile(r"sudo.*bjobs"), f"RUN {testhost}"),
(re.compile(r"sudo.*bjobs"), f"RUN {testhost}"),
(re.compile(r"sudo.*bkill"), "STOP"),
(re.compile(r"sudo.*bjobs"), ""),
]
Expand Down Expand Up @@ -652,3 +652,19 @@ async def test_keepvars(db, event_loop):
spawner_kwargs=spawner_kwargs,
batch_script_re_list=batch_script_re_list,
)


async def test_early_stop(db, event_loop):
script = [
(re.compile(r"sudo.*sbatch"), str(testjob)),
(re.compile(r"sudo.*squeue"), "PENDING "), # pending
(
re.compile(r"sudo.*squeue"),
"slurm_load_jobs error: Unable to contact slurm controller",
), # unknown
# job exits early during start
(re.compile(r"sudo.*squeue"), ""),
(re.compile(r"sudo.*scancel"), "STOP"),
]
with pytest.raises(RuntimeError, match="job has disappeared"):
await run_spawner_script(db, SlurmSpawner, script)

0 comments on commit 3a65827

Please sign in to comment.