diff --git a/hyakvnc/__main__.py b/hyakvnc/__main__.py index f5178b7..a7aeb6d 100644 --- a/hyakvnc/__main__.py +++ b/hyakvnc/__main__.py @@ -40,7 +40,7 @@ def get_apptainer_vnc_instances(read_apptainer_config: bool = False): for p, name_meta in running_hyakvnc_json_files.items(): with open(p, 'r') as f: d = json.load(f) - assert needed_keys <= d.keys(), f"Missing keys {needed_keys - d.keys()} in {jf}" + assert needed_keys <= d.keys(), f"Missing keys {needed_keys - d.keys()} in {d}" logOutPath = Path(d['logOutPath']).expanduser() if not logOutPath.exists(): @@ -51,11 +51,10 @@ def get_apptainer_vnc_instances(read_apptainer_config: bool = False): else: d['config'] = json.loads(base64.b64decode(d['config']).decode('utf-8')) - d['slurm_compute_node'] = slurm_compute_node = p.relative_to(app_dir).parts[0] d['slurm_job_id'] = name_meta['jobid'] - with open(logOutPath, 'r') as f: - logOutFile_contents = f.read() + with open(logOutPath, 'r') as lf: + logOutFile_contents = lf.read() rfbports = re.findall(r'\s+-rfbport\s+(?P\d+)\b', logOutFile_contents) if not rfbports: continue @@ -63,7 +62,7 @@ def get_apptainer_vnc_instances(read_apptainer_config: bool = False): vnc_port = rfbports[-1] vnc_log_file_paths = re.findall( - rf'(?m)Log file is\s*(?P.*[/]{d["slurm_compute_node"]}.*:{vnc_port}\.log)$', + rf'(?m)Log file is\s*(?P.*/{d["slurm_compute_node"]}.*:{vnc_port}\.log)$', logOutFile_contents) if not vnc_log_file_paths: continue @@ -125,7 +124,7 @@ def cmd_create(container_path): # needs to match rf'(?P{app_config.apptainer_instance_prefix})(?P\d+)-(?P.*)'): apptainer_instance_name = rf"{app_config.apptainer_instance_prefix}-\$SLURM_JOB_ID-{container_name}" - apptainer_cmd = apptainer_env_vars_string + rf"apptainer instance start {container_path} {container_name}" + apptainer_cmd = apptainer_env_vars_string + rf"apptainer instance start {container_path} {apptainer_instance_name}" apptainer_cmd_with_rest = rf"{apptainer_cmd} && while true; do sleep 10; done" cmds += ["--wrap", apptainer_cmd_with_rest] @@ -148,7 +147,7 @@ def cmd_create(container_path): logging.info("Waiting for job to start running") try: - state = wait_for_job_status(job_id, states={"RUNNING"}, timeout=app_config.sbatch_post_timeout, + wait_for_job_status(job_id, states=["RUNNING"], timeout=app_config.sbatch_post_timeout, poll_interval=app_config.sbatch_post_poll_interval) except TimeoutError: raise TimeoutError(f"Job {job_id} did not start running within {app_config.sbatch_post_timeout} seconds") @@ -200,8 +199,7 @@ def create_arg_parser(): parser_create.add_argument('-c', '--cpus', dest='cpus', metavar='', help='Subnode cpu count', default=1, type=int) parser_create.add_argument('-G', '--gpus', dest='gpus', metavar='[type:]', help='Subnode gpu count', - default="0" - type = str) + default="0", type = str) parser_create.add_argument('--mem', dest='mem', metavar='', help='Subnode memory amount with units', type=str) parser_create.add_argument('--container', dest='container', metavar='', @@ -223,7 +221,7 @@ def create_arg_parser(): arg_parser = create_arg_parser() -args = (arg_parser).parse_args() +args = arg_parser.parse_args() if args.debug: os.environ["HYAKVNC_LOG_LEVEL"] = "DEBUG" diff --git a/hyakvnc/config.py b/hyakvnc/config.py index 3a7957e..e1a7da8 100644 --- a/hyakvnc/config.py +++ b/hyakvnc/config.py @@ -1,5 +1,4 @@ import json -import json import logging import os from dataclasses import dataclass, asdict @@ -50,7 +49,6 @@ class HyakVncConfig: ssh_host = "klone.hyak.uw.edu" # intermediate host address between local machine and compute node # slurm attributes - ## sbatch environment variables account: Optional[str] = None # account to use for sbatch jobs | -A, --account, SBATCH_ACCOUNT partition: Optional[str] = None # partition to use for sbatch jobs | -p, --partition, SBATCH_PARTITION cluster: Optional[str] = None # cluster to use for sbatch jobs | --clusters, SBATCH_CLUSTERS @@ -88,7 +86,7 @@ def __post_init__(self) -> None: self.apptainer_env_vars["APPTAINER_WRITABLE_TMPFS"] = "1" if self.apptainer_use_writable_tmpfs else "0" if self.apptainer_cleanenv is not None: - self.apptainer_env_vars[["APPTAINER_CLEANENV"] = "1" if self.apptainer_cleanenv else "0" + self.apptainer_env_vars["APPTAINER_CLEANENV"] = "1" if self.apptainer_cleanenv else "0" if self.apptainer_set_bind_paths is not None: self.apptainer_env_vars["APPTAINER_BINDPATH"] = self.apptainer_set_bind_paths diff --git a/hyakvnc/slurmutil.py b/hyakvnc/slurmutil.py index fa5a803..2c0eb28 100755 --- a/hyakvnc/slurmutil.py +++ b/hyakvnc/slurmutil.py @@ -40,8 +40,9 @@ def get_default_account(user: Optional[str] = None, cluster: Optional[str] = Non raise LookupError(f"Could not find default account for user '{user}' on cluster '{cluster}'") -def get_partitions(user: Optional[str] = None, account: Optional[str] = None, cluster: Optional[str] = None) -> set[ - str]: +def get_partitions(user: Optional[str] = None, + account: Optional[str] = None, + cluster: Optional[str] = None) -> set[str]: """ Gets the SLURM partitions for the specified user and account on the specified cluster. @@ -89,7 +90,7 @@ def node_range_to_list(s: str) -> list[str]: :return: list of SLURM nodes :raises ValueError: if the node range could not be converted to a list of nodes """ - output = subproces.run(f"scontrol show hostnames {s}", stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + output = subprocess.run(f"scontrol show hostnames {s}", stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) if output.returncode != 0: raise ValueError(f"Could not convert node range '{s}' to list of nodes:\n{output.stderr}") return output.stdout.rstrip().splitlines() @@ -150,9 +151,11 @@ def from_squeue_line(line: str, field_order=None) -> "SlurmJob": return SlurmJob(**field_dict) -def get_job(user: Optional[str] = os.getlogin(), jobs: Optional[Union[int, list[int]]] = None, - cluster: Optional[str] = None, field_names: Optional[Container[str]] = None) -> Union[ - SlurmJob, list[SlurmJob], None]: +def get_job(jobs: Optional[Union[int, list[int]]] = None, + user: Optional[str] = os.getlogin(), + cluster: Optional[str] = None, + field_names: Optional[Container[str]] = None + ) -> Union[SlurmJob, list[SlurmJob], None]: """ Gets the specified slurm job(s). :param user: User to get jobs for @@ -177,7 +180,8 @@ def get_job(user: Optional[str] = os.getlogin(), jobs: Optional[Union[int, list[ cmds += ['--jobs', jobs] slurm_job_fields = [f for f in fields(SlurmJob) if f.name in field_names] - squeue_format_fields = [f.metadata.get("squeue_field", "") for f in slurm_job_fields].join() + assert len(slurm_job_fields) > 0, "Must specify at least one field to get for slurm jobs" + squeue_format_fields = ",".join([f.metadata.get("squeue_field", "") for f in slurm_job_fields]) cmds += ['--Format', squeue_format_fields] res = subprocess.run(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)