From 61ec21b1fc7f148d091e11df3672118c00c754ea Mon Sep 17 00:00:00 2001 From: DonFreed Date: Tue, 13 Feb 2024 21:49:17 -0800 Subject: [PATCH] Use shlex throughout to improve shell lexing --- .github/workflows/main.yml | 25 ++++++--- sentieon_cli/__init__.py | 17 ++++--- sentieon_cli/command_strings.py | 90 ++++++++++++++++++++++++--------- 3 files changed, 95 insertions(+), 37 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 522120a..4e94343 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -42,18 +42,31 @@ jobs: run: | curl -L https://s3.amazonaws.com/sentieon-release/software/sentieon-genomics-$SENTIEON_VERSION.tar.gz | tar -zxf - - name: Download model - run: curl -LO "https://s3.amazonaws.com/sentieon-release/other/DNAscopePacBio2.0.bundle" + run: | + curl -L "https://s3.amazonaws.com/sentieon-release/other/DNAscopePacBio2.1.bundle" \ + > "DNAscope PacBio2.1.bundle" - name: Smoke test run: | SENTIEON_AUTH_DATA=$(python3 .github/scripts/license_message.py encrypt --key "$ENCRYPTION_KEY" --message "$LICENSE_MESSAGE") export SENTIEON_AUTH_DATA . .venv/bin/activate export PATH=$(pwd)/sentieon-genomics-$SENTIEON_VERSION/bin:$PATH - gzip -dc tests/smoke/ref.fa.bgz > tests/smoke/ref.fa - sentieon-cli dnascope-longread -r tests/smoke/ref.fa -i tests/smoke/sample.cram -m DNAscopePacBio2.0.bundle --repeat-model tests/smoke/sample_repeat.model -g output_hifi.vcf.gz - sentieon driver -r tests/smoke/ref.fa --algo GVCFtyper -v output_hifi.g.vcf.gz output_hifi_gvcftyper.vcf.gz - sentieon-cli dnascope-longread --tech ONT -r tests/smoke/ref.fa -i tests/smoke/sample.cram -m DNAscopePacBio2.0.bundle --repeat-model tests/smoke/sample_repeat.model -g output_ont.vcf.gz - sentieon driver -r tests/smoke/ref.fa --algo GVCFtyper -v output_ont.g.vcf.gz output_hifi_gvcftyper.vcf.gz + gzip -dc tests/smoke/ref.fa.bgz > "tests/smoke/r ef.fa" + mv tests/smoke/ref.fa.fai "tests/smoke/r ef.fa.fai" + mv tests/smoke/sample.cram "tests/smoke/sam ple.cram" + mv tests/smoke/sample.cram.crai "tests/smoke/sam ple.cram.crai" + + sentieon-cli dnascope-longread -r "tests/smoke/r ef.fa" \ + -i "tests/smoke/sam ple.cram" -m "DNAscope PacBio2.1.bundle" \ + --repeat-model tests/smoke/sample_repeat.model -g "output hifi.vcf.gz" + sentieon driver -r "tests/smoke/r ef.fa" --algo GVCFtyper \ + -v "output hifi.g.vcf.gz" output_hifi_gvcftyper.vcf.gz + + sentieon-cli dnascope-longread --tech ONT -r "tests/smoke/r ef.fa" \ + -i "tests/smoke/sam ple.cram" -m "DNAscope PacBio2.1.bundle" \ + --repeat-model tests/smoke/sample_repeat.model -g "output ont.vcf.gz" + sentieon driver -r "tests/smoke/r ef.fa" --algo GVCFtyper \ + -v "output ont.g.vcf.gz" output_hifi_gvcftyper.vcf.gz env: SENTIEON_LICENSE: ${{ secrets.SENTIEON_LICENSE }} SENTIEON_AUTH_MECH: "GitHub Actions - token" diff --git a/sentieon_cli/__init__.py b/sentieon_cli/__init__.py index c8e954d..7a22e97 100644 --- a/sentieon_cli/__init__.py +++ b/sentieon_cli/__init__.py @@ -4,6 +4,7 @@ import sys import subprocess as sp import pathlib +import shlex import shutil import tempfile from typing import Callable, Optional, List @@ -232,7 +233,7 @@ def dnascope_longread( model=model_bundle.joinpath("diploid_model"), ) ) - run(" ".join(driver.build_cmd())) + run(shlex.join(driver.build_cmd())) diploid_vcf = tmp_dir.joinpath("out_diploid.vcf.gz") driver = cmds.Driver( @@ -246,7 +247,7 @@ def dnascope_longread( diploid_vcf, ) ) - run(" ".join(driver.build_cmd())) + run(shlex.join(driver.build_cmd())) # Phasing and RepeatModel phased_bed = tmp_dir.joinpath("out_diploid_phased.bed") @@ -269,7 +270,7 @@ def dnascope_longread( out_ext=phased_ext, ) ) - run(" ".join(driver.build_cmd())) + run(shlex.join(driver.build_cmd())) if tech.upper() == "ONT": run( @@ -298,7 +299,7 @@ def dnascope_longread( read_flag_mask="drop=supplementary", ) ) - run(" ".join(driver.build_cmd())) + shlex.join(driver.build_cmd()) run( f"bcftools view -T {unphased_bed} {phased_vcf} \ @@ -337,7 +338,7 @@ def dnascope_longread( pcr_indel_model=repeat_model, ) ) - run(" ".join(driver.build_cmd())) + run(shlex.join(driver.build_cmd())) kwargs["gvcf_combine_py"] = str( files("sentieon_cli.scripts").joinpath("gvcf_combine.py") @@ -373,7 +374,7 @@ def dnascope_longread( hap_vcf, ) ) - run(" ".join(driver.build_cmd())) + run(shlex.join(driver.build_cmd())) # Second pass - unphased regions diploid_unphased_hp = tmp_dir.joinpath( @@ -393,7 +394,7 @@ def dnascope_longread( pcr_indel_model=repeat_model, ) ) - run(" ".join(driver.build_cmd())) + run(shlex.join(driver.build_cmd())) # Patch DNA and DNAHP variants diploid_unphased_patch = tmp_dir.joinpath( @@ -419,7 +420,7 @@ def dnascope_longread( diploid_unphased, ) ) - run(" ".join(driver.build_cmd())) + run(shlex.join(driver.build_cmd())) # merge calls to create the output run( diff --git a/sentieon_cli/command_strings.py b/sentieon_cli/command_strings.py index c889b69..644c3ec 100644 --- a/sentieon_cli/command_strings.py +++ b/sentieon_cli/command_strings.py @@ -13,6 +13,7 @@ import io import pathlib +import shlex import typing from typing import Any, Optional, List, Union, Dict from .logging import get_logger @@ -37,13 +38,13 @@ def build_cmd(self) -> List[str]: elif isinstance(v, list): for i in v: cmd.append(f"--{k}") - cmd.append(f"'{i}'") + cmd.append(str(i)) elif isinstance(v, bool): if v: cmd.append(f"--{k}") else: cmd.append(f"--{k}") - cmd.append(f"'{v}'") + cmd.append(str(v)) if "output" in self.__dict__: cmd.append(str(self.__dict__["output"])) @@ -184,13 +185,13 @@ def build_cmd(self) -> List[str]: elif isinstance(v, list): for i in v: cmd.append(f"--{k}") - cmd.append(f"'{i}'") + cmd.append(str(i)) elif isinstance(v, bool): if v: cmd.append(f"--{k}") else: cmd.append(f"--{k}") - cmd.append(f"'{v}'") + cmd.append(str(v)) for algo in self.algo: cmd.extend(algo.build_cmd()) @@ -214,9 +215,15 @@ def cmd_bedtools_subtract( for line in open(name(reference) + ".fai", encoding="utf-8"): toks = line.strip().split("\t") f.write(f"{toks[0]}\t0\t{toks[1]}\n") - cmd = f"bedtools subtract -a {regions_bed} -b {phased_bed} " - cmd += f"> {unphased_bed}" - return cmd + cmd = [ + "bedtools", + "subtract", + "-a", + str(regions_bed), + "-b", + str(phased_bed), + ] + return shlex.join(cmd) + ">" + shlex.quote(str(unphased_bed)) def name(path: typing.Union[str, io.TextIOWrapper, pathlib.Path]) -> str: @@ -272,9 +279,20 @@ def cmd_pyexec_vcf_mod_patch( ) -> str: """Patch DNAscope and DNAscopeHP VCF files""" - cmd = f"sentieon pyexec {kwargs['vcf_mod_py']} -t {cores} " - cmd += f"patch --vcf {vcf} --vcf_hp {vcf_hp} {out_vcf}" - return cmd + cmd = [ + "sentieon", + "pyexec", + str(kwargs["vcf_mod_py"]), + "-t", + str(cores), + "patch", + "--vcf", + str(vcf), + "--vcf_hp", + str(vcf_hp), + str(out_vcf), + ] + return shlex.join(cmd) def cmd_pyexec_gvcf_combine( @@ -286,12 +304,25 @@ def cmd_pyexec_gvcf_combine( ) -> str: """Combine gVCF files""" - cmd = f"sentieon pyexec {kwargs['gvcf_combine_py']} -t {cores} " - cmd += f"{reference} {gvcf} {out_vcf} -" - cmd += " | sentieon util vcfconvert - " + out_vcf.replace( - ".vcf.gz", ".g.vcf.gz" - ) - return cmd + cmd1 = [ + "sentieon", + "pyexec", + str(kwargs["gvcf_combine_py"]), + "-t", + str(cores), + str(reference), + gvcf, + out_vcf, + "-", + ] + cmd2 = [ + "sentieon", + "util", + "vcfconvert", + "-", + out_vcf.replace(".vcf.gz", ".g.vcf.gz"), + ] + return shlex.join(cmd1) + "|" + shlex.join(cmd2) def cmd_pyexec_vcf_mod_merge( @@ -306,10 +337,23 @@ def cmd_pyexec_vcf_mod_merge( ) -> str: """Merge haploid VCF files""" - cmd = f"sentieon pyexec {kwargs['vcf_mod_py']} -t {cores} " - cmd += ( - f"merge --hap1 {hap1_vcf} --hap2 {hap2_vcf} --unphased {unphased_vcf} " - ) - cmd += f"--phased {phased_vcf} --bed {phased_bed} {out_vcf}" - - return cmd + cmd = [ + "sentieon", + "pyexec", + kwargs["vcf_mod_py"], + "-t", + str(cores), + "merge", + "--hap1", + hap1_vcf, + "--hap2", + hap2_vcf, + "--unphased", + unphased_vcf, + "--phased", + phased_vcf, + "--bed", + phased_bed, + out_vcf, + ] + return shlex.join(cmd)