Skip to content

Commit

Permalink
Use shlex throughout to improve shell lexing
Browse files Browse the repository at this point in the history
  • Loading branch information
DonFreed committed Feb 14, 2024
1 parent 6f146d2 commit 61ec21b
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 37 deletions.
25 changes: 19 additions & 6 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,18 +42,31 @@ jobs:
run: |
curl -L https://s3.amazonaws.com/sentieon-release/software/sentieon-genomics-$SENTIEON_VERSION.tar.gz | tar -zxf -
- name: Download model
run: curl -LO "https://s3.amazonaws.com/sentieon-release/other/DNAscopePacBio2.0.bundle"
run: |
curl -L "https://s3.amazonaws.com/sentieon-release/other/DNAscopePacBio2.1.bundle" \
> "DNAscope PacBio2.1.bundle"
- name: Smoke test
run: |
SENTIEON_AUTH_DATA=$(python3 .github/scripts/license_message.py encrypt --key "$ENCRYPTION_KEY" --message "$LICENSE_MESSAGE")
export SENTIEON_AUTH_DATA
. .venv/bin/activate
export PATH=$(pwd)/sentieon-genomics-$SENTIEON_VERSION/bin:$PATH
gzip -dc tests/smoke/ref.fa.bgz > tests/smoke/ref.fa
sentieon-cli dnascope-longread -r tests/smoke/ref.fa -i tests/smoke/sample.cram -m DNAscopePacBio2.0.bundle --repeat-model tests/smoke/sample_repeat.model -g output_hifi.vcf.gz
sentieon driver -r tests/smoke/ref.fa --algo GVCFtyper -v output_hifi.g.vcf.gz output_hifi_gvcftyper.vcf.gz
sentieon-cli dnascope-longread --tech ONT -r tests/smoke/ref.fa -i tests/smoke/sample.cram -m DNAscopePacBio2.0.bundle --repeat-model tests/smoke/sample_repeat.model -g output_ont.vcf.gz
sentieon driver -r tests/smoke/ref.fa --algo GVCFtyper -v output_ont.g.vcf.gz output_hifi_gvcftyper.vcf.gz
gzip -dc tests/smoke/ref.fa.bgz > "tests/smoke/r ef.fa"
mv tests/smoke/ref.fa.fai "tests/smoke/r ef.fa.fai"
mv tests/smoke/sample.cram "tests/smoke/sam ple.cram"
mv tests/smoke/sample.cram.crai "tests/smoke/sam ple.cram.crai"
sentieon-cli dnascope-longread -r "tests/smoke/r ef.fa" \
-i "tests/smoke/sam ple.cram" -m "DNAscope PacBio2.1.bundle" \
--repeat-model tests/smoke/sample_repeat.model -g "output hifi.vcf.gz"
sentieon driver -r "tests/smoke/r ef.fa" --algo GVCFtyper \
-v "output hifi.g.vcf.gz" output_hifi_gvcftyper.vcf.gz
sentieon-cli dnascope-longread --tech ONT -r "tests/smoke/r ef.fa" \
-i "tests/smoke/sam ple.cram" -m "DNAscope PacBio2.1.bundle" \
--repeat-model tests/smoke/sample_repeat.model -g "output ont.vcf.gz"
sentieon driver -r "tests/smoke/r ef.fa" --algo GVCFtyper \
-v "output ont.g.vcf.gz" output_hifi_gvcftyper.vcf.gz
env:
SENTIEON_LICENSE: ${{ secrets.SENTIEON_LICENSE }}
SENTIEON_AUTH_MECH: "GitHub Actions - token"
Expand Down
17 changes: 9 additions & 8 deletions sentieon_cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import sys
import subprocess as sp
import pathlib
import shlex
import shutil
import tempfile
from typing import Callable, Optional, List
Expand Down Expand Up @@ -232,7 +233,7 @@ def dnascope_longread(
model=model_bundle.joinpath("diploid_model"),
)
)
run(" ".join(driver.build_cmd()))
run(shlex.join(driver.build_cmd()))

diploid_vcf = tmp_dir.joinpath("out_diploid.vcf.gz")
driver = cmds.Driver(
Expand All @@ -246,7 +247,7 @@ def dnascope_longread(
diploid_vcf,
)
)
run(" ".join(driver.build_cmd()))
run(shlex.join(driver.build_cmd()))

# Phasing and RepeatModel
phased_bed = tmp_dir.joinpath("out_diploid_phased.bed")
Expand All @@ -269,7 +270,7 @@ def dnascope_longread(
out_ext=phased_ext,
)
)
run(" ".join(driver.build_cmd()))
run(shlex.join(driver.build_cmd()))

if tech.upper() == "ONT":
run(
Expand Down Expand Up @@ -298,7 +299,7 @@ def dnascope_longread(
read_flag_mask="drop=supplementary",
)
)
run(" ".join(driver.build_cmd()))
shlex.join(driver.build_cmd())

run(
f"bcftools view -T {unphased_bed} {phased_vcf} \
Expand Down Expand Up @@ -337,7 +338,7 @@ def dnascope_longread(
pcr_indel_model=repeat_model,
)
)
run(" ".join(driver.build_cmd()))
run(shlex.join(driver.build_cmd()))

kwargs["gvcf_combine_py"] = str(
files("sentieon_cli.scripts").joinpath("gvcf_combine.py")
Expand Down Expand Up @@ -373,7 +374,7 @@ def dnascope_longread(
hap_vcf,
)
)
run(" ".join(driver.build_cmd()))
run(shlex.join(driver.build_cmd()))

# Second pass - unphased regions
diploid_unphased_hp = tmp_dir.joinpath(
Expand All @@ -393,7 +394,7 @@ def dnascope_longread(
pcr_indel_model=repeat_model,
)
)
run(" ".join(driver.build_cmd()))
run(shlex.join(driver.build_cmd()))

# Patch DNA and DNAHP variants
diploid_unphased_patch = tmp_dir.joinpath(
Expand All @@ -419,7 +420,7 @@ def dnascope_longread(
diploid_unphased,
)
)
run(" ".join(driver.build_cmd()))
run(shlex.join(driver.build_cmd()))

# merge calls to create the output
run(
Expand Down
90 changes: 67 additions & 23 deletions sentieon_cli/command_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

import io
import pathlib
import shlex
import typing
from typing import Any, Optional, List, Union, Dict
from .logging import get_logger
Expand All @@ -37,13 +38,13 @@ def build_cmd(self) -> List[str]:
elif isinstance(v, list):
for i in v:
cmd.append(f"--{k}")
cmd.append(f"'{i}'")
cmd.append(str(i))
elif isinstance(v, bool):
if v:
cmd.append(f"--{k}")
else:
cmd.append(f"--{k}")
cmd.append(f"'{v}'")
cmd.append(str(v))

if "output" in self.__dict__:
cmd.append(str(self.__dict__["output"]))
Expand Down Expand Up @@ -184,13 +185,13 @@ def build_cmd(self) -> List[str]:
elif isinstance(v, list):
for i in v:
cmd.append(f"--{k}")
cmd.append(f"'{i}'")
cmd.append(str(i))
elif isinstance(v, bool):
if v:
cmd.append(f"--{k}")
else:
cmd.append(f"--{k}")
cmd.append(f"'{v}'")
cmd.append(str(v))

for algo in self.algo:
cmd.extend(algo.build_cmd())
Expand All @@ -214,9 +215,15 @@ def cmd_bedtools_subtract(
for line in open(name(reference) + ".fai", encoding="utf-8"):
toks = line.strip().split("\t")
f.write(f"{toks[0]}\t0\t{toks[1]}\n")
cmd = f"bedtools subtract -a {regions_bed} -b {phased_bed} "
cmd += f"> {unphased_bed}"
return cmd
cmd = [
"bedtools",
"subtract",
"-a",
str(regions_bed),
"-b",
str(phased_bed),
]
return shlex.join(cmd) + ">" + shlex.quote(str(unphased_bed))


def name(path: typing.Union[str, io.TextIOWrapper, pathlib.Path]) -> str:
Expand Down Expand Up @@ -272,9 +279,20 @@ def cmd_pyexec_vcf_mod_patch(
) -> str:
"""Patch DNAscope and DNAscopeHP VCF files"""

cmd = f"sentieon pyexec {kwargs['vcf_mod_py']} -t {cores} "
cmd += f"patch --vcf {vcf} --vcf_hp {vcf_hp} {out_vcf}"
return cmd
cmd = [
"sentieon",
"pyexec",
str(kwargs["vcf_mod_py"]),
"-t",
str(cores),
"patch",
"--vcf",
str(vcf),
"--vcf_hp",
str(vcf_hp),
str(out_vcf),
]
return shlex.join(cmd)


def cmd_pyexec_gvcf_combine(
Expand All @@ -286,12 +304,25 @@ def cmd_pyexec_gvcf_combine(
) -> str:
"""Combine gVCF files"""

cmd = f"sentieon pyexec {kwargs['gvcf_combine_py']} -t {cores} "
cmd += f"{reference} {gvcf} {out_vcf} -"
cmd += " | sentieon util vcfconvert - " + out_vcf.replace(
".vcf.gz", ".g.vcf.gz"
)
return cmd
cmd1 = [
"sentieon",
"pyexec",
str(kwargs["gvcf_combine_py"]),
"-t",
str(cores),
str(reference),
gvcf,
out_vcf,
"-",
]
cmd2 = [
"sentieon",
"util",
"vcfconvert",
"-",
out_vcf.replace(".vcf.gz", ".g.vcf.gz"),
]
return shlex.join(cmd1) + "|" + shlex.join(cmd2)


def cmd_pyexec_vcf_mod_merge(
Expand All @@ -306,10 +337,23 @@ def cmd_pyexec_vcf_mod_merge(
) -> str:
"""Merge haploid VCF files"""

cmd = f"sentieon pyexec {kwargs['vcf_mod_py']} -t {cores} "
cmd += (
f"merge --hap1 {hap1_vcf} --hap2 {hap2_vcf} --unphased {unphased_vcf} "
)
cmd += f"--phased {phased_vcf} --bed {phased_bed} {out_vcf}"

return cmd
cmd = [
"sentieon",
"pyexec",
kwargs["vcf_mod_py"],
"-t",
str(cores),
"merge",
"--hap1",
hap1_vcf,
"--hap2",
hap2_vcf,
"--unphased",
unphased_vcf,
"--phased",
phased_vcf,
"--bed",
phased_bed,
out_vcf,
]
return shlex.join(cmd)

0 comments on commit 61ec21b

Please sign in to comment.