diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7b698a7..5e15497 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -47,7 +47,7 @@ jobs: - name: Run mypy run: poetry run mypy . - name: Run flake8 - run: poetry run flake8 . --ignore E231,E221 --exclude .github/scripts/license_message.py,sentieon_cli/scripts/gvcf_combine.py,sentieon_cli/scripts/vcf_mod.py # false+ from python 3.12 + run: poetry run flake8 . --extend-ignore E231,E221, --exclude .github/scripts/license_message.py,sentieon_cli/scripts/gvcf_combine.py,sentieon_cli/scripts/vcf_mod.py # false+ from python 3.12 - name: Run the automated tests (for example) run: poetry run pytest -v - name: Run doct tests diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 4e94343..ed9e1db 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -5,16 +5,15 @@ on: - main pull_request: -env: - SENTIEON_VERSION: "202308.01" - jobs: smoke: strategy: fail-fast: true + max-parallel: 1 matrix: - python-version: ["3.12"] + python-version: ["3.8", "3.12"] poetry-version: ["1.5.1"] + sentieon-version: ["202308.01", "202308.02"] os: [ubuntu-22.04] #, macos-latest, windows-latest] runs-on: ${{ matrix.os }} steps: @@ -34,6 +33,11 @@ jobs: run: | sudo apt-get update sudo apt-get install -y bcftools + - name: Install samtools + run: | + curl -L https://github.com/samtools/samtools/releases/download/1.19.2/samtools-1.19.2.tar.bz2 | tar -jxf - + cd samtools-1.19.2 + ./configure && sudo make install - name: Install bedtools run: | sudo curl -L -o /usr/local/bin/bedtools "https://github.com/arq5x/bedtools2/releases/download/v2.31.0/bedtools.static" @@ -41,6 +45,8 @@ jobs: - name: Install sentieon run: | curl -L https://s3.amazonaws.com/sentieon-release/software/sentieon-genomics-$SENTIEON_VERSION.tar.gz | tar -zxf - + env: + SENTIEON_VERSION: ${{ matrix.sentieon-version }} - name: Download model run: | curl -L "https://s3.amazonaws.com/sentieon-release/other/DNAscopePacBio2.1.bundle" \ @@ -56,20 +62,50 @@ jobs: mv tests/smoke/sample.cram "tests/smoke/sam ple.cram" mv tests/smoke/sample.cram.crai "tests/smoke/sam ple.cram.crai" - sentieon-cli dnascope-longread -r "tests/smoke/r ef.fa" \ + sentieon-cli dnascope-longread -t 1 -r "tests/smoke/r ef.fa" \ -i "tests/smoke/sam ple.cram" -m "DNAscope PacBio2.1.bundle" \ - --repeat-model tests/smoke/sample_repeat.model -g "output hifi.vcf.gz" - sentieon driver -r "tests/smoke/r ef.fa" --algo GVCFtyper \ + --repeat-model tests/smoke/sample_repeat.model -g \ + -b "tests/smoke/diploid.bed" \ + --haploid-bed "tests/smoke/haploid.bed" \ + "output hifi.vcf.gz" + sentieon driver -r "tests/smoke/r ef.fa" -t 1 --algo GVCFtyper \ -v "output hifi.g.vcf.gz" output_hifi_gvcftyper.vcf.gz + if [ ! -f "output hifi.sv.vcf.gz" -o ! -f "output hifi.haploid.vcf.gz" ]; then + exit 1 + fi - sentieon-cli dnascope-longread --tech ONT -r "tests/smoke/r ef.fa" \ + sentieon-cli dnascope-longread --tech ONT -t 1 -r "tests/smoke/r ef.fa" \ -i "tests/smoke/sam ple.cram" -m "DNAscope PacBio2.1.bundle" \ --repeat-model tests/smoke/sample_repeat.model -g "output ont.vcf.gz" - sentieon driver -r "tests/smoke/r ef.fa" --algo GVCFtyper \ + sentieon driver -r "tests/smoke/r ef.fa" -t 1 --algo GVCFtyper \ -v "output ont.g.vcf.gz" output_hifi_gvcftyper.vcf.gz + if [ ! -f "output ont.sv.vcf.gz" ]; then + exit 1 + fi + + sentieon-cli dnascope-longread -t 1 -r "tests/smoke/r ef.fa" \ + -i "tests/smoke/sam ple.cram" -m "DNAscope PacBio2.1.bundle" \ + --repeat-model tests/smoke/sample_repeat.model --align \ + --input_ref "tests/smoke/r ef.fa" "output realigned.vcf.gz" + if [ ! -f "output realigned.vcf.gz" -o ! -f "output realigned_mm2_sorted_0.cram" ]; then + exit 1 + fi + + samtools fastq --reference "tests/smoke/r ef.fa" \ + "tests/smoke/sam ple.cram" | \ + gzip -c > sample.fq.gz + sentieon-cli dnascope-longread -t 1 -r "tests/smoke/r ef.fa" \ + --fastq sample.fq.gz --readgroups '@RG\tID:sample-1\tSM:sample' \ + -m "DNAscope PacBio2.1.bundle" \ + --repeat-model tests/smoke/sample_repeat.model "output fq.vcf.gz" + if [ ! -f "output fq.vcf.gz" -o ! -f "output fq_mm2_sorted_fq_0.cram" -o ! -f "output fq.sv.vcf.gz" ]; then + exit 1 + fi + env: SENTIEON_LICENSE: ${{ secrets.SENTIEON_LICENSE }} SENTIEON_AUTH_MECH: "GitHub Actions - token" ENCRYPTION_KEY: ${{ secrets.ENCRYPTION_KEY }} LICENSE_MESSAGE: ${{ secrets.LICENSE_MESSAGE }} + SENTIEON_VERSION: ${{ matrix.sentieon-version }} \ No newline at end of file diff --git a/.gitignore b/.gitignore index 4f36f66..41568ce 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,5 @@ __pycache__/ dist/ .vscode/ -poetry.lock secrets.env .*.swp \ No newline at end of file diff --git a/README.md b/README.md index eeff9a1..3dd322f 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,16 @@ A command-line interface for the Sentieon software ## Setup -`sentieon-cli` uses [poetry](https://pypi.org/project/poetry/) for packaging and dependency management. Initially, you will need to install poetry into your environment: +Create a new python virtual environment for the project, if needed: +``` +# Create a new venv, if needed +python3 -m venv /path/to/new/virtal/environment/sentieon_cli + +# Activate the venv +source /path/to/new/virtal/environment/sentieon_cli/bin/activate +``` + +`sentieon-cli` uses [poetry](https://pypi.org/project/poetry/) for packaging and dependency management. Initially, you will need to install poetry: ``` pip install poetry ``` @@ -17,19 +26,13 @@ git clone https://github.com/sentieon/sentieon-cli.git cd sentieon-cli ``` -Use poetry to install the `sentieon-cli` into a virtualenv. +Use poetry to install the `sentieon-cli` into the virtual environment: ``` poetry install ``` -You can then run commands through poetry. -``` -poetry run sentieon-cli ... -``` - -Alternatively, you can use `poetry shell` to activate the environment with the `sentieon-cli`. +You can then run commands from the virtual environment: ``` -poetry shell sentieon-cli ... ``` diff --git a/data/hg38_diploid.bed b/data/hg38_diploid.bed new file mode 100644 index 0000000..932f4cf --- /dev/null +++ b/data/hg38_diploid.bed @@ -0,0 +1,24 @@ +chr1 1 248956422 +chr2 1 242193529 +chr3 1 198295559 +chr4 1 190214555 +chr5 1 181538259 +chr6 1 170805979 +chr7 1 159345973 +chr8 1 145138636 +chr9 1 138394717 +chr10 1 133797422 +chr11 1 135086622 +chr12 1 133275309 +chr13 1 114364328 +chr14 1 107043718 +chr15 1 101991189 +chr16 1 90338345 +chr17 1 83257441 +chr18 1 80373285 +chr19 1 58617616 +chr20 1 64444167 +chr21 1 46709983 +chr22 1 50818468 +chrX 1 2781479 +chrX 155701383 156040895 \ No newline at end of file diff --git a/data/hg38_haploid.bed b/data/hg38_haploid.bed new file mode 100644 index 0000000..a3f011f --- /dev/null +++ b/data/hg38_haploid.bed @@ -0,0 +1,2 @@ +chrX 2781480 155701383 +chrY 2781480 56887903 \ No newline at end of file diff --git a/docs/dnascope-longread.md b/docs/dnascope-longread.md index bf9a96f..5b68719 100644 --- a/docs/dnascope-longread.md +++ b/docs/dnascope-longread.md @@ -1,25 +1,17 @@ # DNAscope LongRead -Sentieon DNAscope LongRead is a pipeline for calling germline SNVs and indels from long-read sequence data. The DNAscope LongRead pipeline is able to take advantage of longer read lengths to perform quick and accurate variant calling using specially calibrated machine learning models. +Sentieon DNAscope LongRead is a pipeline for alignment and germline variant calling (SNVs, SVs, and indels) from long-read sequence data. The DNAscope LongRead pipeline is able to take advantage of longer read lengths to perform quick and accurate variant calling using specially calibrated machine learning models. -This folder provides a shell script implementing the DNAscope LongRead pipeline along with Python scripts for VCF manipulation. The pipeline expects aligned reads as input in BAM or CRAM format and will output variants in the VCF (and gVCF) formats. +This folder provides a shell script implementing the DNAscope LongRead pipeline. The pipeline will accept as input aligned reads in BAM or CRAM format, or un-aligned data in FASTQ, uBAM, or uCRAM format. The pipeline will output variants in the VCF (or gVCF) formats and aligned reads in BAM or CRAM formats. DNAscope LongRead is implemented using the Sentieon software package, which requires a valid license for use. Please contact info@sentieon.com for access to the Sentieon software and an evaluation license. ## Prerequisites -In order to run the pipeline, you need to use the Sentieon software package version 202308 or higher. The pipeline also requires [Python] versions >2.7 or >3.3, [bcftools] version 1.10 or higher, and [bedtools]. The `sentieon`, `python`, `bcftools`, and `bedtools` executables will be accessed through the user's `PATH` environment variable. +In order to run the pipeline, you need to use the Sentieon software package version 202308.01 or higher and [Python] version >3.8. SNV and indel calling additionally requires [bcftools] version 1.10 or higher and [bedtools]. Alignment of read data in uBAM or uCRAM or re-alignment of read data additional requires [samtools] version 1.16 or higher. The `sentieon`, `python`, `bcftools`, `bedtools`, and `samtools` executables will be accessed through the user's `PATH` environment variable. ## Input data requirements -### Aligned reads - PacBio HiFi - -The pipeline will accept PacBio HiFi long reads that have been aligned to the reference genome with `minimap2` or `pbmm2`. When aligning reads with Sentieon minimap2, using the Sentieon model for HiFi reads is recommended. When aligning reads with the open-source minimap2, `-x map-hifi` is recommended. - -### Aligned reads - ONT - -The pipeline will accept Oxford Nanopore (ONT) long reads that have been aligned to the reference genome with `minimap2`. When aligning reads with Sentieon minimap2, using the Sentieon model for ONT is recommended. When aligning reads with the open-source minimap2, `-x map-ont` is recommended. - ### The Reference genome DNAscope LongRead will call variants present in the sample relative to a high quality reference genome sequence. Besides the reference genome file, a samtools fasta index file (.fai) needs to be present. We recommend aligning to a reference genome without alternate contigs. @@ -27,42 +19,62 @@ DNAscope LongRead will call variants present in the sample relative to a high qu ## Usage -A single command is run to call variants from PacBio HiFi reads: +A single command is run to call SNVs, indels, and structural variants from PacBio HiFi or ONT reads in the uBAM, uCRAM, BAM, or CRAM formats: ```sh -sentieon-cli dnascope-longread [-h] -r REFERENCE -i SAMPLE_INPUT -m MODEL_BUNDLE [-d DBSNP] [-b DIPLOID_BED] [-t NUMBER_THREADS] [-g] [--] VARIANT_VCF +sentieon-cli dnascope-longread [-h] -r REFERENCE -i SAMPLE_INPUT \ + -m MODEL_BUNDLE [-d DBSNP] [-b DIPLOID_BED] [-t NUMBER_THREADS] [-g] \ + --tech HiFi|ONT [--haploid-bed HAPLOID_BED] [--] sample.vcf.gz ``` -A single command is run to call variants from ONT long reads: +With uBAM, uCRAM, BAM, or CRAM input, the read data can be re-aligned to the reference genome using minimap2 using the `--align` argument. + +A single command is run to call SNVs, indels, and structural variants from PacBio HiFi or ONT reads in the FASTQ format: ```sh -sentieon-cli dnascope-longread --tech ONT [-h] -r REFERENCE -i SAMPLE_INPUT -m MODEL_BUNDLE [-d DBSNP] [-b DIPLOID_BED] [-t NUMBER_THREADS] [-g] [--] VARIANT_VCF +sentieon-cli dnascope-longread [-h] -r REFERENCE --fastq INPUT_FASTQ \ + --readgroups READGROUP -m MODEL_BUNDLE [-d DBSNP] [-b DIPLOID_BED] \ + [-t NUMBER_THREADS] [-g] --tech HiFi|ONT [--haploid-bed HAPLOID_BED] [--] \ + sample.vcf.gz ``` The Sentieon LongRead pipeline requires the following arguments: - `-r REFERENCE`: the location of the reference FASTA file. You should make sure that the reference is the same as the one used in the mapping stage. A reference fasta index, ".fai" file, is also required. -- `-i SAMPLE_INPUT`: the input sample file in BAM or CRAM format. One or more files can be suppied by repeating the `-i` argument. -- `-m MODEL_BUNDLE`: the location of the model bundle. -- `--tech ONT`: Sequencing technology used to generate the reads. Supported arguments are `ONT` or `HiFi`. +- `-i SAMPLE_INPUT`: the input sample file in BAM or CRAM format. One or more files can be suppied by passing multiple files after the `-i` argument. +- `--fastq INPUT_FASTQ`: the input sample file in FASTQ format. One or more files can be supplied by passing multiple files after the `--fastq` argument. +- `--readgroups READGROUP`: readgroup information for the read data. The `--readgroups` argument is required if the input data is in the fastq format. This argument expects complete readgroup strings and these strings will be passed to `minimap2` through the `-R` argument. An example argument is `--readgroups '@RG\tID:foo\tSM:bar'`. +- `-m MODEL_BUNDLE`: the location of the model bundle. Model bundle files can be found in the [sentieon-models] repository. +- `--tech HiFi|ONT`: Sequencing technology used to generate the reads. Supported arguments are `ONT` or `HiFi`. The Sentieon LongRead pipeline accepts the following optional arguments: - `-d DBSNP`: the location of the Single Nucleotide Polymorphism database (dbSNP) used to label known variants in VCF (`.vcf`) or bgzip compressed VCF (`.vcf.gz`) format. Only one file is supported. Supplying this file will annotate variants with their dbSNP refSNP ID numbers. A VCF index file is required. -- `-b DIPLOID_BED`: interval in the reference to restrict variant calling, in BED file format. Supplying this file will limit variant calling to the intervals inside the BED file. +- `-b DIPLOID_BED`: interval in the reference to restrict diploid variant calling, in BED file format. Supplying this file will limit diploid variant calling to the intervals inside the BED file. +- `--haploid-bed HAPLOID_BED`: interval in the reference to restrict haploid variant calling, in BED file format. Supplying this file will perform haploid variant calling across the intervals inside the BED file. - `-t NUMBER_THREADS`: number of computing threads that will be used by the software to run parallel processes. The argument is optional; if omitted, the pipeline will use as many threads as the server has. - `-g`: output variants in the gVCF format, in addition to the VCF output file. The tool will output a bgzip compressed gVCF file with a corresponding index file. +- `--align`: re-align the input read data to the reference genome using Sentieon minimap2. This argument needs to be passed to obtain useful output from uBAM or uCRAM input. - `-h`: print the command-line help and exit. - `--dry-run`: print the pipeline commands, but do not actually execute them. The Sentieon LongRead pipeline requires the following positional arguments: -- `VARIANT_VCF`: the location and filename of the variant calling output. The tool will output a bgzip compressed VCF file with a corresponding index file. +- `sample.vcf.gz`: the location and filename of the variant calling output. The filename must end with the suffix, `.vcf.gz`. The tool will output a bgzip compressed VCF file with a corresponding index file. ## Pipeline output The Sentieon LongRead pipeline will output a bgzip compressed file (.vcf.gz) containing variant calls in the standard VCFv4.2 format along with a tabix index file (.vcf.gz.tbi). If the `-g` option is used, the pipeline will also output a bgzip compressed file (.g.vcf.gz) containing variant calls in the gVCF format along with a tabix index file (.g.vcf.gz.tbi). +With the `--haploid-bed HAPLOID_BED` argument, the pipeline will create the following output files: +- `sample.vcf.gz`: SNV and indel variant calls across the diploid regions of the genome (as defined in the `-d DIPLOID_BED` file). +- `sample.haploid.vcf.gz`: SNV and indel variant calls across the haploid regions of the genome (as defined in the `--haploid-bed HAPLOID_BED` file). +- `sample.sv.vcf.gz`: structural variant calls from the Sentieon LongReadSV tool. + +With the `--align` argument or fastq input, the software will also output a CRAM or BAM file for each input file. + ## Other considerations -### Diploid variant calling +### Diploid and haploid variant calling + +The default pipeline is recommended for use with samples from diploid organisms. For samples with both diploid and haploid chromosomes, the `-b DIPLOID_BED` option can be used to limit diploid variant calling to diploid chromosomes and the `--haploid-bed HAPLOID_BED` argument can be used to perform haploid variant calling across haploid chromosomes. Diploid and haploid variants will be output to separate VCF files. -Currently, the pipeline is only recommended for use with samples from diploid organisms. For samples with both diploid and haploid chromosomes, the `-b DIPLOID_BED` option can be used to limit variant calling to diploid chromosomes. +Example diploid and haploid BED files can be found in the [/data](/data) folder in this repository. ### Modification @@ -77,5 +89,7 @@ The Python scripts in the `sentieon_cli/scripts`` folder perform low-level manip [Python]: https://www.python.org/ [bcftools]: http://samtools.github.io/bcftools/bcftools.html [bedtools]: https://bedtools.readthedocs.io/en/latest/ +[samtools]: https://www.htslib.org/ +[sentieon-models]: https://github.com/Sentieon/sentieon-models [Sentieon DNAscope LongRead – A highly Accurate, Fast, and Efficient Pipeline for Germline Variant Calling from PacBio HiFi reads]: https://www.biorxiv.org/content/10.1101/2022.06.01.494452v1 \ No newline at end of file diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 0000000..bb06680 --- /dev/null +++ b/poetry.lock @@ -0,0 +1,367 @@ +# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. + +[[package]] +name = "argh" +version = "0.29.4" +description = "An unobtrusive argparse wrapper with natural syntax" +optional = false +python-versions = ">=3.8" +files = [ + {file = "argh-0.29.4-py3-none-any.whl", hash = "sha256:f84adac68095a8a038d5c0affcd1dc919f0d06dbdac45686ce45202b9c637a6e"}, + {file = "argh-0.29.4.tar.gz", hash = "sha256:695c0ae4534270cae2697841b4a56f434a990694a00264ea10ebbbcdc02c13f7"}, +] + +[package.extras] +completion = ["argcomplete (>=2.0)"] +docs = ["readthedocs-sphinx-search (==0.2.0)", "sphinx (>=6.1)", "sphinx-pyproject (==0.1.0)", "sphinx_rtd_theme (>=1.2.0)"] +linters = ["pre-commit (>=3.0.4)"] +test = ["pytest (>=7.2)", "pytest-cov (>=4.0)", "tox (>=4.4)"] + +[[package]] +name = "black" +version = "23.10.1" +description = "The uncompromising code formatter." +optional = false +python-versions = ">=3.8" +files = [ + {file = "black-23.10.1-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:ec3f8e6234c4e46ff9e16d9ae96f4ef69fa328bb4ad08198c8cee45bb1f08c69"}, + {file = "black-23.10.1-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:1b917a2aa020ca600483a7b340c165970b26e9029067f019e3755b56e8dd5916"}, + {file = "black-23.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c74de4c77b849e6359c6f01987e94873c707098322b91490d24296f66d067dc"}, + {file = "black-23.10.1-cp310-cp310-win_amd64.whl", hash = "sha256:7b4d10b0f016616a0d93d24a448100adf1699712fb7a4efd0e2c32bbb219b173"}, + {file = "black-23.10.1-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:b15b75fc53a2fbcac8a87d3e20f69874d161beef13954747e053bca7a1ce53a0"}, + {file = "black-23.10.1-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:e293e4c2f4a992b980032bbd62df07c1bcff82d6964d6c9496f2cd726e246ace"}, + {file = "black-23.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7d56124b7a61d092cb52cce34182a5280e160e6aff3137172a68c2c2c4b76bcb"}, + {file = "black-23.10.1-cp311-cp311-win_amd64.whl", hash = "sha256:3f157a8945a7b2d424da3335f7ace89c14a3b0625e6593d21139c2d8214d55ce"}, + {file = "black-23.10.1-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:cfcce6f0a384d0da692119f2d72d79ed07c7159879d0bb1bb32d2e443382bf3a"}, + {file = "black-23.10.1-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:33d40f5b06be80c1bbce17b173cda17994fbad096ce60eb22054da021bf933d1"}, + {file = "black-23.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:840015166dbdfbc47992871325799fd2dc0dcf9395e401ada6d88fe11498abad"}, + {file = "black-23.10.1-cp38-cp38-win_amd64.whl", hash = "sha256:037e9b4664cafda5f025a1728c50a9e9aedb99a759c89f760bd83730e76ba884"}, + {file = "black-23.10.1-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:7cb5936e686e782fddb1c73f8aa6f459e1ad38a6a7b0e54b403f1f05a1507ee9"}, + {file = "black-23.10.1-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:7670242e90dc129c539e9ca17665e39a146a761e681805c54fbd86015c7c84f7"}, + {file = "black-23.10.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ed45ac9a613fb52dad3b61c8dea2ec9510bf3108d4db88422bacc7d1ba1243d"}, + {file = "black-23.10.1-cp39-cp39-win_amd64.whl", hash = "sha256:6d23d7822140e3fef190734216cefb262521789367fbdc0b3f22af6744058982"}, + {file = "black-23.10.1-py3-none-any.whl", hash = "sha256:d431e6739f727bb2e0495df64a6c7a5310758e87505f5f8cde9ff6c0f2d7e4fe"}, + {file = "black-23.10.1.tar.gz", hash = "sha256:1f8ce316753428ff68749c65a5f7844631aa18c8679dfd3ca9dc1a289979c258"}, +] + +[package.dependencies] +click = ">=8.0.0" +mypy-extensions = ">=0.4.3" +packaging = ">=22.0" +pathspec = ">=0.9.0" +platformdirs = ">=2" +tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} +typing-extensions = {version = ">=4.0.1", markers = "python_version < \"3.11\""} + +[package.extras] +colorama = ["colorama (>=0.4.3)"] +d = ["aiohttp (>=3.7.4)"] +jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] +uvloop = ["uvloop (>=0.15.2)"] + +[[package]] +name = "click" +version = "8.1.7" +description = "Composable command line interface toolkit" +optional = false +python-versions = ">=3.7" +files = [ + {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"}, + {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[[package]] +name = "colorama" +version = "0.4.6" +description = "Cross-platform colored terminal text." +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +files = [ + {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, + {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, +] + +[[package]] +name = "colorlog" +version = "6.7.0" +description = "Add colours to the output of Python's logging module." +optional = false +python-versions = ">=3.6" +files = [ + {file = "colorlog-6.7.0-py2.py3-none-any.whl", hash = "sha256:0d33ca236784a1ba3ff9c532d4964126d8a2c44f1f0cb1d2b0728196f512f662"}, + {file = "colorlog-6.7.0.tar.gz", hash = "sha256:bd94bd21c1e13fac7bd3153f4bc3a7dc0eb0974b8bc2fdf1a989e474f6e582e5"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "sys_platform == \"win32\""} + +[package.extras] +development = ["black", "flake8", "mypy", "pytest", "types-colorama"] + +[[package]] +name = "exceptiongroup" +version = "1.1.3" +description = "Backport of PEP 654 (exception groups)" +optional = false +python-versions = ">=3.7" +files = [ + {file = "exceptiongroup-1.1.3-py3-none-any.whl", hash = "sha256:343280667a4585d195ca1cf9cef84a4e178c4b6cf2274caef9859782b567d5e3"}, + {file = "exceptiongroup-1.1.3.tar.gz", hash = "sha256:097acd85d473d75af5bb98e41b61ff7fe35efe6675e4f9370ec6ec5126d160e9"}, +] + +[package.extras] +test = ["pytest (>=6)"] + +[[package]] +name = "flake8" +version = "5.0.4" +description = "the modular source code checker: pep8 pyflakes and co" +optional = false +python-versions = ">=3.6.1" +files = [ + {file = "flake8-5.0.4-py2.py3-none-any.whl", hash = "sha256:7a1cf6b73744f5806ab95e526f6f0d8c01c66d7bbe349562d22dfca20610b248"}, + {file = "flake8-5.0.4.tar.gz", hash = "sha256:6fbe320aad8d6b95cec8b8e47bc933004678dc63095be98528b7bdd2a9f510db"}, +] + +[package.dependencies] +mccabe = ">=0.7.0,<0.8.0" +pycodestyle = ">=2.9.0,<2.10.0" +pyflakes = ">=2.5.0,<2.6.0" + +[[package]] +name = "importlib-resources" +version = "5.13.0" +description = "Read resources from Python packages" +optional = false +python-versions = ">=3.8" +files = [ + {file = "importlib_resources-5.13.0-py3-none-any.whl", hash = "sha256:9f7bd0c97b79972a6cce36a366356d16d5e13b09679c11a58f1014bfdf8e64b2"}, + {file = "importlib_resources-5.13.0.tar.gz", hash = "sha256:82d5c6cca930697dbbd86c93333bb2c2e72861d4789a11c2662b933e5ad2b528"}, +] + +[package.dependencies] +zipp = {version = ">=3.1.0", markers = "python_version < \"3.10\""} + +[package.extras] +docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +testing = ["pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-ruff"] + +[[package]] +name = "iniconfig" +version = "2.0.0" +description = "brain-dead simple config-ini parsing" +optional = false +python-versions = ">=3.7" +files = [ + {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, + {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, +] + +[[package]] +name = "mccabe" +version = "0.7.0" +description = "McCabe checker, plugin for flake8" +optional = false +python-versions = ">=3.6" +files = [ + {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"}, + {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, +] + +[[package]] +name = "mypy" +version = "1.6.1" +description = "Optional static typing for Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "mypy-1.6.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e5012e5cc2ac628177eaac0e83d622b2dd499e28253d4107a08ecc59ede3fc2c"}, + {file = "mypy-1.6.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d8fbb68711905f8912e5af474ca8b78d077447d8f3918997fecbf26943ff3cbb"}, + {file = "mypy-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21a1ad938fee7d2d96ca666c77b7c494c3c5bd88dff792220e1afbebb2925b5e"}, + {file = "mypy-1.6.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b96ae2c1279d1065413965c607712006205a9ac541895004a1e0d4f281f2ff9f"}, + {file = "mypy-1.6.1-cp310-cp310-win_amd64.whl", hash = "sha256:40b1844d2e8b232ed92e50a4bd11c48d2daa351f9deee6c194b83bf03e418b0c"}, + {file = "mypy-1.6.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:81af8adaa5e3099469e7623436881eff6b3b06db5ef75e6f5b6d4871263547e5"}, + {file = "mypy-1.6.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8c223fa57cb154c7eab5156856c231c3f5eace1e0bed9b32a24696b7ba3c3245"}, + {file = "mypy-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8032e00ce71c3ceb93eeba63963b864bf635a18f6c0c12da6c13c450eedb183"}, + {file = "mypy-1.6.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:4c46b51de523817a0045b150ed11b56f9fff55f12b9edd0f3ed35b15a2809de0"}, + {file = "mypy-1.6.1-cp311-cp311-win_amd64.whl", hash = "sha256:19f905bcfd9e167159b3d63ecd8cb5e696151c3e59a1742e79bc3bcb540c42c7"}, + {file = "mypy-1.6.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:82e469518d3e9a321912955cc702d418773a2fd1e91c651280a1bda10622f02f"}, + {file = "mypy-1.6.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d4473c22cc296425bbbce7e9429588e76e05bc7342da359d6520b6427bf76660"}, + {file = "mypy-1.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:59a0d7d24dfb26729e0a068639a6ce3500e31d6655df8557156c51c1cb874ce7"}, + {file = "mypy-1.6.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:cfd13d47b29ed3bbaafaff7d8b21e90d827631afda134836962011acb5904b71"}, + {file = "mypy-1.6.1-cp312-cp312-win_amd64.whl", hash = "sha256:eb4f18589d196a4cbe5290b435d135dee96567e07c2b2d43b5c4621b6501531a"}, + {file = "mypy-1.6.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:41697773aa0bf53ff917aa077e2cde7aa50254f28750f9b88884acea38a16169"}, + {file = "mypy-1.6.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7274b0c57737bd3476d2229c6389b2ec9eefeb090bbaf77777e9d6b1b5a9d143"}, + {file = "mypy-1.6.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bbaf4662e498c8c2e352da5f5bca5ab29d378895fa2d980630656178bd607c46"}, + {file = "mypy-1.6.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:bb8ccb4724f7d8601938571bf3f24da0da791fe2db7be3d9e79849cb64e0ae85"}, + {file = "mypy-1.6.1-cp38-cp38-win_amd64.whl", hash = "sha256:68351911e85145f582b5aa6cd9ad666c8958bcae897a1bfda8f4940472463c45"}, + {file = "mypy-1.6.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:49ae115da099dcc0922a7a895c1eec82c1518109ea5c162ed50e3b3594c71208"}, + {file = "mypy-1.6.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8b27958f8c76bed8edaa63da0739d76e4e9ad4ed325c814f9b3851425582a3cd"}, + {file = "mypy-1.6.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:925cd6a3b7b55dfba252b7c4561892311c5358c6b5a601847015a1ad4eb7d332"}, + {file = "mypy-1.6.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8f57e6b6927a49550da3d122f0cb983d400f843a8a82e65b3b380d3d7259468f"}, + {file = "mypy-1.6.1-cp39-cp39-win_amd64.whl", hash = "sha256:a43ef1c8ddfdb9575691720b6352761f3f53d85f1b57d7745701041053deff30"}, + {file = "mypy-1.6.1-py3-none-any.whl", hash = "sha256:4cbe68ef919c28ea561165206a2dcb68591c50f3bcf777932323bc208d949cf1"}, + {file = "mypy-1.6.1.tar.gz", hash = "sha256:4d01c00d09a0be62a4ca3f933e315455bde83f37f892ba4b08ce92f3cf44bcc1"}, +] + +[package.dependencies] +mypy-extensions = ">=1.0.0" +tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} +typing-extensions = ">=4.1.0" + +[package.extras] +dmypy = ["psutil (>=4.0)"] +install-types = ["pip"] +reports = ["lxml"] + +[[package]] +name = "mypy-extensions" +version = "1.0.0" +description = "Type system extensions for programs checked with the mypy type checker." +optional = false +python-versions = ">=3.5" +files = [ + {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"}, + {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, +] + +[[package]] +name = "packaging" +version = "23.2" +description = "Core utilities for Python packages" +optional = false +python-versions = ">=3.7" +files = [ + {file = "packaging-23.2-py3-none-any.whl", hash = "sha256:8c491190033a9af7e1d931d0b5dacc2ef47509b34dd0de67ed209b5203fc88c7"}, + {file = "packaging-23.2.tar.gz", hash = "sha256:048fb0e9405036518eaaf48a55953c750c11e1a1b68e0dd1a9d62ed0c092cfc5"}, +] + +[[package]] +name = "pathspec" +version = "0.11.2" +description = "Utility library for gitignore style pattern matching of file paths." +optional = false +python-versions = ">=3.7" +files = [ + {file = "pathspec-0.11.2-py3-none-any.whl", hash = "sha256:1d6ed233af05e679efb96b1851550ea95bbb64b7c490b0f5aa52996c11e92a20"}, + {file = "pathspec-0.11.2.tar.gz", hash = "sha256:e0d8d0ac2f12da61956eb2306b69f9469b42f4deb0f3cb6ed47b9cce9996ced3"}, +] + +[[package]] +name = "platformdirs" +version = "3.11.0" +description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." +optional = false +python-versions = ">=3.7" +files = [ + {file = "platformdirs-3.11.0-py3-none-any.whl", hash = "sha256:e9d171d00af68be50e9202731309c4e658fd8bc76f55c11c7dd760d023bda68e"}, + {file = "platformdirs-3.11.0.tar.gz", hash = "sha256:cf8ee52a3afdb965072dcc652433e0c7e3e40cf5ea1477cd4b3b1d2eb75495b3"}, +] + +[package.extras] +docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.1)", "sphinx-autodoc-typehints (>=1.24)"] +test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)"] + +[[package]] +name = "pluggy" +version = "1.3.0" +description = "plugin and hook calling mechanisms for python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pluggy-1.3.0-py3-none-any.whl", hash = "sha256:d89c696a773f8bd377d18e5ecda92b7a3793cbe66c87060a6fb58c7b6e1061f7"}, + {file = "pluggy-1.3.0.tar.gz", hash = "sha256:cf61ae8f126ac6f7c451172cf30e3e43d3ca77615509771b3a984a0730651e12"}, +] + +[package.extras] +dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] + +[[package]] +name = "pycodestyle" +version = "2.9.1" +description = "Python style guide checker" +optional = false +python-versions = ">=3.6" +files = [ + {file = "pycodestyle-2.9.1-py2.py3-none-any.whl", hash = "sha256:d1735fc58b418fd7c5f658d28d943854f8a849b01a5d0a1e6f3f3fdd0166804b"}, + {file = "pycodestyle-2.9.1.tar.gz", hash = "sha256:2c9607871d58c76354b697b42f5d57e1ada7d261c261efac224b664affdc5785"}, +] + +[[package]] +name = "pyflakes" +version = "2.5.0" +description = "passive checker of Python programs" +optional = false +python-versions = ">=3.6" +files = [ + {file = "pyflakes-2.5.0-py2.py3-none-any.whl", hash = "sha256:4579f67d887f804e67edb544428f264b7b24f435b263c4614f384135cea553d2"}, + {file = "pyflakes-2.5.0.tar.gz", hash = "sha256:491feb020dca48ccc562a8c0cbe8df07ee13078df59813b83959cbdada312ea3"}, +] + +[[package]] +name = "pytest" +version = "7.4.3" +description = "pytest: simple powerful testing with Python" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pytest-7.4.3-py3-none-any.whl", hash = "sha256:0d009c083ea859a71b76adf7c1d502e4bc170b80a8ef002da5806527b9591fac"}, + {file = "pytest-7.4.3.tar.gz", hash = "sha256:d989d136982de4e3b29dabcc838ad581c64e8ed52c11fbe86ddebd9da0818cd5"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "sys_platform == \"win32\""} +exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=0.12,<2.0" +tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} + +[package.extras] +testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] + +[[package]] +name = "tomli" +version = "2.0.1" +description = "A lil' TOML parser" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, + {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, +] + +[[package]] +name = "typing-extensions" +version = "4.8.0" +description = "Backported and Experimental Type Hints for Python 3.8+" +optional = false +python-versions = ">=3.8" +files = [ + {file = "typing_extensions-4.8.0-py3-none-any.whl", hash = "sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0"}, + {file = "typing_extensions-4.8.0.tar.gz", hash = "sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef"}, +] + +[[package]] +name = "zipp" +version = "3.17.0" +description = "Backport of pathlib-compatible object wrapper for zip files" +optional = false +python-versions = ">=3.8" +files = [ + {file = "zipp-3.17.0-py3-none-any.whl", hash = "sha256:0e923e726174922dce09c53c59ad483ff7bbb8e572e00c7f7c46b88556409f31"}, + {file = "zipp-3.17.0.tar.gz", hash = "sha256:84e64a1c28cf7e91ed2078bb8cc8c259cb19b76942096c8d7b84947690cabaf0"}, +] + +[package.extras] +docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-lint"] +testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy (>=0.9.1)", "pytest-ruff"] + +[metadata] +lock-version = "2.0" +python-versions = "^3.8" +content-hash = "89a3367f8fb0615b0e6b2311f055efc5b8f345986513fba1c7ce46eaa945476b" diff --git a/sentieon_cli/__init__.py b/sentieon_cli/__init__.py index 89aa6b4..ede8029 100644 --- a/sentieon_cli/__init__.py +++ b/sentieon_cli/__init__.py @@ -1,455 +1,6 @@ -import multiprocessing as mp -import argparse -import os -import sys -import subprocess as sp -import pathlib -import shlex -import shutil -import tempfile -from typing import Callable, Optional, List - import argh -import packaging.version - -from argh import arg -from importlib_resources import files -from .logging import get_logger -from . import command_strings as cmds - -__version__ = "0.1.0" - -logger = get_logger(__name__) - -TOOL_MIN_VERSIONS = { - "sentieon driver": packaging.version.Version("202308"), - "bcftools": packaging.version.Version("1.10"), - "bedtools": None, -} - - -def tmp(): - """Create a temporary directory for the current process.""" - tmp_base = os.getenv("SENTIEON_TMPDIR") - tmp_dir = tempfile.mkdtemp(dir=tmp_base) - return tmp_dir - - -def check_version(cmd: str, version: Optional[packaging.version.Version]): - """Check the version of an executable""" - cmd_list: List[str] = cmd.split() - exec_file = shutil.which(cmd_list[0]) - if not exec_file: - print(f"Error: no '{cmd}' found in the PATH") - sys.exit(2) - - if version is None: - return - - cmd_list.append("--version") - cmd_version_str = sp.check_output(cmd_list).decode("utf-8").strip() - if cmd_list[0] == "sentieon": - cmd_version_str = cmd_version_str.split("-")[-1] - else: - # handle, e.g. bcftools which outputs multiple lines. - cmd_version_str = ( - cmd_version_str.split("\n")[0].split()[-1].split("-")[0] - ) - cmd_version = packaging.version.Version(cmd_version_str) - if cmd_version < version: - print( - f"Error: the pipeline requires {cmd} version '{version}' or later " - f"but {cmd} '{cmd_version}' was found in the PATH" - ) - sys.exit(2) - return - - -def path_arg( - exists: Optional[bool] = None, - is_dir: Optional[bool] = None, - is_file: Optional[bool] = None, - is_fifo: Optional[bool] = None, -) -> Callable[[str], pathlib.Path]: - """pathlib checked types for argparse""" - - def _path_arg(arg: str) -> pathlib.Path: - p = pathlib.Path(arg) - - attrs = [exists, is_dir, is_file, is_fifo] - attr_names = ["exists", "is_dir", "is_file", "is_fifo"] - - for attr_val, attr_name in zip(attrs, attr_names): - if attr_val is None: # Skip attributes that are not defined - continue - - m = getattr(p, attr_name) - if m() != attr_val: - raise argparse.ArgumentTypeError( - "The supplied path argument needs the attribute" - f" {attr_name}={attr_val}, but {attr_name}={m()}" - ) - return p - - return _path_arg - - -@arg( - "-r", - "--reference", - required=True, - help="fasta for reference genome", - type=path_arg(exists=True, is_file=True), -) -@arg( - "-i", - "--sample-input", - required=True, - nargs="+", - help="sample BAM or CRAM file", - type=path_arg(exists=True, is_file=True), -) -@arg( - "-m", - "--model-bundle", - help="The model bundle file", - required=True, - type=path_arg(exists=True, is_file=True), -) -@arg( - "output-vcf", - help="Output VCF File. The file name must end in .vcf.gz", - type=path_arg(), -) -@arg( - "-d", - "--dbsnp", - help="dbSNP vcf file Supplying this file will annotate variants with \ - their dbSNP refSNP ID numbers.", - default=None, - type=path_arg(exists=True, is_file=True), -) -@arg( - "-b", - "--bed", - help="Region BED file. Supplying this file will limit variant calling \ - to the intervals inside the BED file.", - type=path_arg(exists=True, is_file=True), -) -@arg( - "-t", - "--cores", - help="Number of threads/processes to use. %(default)s", -) -@arg( - "-g", - "--gvcf", - help="Generate a gVCF output file along with the VCF." - " (default generates only the VCF)", - action="store_true", -) -@arg( - "--tech", - help="Sequencing technology used to generate the reads.", - choices=["HiFi", "ONT"], -) -@arg( - "--dry-run", - help="Print the commands without running them.", -) -@arg( - "--repeat-model", - type=path_arg(exists=True, is_file=True), - help=argparse.SUPPRESS, -) -@arg( - "--skip-version-check", - help=argparse.SUPPRESS, -) -@arg( - "--retain-tmpdir", - help=argparse.SUPPRESS, -) -def dnascope_longread( - output_vcf: pathlib.Path, - reference: Optional[pathlib.Path] = None, - sample_input: Optional[List[pathlib.Path]] = None, - model_bundle: Optional[pathlib.Path] = None, - dbsnp: Optional[pathlib.Path] = None, - bed: Optional[pathlib.Path] = None, - cores: int = mp.cpu_count(), - gvcf: bool = False, - tech: str = "HiFi", - dry_run: bool = False, - repeat_model: Optional[pathlib.Path] = None, - skip_version_check: bool = False, - retain_tmpdir: bool = False, - **kwargs: str, -): - """ - Run sentieon cli with the algo DNAscope command. - """ - assert reference - assert sample_input - assert model_bundle - - logger.setLevel(kwargs["loglevel"]) - logger.info("Starting sentieon-cli version: %s", __version__) - - if not skip_version_check: - for cmd, min_version in TOOL_MIN_VERSIONS.items(): - check_version(cmd, min_version) - - tmp_dir_str = tmp() - tmp_dir = pathlib.Path(tmp_dir_str) - - if dry_run: - run = print - else: - from .runner import run # type: ignore[assignment] - - # First pass - diploid calling - diploid_gvcf_fn = tmp_dir.joinpath("out_diploid.g.vcf.gz") - diploid_tmp_vcf = tmp_dir.joinpath("out_diploid_tmp.vcf.gz") - driver = cmds.Driver( - reference=reference, - thread_count=cores, - input=sample_input, - interval=bed, - ) - if gvcf: - driver.add_algo( - cmds.DNAscope( - diploid_gvcf_fn, - dbsnp=dbsnp, - emit_mode="gvcf", - model=model_bundle.joinpath("gvcf_model"), - ) - ) - driver.add_algo( - cmds.DNAscope( - diploid_tmp_vcf, - dbsnp=dbsnp, - model=model_bundle.joinpath("diploid_model"), - ) - ) - run(shlex.join(driver.build_cmd())) - - diploid_vcf = tmp_dir.joinpath("out_diploid.vcf.gz") - driver = cmds.Driver( - reference=reference, - thread_count=cores, - ) - driver.add_algo( - cmds.DNAModelApply( - model_bundle.joinpath("diploid_model"), - diploid_tmp_vcf, - diploid_vcf, - ) - ) - run(shlex.join(driver.build_cmd())) - - # Phasing and RepeatModel - phased_bed = tmp_dir.joinpath("out_diploid_phased.bed") - unphased_bed = tmp_dir.joinpath("out_diploid_unphased.bed") - phased_vcf = tmp_dir.joinpath("out_diploid_phased.vcf.gz") - phased_ext = tmp_dir.joinpath("out_diploid_phased.ext.vcf.gz") - phased_unphased = tmp_dir.joinpath("out_diploid_phased_unphased.vcf.gz") - phased_phased = tmp_dir.joinpath("out_diploid_phased_phased.vcf.gz") - driver = cmds.Driver( - reference=reference, - thread_count=cores, - input=sample_input, - interval=bed, - ) - driver.add_algo( - cmds.VariantPhaser( - diploid_vcf, - phased_vcf, - out_bed=phased_bed, - out_ext=phased_ext, - ) - ) - run(shlex.join(driver.build_cmd())) - - if tech.upper() == "ONT": - run( - f"bcftools view -T {phased_bed} {phased_vcf} \ - | sentieon util vcfconvert - {phased_phased}" - ) - run( - cmds.cmd_bedtools_subtract( - bed, phased_bed, unphased_bed, tmp_dir, reference, dry_run - ) - ) - - if not repeat_model: - repeat_model = tmp_dir.joinpath("out_repeat.model") - driver = cmds.Driver( - reference=reference, - thread_count=cores, - input=sample_input, - interval=phased_bed, - read_filter=f"PhasedReadFilter,phased_vcf={phased_ext},phase_select=tag", # noqa: E501 - ) - driver.add_algo( - cmds.RepeatModel( - repeat_model, - phased=True, - read_flag_mask="drop=supplementary", - ) - ) - run(shlex.join(driver.build_cmd())) - - run( - f"bcftools view -T {unphased_bed} {phased_vcf} \ - | sentieon util vcfconvert - {phased_unphased}" - ) - - # Second pass - phased variants - for phase in (1, 2): - hp_std_vcf = tmp_dir.joinpath(f"out_hap{phase}_nohp_tmp.vcf.gz") - hp_vcf = tmp_dir.joinpath(f"out_hap{phase}_tmp.vcf.gz") - driver = cmds.Driver( - reference=reference, - thread_count=cores, - input=sample_input, - interval=phased_bed, - read_filter=( - f"PhasedReadFilter,phased_vcf={phased_ext}" - f",phase_select={phase}" - ), - ) - - if tech.upper() == "HIFI": - # ONT doesn't do DNAscope in 2nd pass. - driver.add_algo( - cmds.DNAscope( - hp_std_vcf, - dbsnp=dbsnp, - model=model_bundle.joinpath("haploid_model"), - ) - ) - driver.add_algo( - cmds.DNAscopeHP( - hp_vcf, - dbsnp=dbsnp, - model=model_bundle.joinpath("haploid_hp_model"), - pcr_indel_model=repeat_model, - ) - ) - run(shlex.join(driver.build_cmd())) - - kwargs["gvcf_combine_py"] = str( - files("sentieon_cli.scripts").joinpath("gvcf_combine.py") - ) - kwargs["vcf_mod_py"] = str( - files("sentieon_cli.scripts").joinpath("vcf_mod.py") - ) - - patch_vcfs = [tmp_dir.joinpath(f"out_hap{i}_patch.vcf.gz") for i in (1, 2)] - run( - cmds.cmd_pyexec_vcf_mod_haploid_patch( - str(patch_vcfs[0]), - str(patch_vcfs[1]), - f"{tmp_dir}/out_hap%d_%stmp.vcf.gz", - tech, - str(phased_phased), - cores, - kwargs, - ) - ) - - # apply trained model to the patched vcfs. - hap_vcfs = [tmp_dir.joinpath(f"out_hap{i}.vcf.gz") for i in (1, 2)] - for patch_vcf, hap_vcf in zip(patch_vcfs, hap_vcfs): - driver = cmds.Driver( - reference=reference, - thread_count=cores, - ) - driver.add_algo( - cmds.DNAModelApply( - model_bundle.joinpath("haploid_model"), - patch_vcf, - hap_vcf, - ) - ) - run(shlex.join(driver.build_cmd())) - - # Second pass - unphased regions - diploid_unphased_hp = tmp_dir.joinpath( - "out_diploid_phased_unphased_hp.vcf.gz" - ) - driver = cmds.Driver( - reference=reference, - thread_count=cores, - input=sample_input, - interval=unphased_bed, - ) - driver.add_algo( - cmds.DNAscopeHP( - diploid_unphased_hp, - dbsnp=dbsnp, - model=model_bundle.joinpath("diploid_hp_model"), - pcr_indel_model=repeat_model, - ) - ) - run(shlex.join(driver.build_cmd())) - - # Patch DNA and DNAHP variants - diploid_unphased_patch = tmp_dir.joinpath( - "out_diploid_unphased_patch.vcf.gz" - ) - diploid_unphased = tmp_dir.joinpath("out_diploid_unphased.vcf.gz") - cmd = cmds.cmd_pyexec_vcf_mod_patch( - str(diploid_unphased_patch), - str(phased_unphased), - str(diploid_unphased_hp), - cores, - kwargs, - ) - run(cmd) - driver = cmds.Driver( - reference=reference, - thread_count=cores, - ) - driver.add_algo( - cmds.DNAModelApply( - model_bundle.joinpath("diploid_model_unphased"), - diploid_unphased_patch, - diploid_unphased, - ) - ) - run(shlex.join(driver.build_cmd())) - - # merge calls to create the output - run( - cmds.cmd_pyexec_vcf_mod_merge( - str(hap_vcfs[0]), - str(hap_vcfs[1]), - str(diploid_unphased), - str(phased_vcf), - str(phased_bed), - str(output_vcf), - cores, - kwargs, - ) - ) - - if gvcf: - run( - cmds.cmd_pyexec_gvcf_combine( - reference, - str(diploid_gvcf_fn), - str(output_vcf), - cores, - kwargs, - ) - ) - if not retain_tmpdir: - shutil.rmtree(tmp_dir_str) - return +from . import dnascope_longread def main(): @@ -473,7 +24,7 @@ def main(): const="DEBUG", ) - parser.add_commands([dnascope_longread]) + parser.add_commands([dnascope_longread.dnascope_longread]) parser.dispatch() diff --git a/sentieon_cli/command_strings.py b/sentieon_cli/command_strings.py index 644c3ec..39d3aac 100644 --- a/sentieon_cli/command_strings.py +++ b/sentieon_cli/command_strings.py @@ -1,204 +1,19 @@ """ -This module contains the functions that accept kwargs and return the +This module contains the functions that accept arguments and return the command strings. - -We expect that the kwargs are created from user-input (to argh) or a config -file. Where appropriate, we use named args, but for flexibility, kwargs are -also used. - -Command strings may be partial--that is, they could be mixed with other -command strings to get a full viable command. - """ import io import pathlib import shlex +import subprocess as sp import typing -from typing import Any, Optional, List, Union, Dict +from typing import Any, Optional, List, Dict from .logging import get_logger logger = get_logger(__name__) -class BaseAlgo: - """A base class for Sentieon algos""" - - name = "BaseAlgo" - - def build_cmd(self) -> List[str]: - """Build a command line for the algo""" - cmd: List[str] = ["--algo", self.name] - - for k, v in self.__dict__.items(): - if k == "output": - continue - elif v is None: - continue - elif isinstance(v, list): - for i in v: - cmd.append(f"--{k}") - cmd.append(str(i)) - elif isinstance(v, bool): - if v: - cmd.append(f"--{k}") - else: - cmd.append(f"--{k}") - cmd.append(str(v)) - - if "output" in self.__dict__: - cmd.append(str(self.__dict__["output"])) - - return cmd - - -class VariantPhaser(BaseAlgo): - """algo VariantPhaser""" - - name = "VariantPhaser" - - def __init__( - self, - vcf: pathlib.Path, - output: pathlib.Path, - out_bed: Optional[pathlib.Path] = None, - out_ext: Optional[pathlib.Path] = None, - max_depth: int = 1000, - ): - self.vcf = vcf - self.output = output - self.out_bed = out_bed - self.out_ext = out_ext - self.max_depth = max_depth - - -class RepeatModel(BaseAlgo): - """algo RepeatModel""" - - name = "RepeatModel" - - def __init__( - self, - output: pathlib.Path, - phased: bool = False, - min_map_qual: int = 1, - min_group_count: int = 10000, - read_flag_mask: Optional[str] = None, - repeat_extension: int = 5, - max_repeat_unit_size: int = 2, - min_repeat_count: int = 6, - ): - self.output = output - self.phased = phased - self.min_map_qual = min_map_qual - self.min_group_count = min_group_count - self.read_flag_mask = read_flag_mask - self.repeat_extension = repeat_extension - self.max_repeat_unit_size = max_repeat_unit_size - self.min_repeat_count = min_repeat_count - - -class DNAModelApply(BaseAlgo): - """algo DNAModelApply""" - - name = "DNAModelApply" - - def __init__( - self, - model: pathlib.Path, - vcf: pathlib.Path, - output: pathlib.Path, - ): - self.model = model - self.vcf = vcf - self.output = output - - -class DNAscope(BaseAlgo): - """algo DNAscope""" - - name = "DNAscope" - - def __init__( - self, - output: pathlib.Path, - dbsnp: Optional[pathlib.Path] = None, - emit_mode: str = "variant", - model: Optional[pathlib.Path] = None, - ): - self.output = output - self.dbsnp = dbsnp - self.emit_mode = emit_mode - self.model = model - - -class DNAscopeHP(BaseAlgo): - """algo DNAscopeHP""" - - name = "DNAscopeHP" - - def __init__( - self, - output: pathlib.Path, - dbsnp: Optional[pathlib.Path] = None, - model: Optional[pathlib.Path] = None, - pcr_indel_model: Optional[pathlib.Path] = None, - ): - self.output = output - self.dbsnp = dbsnp - self.model = model - self.pcr_indel_model = pcr_indel_model - - -class Driver: - """Representing the Sentieon driver""" - - def __init__( - self, - reference: Optional[pathlib.Path] = None, - thread_count: Optional[int] = None, - interval: Optional[Union[pathlib.Path, str]] = None, - read_filter: Optional[str] = None, - input: Optional[List[pathlib.Path]] = None, - algo: Optional[List[BaseAlgo]] = None, - ): - self.reference = reference - self.input = input - self.thread_count = thread_count - self.interval = interval - self.read_filter = read_filter - self.algo = algo if algo else [] - - def add_algo(self, algo: BaseAlgo): - """Add an algo to the driver""" - self.algo.append(algo) - - def build_cmd(self) -> List[str]: - """Build a command line for the driver""" - cmd: List[str] = ["sentieon", "driver"] - - for k, v in self.__dict__.items(): - if k == "algo": - continue - elif v is None: - continue - elif isinstance(v, list): - for i in v: - cmd.append(f"--{k}") - cmd.append(str(i)) - elif isinstance(v, bool): - if v: - cmd.append(f"--{k}") - else: - cmd.append(f"--{k}") - cmd.append(str(v)) - - for algo in self.algo: - cmd.extend(algo.build_cmd()) - - return cmd - - def cmd_bedtools_subtract( regions_bed: typing.Optional[pathlib.Path], phased_bed: pathlib.Path, @@ -357,3 +172,175 @@ def cmd_pyexec_vcf_mod_merge( out_vcf, ] return shlex.join(cmd) + + +def cmd_pyexec_vcf_mod_haploid_patch2( + out_vcf: str, + vcf: str, + vcf_hp: str, + cores: int, + kwargs: Dict[str, Any], +) -> str: + """Patch a single pair of haploid DNAscope and DNAscopeHP VCFs""" + + cmd = [ + "sentieon", + "pyexec", + str(kwargs["vcf_mod_py"]), + "-t", + str(cores), + "haploid_patch2", + "--vcf", + vcf, + "--vcf_hp", + vcf_hp, + "--patch_vcf", + out_vcf, + ] + return shlex.join(cmd) + + +def get_rg_lines( + input_aln: pathlib.Path, + dry_run: bool, +) -> List[str]: + """Read the @RG lines from an alignment file""" + cmd = shlex.join( + [ + "samtools", + "view", + "-H", + str(input_aln), + ] + ) + rg_lines: List[str] = [] + if not dry_run: + res = sp.run(cmd, shell=True, check=True, stdout=sp.PIPE) + for line in res.stdout.decode("utf-8").split("\n"): + if line.startswith("@RG"): + rg_lines.append(line) + else: + rg_lines.append("@RG\tID:mysample-1\tSM:mysample") + return rg_lines + + +def cmd_samtools_fastq_minimap2( + out_aln: pathlib.Path, + input_aln: pathlib.Path, + reference: pathlib.Path, + model_bundle: pathlib.Path, + cores: int, + rg_lines: List[str], + input_ref: Optional[pathlib.Path] = None, + fastq_taglist: str = "*", + util_sort_args: str = "--cram_write_options version=3.0,compressor=rans", +) -> str: + """Re-align an input BAM/CRAM/uBAM/uCRAM file with minimap2""" + + ref_cmd: List[str] = [] + if input_ref: + ref_cmd = ["--reference", str(reference)] + cmd1 = ( + [ + "samtools", + "fastq", + ] + + ref_cmd + + [ + "-@", + str(cores), + "-T", + fastq_taglist, + str(input_aln), + ] + ) + cmd2 = [ + "sentieon", + "minimap2", + "-y", + "-t", + str(cores), + "-a", + "-x", + f"{model_bundle}/minimap2.model", + str(reference), + "/dev/stdin", + ] + cmd3 = [ + "sentieon", + "util", + "sort", + "-i", + "-", + "-t", + str(cores), + "--reference", + str(reference), + "-o", + str(out_aln), + "--sam2bam", + ] + util_sort_args.split() + + # Commands to replace the @RG lines in the header + rg_cmds: List[List[str]] = [] + for rg_line in rg_lines: + rg_cmds.append( + [ + "samtools", + "addreplacerg", + "-r", + str(rg_line), + "-m", + "orphan_only", + "-", + ] + ) + + return " | ".join([shlex.join(x) for x in (cmd1, cmd2, *rg_cmds, cmd3)]) + + +def cmd_fastq_minimap2( + out_aln: pathlib.Path, + fastq: pathlib.Path, + readgroup: str, + reference: pathlib.Path, + model_bundle: pathlib.Path, + cores: int, + unzip: str = "gzip", + util_sort_args: str = "--cram_write_options version=3.0,compressor=rans", +) -> str: + """Align an input fastq file with minimap2""" + + cmd1 = [ + unzip, + "-dc", + str(fastq), + ] + cmd2 = [ + "sentieon", + "minimap2", + "-t", + str(cores), + "-a", + "-x", + f"{model_bundle}/minimap2.model", + "-R", + readgroup, + str(reference), + "/dev/stdin", + ] + cmd3 = [ + "sentieon", + "util", + "sort", + "-i", + "-", + "-t", + str(cores), + "--reference", + str(reference), + "-o", + str(out_aln), + "--sam2bam", + ] + util_sort_args.split() + return " | ".join([shlex.join(x) for x in (cmd1, cmd2, cmd3)]) diff --git a/sentieon_cli/dnascope_longread.py b/sentieon_cli/dnascope_longread.py new file mode 100644 index 0000000..b77629c --- /dev/null +++ b/sentieon_cli/dnascope_longread.py @@ -0,0 +1,718 @@ +""" +Functionality for the DNAscope LongRead pipeline +""" + +import argparse +import multiprocessing as mp +import pathlib +import shlex +import shutil +import sys +from typing import Any, Callable, Dict, List, Optional + +import packaging.version + +from argh import arg +from importlib_resources import files + +from . import command_strings as cmds +from .driver import ( + Driver, + DNAscope, + DNAscopeHP, + DNAModelApply, + LongReadSV, + RepeatModel, + VariantPhaser, +) +from .util import ( + __version__, + check_version, + logger, + path_arg, + tmp, + library_preloaded, +) + +TOOL_MIN_VERSIONS = { + "sentieon driver": packaging.version.Version("202308.01"), + "bcftools": packaging.version.Version("1.10"), + "bedtools": None, +} + +SV_MIN_VERSIONS = { + "sentieon driver": packaging.version.Version("202308"), +} + +ALN_MIN_VERSIONS = { + "sentieon driver": packaging.version.Version("202308"), + "samtools": packaging.version.Version("1.16"), +} + +FQ_MIN_VERSIONS = { + "sentieon driver": packaging.version.Version("202308"), +} + + +def align_inputs( + run: Callable[[str], None], + output_vcf: pathlib.Path, + reference: pathlib.Path, + sample_input: List[pathlib.Path], + model_bundle: pathlib.Path, + cores: int = mp.cpu_count(), + dry_run: bool = False, + skip_version_check: bool = False, + bam_format: bool = False, + fastq_taglist: str = "*", + util_sort_args: str = "--cram_write_options version=3.0,compressor=rans", + input_ref: Optional[pathlib.Path] = None, + **_kwargs: Any, +) -> List[pathlib.Path]: + """ + Align reads to the reference genome using minimap2 + """ + if not skip_version_check: + for cmd, min_version in ALN_MIN_VERSIONS.items(): + check_version(cmd, min_version) + + res: List[pathlib.Path] = [] + suffix = "bam" if bam_format else "cram" + for i, input_aln in enumerate(sample_input): + out_aln = pathlib.Path( + str(output_vcf).replace(".vcf.gz", f"_mm2_sorted_{i}.{suffix}") + ) + rg_lines = cmds.get_rg_lines( + input_aln, + dry_run, + ) + + run( + cmds.cmd_samtools_fastq_minimap2( + out_aln, + input_aln, + reference, + model_bundle, + cores, + rg_lines, + input_ref, + fastq_taglist, + util_sort_args, + ) + ) + res.append(out_aln) + return res + + +def align_fastq( + run: Callable[[str], None], + output_vcf: pathlib.Path, + reference: pathlib.Path, + model_bundle: pathlib.Path, + cores: int = mp.cpu_count(), + fastq: Optional[List[pathlib.Path]] = None, + readgroups: Optional[List[str]] = None, + skip_version_check: bool = False, + bam_format: bool = False, + util_sort_args: str = "--cram_write_options version=3.0,compressor=rans", + **_kwargs: Any, +) -> List[pathlib.Path]: + """ + Align fastq to the reference genome using minimap2 + """ + res: List[pathlib.Path] = [] + if fastq is None and readgroups is None: + return res + if (not fastq or not readgroups) or (len(fastq) != len(readgroups)): + logger.error( + "The number of readgroups does not equal the number of fastq files" + ) + sys.exit(1) + + if not skip_version_check: + for cmd, min_version in FQ_MIN_VERSIONS.items(): + check_version(cmd, min_version) + + unzip = "igzip" + if not shutil.which(unzip): + logger.warning( + "igzip is recommended for decompression, but is not available. " + "Falling back to gzip." + ) + unzip = "gzip" + + suffix = "bam" if bam_format else "cram" + for i, (fq, rg) in enumerate(zip(fastq, readgroups)): + out_aln = pathlib.Path( + str(output_vcf).replace(".vcf.gz", f"_mm2_sorted_fq_{i}.{suffix}") + ) + run( + cmds.cmd_fastq_minimap2( + out_aln, + fq, + rg, + reference, + model_bundle, + cores, + unzip, + util_sort_args, + ) + ) + res.append(out_aln) + return res + + +def call_variants( + run: Callable[[str], None], + tmp_dir: pathlib.Path, + output_vcf: pathlib.Path, + reference: pathlib.Path, + sample_input: List[pathlib.Path], + model_bundle: pathlib.Path, + dbsnp: Optional[pathlib.Path] = None, + bed: Optional[pathlib.Path] = None, + haploid_bed: Optional[pathlib.Path] = None, + cores: int = mp.cpu_count(), + gvcf: bool = False, + tech: str = "HiFi", + dry_run: bool = False, + repeat_model: Optional[pathlib.Path] = None, + skip_version_check: bool = False, + **_kwargs: Any, +) -> int: + """ + Call SNVs and indels using the DNAscope LongRead pipeline + """ + if haploid_bed and not bed: + logger.error( + "Please supply a BED file of diploid regions to distinguish " + "haploid and diploid regions of the genome." + ) + sys.exit(1) + + if not bed: + logger.warning( + "A BED file is recommended to restrict variant calling to diploid " + "regions of the genome." + ) + + if not skip_version_check: + for cmd, min_version in TOOL_MIN_VERSIONS.items(): + check_version(cmd, min_version) + + # First pass - diploid calling + diploid_gvcf_fn = tmp_dir.joinpath("out_diploid.g.vcf.gz") + diploid_tmp_vcf = tmp_dir.joinpath("out_diploid_tmp.vcf.gz") + driver = Driver( + reference=reference, + thread_count=cores, + input=sample_input, + interval=bed, + ) + if gvcf: + driver.add_algo( + DNAscope( + diploid_gvcf_fn, + dbsnp=dbsnp, + emit_mode="gvcf", + model=model_bundle.joinpath("gvcf_model"), + ) + ) + driver.add_algo( + DNAscope( + diploid_tmp_vcf, + dbsnp=dbsnp, + model=model_bundle.joinpath("diploid_model"), + ) + ) + run(shlex.join(driver.build_cmd())) + + diploid_vcf = tmp_dir.joinpath("out_diploid.vcf.gz") + driver = Driver( + reference=reference, + thread_count=cores, + ) + driver.add_algo( + DNAModelApply( + model_bundle.joinpath("diploid_model"), + diploid_tmp_vcf, + diploid_vcf, + ) + ) + run(shlex.join(driver.build_cmd())) + + # Phasing and RepeatModel + phased_bed = tmp_dir.joinpath("out_diploid_phased.bed") + unphased_bed = tmp_dir.joinpath("out_diploid_unphased.bed") + phased_vcf = tmp_dir.joinpath("out_diploid_phased.vcf.gz") + phased_ext = tmp_dir.joinpath("out_diploid_phased.ext.vcf.gz") + phased_unphased = tmp_dir.joinpath("out_diploid_phased_unphased.vcf.gz") + phased_phased = tmp_dir.joinpath("out_diploid_phased_phased.vcf.gz") + driver = Driver( + reference=reference, + thread_count=cores, + input=sample_input, + interval=bed, + ) + driver.add_algo( + VariantPhaser( + diploid_vcf, + phased_vcf, + out_bed=phased_bed, + out_ext=phased_ext, + ) + ) + run(shlex.join(driver.build_cmd())) + + if tech.upper() == "ONT": + run( + f"bcftools view -T {phased_bed} {phased_vcf} \ + | sentieon util vcfconvert - {phased_phased}" + ) + run( + cmds.cmd_bedtools_subtract( + bed, phased_bed, unphased_bed, tmp_dir, reference, dry_run + ) + ) + + if not repeat_model: + repeat_model = tmp_dir.joinpath("out_repeat.model") + driver = Driver( + reference=reference, + thread_count=cores, + input=sample_input, + interval=phased_bed, + read_filter=f"PhasedReadFilter,phased_vcf={phased_ext},phase_select=tag", # noqa: E501 + ) + driver.add_algo( + RepeatModel( + repeat_model, + phased=True, + read_flag_mask="drop=supplementary", + ) + ) + run(shlex.join(driver.build_cmd())) + + run( + f"bcftools view -T {unphased_bed} {phased_vcf} \ + | sentieon util vcfconvert - {phased_unphased}" + ) + + # Second pass - phased variants + for phase in (1, 2): + hp_std_vcf = tmp_dir.joinpath(f"out_hap{phase}_nohp_tmp.vcf.gz") + hp_vcf = tmp_dir.joinpath(f"out_hap{phase}_tmp.vcf.gz") + driver = Driver( + reference=reference, + thread_count=cores, + input=sample_input, + interval=phased_bed, + read_filter=( + f"PhasedReadFilter,phased_vcf={phased_ext}" + f",phase_select={phase}" + ), + ) + + if tech.upper() == "HIFI": + # ONT doesn't do DNAscope in 2nd pass. + driver.add_algo( + DNAscope( + hp_std_vcf, + dbsnp=dbsnp, + model=model_bundle.joinpath("haploid_model"), + ) + ) + driver.add_algo( + DNAscopeHP( + hp_vcf, + dbsnp=dbsnp, + model=model_bundle.joinpath("haploid_hp_model"), + pcr_indel_model=repeat_model, + ) + ) + run(shlex.join(driver.build_cmd())) + + kwargs: Dict[str, str] = dict() + kwargs["gvcf_combine_py"] = str( + files("sentieon_cli.scripts").joinpath("gvcf_combine.py") + ) + kwargs["vcf_mod_py"] = str( + files("sentieon_cli.scripts").joinpath("vcf_mod.py") + ) + + patch_vcfs = [tmp_dir.joinpath(f"out_hap{i}_patch.vcf.gz") for i in (1, 2)] + run( + cmds.cmd_pyexec_vcf_mod_haploid_patch( + str(patch_vcfs[0]), + str(patch_vcfs[1]), + f"{tmp_dir}/out_hap%d_%stmp.vcf.gz", + tech, + str(phased_phased), + cores, + kwargs, + ) + ) + + # apply trained model to the patched vcfs. + hap_vcfs = [tmp_dir.joinpath(f"out_hap{i}.vcf.gz") for i in (1, 2)] + for patch_vcf, hap_vcf in zip(patch_vcfs, hap_vcfs): + driver = Driver( + reference=reference, + thread_count=cores, + ) + driver.add_algo( + DNAModelApply( + model_bundle.joinpath("haploid_model"), + patch_vcf, + hap_vcf, + ) + ) + run(shlex.join(driver.build_cmd())) + + # Second pass - unphased regions + diploid_unphased_hp = tmp_dir.joinpath( + "out_diploid_phased_unphased_hp.vcf.gz" + ) + driver = Driver( + reference=reference, + thread_count=cores, + input=sample_input, + interval=unphased_bed, + ) + driver.add_algo( + DNAscopeHP( + diploid_unphased_hp, + dbsnp=dbsnp, + model=model_bundle.joinpath("diploid_hp_model"), + pcr_indel_model=repeat_model, + ) + ) + run(shlex.join(driver.build_cmd())) + + # Patch DNA and DNAHP variants + diploid_unphased_patch = tmp_dir.joinpath( + "out_diploid_unphased_patch.vcf.gz" + ) + diploid_unphased = tmp_dir.joinpath("out_diploid_unphased.vcf.gz") + cmd = cmds.cmd_pyexec_vcf_mod_patch( + str(diploid_unphased_patch), + str(phased_unphased), + str(diploid_unphased_hp), + cores, + kwargs, + ) + run(cmd) + driver = Driver( + reference=reference, + thread_count=cores, + ) + driver.add_algo( + DNAModelApply( + model_bundle.joinpath("diploid_model_unphased"), + diploid_unphased_patch, + diploid_unphased, + ) + ) + run(shlex.join(driver.build_cmd())) + + # merge calls to create the output + run( + cmds.cmd_pyexec_vcf_mod_merge( + str(hap_vcfs[0]), + str(hap_vcfs[1]), + str(diploid_unphased), + str(phased_vcf), + str(phased_bed), + str(output_vcf), + cores, + kwargs, + ) + ) + + if gvcf: + run( + cmds.cmd_pyexec_gvcf_combine( + reference, + str(diploid_gvcf_fn), + str(output_vcf), + cores, + kwargs, + ) + ) + + if haploid_bed: + # Haploid variant calling + haploid_fn = tmp_dir.joinpath("haploid.vcf.gz") + haploid_hp_fn = tmp_dir.joinpath("haploid_hp.vcf.gz") + haploid_out_fn = str(output_vcf).replace(".vcf.gz", ".haploid.vcf.gz") + driver = Driver( + reference=reference, + thread_count=cores, + input=sample_input, + interval=haploid_bed, + ) + driver.add_algo( + DNAscope( + haploid_fn, + dbsnp=dbsnp, + model=model_bundle.joinpath("haploid_model"), + ) + ) + driver.add_algo( + DNAscopeHP( + haploid_hp_fn, + dbsnp=dbsnp, + model=model_bundle.joinpath("haploid_hp_model"), + pcr_indel_model=repeat_model, + ) + ) + run(shlex.join(driver.build_cmd())) + + run( + cmds.cmd_pyexec_vcf_mod_haploid_patch2( + haploid_out_fn, + str(haploid_fn), + str(haploid_hp_fn), + cores, + kwargs, + ) + ) + return 0 + + +def call_svs( + run: Callable[[str], None], + output_vcf: pathlib.Path, + reference: pathlib.Path, + sample_input: List[pathlib.Path], + model_bundle: pathlib.Path, + bed: Optional[pathlib.Path] = None, + cores: int = mp.cpu_count(), + skip_version_check: bool = False, + **_kwargs: Any, +) -> int: + """ + Call SVs using Sentieon LongReadSV + """ + if not skip_version_check: + for cmd, min_version in SV_MIN_VERSIONS.items(): + check_version(cmd, min_version) + + sv_vcf = pathlib.Path(str(output_vcf).replace(".vcf.gz", ".sv.vcf.gz")) + driver = Driver( + reference=reference, + thread_count=cores, + input=sample_input, + interval=bed, + ) + driver.add_algo( + LongReadSV( + sv_vcf, + model_bundle.joinpath("longreadsv.model"), + ) + ) + run(shlex.join(driver.build_cmd())) + return 0 + + +@arg( + "-r", + "--reference", + required=True, + help="fasta for reference genome", + type=path_arg(exists=True, is_file=True), +) +@arg( + "-i", + "--sample-input", + nargs="*", + help="sample BAM or CRAM file", + type=path_arg(exists=True, is_file=True), +) +@arg( + "--fastq", + nargs="*", + help="Sample fastq files", + type=path_arg(exists=True, is_file=True), +) +@arg( + "--readgroups", + nargs="*", + help="Readgroup information for the fastq files", +) +@arg( + "-m", + "--model-bundle", + help="The model bundle file", + required=True, + type=path_arg(exists=True, is_file=True), +) +@arg( + "output-vcf", + help="Output VCF File. The file name must end in .vcf.gz", + type=path_arg(), +) +@arg( + "-d", + "--dbsnp", + help="dbSNP vcf file Supplying this file will annotate variants with \ + their dbSNP refSNP ID numbers.", + default=None, + type=path_arg(exists=True, is_file=True), +) +@arg( + "-b", + "--bed", + help="Region BED file. Supplying this file will limit variant calling \ + to the intervals inside the BED file.", + type=path_arg(exists=True, is_file=True), +) +@arg( + "--haploid-bed", + help=( + "A BED file of haploid regions. Supplying this file will perform " + "haploid variant calling across these regions." + ), + type=path_arg(exists=True, is_file=True), +) +@arg( + "-t", + "--cores", + help="Number of threads/processes to use. %(default)s", +) +@arg( + "-g", + "--gvcf", + help="Generate a gVCF output file along with the VCF." + " (default generates only the VCF)", + action="store_true", +) +@arg( + "--tech", + help="Sequencing technology used to generate the reads.", + choices=["HiFi", "ONT"], +) +@arg( + "--dry-run", + help="Print the commands without running them.", +) +@arg( + "--skip-small-variants", + help="Skip small variant (SNV/indel) calling", +) +@arg( + "--skip-svs", + help="Skip SV calling", +) +@arg( + "--align", + help="Align the input BAM/CRAM/uBAM file to the reference genome", + action="store_true", +) +@arg( + "--input_ref", + help="Used to decode the input alignment file. Required if the input file" + " is in the CRAM/uCRAM formats", + type=path_arg(exists=True, is_file=True), +) +@arg( + "--fastq_taglist", + help="A comma-separated list of tags to retain. Defaults to '%(default)s'" + " and the 'RG' tag is required", +) +@arg( + "--bam_format", + help="Use the BAM format instead of CRAM for output aligned files", + action="store_true", +) +@arg( + "--util_sort_args", + help="Extra arguments for sentieon util sort", +) +@arg( + "--repeat-model", + type=path_arg(exists=True, is_file=True), + help=argparse.SUPPRESS, +) +@arg( + "--skip-version-check", + help=argparse.SUPPRESS, + action="store_true", +) +@arg( + "--retain-tmpdir", + help=argparse.SUPPRESS, + action="store_true", +) +def dnascope_longread( + output_vcf: pathlib.Path, # pylint: disable=W0613 + reference: Optional[pathlib.Path] = None, + sample_input: Optional[List[pathlib.Path]] = None, + fastq: Optional[List[pathlib.Path]] = None, + readgroups: Optional[List[str]] = None, + model_bundle: Optional[pathlib.Path] = None, + dbsnp: Optional[pathlib.Path] = None, # pylint: disable=W0613 + bed: Optional[pathlib.Path] = None, # pylint: disable=W0613 + haploid_bed: Optional[pathlib.Path] = None, # pylint: disable=W0613 + cores: int = mp.cpu_count(), # pylint: disable=W0613 + gvcf: bool = False, # pylint: disable=W0613 + tech: str = "HiFi", # pylint: disable=W0613 + dry_run: bool = False, + skip_small_variants: bool = False, + skip_svs: bool = False, + align: bool = False, + input_ref: Optional[pathlib.Path] = None, + fastq_taglist: str = "*", # pylint: disable=W0613 + bam_format: bool = False, # pylint: disable=W0613 + util_sort_args: str = ( + "--cram_write_options version=3.0,compressor=rans" + ), # pylint: disable=W0613 + repeat_model: Optional[pathlib.Path] = None, # pylint: disable=W0613 + skip_version_check: bool = False, # pylint: disable=W0613 + retain_tmpdir: bool = False, + **kwargs: str, +): + """ + Run sentieon cli with the algo DNAscope command. + """ + assert reference + assert sample_input or fastq + assert model_bundle + + logger.setLevel(kwargs["loglevel"]) + logger.info("Starting sentieon-cli version: %s", __version__) + + if not library_preloaded("libjemalloc.so"): + logger.warning( + "jemalloc is recommended, but is not preloaded. See " + "https://support.sentieon.com/appnotes/jemalloc/" + ) + + tmp_dir_str = tmp() + tmp_dir = pathlib.Path(tmp_dir_str) # type: ignore # pylint: disable=W0641 # noqa: E501 + + if dry_run: + run = print # type: ignore # pylint: disable=W0641 + else: + from .runner import run # type: ignore[assignment] # noqa: F401 + + sample_input = sample_input if sample_input else [] + if align: + sample_input = align_inputs(**locals()) + sample_input.extend(align_fastq(**locals())) + + if not skip_small_variants: + res = call_variants(**locals()) + if res != 0: + logger.error("Small variant calling failed") + return + + if not skip_svs: + res = call_svs(**locals()) + if res != 0: + logger.error("SV calling failed") + return + + if not retain_tmpdir: + shutil.rmtree(tmp_dir_str) + return diff --git a/sentieon_cli/driver.py b/sentieon_cli/driver.py new file mode 100644 index 0000000..e77cbce --- /dev/null +++ b/sentieon_cli/driver.py @@ -0,0 +1,206 @@ +""" +Classes for interacting with the Sentieon driver +""" + +import pathlib +from typing import Optional, List, Union + + +class BaseAlgo: + """A base class for Sentieon algos""" + + name = "BaseAlgo" + + def build_cmd(self) -> List[str]: + """Build a command line for the algo""" + cmd: List[str] = ["--algo", self.name] + + for k, v in self.__dict__.items(): + if k == "output": + continue + elif v is None: + continue + elif isinstance(v, list): + for i in v: + cmd.append(f"--{k}") + cmd.append(str(i)) + elif isinstance(v, bool): + if v: + cmd.append(f"--{k}") + else: + cmd.append(f"--{k}") + cmd.append(str(v)) + + if "output" in self.__dict__: + cmd.append(str(self.__dict__["output"])) + + return cmd + + +class VariantPhaser(BaseAlgo): + """algo VariantPhaser""" + + name = "VariantPhaser" + + def __init__( + self, + vcf: pathlib.Path, + output: pathlib.Path, + out_bed: Optional[pathlib.Path] = None, + out_ext: Optional[pathlib.Path] = None, + max_depth: int = 1000, + ): + self.vcf = vcf + self.output = output + self.out_bed = out_bed + self.out_ext = out_ext + self.max_depth = max_depth + + +class RepeatModel(BaseAlgo): + """algo RepeatModel""" + + name = "RepeatModel" + + def __init__( + self, + output: pathlib.Path, + phased: bool = False, + min_map_qual: int = 1, + min_group_count: int = 10000, + read_flag_mask: Optional[str] = None, + repeat_extension: int = 5, + max_repeat_unit_size: int = 2, + min_repeat_count: int = 6, + ): + self.output = output + self.phased = phased + self.min_map_qual = min_map_qual + self.min_group_count = min_group_count + self.read_flag_mask = read_flag_mask + self.repeat_extension = repeat_extension + self.max_repeat_unit_size = max_repeat_unit_size + self.min_repeat_count = min_repeat_count + + +class DNAModelApply(BaseAlgo): + """algo DNAModelApply""" + + name = "DNAModelApply" + + def __init__( + self, + model: pathlib.Path, + vcf: pathlib.Path, + output: pathlib.Path, + ): + self.model = model + self.vcf = vcf + self.output = output + + +class DNAscope(BaseAlgo): + """algo DNAscope""" + + name = "DNAscope" + + def __init__( + self, + output: pathlib.Path, + dbsnp: Optional[pathlib.Path] = None, + emit_mode: str = "variant", + model: Optional[pathlib.Path] = None, + ): + self.output = output + self.dbsnp = dbsnp + self.emit_mode = emit_mode + self.model = model + + +class DNAscopeHP(BaseAlgo): + """algo DNAscopeHP""" + + name = "DNAscopeHP" + + def __init__( + self, + output: pathlib.Path, + dbsnp: Optional[pathlib.Path] = None, + model: Optional[pathlib.Path] = None, + pcr_indel_model: Optional[pathlib.Path] = None, + ): + self.output = output + self.dbsnp = dbsnp + self.model = model + self.pcr_indel_model = pcr_indel_model + + +class LongReadSV(BaseAlgo): + """algo LongReadSV""" + + name = "LongReadSV" + + def __init__( + self, + output: pathlib.Path, + model: Optional[pathlib.Path] = None, + min_map_qual: Optional[int] = None, + min_sv_size: Optional[int] = None, + min_dp: Optional[int] = None, + min_af: Optional[float] = None, + ): + self.output = output + self.model = model + self.min_map_qual = min_map_qual + self.min_sv_size = min_sv_size + self.min_dp = min_dp + self.min_af = min_af + + +class Driver: + """Representing the Sentieon driver""" + + def __init__( + self, + reference: Optional[pathlib.Path] = None, + thread_count: Optional[int] = None, + interval: Optional[Union[pathlib.Path, str]] = None, + read_filter: Optional[str] = None, + input: Optional[List[pathlib.Path]] = None, + algo: Optional[List[BaseAlgo]] = None, + ): + self.reference = reference + self.input = input + self.thread_count = thread_count + self.interval = interval + self.read_filter = read_filter + self.algo = algo if algo else [] + + def add_algo(self, algo: BaseAlgo): + """Add an algo to the driver""" + self.algo.append(algo) + + def build_cmd(self) -> List[str]: + """Build a command line for the driver""" + cmd: List[str] = ["sentieon", "driver"] + + for k, v in self.__dict__.items(): + if k == "algo": + continue + elif v is None: + continue + elif isinstance(v, list): + for i in v: + cmd.append(f"--{k}") + cmd.append(str(i)) + elif isinstance(v, bool): + if v: + cmd.append(f"--{k}") + else: + cmd.append(f"--{k}") + cmd.append(str(v)) + + for algo in self.algo: + cmd.extend(algo.build_cmd()) + + return cmd diff --git a/sentieon_cli/scripts/vcf_mod.py b/sentieon_cli/scripts/vcf_mod.py index be5705c..14041aa 100644 --- a/sentieon_cli/scripts/vcf_mod.py +++ b/sentieon_cli/scripts/vcf_mod.py @@ -187,6 +187,57 @@ def open_vcfs(input_vcf_fns, output_vcf_fns, update=None, hdr_idx=0): return (input_vcfs, output_vcfs) +def haploid_patch_main(args): + input_vcfs, output_vcfs = open_vcfs( + (args.vcf, args.vcf_hp), + (args.patch_vcf,), + update=( + '##INFO=', + ), + hdr_idx=None, + ) + vcfs = input_vcfs + output_vcfs + result = sharded_run( + args.thread_count, + input_vcfs[0].contigs, + haploid_patch, + 100 * 1000 * 1000, + None, + *vcfs + ) + + for f in vcfs: + if f: + f.close() + return 0 + +def haploid_patch(vcfi1, vcfd1, vcfo1, **kwargs): + for pos, grp, ovl in grouper(vcfi1, vcfd1): + v1, d1 = grp + + if d1 and d1.samples[0].get('GT') is None: + d1 = None + if not v1 or d1 and compatible(v1, d1): + v1 = d1 + if not v1: + continue + + v1 = trim1(v1 == d1 and vcfd1 or vcfi1, v1) + i1, p1 = getpl(v1, None) + + # skip high conf ref calls + if (i1 == 0 and p1 == 0 and + (not v1 or v1.samples[0].get('GQ') >= 20)): + continue + + if v1: + if v1 == d1 or v1.info.get('STR') and v1.info.get('RPA')[0]>=4: + v1.info['DELTA'] = True + v1.info.pop('ML_PROB', None) + v1.filter = [] + v1.line = None + vcfo1.emit(v1) + def merge_main(args): input_vcfs, output_vcfs = open_vcfs( (args.hap1, args.hap2, args.unphased, args.phased), @@ -696,6 +747,21 @@ def parse_args(argv=None): patch_parser.add_argument("output_vcf", help="The patched output VCF") patch_parser.set_defaults(func=patch2_main) + haploid_patch2_parser = subparsers.add_parser( + "haploid_patch2", + help="Patch a single pair of haploid DNAscope and DNAscopeHP VCFs", + ) + haploid_patch2_parser.add_argument( + "--vcf", required=True, help="The DNAscope vcf" + ) + haploid_patch2_parser.add_argument( + "--vcf_hp", required=True, help="The DNAscopeHP vcf" + ) + haploid_patch2_parser.add_argument( + "--patch_vcf", required=True, help="The output patch vcf" + ) + haploid_patch2_parser.set_defaults(func=haploid_patch_main) + return (parser.parse_args(argv), parser, subparsers) diff --git a/sentieon_cli/util.py b/sentieon_cli/util.py new file mode 100644 index 0000000..9072bae --- /dev/null +++ b/sentieon_cli/util.py @@ -0,0 +1,100 @@ +""" +Utility functions +""" + +import argparse +import os +import pathlib +import re +import shutil +import subprocess as sp +import sys +import tempfile +from typing import Callable, List, Optional + +import packaging.version + +from .logging import get_logger + +__version__ = "0.1.0" + +logger = get_logger(__name__) + +PRELOAD_SEP = r":| " +PRELOAD_SEP_PAT = re.compile(PRELOAD_SEP) + + +def tmp(): + """Create a temporary directory for the current process.""" + tmp_base = os.getenv("SENTIEON_TMPDIR") + tmp_dir = tempfile.mkdtemp(dir=tmp_base) + return tmp_dir + + +def check_version(cmd: str, version: Optional[packaging.version.Version]): + """Check the version of an executable""" + cmd_list: List[str] = cmd.split() + exec_file = shutil.which(cmd_list[0]) + if not exec_file: + print(f"Error: no '{cmd}' found in the PATH") + sys.exit(2) + + if version is None: + return + + cmd_list.append("--version") + cmd_version_str = sp.check_output(cmd_list).decode("utf-8").strip() + if cmd_list[0] == "sentieon": + cmd_version_str = cmd_version_str.split("-")[-1] + else: + # handle, e.g. bcftools which outputs multiple lines. + cmd_version_str = ( + cmd_version_str.split("\n")[0].split()[-1].split("-")[0] + ) + cmd_version = packaging.version.Version(cmd_version_str) + if cmd_version < version: + print( + f"Error: the pipeline requires {cmd} version '{version}' or later " + f"but {cmd} '{cmd_version}' was found in the PATH" + ) + sys.exit(2) + return + + +def path_arg( + exists: Optional[bool] = None, + is_dir: Optional[bool] = None, + is_file: Optional[bool] = None, + is_fifo: Optional[bool] = None, +) -> Callable[[str], pathlib.Path]: + """pathlib checked types for argparse""" + + def _path_arg(arg: str) -> pathlib.Path: + p = pathlib.Path(arg) + + attrs = [exists, is_dir, is_file, is_fifo] + attr_names = ["exists", "is_dir", "is_file", "is_fifo"] + + for attr_val, attr_name in zip(attrs, attr_names): + if attr_val is None: # Skip attributes that are not defined + continue + + m = getattr(p, attr_name) + if m() != attr_val: + raise argparse.ArgumentTypeError( + "The supplied path argument needs the attribute" + f" {attr_name}={attr_val}, but {attr_name}={m()}" + ) + return p + + return _path_arg + + +def library_preloaded(library_name: str) -> bool: + """Check if a shared library is preloaded through LD_PRELOAD""" + ld_preload = os.getenv("LD_PRELOAD", "") + for lib in PRELOAD_SEP_PAT.split(ld_preload): + lib_base = os.path.basename(lib) + if library_name in lib_base: + return True + return False diff --git a/tests/smoke/diploid.bed b/tests/smoke/diploid.bed new file mode 100644 index 0000000..ad5c2ff --- /dev/null +++ b/tests/smoke/diploid.bed @@ -0,0 +1 @@ +chr20 1 20000 diff --git a/tests/smoke/haploid.bed b/tests/smoke/haploid.bed new file mode 100644 index 0000000..04e2eae --- /dev/null +++ b/tests/smoke/haploid.bed @@ -0,0 +1,2 @@ +chr20 20001 25000 + diff --git a/tests/test_all.py b/tests/test_all.py index 78ceedc..5c22617 100644 --- a/tests/test_all.py +++ b/tests/test_all.py @@ -9,6 +9,7 @@ ) import sentieon_cli # NOQA +import sentieon_cli.util # NOQA def test_one(): @@ -17,7 +18,7 @@ def test_one(): def test_tmp(): """basic test for tmp()""" - tmp_dir = sentieon_cli.tmp() + tmp_dir = sentieon_cli.util.tmp() assert os.path.exists(tmp_dir) assert os.path.isdir(tmp_dir) shutil.rmtree(tmp_dir)