Merge pull request #1 from Sentieon/dev

Updates to the DNAscope LongRead pipeline
Sentieon · May 1, 2024 · 62cd5c2 · 62cd5c2
2 parents 508b144 + dcd9245
commit 62cd5c2
Show file tree

Hide file tree

Showing 17 changed files with 1,759 additions and 682 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -47,7 +47,7 @@ jobs:
       - name: Run mypy
         run: poetry run mypy . 
       - name: Run flake8
-        run: poetry run flake8 . --ignore E231,E221 --exclude .github/scripts/license_message.py,sentieon_cli/scripts/gvcf_combine.py,sentieon_cli/scripts/vcf_mod.py # false+ from python 3.12
+        run: poetry run flake8 . --extend-ignore E231,E221, --exclude .github/scripts/license_message.py,sentieon_cli/scripts/gvcf_combine.py,sentieon_cli/scripts/vcf_mod.py # false+ from python 3.12
       - name: Run the automated tests (for example)
         run: poetry run pytest -v
       - name: Run doct tests

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -5,16 +5,15 @@ on:
       - main
   pull_request:
 
-env:
-  SENTIEON_VERSION: "202308.01"
-
 jobs:
   smoke:
     strategy:
       fail-fast: true
+      max-parallel: 1
       matrix:
-        python-version: ["3.12"]
+        python-version: ["3.8", "3.12"]
         poetry-version: ["1.5.1"]
+        sentieon-version: ["202308.01", "202308.02"]
         os: [ubuntu-22.04] #, macos-latest, windows-latest]
     runs-on: ${{ matrix.os }}
     steps:
@@ -34,13 +33,20 @@ jobs:
         run: |
           sudo apt-get update
           sudo apt-get install -y bcftools
+      - name: Install samtools
+        run: |
+          curl -L https://github.com/samtools/samtools/releases/download/1.19.2/samtools-1.19.2.tar.bz2 | tar -jxf -
+          cd samtools-1.19.2
+          ./configure && sudo make install
       - name: Install bedtools
         run: |
           sudo curl -L -o /usr/local/bin/bedtools "https://github.com/arq5x/bedtools2/releases/download/v2.31.0/bedtools.static"
           sudo chmod ugo+x /usr/local/bin/bedtools
       - name: Install sentieon
         run: |
           curl -L https://s3.amazonaws.com/sentieon-release/software/sentieon-genomics-$SENTIEON_VERSION.tar.gz | tar -zxf -
+        env:
+          SENTIEON_VERSION: ${{ matrix.sentieon-version }}
       - name: Download model
         run: |
           curl -L "https://s3.amazonaws.com/sentieon-release/other/DNAscopePacBio2.1.bundle" \
@@ -56,20 +62,50 @@ jobs:
           mv tests/smoke/sample.cram "tests/smoke/sam ple.cram"
           mv tests/smoke/sample.cram.crai "tests/smoke/sam ple.cram.crai"
 
-          sentieon-cli dnascope-longread -r "tests/smoke/r ef.fa" \
+          sentieon-cli dnascope-longread -t 1 -r "tests/smoke/r ef.fa" \
             -i "tests/smoke/sam ple.cram" -m "DNAscope PacBio2.1.bundle" \
-            --repeat-model tests/smoke/sample_repeat.model -g "output hifi.vcf.gz"
-          sentieon driver -r "tests/smoke/r ef.fa" --algo GVCFtyper \
+            --repeat-model tests/smoke/sample_repeat.model -g \
+            -b "tests/smoke/diploid.bed" \
+            --haploid-bed "tests/smoke/haploid.bed" \
+            "output hifi.vcf.gz"
+          sentieon driver -r "tests/smoke/r ef.fa" -t 1 --algo GVCFtyper \
             -v "output hifi.g.vcf.gz" output_hifi_gvcftyper.vcf.gz
+          if [ ! -f "output hifi.sv.vcf.gz" -o ! -f "output hifi.haploid.vcf.gz" ]; then
+            exit 1
+          fi
 
-          sentieon-cli dnascope-longread --tech ONT -r "tests/smoke/r ef.fa" \
+          sentieon-cli dnascope-longread --tech ONT -t 1 -r "tests/smoke/r ef.fa" \
             -i "tests/smoke/sam ple.cram" -m "DNAscope PacBio2.1.bundle" \
             --repeat-model tests/smoke/sample_repeat.model -g "output ont.vcf.gz"
-          sentieon driver -r "tests/smoke/r ef.fa" --algo GVCFtyper \
+          sentieon driver -r "tests/smoke/r ef.fa" -t 1 --algo GVCFtyper \
             -v "output ont.g.vcf.gz" output_hifi_gvcftyper.vcf.gz
+          if [ ! -f "output ont.sv.vcf.gz" ]; then
+            exit 1
+          fi
+
+          sentieon-cli dnascope-longread -t 1 -r "tests/smoke/r ef.fa" \
+            -i "tests/smoke/sam ple.cram" -m "DNAscope PacBio2.1.bundle" \
+            --repeat-model tests/smoke/sample_repeat.model --align \
+            --input_ref "tests/smoke/r ef.fa" "output realigned.vcf.gz"
+          if [ ! -f "output realigned.vcf.gz" -o ! -f "output realigned_mm2_sorted_0.cram" ]; then
+            exit 1
+          fi
+
+          samtools fastq --reference "tests/smoke/r ef.fa" \
+            "tests/smoke/sam ple.cram" | \
+            gzip -c > sample.fq.gz
+          sentieon-cli dnascope-longread -t 1 -r "tests/smoke/r ef.fa" \
+            --fastq sample.fq.gz --readgroups '@RG\tID:sample-1\tSM:sample' \
+            -m "DNAscope PacBio2.1.bundle" \
+            --repeat-model tests/smoke/sample_repeat.model "output fq.vcf.gz"
+          if [ ! -f "output fq.vcf.gz" -o ! -f "output fq_mm2_sorted_fq_0.cram" -o ! -f "output fq.sv.vcf.gz" ]; then
+            exit 1
+          fi
+
         env:
           SENTIEON_LICENSE: ${{ secrets.SENTIEON_LICENSE }}
           SENTIEON_AUTH_MECH: "GitHub Actions - token"
           ENCRYPTION_KEY: ${{ secrets.ENCRYPTION_KEY }}
           LICENSE_MESSAGE: ${{ secrets.LICENSE_MESSAGE }}
+          SENTIEON_VERSION: ${{ matrix.sentieon-version }}
 
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,5 @@
 __pycache__/
 dist/
 .vscode/
-poetry.lock
 secrets.env
 .*.swp
diff --git a/README.md b/README.md
@@ -6,7 +6,16 @@ A command-line interface for the Sentieon software
 
 ## Setup
 
-`sentieon-cli` uses [poetry](https://pypi.org/project/poetry/) for packaging and dependency management. Initially, you will need to install poetry into your environment:
+Create a new python virtual environment for the project, if needed:
+```
+# Create a new venv, if needed
+python3 -m venv /path/to/new/virtal/environment/sentieon_cli
+
+# Activate the venv
+source /path/to/new/virtal/environment/sentieon_cli/bin/activate
+```
+
+`sentieon-cli` uses [poetry](https://pypi.org/project/poetry/) for packaging and dependency management. Initially, you will need to install poetry:
 ```
 pip install poetry
 ```
@@ -17,19 +26,13 @@ git clone https://github.com/sentieon/sentieon-cli.git
 cd sentieon-cli
 ```
 
-Use poetry to install the `sentieon-cli` into a virtualenv.
+Use poetry to install the `sentieon-cli` into the virtual environment:
 ```
 poetry install
 ```
 
-You can then run commands through poetry.
-```
-poetry run sentieon-cli ...
-```
-
-Alternatively, you can use `poetry shell` to activate the environment with the `sentieon-cli`.
+You can then run commands from the virtual environment:
 ```
-poetry shell
 sentieon-cli ...
 ```
 

diff --git a/data/hg38_diploid.bed b/data/hg38_diploid.bed
@@ -0,0 +1,24 @@
+chr1	1	248956422
+chr2	1	242193529
+chr3	1	198295559
+chr4	1	190214555
+chr5	1	181538259
+chr6	1	170805979
+chr7	1	159345973
+chr8	1	145138636
+chr9	1	138394717
+chr10	1	133797422
+chr11	1	135086622
+chr12	1	133275309
+chr13	1	114364328
+chr14	1	107043718
+chr15	1	101991189
+chr16	1	90338345
+chr17	1	83257441
+chr18	1	80373285
+chr19	1	58617616
+chr20	1	64444167
+chr21	1	46709983
+chr22	1	50818468
+chrX	1	2781479
+chrX	155701383	156040895
diff --git a/data/hg38_haploid.bed b/data/hg38_haploid.bed
@@ -0,0 +1,2 @@
+chrX	2781480	155701383
+chrY	2781480	56887903
diff --git a/docs/dnascope-longread.md b/docs/dnascope-longread.md
@@ -1,68 +1,80 @@
 # DNAscope LongRead
 
-Sentieon DNAscope LongRead is a pipeline for calling germline SNVs and indels from long-read sequence data. The DNAscope LongRead pipeline is able to take advantage of longer read lengths to perform quick and accurate variant calling using specially calibrated machine learning models.
+Sentieon DNAscope LongRead is a pipeline for alignment and germline variant calling (SNVs, SVs, and indels) from long-read sequence data. The DNAscope LongRead pipeline is able to take advantage of longer read lengths to perform quick and accurate variant calling using specially calibrated machine learning models.
 
-This folder provides a shell script implementing the DNAscope LongRead pipeline along with Python scripts for VCF manipulation. The pipeline expects aligned reads as input in BAM or CRAM format and will output variants in the VCF (and gVCF) formats.
+This folder provides a shell script implementing the DNAscope LongRead pipeline. The pipeline will accept as input aligned reads in BAM or CRAM format, or un-aligned data in FASTQ, uBAM, or uCRAM format. The pipeline will output variants in the VCF (or gVCF) formats and aligned reads in BAM or CRAM formats.
 
 DNAscope LongRead is implemented using the Sentieon software package, which requires a valid license for use. Please contact [email protected] for access to the Sentieon software and an evaluation license.
 
 ## Prerequisites
 
-In order to run the pipeline, you need to use the Sentieon software package version 202308 or higher. The pipeline also requires [Python] versions >2.7 or >3.3, [bcftools] version 1.10 or higher, and [bedtools]. The `sentieon`, `python`, `bcftools`, and `bedtools` executables will be accessed through the user's `PATH` environment variable.
+In order to run the pipeline, you need to use the Sentieon software package version 202308.01 or higher and [Python] version >3.8. SNV and indel calling additionally requires [bcftools] version 1.10 or higher and [bedtools]. Alignment of read data in uBAM or uCRAM or re-alignment of read data additional requires [samtools] version 1.16 or higher. The `sentieon`, `python`, `bcftools`, `bedtools`, and `samtools` executables will be accessed through the user's `PATH` environment variable.
 
 ## Input data requirements
 
-### Aligned reads - PacBio HiFi
-
-The pipeline will accept PacBio HiFi long reads that have been aligned to the reference genome with `minimap2` or `pbmm2`. When aligning reads with Sentieon minimap2, using the Sentieon model for HiFi reads is recommended. When aligning reads with the open-source minimap2, `-x map-hifi` is recommended.
-
-### Aligned reads - ONT
-
-The pipeline will accept Oxford Nanopore (ONT) long reads that have been aligned to the reference genome with `minimap2`. When aligning reads with Sentieon minimap2, using the Sentieon model for ONT is recommended. When aligning reads with the open-source minimap2, `-x map-ont` is recommended.
-
 ### The Reference genome
 
 DNAscope LongRead will call variants present in the sample relative to a high quality reference genome sequence. Besides the reference genome file, a samtools fasta index file (.fai) needs to be present. We recommend aligning to a reference genome without alternate contigs.
 
 
 ## Usage
 
-A single command is run to call variants from PacBio HiFi reads:
+A single command is run to call SNVs, indels, and structural variants from PacBio HiFi or ONT reads in the uBAM, uCRAM, BAM, or CRAM formats:
 ```sh
-sentieon-cli dnascope-longread [-h] -r REFERENCE -i SAMPLE_INPUT -m MODEL_BUNDLE [-d DBSNP] [-b DIPLOID_BED] [-t NUMBER_THREADS] [-g]  [--] VARIANT_VCF
+sentieon-cli dnascope-longread [-h] -r REFERENCE -i SAMPLE_INPUT \
+  -m MODEL_BUNDLE [-d DBSNP] [-b DIPLOID_BED] [-t NUMBER_THREADS] [-g] \
+  --tech HiFi|ONT [--haploid-bed HAPLOID_BED] [--] sample.vcf.gz
 ```
 
-A single command is run to call variants from ONT long reads:
+With uBAM, uCRAM, BAM, or CRAM input, the read data can be re-aligned to the reference genome using minimap2 using the `--align` argument.
+
+A single command is run to call SNVs, indels, and structural variants from PacBio HiFi or ONT reads in the FASTQ format:
 ```sh
-sentieon-cli dnascope-longread --tech ONT [-h] -r REFERENCE -i SAMPLE_INPUT -m MODEL_BUNDLE [-d DBSNP] [-b DIPLOID_BED] [-t NUMBER_THREADS] [-g]  [--] VARIANT_VCF
+sentieon-cli dnascope-longread [-h] -r REFERENCE --fastq INPUT_FASTQ  \
+  --readgroups READGROUP -m MODEL_BUNDLE [-d DBSNP] [-b DIPLOID_BED] \
+  [-t NUMBER_THREADS] [-g] --tech HiFi|ONT [--haploid-bed HAPLOID_BED] [--] \
+  sample.vcf.gz
 ```
 
 The Sentieon LongRead pipeline requires the following arguments:
 - `-r REFERENCE`: the location of the reference FASTA file. You should make sure that the reference is the same as the one used in the mapping stage. A reference fasta index, ".fai" file, is also required.
-- `-i SAMPLE_INPUT`: the input sample file in BAM or CRAM format. One or more files can be suppied by repeating the `-i` argument.
-- `-m MODEL_BUNDLE`: the location of the model bundle.
-- `--tech ONT`: Sequencing technology used to generate the reads. Supported arguments are `ONT` or `HiFi`. 
+- `-i SAMPLE_INPUT`: the input sample file in BAM or CRAM format. One or more files can be suppied by passing multiple files after the `-i` argument.
+- `--fastq INPUT_FASTQ`: the input sample file in FASTQ format. One or more files can be supplied by passing multiple files after the `--fastq` argument.
+- `--readgroups READGROUP`: readgroup information for the read data. The `--readgroups` argument is required if the input data is in the fastq format. This argument expects complete readgroup strings and these strings will be passed to `minimap2` through the `-R` argument. An example argument is `--readgroups '@RG\tID:foo\tSM:bar'`.
+- `-m MODEL_BUNDLE`: the location of the model bundle. Model bundle files can be found in the [sentieon-models] repository.
+- `--tech HiFi|ONT`: Sequencing technology used to generate the reads. Supported arguments are `ONT` or `HiFi`.
 
 The Sentieon LongRead pipeline accepts the following optional arguments:
 - `-d DBSNP`: the location of the Single Nucleotide Polymorphism database (dbSNP) used to label known variants in VCF (`.vcf`) or bgzip compressed VCF (`.vcf.gz`) format. Only one file is supported. Supplying this file will annotate variants with their dbSNP refSNP ID numbers. A VCF index file is required.
-- `-b DIPLOID_BED`: interval in the reference to restrict variant calling, in BED file format. Supplying this file will limit variant calling to the intervals inside the BED file.
+- `-b DIPLOID_BED`: interval in the reference to restrict diploid variant calling, in BED file format. Supplying this file will limit diploid variant calling to the intervals inside the BED file.
+- `--haploid-bed HAPLOID_BED`: interval in the reference to restrict haploid variant calling, in BED file format. Supplying this file will perform haploid variant calling across the intervals inside the BED file.
 - `-t NUMBER_THREADS`: number of computing threads that will be used by the software to run parallel processes. The argument is optional; if omitted, the pipeline will use as many threads as the server has.
 - `-g`: output variants in the gVCF format, in addition to the VCF output file. The tool will output a bgzip compressed gVCF file with a corresponding index file.
+- `--align`: re-align the input read data to the reference genome using Sentieon minimap2. This argument needs to be passed to obtain useful output from uBAM or uCRAM input.
 - `-h`: print the command-line help and exit.
 - `--dry-run`: print the pipeline commands, but do not actually execute them.
 
 The Sentieon LongRead pipeline requires the following positional arguments:
-- `VARIANT_VCF`: the location and filename of the variant calling output. The tool will output a bgzip compressed VCF file with a corresponding index file.
+- `sample.vcf.gz`: the location and filename of the variant calling output. The filename must end with the suffix, `.vcf.gz`. The tool will output a bgzip compressed VCF file with a corresponding index file.
 
 ## Pipeline output
 
 The Sentieon LongRead pipeline will output a bgzip compressed file (.vcf.gz) containing variant calls in the standard VCFv4.2 format along with a tabix index file (.vcf.gz.tbi). If the `-g` option is used, the pipeline will also output a bgzip compressed file (.g.vcf.gz) containing variant calls in the gVCF format along with a tabix index file (.g.vcf.gz.tbi).
 
+With the `--haploid-bed HAPLOID_BED` argument, the pipeline will create the following output files:
+- `sample.vcf.gz`: SNV and indel variant calls across the diploid regions of the genome (as defined in the `-d DIPLOID_BED` file).
+- `sample.haploid.vcf.gz`: SNV and indel variant calls across the haploid regions of the genome (as defined in the `--haploid-bed HAPLOID_BED` file).
+- `sample.sv.vcf.gz`: structural variant calls from the Sentieon LongReadSV tool.
+
+With the `--align` argument or fastq input, the software will also output a CRAM or BAM file for each input file.
+
 ## Other considerations
 
-### Diploid variant calling
+### Diploid and haploid variant calling
+
+The default pipeline is recommended for use with samples from diploid organisms. For samples with both diploid and haploid chromosomes, the `-b DIPLOID_BED` option can be used to limit diploid variant calling to diploid chromosomes and the `--haploid-bed HAPLOID_BED` argument can be used to perform haploid variant calling across haploid chromosomes. Diploid and haploid variants will be output to separate VCF files.
 
-Currently, the pipeline is only recommended for use with samples from diploid organisms. For samples with both diploid and haploid chromosomes, the `-b DIPLOID_BED` option can be used to limit variant calling to diploid chromosomes.
+Example diploid and haploid BED files can be found in the [/data](/data) folder in this repository.
 
 ### Modification
 
@@ -77,5 +89,7 @@ The Python scripts in the `sentieon_cli/scripts`` folder perform low-level manip
 [Python]: https://www.python.org/
 [bcftools]: http://samtools.github.io/bcftools/bcftools.html
 [bedtools]: https://bedtools.readthedocs.io/en/latest/
+[samtools]: https://www.htslib.org/
+[sentieon-models]: https://github.com/Sentieon/sentieon-models
 
 [Sentieon DNAscope LongRead – A highly Accurate, Fast, and Efficient Pipeline for Germline Variant Calling from PacBio HiFi reads]: https://www.biorxiv.org/content/10.1101/2022.06.01.494452v1