diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 85d867f..534fa92 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -5,7 +5,7 @@ on:
branches:
- main
paths:
- - 'docs/**'
+ - "docs/**"
jobs:
deploy:
diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
index 6e8d31d..b41720d 100644
--- a/.github/workflows/main.yaml
+++ b/.github/workflows/main.yaml
@@ -13,82 +13,82 @@ jobs:
Dry_Run_and_Lint:
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@v2
- - uses: docker://snakemake/snakemake:v6.8.2
- - name: Tumor-normal FastQ Dry Run
- run: |
- docker run -v $PWD:/opt2 snakemake/snakemake:v6.8.2 \
- /opt2/xavier run --input \
- /opt2/.tests/Sample10_ARK1_S37.R1.fastq.gz /opt2/.tests/Sample10_ARK1_S37.R2.fastq.gz \
- /opt2/.tests/Sample11_ACI_158_S38.R1.fastq.gz /opt2/.tests/Sample11_ACI_158_S38.R2.fastq.gz \
- /opt2/.tests/Sample4_CRL1622_S31.R1.fastq.gz /opt2/.tests/Sample4_CRL1622_S31.R2.fastq.gz \
- --output /opt2/output_tn_fqs --targets /opt2/.tests/Agilent_SSv7_allExons_hg38.bed \
- --pairs /opt2/.tests/pairs.tsv --genome hg38 --mode local --ffpe --cnv --runmode init
-
- docker run -v $PWD:/opt2 snakemake/snakemake:v6.8.2 \
- /opt2/xavier run --input \
- /opt2/.tests/Sample10_ARK1_S37.R1.fastq.gz /opt2/.tests/Sample10_ARK1_S37.R2.fastq.gz \
- /opt2/.tests/Sample11_ACI_158_S38.R1.fastq.gz /opt2/.tests/Sample11_ACI_158_S38.R2.fastq.gz \
- /opt2/.tests/Sample4_CRL1622_S31.R1.fastq.gz /opt2/.tests/Sample4_CRL1622_S31.R2.fastq.gz \
- --output /opt2/output_tn_fqs --targets /opt2/.tests/Agilent_SSv7_allExons_hg38.bed \
- --pairs /opt2/.tests/pairs.tsv --genome hg38 --mode local --ffpe --cnv --runmode dryrun
-
- - name: Tumor-only FastQ Dry Run
- run: |
- docker run -v $PWD:/opt2 snakemake/snakemake:v6.8.2 \
- /opt2/xavier run --input \
- /opt2/.tests/Sample10_ARK1_S37.R1.fastq.gz /opt2/.tests/Sample10_ARK1_S37.R2.fastq.gz \
- /opt2/.tests/Sample11_ACI_158_S38.R1.fastq.gz /opt2/.tests/Sample11_ACI_158_S38.R2.fastq.gz \
- /opt2/.tests/Sample4_CRL1622_S31.R1.fastq.gz /opt2/.tests/Sample4_CRL1622_S31.R2.fastq.gz \
- --output /opt2/output_tonly_fqs --targets /opt2/.tests/Agilent_SSv7_allExons_hg38.bed \
- --genome hg38 --mode local --ffpe --runmode init
-
- docker run -v $PWD:/opt2 snakemake/snakemake:v6.8.2 \
- /opt2/xavier run --input \
- /opt2/.tests/Sample10_ARK1_S37.R1.fastq.gz /opt2/.tests/Sample10_ARK1_S37.R2.fastq.gz \
- /opt2/.tests/Sample11_ACI_158_S38.R1.fastq.gz /opt2/.tests/Sample11_ACI_158_S38.R2.fastq.gz \
- /opt2/.tests/Sample4_CRL1622_S31.R1.fastq.gz /opt2/.tests/Sample4_CRL1622_S31.R2.fastq.gz \
- --output /opt2/output_tonly_fqs --targets /opt2/.tests/Agilent_SSv7_allExons_hg38.bed \
- --genome hg38 --mode local --ffpe --runmode dryrun
-
- - name: Tumor-normal BAM Dry Run
- run: |
- docker run -v $PWD:/opt2 snakemake/snakemake:v6.8.2 \
- /opt2/xavier run --input \
- /opt2/.tests/Sample10_ARK1_S37.recal.bam \
- /opt2/.tests/Sample11_ACI_158_S38.recal.bam \
- /opt2/.tests/Sample4_CRL1622_S31.recal.bam \
- --output /opt2/output_tn_bams --targets /opt2/.tests/Agilent_SSv7_allExons_hg38.bed \
- --pairs /opt2/.tests/pairs.tsv --genome hg38 --mode local --ffpe --cnv --runmode init
-
- docker run -v $PWD:/opt2 snakemake/snakemake:v6.8.2 \
- /opt2/xavier run --input \
- /opt2/.tests/Sample10_ARK1_S37.recal.bam \
- /opt2/.tests/Sample11_ACI_158_S38.recal.bam \
- /opt2/.tests/Sample4_CRL1622_S31.recal.bam \
- --output /opt2/output_tn_bams --targets /opt2/.tests/Agilent_SSv7_allExons_hg38.bed \
- --pairs /opt2/.tests/pairs.tsv --genome hg38 --mode local --ffpe --cnv --runmode dryrun
-
- - name: Tumor-only BAM Dry Run
- run: |
- docker run -v $PWD:/opt2 snakemake/snakemake:v6.8.2 \
- /opt2/xavier run --input \
- /opt2/.tests/Sample10_ARK1_S37.recal.bam \
- /opt2/.tests/Sample11_ACI_158_S38.recal.bam \
- /opt2/.tests/Sample4_CRL1622_S31.recal.bam \
- --output /opt2/output_tonly_bams --targets /opt2/.tests/Agilent_SSv7_allExons_hg38.bed \
- --genome hg38 --mode local --ffpe --runmode init
-
- docker run -v $PWD:/opt2 snakemake/snakemake:v6.8.2 \
- /opt2/xavier run --input \
- /opt2/.tests/Sample10_ARK1_S37.recal.bam \
- /opt2/.tests/Sample11_ACI_158_S38.recal.bam \
- /opt2/.tests/Sample4_CRL1622_S31.recal.bam \
- --output /opt2/output_tonly_bams --targets /opt2/.tests/Agilent_SSv7_allExons_hg38.bed \
- --genome hg38 --mode local --ffpe --runmode dryrun
-
- - name: Lint Workflow
- continue-on-error: true
- run: |
- docker run -v $PWD:/opt2 snakemake/snakemake:v5.24.2 snakemake --lint -s /opt2/output/workflow/Snakefile -d /opt2/output_tn_fqs || \
- echo 'There may have been a few warnings or errors. Please read through the log to determine if its harmless.'
+ - uses: actions/checkout@v2
+ - uses: docker://snakemake/snakemake:v6.8.2
+ - name: Tumor-normal FastQ Dry Run
+ run: |
+ docker run -v $PWD:/opt2 snakemake/snakemake:v6.8.2 \
+ /opt2/xavier run --input \
+ /opt2/.tests/Sample10_ARK1_S37.R1.fastq.gz /opt2/.tests/Sample10_ARK1_S37.R2.fastq.gz \
+ /opt2/.tests/Sample11_ACI_158_S38.R1.fastq.gz /opt2/.tests/Sample11_ACI_158_S38.R2.fastq.gz \
+ /opt2/.tests/Sample4_CRL1622_S31.R1.fastq.gz /opt2/.tests/Sample4_CRL1622_S31.R2.fastq.gz \
+ --output /opt2/output_tn_fqs --targets /opt2/.tests/Agilent_SSv7_allExons_hg38.bed \
+ --pairs /opt2/.tests/pairs.tsv --genome hg38 --mode local --ffpe --cnv --runmode init
+
+ docker run -v $PWD:/opt2 snakemake/snakemake:v6.8.2 \
+ /opt2/xavier run --input \
+ /opt2/.tests/Sample10_ARK1_S37.R1.fastq.gz /opt2/.tests/Sample10_ARK1_S37.R2.fastq.gz \
+ /opt2/.tests/Sample11_ACI_158_S38.R1.fastq.gz /opt2/.tests/Sample11_ACI_158_S38.R2.fastq.gz \
+ /opt2/.tests/Sample4_CRL1622_S31.R1.fastq.gz /opt2/.tests/Sample4_CRL1622_S31.R2.fastq.gz \
+ --output /opt2/output_tn_fqs --targets /opt2/.tests/Agilent_SSv7_allExons_hg38.bed \
+ --pairs /opt2/.tests/pairs.tsv --genome hg38 --mode local --ffpe --cnv --runmode dryrun
+
+ - name: Tumor-only FastQ Dry Run
+ run: |
+ docker run -v $PWD:/opt2 snakemake/snakemake:v6.8.2 \
+ /opt2/xavier run --input \
+ /opt2/.tests/Sample10_ARK1_S37.R1.fastq.gz /opt2/.tests/Sample10_ARK1_S37.R2.fastq.gz \
+ /opt2/.tests/Sample11_ACI_158_S38.R1.fastq.gz /opt2/.tests/Sample11_ACI_158_S38.R2.fastq.gz \
+ /opt2/.tests/Sample4_CRL1622_S31.R1.fastq.gz /opt2/.tests/Sample4_CRL1622_S31.R2.fastq.gz \
+ --output /opt2/output_tonly_fqs --targets /opt2/.tests/Agilent_SSv7_allExons_hg38.bed \
+ --genome hg38 --mode local --ffpe --runmode init
+
+ docker run -v $PWD:/opt2 snakemake/snakemake:v6.8.2 \
+ /opt2/xavier run --input \
+ /opt2/.tests/Sample10_ARK1_S37.R1.fastq.gz /opt2/.tests/Sample10_ARK1_S37.R2.fastq.gz \
+ /opt2/.tests/Sample11_ACI_158_S38.R1.fastq.gz /opt2/.tests/Sample11_ACI_158_S38.R2.fastq.gz \
+ /opt2/.tests/Sample4_CRL1622_S31.R1.fastq.gz /opt2/.tests/Sample4_CRL1622_S31.R2.fastq.gz \
+ --output /opt2/output_tonly_fqs --targets /opt2/.tests/Agilent_SSv7_allExons_hg38.bed \
+ --genome hg38 --mode local --ffpe --runmode dryrun
+
+ - name: Tumor-normal BAM Dry Run
+ run: |
+ docker run -v $PWD:/opt2 snakemake/snakemake:v6.8.2 \
+ /opt2/xavier run --input \
+ /opt2/.tests/Sample10_ARK1_S37.recal.bam \
+ /opt2/.tests/Sample11_ACI_158_S38.recal.bam \
+ /opt2/.tests/Sample4_CRL1622_S31.recal.bam \
+ --output /opt2/output_tn_bams --targets /opt2/.tests/Agilent_SSv7_allExons_hg38.bed \
+ --pairs /opt2/.tests/pairs.tsv --genome hg38 --mode local --ffpe --cnv --runmode init
+
+ docker run -v $PWD:/opt2 snakemake/snakemake:v6.8.2 \
+ /opt2/xavier run --input \
+ /opt2/.tests/Sample10_ARK1_S37.recal.bam \
+ /opt2/.tests/Sample11_ACI_158_S38.recal.bam \
+ /opt2/.tests/Sample4_CRL1622_S31.recal.bam \
+ --output /opt2/output_tn_bams --targets /opt2/.tests/Agilent_SSv7_allExons_hg38.bed \
+ --pairs /opt2/.tests/pairs.tsv --genome hg38 --mode local --ffpe --cnv --runmode dryrun
+
+ - name: Tumor-only BAM Dry Run
+ run: |
+ docker run -v $PWD:/opt2 snakemake/snakemake:v6.8.2 \
+ /opt2/xavier run --input \
+ /opt2/.tests/Sample10_ARK1_S37.recal.bam \
+ /opt2/.tests/Sample11_ACI_158_S38.recal.bam \
+ /opt2/.tests/Sample4_CRL1622_S31.recal.bam \
+ --output /opt2/output_tonly_bams --targets /opt2/.tests/Agilent_SSv7_allExons_hg38.bed \
+ --genome hg38 --mode local --ffpe --runmode init
+
+ docker run -v $PWD:/opt2 snakemake/snakemake:v6.8.2 \
+ /opt2/xavier run --input \
+ /opt2/.tests/Sample10_ARK1_S37.recal.bam \
+ /opt2/.tests/Sample11_ACI_158_S38.recal.bam \
+ /opt2/.tests/Sample4_CRL1622_S31.recal.bam \
+ --output /opt2/output_tonly_bams --targets /opt2/.tests/Agilent_SSv7_allExons_hg38.bed \
+ --genome hg38 --mode local --ffpe --runmode dryrun
+
+ - name: Lint Workflow
+ continue-on-error: true
+ run: |
+ docker run -v $PWD:/opt2 snakemake/snakemake:v5.24.2 snakemake --lint -s /opt2/output/workflow/Snakefile -d /opt2/output_tn_fqs || \
+ echo 'There may have been a few warnings or errors. Please read through the log to determine if its harmless.'
diff --git a/.gitignore b/.gitignore
index 71ada8b..658d64e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -166,4 +166,4 @@ test_*/
test.sh
# bash history files
-**/.koparde*
\ No newline at end of file
+**/.koparde*
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 12dc373..8e93c39 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -20,6 +20,11 @@ repos:
hooks:
- id: codespell
# https://github.com/codespell-project/codespell/issues/1498
+ exclude: >
+ (?x)^(
+ .*\.svg
+ )$
+ # https://github.com/codespell-project/codespell/issues/1498
# Python formatting
- repo: https://github.com/psf/black
rev: 23.7.0
diff --git a/.tests/README.md b/.tests/README.md
index 97b6f4c..be56f9a 100644
--- a/.tests/README.md
+++ b/.tests/README.md
@@ -1,6 +1,5 @@
# About
-These input files are used for continuous integration purposes, specificially to dry run the pipeline whenever commits have been made to the main, master, or unified branches.
+These input files are used for continuous integration purposes, specifically to dry run the pipeline whenever commits have been made to the main, master, or unified branches.
**Please Note:** Each of the provided FastQ files and BAM files are empty and are not suitable input to the CCBR GATK4 pipeline!
-
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1e74cb9..27c897a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,7 @@
## development version
- Create `CITATION.cff` to describe how to cite XAVIER. (#68, @kelly-sovacool)
+- Minor documentation improvements. (#78, @kelly-sovacool)
## v3.0.2
diff --git a/README.md b/README.md
index 0f27270..b70253f 100644
--- a/README.md
+++ b/README.md
@@ -1,15 +1,16 @@
-# XAVIER - e**X**ome **A**nalysis and **V**ariant explor**ER** 🔬 [![tests](https://github.com/CCBR/XAVIER/workflows/tests/badge.svg)](https://github.com/CCBR/XAVIER/actions/workflows/main.yaml) [![docs](https://github.com/CCBR/XAVIER/workflows/docs/badge.svg)](https://github.com/CCBR/XAVIER/actions/workflows/docs.yml) [![Docker Pulls](https://img.shields.io/docker/pulls/nciccbr/ccbr_wes_base)](https://hub.docker.com/r/nciccbr/ccbr_wes_base) [![GitHub issues](https://img.shields.io/github/issues/CCBR/XAVIER?color=brightgreen)](https://github.com/CCBR/XAVIER/issues) [![GitHub license](https://img.shields.io/github/license/CCBR/XAVIER)](https://github.com/CCBR/XAVIER/blob/main/LICENSE)
+# XAVIER - e**X**ome **A**nalysis and **V**ariant explor**ER** 🔬 [![tests](https://github.com/CCBR/XAVIER/workflows/tests/badge.svg)](https://github.com/CCBR/XAVIER/actions/workflows/main.yaml) [![docs](https://github.com/CCBR/XAVIER/workflows/docs/badge.svg)](https://github.com/CCBR/XAVIER/actions/workflows/docs.yml) [![Docker Pulls](https://img.shields.io/docker/pulls/nciccbr/ccbr_wes_base)](https://hub.docker.com/r/nciccbr/ccbr_wes_base) [![GitHub issues](https://img.shields.io/github/issues/CCBR/XAVIER?color=brightgreen)](https://github.com/CCBR/XAVIER/issues) [![GitHub license](https://img.shields.io/github/license/CCBR/XAVIER)](https://github.com/CCBR/XAVIER/blob/main/LICENSE)
-> ***_XAVIER - eXome Analysis and Variant explorER_***. This is the home of the pipeline, XAVIER. Its long-term goals: to accurately call germline and somatic variants, to infer CNVs, and to boldly annotate variants like no pipeline before!
+> **_*XAVIER - eXome Analysis and Variant explorER*_**. This is the home of the pipeline, XAVIER. Its long-term goals: to accurately call germline and somatic variants, to infer CNVs, and to boldly annotate variants like no pipeline before!
## Overview
+
Welcome to XAVIER! Before getting started, we highly recommend reading through [xavier's documentation](https://CCBR.github.io/XAVIER).
-The **`xavier`** pipeline is composed several inter-related sub commands to setup and run the pipeline across different systems. Each of the available sub commands perform different functions:
+The **`xavier`** pipeline is composed several inter-related sub commands to setup and run the pipeline across different systems. Each of the available sub commands perform different functions:
- * [xavier run](https://CCBR.github.io/XAVIER/usage/run/): Run the XAVIER pipeline with your input files.
- * [xavier unlock](https://CCBR.github.io/XAVIER/usage/unlock/): Unlocks a previous runs output directory.
- * [xavier cache](https://CCBR.github.io/XAVIER/usage/cache/): Cache remote resources locally, coming soon!
+- [xavier run](https://CCBR.github.io/XAVIER/usage/run/): Run the XAVIER pipeline with your input files.
+- [xavier unlock](https://CCBR.github.io/XAVIER/usage/unlock/): Unlocks a previous runs output directory.
+- [xavier cache](https://CCBR.github.io/XAVIER/usage/cache/): Cache remote resources locally, coming soon!
XAVIER is a comprehensive whole exome-sequencing pipeline following the Broad's set of best practices. It relies on technologies like [Singularity1](https://singularity.lbl.gov/) to maintain the highest-level of reproducibility. The pipeline consists of a series of data processing and quality-control steps orchestrated by [Snakemake2](https://snakemake.readthedocs.io/en/stable/), a flexible and scalable workflow management system, to submit jobs to a cluster or cloud provider.
@@ -20,12 +21,15 @@ Before getting started, we highly recommend reading through the [usage](https://
For more information about issues or trouble-shooting a problem, please checkout our [FAQ](faq/questions.md) prior to [opening an issue on Github](https://github.com/CCBR/XAVIER/issues).
## Dependencies
-**Requires:** `singularity>=3.5` `snakemake==6.X`
+
+**Requires:** `singularity>=3.5` `snakemake==6.X`
[Snakemake](https://snakemake.readthedocs.io/en/stable/getting_started/installation.html) and [singularity](https://singularity.lbl.gov/all-releases) must be installed on the target system. Snakemake orchestrates the execution of each step in the pipeline. To guarantee the highest level of reproducibility, each step relies on versioned images from [DockerHub](https://hub.docker.com/orgs/nciccbr/repositories). Snakemake uses singaularity to pull these images onto the local filesystem prior to job execution, and as so, snakemake and singularity are the only two dependencies.
## Run XAVIER pipeline
+
### Biowulf
+
```bash
# XAVIER is configured to use different execution backends: local or slurm
# view the help page for more information
@@ -52,7 +56,7 @@ xavier run \
--pairs pairs.txt \
--targets Targets_hg38.bed \
--mode slurm \
---runmode init
+--runmode init
# Second, do a dry run to visualize outputs
xavier run \
@@ -62,7 +66,7 @@ xavier run \
--pairs pairs.txt \
--targets Targets_hg38.bed \
--mode slurm \
---runmode dryrun
+--runmode dryrun
# Then do a complete run
xavier run \
@@ -72,10 +76,11 @@ xavier run \
--pairs pairs.txt \
--targets Targets_hg38.bed \
--mode slurm \
---runmode run
+--runmode run
```
### FRCE
+
```bash
# grab an interactive node
srun --export all --pty --x11 bash
@@ -103,11 +108,11 @@ xavier run \
```
-## Contribute
-
-This site is a living document, created for and by members like you. XAVIER is maintained by the members of CCBR and is improved by continous feedback! We encourage you to contribute new content and make improvements to existing content via pull request to our [repository](https://github.com/CCBR/XAVIER/pulls).
+## Contribute
+This site is a living document, created for and by members like you. XAVIER is maintained by the members of CCBR and is improved by continuous feedback! We encourage you to contribute new content and make improvements to existing content via pull request to our [repository](https://github.com/CCBR/XAVIER/pulls).
## References
-**1.** Kurtzer GM, Sochat V, Bauer MW (2017). Singularity: Scientific containers for mobility of compute. PLoS ONE 12(5): e0177459.
-**2.** Koster, J. and S. Rahmann (2018). "Snakemake-a scalable bioinformatics workflow engine." Bioinformatics 34(20): 3600.
+
+**1.** Kurtzer GM, Sochat V, Bauer MW (2017). Singularity: Scientific containers for mobility of compute. PLoS ONE 12(5): e0177459.
+**2.** Koster, J. and S. Rahmann (2018). "Snakemake-a scalable bioinformatics workflow engine." Bioinformatics 34(20): 3600.
diff --git a/bin/redirect b/bin/redirect
index dcc729c..ff35c1d 100755
--- a/bin/redirect
+++ b/bin/redirect
@@ -17,7 +17,7 @@ TOOLDIR=$(dirname "$SCRIPTDIRNAME")
TOOLNAME=$(basename "$SCRIPTNAME")
echo $TOOLNAME
-# find out if you are running on biowulf or frce and load conda
+# find out if you are running on biowulf or frce and load conda
nbiowulf=$(scontrol show config | grep -i -c biowulf)
if [[ "$nbiowulf" > 0 ]];then ISBIOWULF=true; else ISBIOWULF=false;fi
nfrce=$(scontrol show config | grep -i -c fsitgl)
@@ -53,4 +53,3 @@ fi
${TOOLDIR}/${TOOLNAME} "$@" || true
-
diff --git a/config/cluster.biowulf.json b/config/cluster.biowulf.json
index 10f4cc1..807bfc6 100644
--- a/config/cluster.biowulf.json
+++ b/config/cluster.biowulf.json
@@ -1,282 +1,281 @@
{
"__default__": {
"partition": "norm",
- "threads":"4",
+ "threads": "4",
"mem": "64G",
"gres": "lscratch:256",
"time": "2-00:00:00",
- "name" : "{rule}.{wildcards}",
- "output" : "logfiles/slurmfiles/${{SLURM_JOBID}}.%j.{rule}.{wildcards}.out",
- "error" : "logfiles/slurmfiles/${{SLURM_JOBID}}.%j.{rule}.{wildcards}.err"
+ "name": "{rule}.{wildcards}",
+ "output": "logfiles/slurmfiles/${{SLURM_JOBID}}.%j.{rule}.{wildcards}.out",
+ "error": "logfiles/slurmfiles/${{SLURM_JOBID}}.%j.{rule}.{wildcards}.err"
},
"bam": {
- "threads": "2",
- "time": "12:00:00"
- },
- "filter_cnv": {
- "threads": "2",
- "time": "2:00:00",
- "mem": "1G"
- },
- "samtools_flagstats": {
- "threads": "2",
- "time": "4:00:00"
- },
+ "threads": "2",
+ "time": "12:00:00"
+ },
+ "filter_cnv": {
+ "threads": "2",
+ "time": "2:00:00",
+ "mem": "1G"
+ },
+ "samtools_flagstats": {
+ "threads": "2",
+ "time": "4:00:00"
+ },
+
+ "strelka": {
+ "threads": "16",
+ "time": "16:00:00",
+ "mem": "32G"
+ },
+
+ "strelka_filter": {
+ "threads": "4",
+ "time": "8:00:00",
+ "mem": "16G"
+ },
+ "vardict": {
+ "threads": "16",
+ "time": "16:00:00",
+ "mem": "32G"
+ },
- "strelka": {
- "threads": "16",
- "time": "16:00:00",
- "mem": "32G"
- },
+ "vardict_filter": {
+ "threads": "4",
+ "time": "8:00:00",
+ "mem": "32G"
+ },
+ "varscan": {
+ "threads": "16",
+ "time": "16:00:00",
+ "mem": "32G"
+ },
- "strelka_filter": {
- "threads": "4",
- "time": "8:00:00",
- "mem": "16G"
- },
- "vardict": {
- "threads": "16",
- "time": "16:00:00",
- "mem": "32G"
- },
+ "varscan_filter": {
+ "threads": "4",
+ "time": "8:00:00",
+ "mem": "32G"
+ },
- "vardict_filter": {
- "threads": "4",
- "time": "8:00:00",
- "mem": "32G"
- },
- "varscan": {
- "threads": "16",
- "time": "16:00:00",
- "mem": "32G"
- },
+ "merge_somatic_callers": {
+ "threads": "16",
+ "time": "18:00:00",
+ "mem": "64G"
+ },
- "varscan_filter": {
- "threads": "4",
- "time": "8:00:00",
- "mem": "32G"
- },
-
- "merge_somatic_callers": {
- "threads": "16",
- "time": "18:00:00",
- "mem": "64G"
- },
-
-
- "sobdetect1": {
- "threads": "4",
- "time": "24:00:00",
- "mem": "32G"
- },
- "sobdetect_cohort_params": {
- "threads": "4",
- "time": "12:00:00",
- "mem": "32G"
- },
- "sobdetect2": {
- "threads": "4",
- "time": "24:00:00",
- "mem": "32G"
- },
- "sobdetect_metrics": {
- "threads": "4",
- "time": "12:00:00",
- "mem": "32G"
- },
-
- "collectvariantcallmetrics": {
- "threads": "2",
- "time": "12:00:00"
- },
+ "sobdetect1": {
+ "threads": "4",
+ "time": "24:00:00",
+ "mem": "32G"
+ },
+ "sobdetect_cohort_params": {
+ "threads": "4",
+ "time": "12:00:00",
+ "mem": "32G"
+ },
+ "sobdetect2": {
+ "threads": "4",
+ "time": "24:00:00",
+ "mem": "32G"
+ },
+ "sobdetect_metrics": {
+ "threads": "4",
+ "time": "12:00:00",
+ "mem": "32G"
+ },
- "bam2fastq": {
- "threads": "32",
- "time": "12:00:00",
- "mem": "48G"
- },
- "merge_bams": {
- "threads": "24",
- "time": "12:00:00",
- "mem": "48G"
- },
- "merge_bam": {
- "threads": "16",
- "time": "12:00:00",
- "mem": "48G"
- },
- "trimmomatic": {
- "threads": "24",
- "time": "8:00:00",
- "mem": "24G"
- },
- "presort": {
- "threads": "32",
- "time": "24:00:00",
- "mem": "48G"
- },
- "bwa_mem": {
- "threads": "24",
- "mem": "96G"
- },
- "picard_headers": {
- "threads": "2",
- "time": "24:00:00"
- },
- "markdups": {
- "threads": "32",
- "time": "24:00:00"
- },
- "gatk_realign": {
- "threads": "2",
- "time": "48:00:00"
- },
- "gatk_recal": {
- "threads": "4",
- "time": "48:00:00",
- "mem": "72G"
- },
- "recal_1": {
- "threads": "2",
- "time": "24:00:00",
- "mem": "72G"
- },
- "recal_2": {
- "threads": "2",
- "time": "24:00:00",
- "mem": "72G"
- },
- "recal_3": {
- "threads": "2",
- "time": "24:00:00",
- "mem": "72G"
- },
- "recal_4": {
- "threads": "2",
- "time": "24:00:00",
- "mem": "72G"
- },
- "recal_5": {
- "threads": "2",
- "time": "24:00:00",
- "mem": "72G"
- },
- "novocraft_sort": {
- "threads": "32",
- "time": "12:00:00",
- "mem": "96G"
- },
- "qc": {
- "threads": "2",
- "partition": "norm",
- "time": "12:00:00"
- },
- "ped": {
- "threads": "2",
- "partition": "norm",
- "time": "12:00:00"
- },
- "inbreeding": {
- "threads": "8",
- "partition": "norm"
- },
- "cnv_post": {
- "threads": "8",
- "partition": "norm",
- "time": "12:00:00"
- },
- "hla": {
- "threads": "8",
- "partition": "norm",
- "mem": "96G"
- },
- "qualimap": {
- "threads": "24",
- "mem": "64G",
- "time": "24:00:00"
- },
- "qualiremap": {
- "threads": "24",
- "mem": "72G",
- "time": "12:00:00"
- },
- "varianteval":{
- "threads":"16",
- "mem":"32G",
- "time": "12:00:00"
- },
- "fastqc_raw":{
- "threads":"8",
- "mem":"12G",
- "time": "4:00:00"
- },
- "fastqc_trimmed":{
- "threads":"8",
- "mem":"12G",
- "time": "4:00:00"
- },
- "Gatk_SelectVariants":{
- "threads":"2",
- "mem":"24G",
- "time": "12:00:00"
- },
- "mutect2_filter":{
- "threads":"2",
- "mem":"24G",
- "time": "12:00:00"
- },
- "SNPeff":{
- "mem":"24G",
- "time": "12:00:00"
- },
- "haplotypecaller":{
- "threads":"4",
- "mem":"48G",
- "time": "2-00:00:00"
- },
- "mergegvcfs":{
- "threads":"4",
- "mem":"48G",
- "time": "5-00:00:00"
- },
- "merge_chrom":{
- "threads":"8",
- "mem":"48G",
- "time": "1-00:00:00"
- },
- "genotype":{
- "threads":"2",
- "mem":"96G",
- "time": "3-00:00:00"
- },
- "Gatk_Variantfilter":{
- "threads":"2",
- "mem":"32G",
- "time": "24:00:00"
- },
- "apply_snp":{
- "threads":"4",
- "mem":"48G",
- "time": "2-00:00:00"
- },
- "apply_indel":{
- "threads":"4",
- "mem":"48G",
- "time": "2-00:00:00"
- },
- "gtype_refinement":{
- "threads":"2",
- "mem":"48G",
- "time": "2-00:00:00"
- },
- "admixture_germline":{
- "threads":"32",
- "mem":"48G",
- "time": "2-00:00:00"
- },
- "hlamut":{
- "threads":"8",
- "mem":"48G",
- "time": "2-00:00:00"
- }
+ "collectvariantcallmetrics": {
+ "threads": "2",
+ "time": "12:00:00"
+ },
+
+ "bam2fastq": {
+ "threads": "32",
+ "time": "12:00:00",
+ "mem": "48G"
+ },
+ "merge_bams": {
+ "threads": "24",
+ "time": "12:00:00",
+ "mem": "48G"
+ },
+ "merge_bam": {
+ "threads": "16",
+ "time": "12:00:00",
+ "mem": "48G"
+ },
+ "trimmomatic": {
+ "threads": "24",
+ "time": "8:00:00",
+ "mem": "24G"
+ },
+ "presort": {
+ "threads": "32",
+ "time": "24:00:00",
+ "mem": "48G"
+ },
+ "bwa_mem": {
+ "threads": "24",
+ "mem": "96G"
+ },
+ "picard_headers": {
+ "threads": "2",
+ "time": "24:00:00"
+ },
+ "markdups": {
+ "threads": "32",
+ "time": "24:00:00"
+ },
+ "gatk_realign": {
+ "threads": "2",
+ "time": "48:00:00"
+ },
+ "gatk_recal": {
+ "threads": "4",
+ "time": "48:00:00",
+ "mem": "72G"
+ },
+ "recal_1": {
+ "threads": "2",
+ "time": "24:00:00",
+ "mem": "72G"
+ },
+ "recal_2": {
+ "threads": "2",
+ "time": "24:00:00",
+ "mem": "72G"
+ },
+ "recal_3": {
+ "threads": "2",
+ "time": "24:00:00",
+ "mem": "72G"
+ },
+ "recal_4": {
+ "threads": "2",
+ "time": "24:00:00",
+ "mem": "72G"
+ },
+ "recal_5": {
+ "threads": "2",
+ "time": "24:00:00",
+ "mem": "72G"
+ },
+ "novocraft_sort": {
+ "threads": "32",
+ "time": "12:00:00",
+ "mem": "96G"
+ },
+ "qc": {
+ "threads": "2",
+ "partition": "norm",
+ "time": "12:00:00"
+ },
+ "ped": {
+ "threads": "2",
+ "partition": "norm",
+ "time": "12:00:00"
+ },
+ "inbreeding": {
+ "threads": "8",
+ "partition": "norm"
+ },
+ "cnv_post": {
+ "threads": "8",
+ "partition": "norm",
+ "time": "12:00:00"
+ },
+ "hla": {
+ "threads": "8",
+ "partition": "norm",
+ "mem": "96G"
+ },
+ "qualimap": {
+ "threads": "24",
+ "mem": "64G",
+ "time": "24:00:00"
+ },
+ "qualiremap": {
+ "threads": "24",
+ "mem": "72G",
+ "time": "12:00:00"
+ },
+ "varianteval": {
+ "threads": "16",
+ "mem": "32G",
+ "time": "12:00:00"
+ },
+ "fastqc_raw": {
+ "threads": "8",
+ "mem": "12G",
+ "time": "4:00:00"
+ },
+ "fastqc_trimmed": {
+ "threads": "8",
+ "mem": "12G",
+ "time": "4:00:00"
+ },
+ "Gatk_SelectVariants": {
+ "threads": "2",
+ "mem": "24G",
+ "time": "12:00:00"
+ },
+ "mutect2_filter": {
+ "threads": "2",
+ "mem": "24G",
+ "time": "12:00:00"
+ },
+ "SNPeff": {
+ "mem": "24G",
+ "time": "12:00:00"
+ },
+ "haplotypecaller": {
+ "threads": "4",
+ "mem": "48G",
+ "time": "2-00:00:00"
+ },
+ "mergegvcfs": {
+ "threads": "4",
+ "mem": "48G",
+ "time": "5-00:00:00"
+ },
+ "merge_chrom": {
+ "threads": "8",
+ "mem": "48G",
+ "time": "1-00:00:00"
+ },
+ "genotype": {
+ "threads": "2",
+ "mem": "96G",
+ "time": "3-00:00:00"
+ },
+ "Gatk_Variantfilter": {
+ "threads": "2",
+ "mem": "32G",
+ "time": "24:00:00"
+ },
+ "apply_snp": {
+ "threads": "4",
+ "mem": "48G",
+ "time": "2-00:00:00"
+ },
+ "apply_indel": {
+ "threads": "4",
+ "mem": "48G",
+ "time": "2-00:00:00"
+ },
+ "gtype_refinement": {
+ "threads": "2",
+ "mem": "48G",
+ "time": "2-00:00:00"
+ },
+ "admixture_germline": {
+ "threads": "32",
+ "mem": "48G",
+ "time": "2-00:00:00"
+ },
+ "hlamut": {
+ "threads": "8",
+ "mem": "48G",
+ "time": "2-00:00:00"
+ }
}
diff --git a/config/cluster.frce.json b/config/cluster.frce.json
index d6c7a81..7896f0a 100644
--- a/config/cluster.frce.json
+++ b/config/cluster.frce.json
@@ -1,281 +1,280 @@
{
"__default__": {
"partition": "norm",
- "threads":"4",
+ "threads": "4",
"mem": "64G",
"time": "2-00:00:00",
- "name" : "{rule}.{wildcards}",
- "output" : "logfiles/slurmfiles/${{SLURM_JOBID}}.%j.{rule}.{wildcards}.out",
- "error" : "logfiles/slurmfiles/${{SLURM_JOBID}}.%j.{rule}.{wildcards}.err"
+ "name": "{rule}.{wildcards}",
+ "output": "logfiles/slurmfiles/${{SLURM_JOBID}}.%j.{rule}.{wildcards}.out",
+ "error": "logfiles/slurmfiles/${{SLURM_JOBID}}.%j.{rule}.{wildcards}.err"
},
"bam": {
- "threads": "2",
- "time": "12:00:00"
- },
- "filter_cnv": {
- "threads": "2",
- "time": "2:00:00",
- "mem": "1G"
- },
- "samtools_flagstats": {
- "threads": "2",
- "time": "4:00:00"
- },
+ "threads": "2",
+ "time": "12:00:00"
+ },
+ "filter_cnv": {
+ "threads": "2",
+ "time": "2:00:00",
+ "mem": "1G"
+ },
+ "samtools_flagstats": {
+ "threads": "2",
+ "time": "4:00:00"
+ },
+
+ "strelka": {
+ "threads": "16",
+ "time": "16:00:00",
+ "mem": "32G"
+ },
+
+ "strelka_filter": {
+ "threads": "4",
+ "time": "8:00:00",
+ "mem": "16G"
+ },
+ "vardict": {
+ "threads": "16",
+ "time": "16:00:00",
+ "mem": "32G"
+ },
- "strelka": {
- "threads": "16",
- "time": "16:00:00",
- "mem": "32G"
- },
+ "vardict_filter": {
+ "threads": "4",
+ "time": "8:00:00",
+ "mem": "32G"
+ },
+ "varscan": {
+ "threads": "16",
+ "time": "16:00:00",
+ "mem": "32G"
+ },
- "strelka_filter": {
- "threads": "4",
- "time": "8:00:00",
- "mem": "16G"
- },
- "vardict": {
- "threads": "16",
- "time": "16:00:00",
- "mem": "32G"
- },
+ "varscan_filter": {
+ "threads": "4",
+ "time": "8:00:00",
+ "mem": "32G"
+ },
- "vardict_filter": {
- "threads": "4",
- "time": "8:00:00",
- "mem": "32G"
- },
- "varscan": {
- "threads": "16",
- "time": "16:00:00",
- "mem": "32G"
- },
+ "merge_somatic_callers": {
+ "threads": "16",
+ "time": "18:00:00",
+ "mem": "64G"
+ },
- "varscan_filter": {
- "threads": "4",
- "time": "8:00:00",
- "mem": "32G"
- },
-
- "merge_somatic_callers": {
- "threads": "16",
- "time": "18:00:00",
- "mem": "64G"
- },
-
-
- "sobdetect1": {
- "threads": "4",
- "time": "24:00:00",
- "mem": "32G"
- },
- "sobdetect_cohort_params": {
- "threads": "4",
- "time": "12:00:00",
- "mem": "32G"
- },
- "sobdetect2": {
- "threads": "4",
- "time": "24:00:00",
- "mem": "32G"
- },
- "sobdetect_metrics": {
- "threads": "4",
- "time": "12:00:00",
- "mem": "32G"
- },
-
- "collectvariantcallmetrics": {
- "threads": "2",
- "time": "12:00:00"
- },
+ "sobdetect1": {
+ "threads": "4",
+ "time": "24:00:00",
+ "mem": "32G"
+ },
+ "sobdetect_cohort_params": {
+ "threads": "4",
+ "time": "12:00:00",
+ "mem": "32G"
+ },
+ "sobdetect2": {
+ "threads": "4",
+ "time": "24:00:00",
+ "mem": "32G"
+ },
+ "sobdetect_metrics": {
+ "threads": "4",
+ "time": "12:00:00",
+ "mem": "32G"
+ },
- "bam2fastq": {
- "threads": "32",
- "time": "12:00:00",
- "mem": "48G"
- },
- "merge_bams": {
- "threads": "24",
- "time": "12:00:00",
- "mem": "48G"
- },
- "merge_bam": {
- "threads": "16",
- "time": "12:00:00",
- "mem": "48G"
- },
- "trimmomatic": {
- "threads": "24",
- "time": "8:00:00",
- "mem": "24G"
- },
- "presort": {
- "threads": "32",
- "time": "24:00:00",
- "mem": "48G"
- },
- "bwa_mem": {
- "threads": "24",
- "mem": "96G"
- },
- "picard_headers": {
- "threads": "2",
- "time": "24:00:00"
- },
- "markdups": {
- "threads": "32",
- "time": "24:00:00"
- },
- "gatk_realign": {
- "threads": "2",
- "time": "48:00:00"
- },
- "gatk_recal": {
- "threads": "4",
- "time": "48:00:00",
- "mem": "72G"
- },
- "recal_1": {
- "threads": "2",
- "time": "24:00:00",
- "mem": "72G"
- },
- "recal_2": {
- "threads": "2",
- "time": "24:00:00",
- "mem": "72G"
- },
- "recal_3": {
- "threads": "2",
- "time": "24:00:00",
- "mem": "72G"
- },
- "recal_4": {
- "threads": "2",
- "time": "24:00:00",
- "mem": "72G"
- },
- "recal_5": {
- "threads": "2",
- "time": "24:00:00",
- "mem": "72G"
- },
- "novocraft_sort": {
- "threads": "32",
- "time": "12:00:00",
- "mem": "96G"
- },
- "qc": {
- "threads": "2",
- "partition": "norm",
- "time": "12:00:00"
- },
- "ped": {
- "threads": "2",
- "partition": "norm",
- "time": "12:00:00"
- },
- "inbreeding": {
- "threads": "8",
- "partition": "norm"
- },
- "cnv_post": {
- "threads": "8",
- "partition": "norm",
- "time": "12:00:00"
- },
- "hla": {
- "threads": "8",
- "partition": "norm",
- "mem": "96G"
- },
- "qualimap": {
- "threads": "24",
- "mem": "64G",
- "time": "24:00:00"
- },
- "qualiremap": {
- "threads": "24",
- "mem": "72G",
- "time": "12:00:00"
- },
- "varianteval":{
- "threads":"16",
- "mem":"32G",
- "time": "12:00:00"
- },
- "fastqc_raw":{
- "threads":"8",
- "mem":"12G",
- "time": "4:00:00"
- },
- "fastqc_trimmed":{
- "threads":"8",
- "mem":"12G",
- "time": "4:00:00"
- },
- "Gatk_SelectVariants":{
- "threads":"2",
- "mem":"24G",
- "time": "12:00:00"
- },
- "mutect2_filter":{
- "threads":"2",
- "mem":"24G",
- "time": "12:00:00"
- },
- "SNPeff":{
- "mem":"24G",
- "time": "12:00:00"
- },
- "haplotypecaller":{
- "threads":"4",
- "mem":"48G",
- "time": "2-00:00:00"
- },
- "mergegvcfs":{
- "threads":"4",
- "mem":"48G",
- "time": "5-00:00:00"
- },
- "merge_chrom":{
- "threads":"8",
- "mem":"48G",
- "time": "1-00:00:00"
- },
- "genotype":{
- "threads":"2",
- "mem":"96G",
- "time": "3-00:00:00"
- },
- "Gatk_Variantfilter":{
- "threads":"2",
- "mem":"32G",
- "time": "24:00:00"
- },
- "apply_snp":{
- "threads":"4",
- "mem":"48G",
- "time": "2-00:00:00"
- },
- "apply_indel":{
- "threads":"4",
- "mem":"48G",
- "time": "2-00:00:00"
- },
- "gtype_refinement":{
- "threads":"2",
- "mem":"48G",
- "time": "2-00:00:00"
- },
- "admixture_germline":{
- "threads":"32",
- "mem":"48G",
- "time": "2-00:00:00"
- },
- "hlamut":{
- "threads":"8",
- "mem":"48G",
- "time": "2-00:00:00"
- }
+ "collectvariantcallmetrics": {
+ "threads": "2",
+ "time": "12:00:00"
+ },
+
+ "bam2fastq": {
+ "threads": "32",
+ "time": "12:00:00",
+ "mem": "48G"
+ },
+ "merge_bams": {
+ "threads": "24",
+ "time": "12:00:00",
+ "mem": "48G"
+ },
+ "merge_bam": {
+ "threads": "16",
+ "time": "12:00:00",
+ "mem": "48G"
+ },
+ "trimmomatic": {
+ "threads": "24",
+ "time": "8:00:00",
+ "mem": "24G"
+ },
+ "presort": {
+ "threads": "32",
+ "time": "24:00:00",
+ "mem": "48G"
+ },
+ "bwa_mem": {
+ "threads": "24",
+ "mem": "96G"
+ },
+ "picard_headers": {
+ "threads": "2",
+ "time": "24:00:00"
+ },
+ "markdups": {
+ "threads": "32",
+ "time": "24:00:00"
+ },
+ "gatk_realign": {
+ "threads": "2",
+ "time": "48:00:00"
+ },
+ "gatk_recal": {
+ "threads": "4",
+ "time": "48:00:00",
+ "mem": "72G"
+ },
+ "recal_1": {
+ "threads": "2",
+ "time": "24:00:00",
+ "mem": "72G"
+ },
+ "recal_2": {
+ "threads": "2",
+ "time": "24:00:00",
+ "mem": "72G"
+ },
+ "recal_3": {
+ "threads": "2",
+ "time": "24:00:00",
+ "mem": "72G"
+ },
+ "recal_4": {
+ "threads": "2",
+ "time": "24:00:00",
+ "mem": "72G"
+ },
+ "recal_5": {
+ "threads": "2",
+ "time": "24:00:00",
+ "mem": "72G"
+ },
+ "novocraft_sort": {
+ "threads": "32",
+ "time": "12:00:00",
+ "mem": "96G"
+ },
+ "qc": {
+ "threads": "2",
+ "partition": "norm",
+ "time": "12:00:00"
+ },
+ "ped": {
+ "threads": "2",
+ "partition": "norm",
+ "time": "12:00:00"
+ },
+ "inbreeding": {
+ "threads": "8",
+ "partition": "norm"
+ },
+ "cnv_post": {
+ "threads": "8",
+ "partition": "norm",
+ "time": "12:00:00"
+ },
+ "hla": {
+ "threads": "8",
+ "partition": "norm",
+ "mem": "96G"
+ },
+ "qualimap": {
+ "threads": "24",
+ "mem": "64G",
+ "time": "24:00:00"
+ },
+ "qualiremap": {
+ "threads": "24",
+ "mem": "72G",
+ "time": "12:00:00"
+ },
+ "varianteval": {
+ "threads": "16",
+ "mem": "32G",
+ "time": "12:00:00"
+ },
+ "fastqc_raw": {
+ "threads": "8",
+ "mem": "12G",
+ "time": "4:00:00"
+ },
+ "fastqc_trimmed": {
+ "threads": "8",
+ "mem": "12G",
+ "time": "4:00:00"
+ },
+ "Gatk_SelectVariants": {
+ "threads": "2",
+ "mem": "24G",
+ "time": "12:00:00"
+ },
+ "mutect2_filter": {
+ "threads": "2",
+ "mem": "24G",
+ "time": "12:00:00"
+ },
+ "SNPeff": {
+ "mem": "24G",
+ "time": "12:00:00"
+ },
+ "haplotypecaller": {
+ "threads": "4",
+ "mem": "48G",
+ "time": "2-00:00:00"
+ },
+ "mergegvcfs": {
+ "threads": "4",
+ "mem": "48G",
+ "time": "5-00:00:00"
+ },
+ "merge_chrom": {
+ "threads": "8",
+ "mem": "48G",
+ "time": "1-00:00:00"
+ },
+ "genotype": {
+ "threads": "2",
+ "mem": "96G",
+ "time": "3-00:00:00"
+ },
+ "Gatk_Variantfilter": {
+ "threads": "2",
+ "mem": "32G",
+ "time": "24:00:00"
+ },
+ "apply_snp": {
+ "threads": "4",
+ "mem": "48G",
+ "time": "2-00:00:00"
+ },
+ "apply_indel": {
+ "threads": "4",
+ "mem": "48G",
+ "time": "2-00:00:00"
+ },
+ "gtype_refinement": {
+ "threads": "2",
+ "mem": "48G",
+ "time": "2-00:00:00"
+ },
+ "admixture_germline": {
+ "threads": "32",
+ "mem": "48G",
+ "time": "2-00:00:00"
+ },
+ "hlamut": {
+ "threads": "8",
+ "mem": "48G",
+ "time": "2-00:00:00"
+ }
}
diff --git a/config/config.json b/config/config.json
index 5ad802d..c4f7d98 100644
--- a/config/config.json
+++ b/config/config.json
@@ -1,37 +1,49 @@
{
"input_params": {
- "FASTQ_SOURCE": "",
- "BAM_SOURCE": "",
- "TN_MODE": "auto",
- "PAIRS_FILE": "",
- "VARIANT_CALLERS": ["mutect2","vardict","mutect","strelka","varscan"],
- "BASE_OUTDIR": "pipeline_output",
- "OTHER_SOMATIC_VCFS": {},
- "EXOME_TARGETS": "/data/CCBR_Pipeliner/db/PipeDB/lib/Agilent_SSv7_allExons_hg38.bed",
- "GERMLINE": "false",
- "FFPE_FILTER": "false",
- "CNV_CALLING": "false",
- "tmpdisk": "",
- "genome": ""
+ "FASTQ_SOURCE": "",
+ "BAM_SOURCE": "",
+ "TN_MODE": "auto",
+ "PAIRS_FILE": "",
+ "VARIANT_CALLERS": [
+ "mutect2",
+ "vardict",
+ "mutect",
+ "strelka",
+ "varscan"
+ ],
+ "BASE_OUTDIR": "pipeline_output",
+ "OTHER_SOMATIC_VCFS": {},
+ "EXOME_TARGETS": "/data/CCBR_Pipeliner/db/PipeDB/lib/Agilent_SSv7_allExons_hg38.bed",
+ "GERMLINE": "false",
+ "FFPE_FILTER": "false",
+ "CNV_CALLING": "false",
+ "tmpdisk": "",
+ "genome": ""
},
"input_params_test": {
- "FASTQ_SOURCE": "/data/tandonm/pl_test_data/human/fastq",
- "BAM_SOURCE": "/data/tandonm/pl_test_data/human/bams",
- "PAIRS_FILE": "pairs.tsv",
- "VARIANT_CALLERS": ["mutect2","mutect","strelka","vardict","varscan"],
- "BASE_OUTDIR": "pipe_out_1",
- "OTHER_SOMATIC_VCFS": {},
- "EXOME_TARGETS": "/data/CCBR_Pipeliner/db/PipeDB/lib/Agilent_SSv7_allExons_hg38.bed",
- "FFPE_FILTER": "false",
- "CNV_CALLING": "false"
+ "FASTQ_SOURCE": "/data/tandonm/pl_test_data/human/fastq",
+ "BAM_SOURCE": "/data/tandonm/pl_test_data/human/bams",
+ "PAIRS_FILE": "pairs.tsv",
+ "VARIANT_CALLERS": [
+ "mutect2",
+ "mutect",
+ "strelka",
+ "vardict",
+ "varscan"
+ ],
+ "BASE_OUTDIR": "pipe_out_1",
+ "OTHER_SOMATIC_VCFS": {},
+ "EXOME_TARGETS": "/data/CCBR_Pipeliner/db/PipeDB/lib/Agilent_SSv7_allExons_hg38.bed",
+ "FFPE_FILTER": "false",
+ "CNV_CALLING": "false"
},
"output_params": {
- "FASTQ": "fastqs",
- "BAM": "bams",
- "MERGED_SOMATIC_OUTDIR": "merged_somatic_variants",
- "GERMLINE_VCF": {
- "GATK": "germline_joint_genotyping"
- }
+ "FASTQ": "fastqs",
+ "BAM": "bams",
+ "MERGED_SOMATIC_OUTDIR": "merged_somatic_variants",
+ "GERMLINE_VCF": {
+ "GATK": "germline_joint_genotyping"
+ }
},
"scripts": {
"vcf2maf_wrapper": "workflow/scripts/vcf2maf_wrapper.bash",
@@ -44,11 +56,11 @@
"correct_target_bed": "workflow/scripts/correct_target_bed.py",
"genderPrediction": "workflow/scripts/RScripts/predictGender.R",
"combineSamples": "workflow/scripts/RScripts/combineAllSampleCompareResults.R",
- "ancestry": "workflow/scripts/RScripts/sampleCompareAncestoryPlots.R"
+ "ancestry": "workflow/scripts/RScripts/sampleCompareAncestryPlots.R"
},
"available_somatic_callers": {
- "paired": ["mutect2","strelka","mutect","vardict","varscan"],
- "tumor_only": ["mutect2","mutect","vardict","varscan"]
+ "paired": ["mutect2", "strelka", "mutect", "vardict", "varscan"],
+ "tumor_only": ["mutect2", "mutect", "vardict", "varscan"]
},
"not_used": {
"1000G": "/data/GRIS_NCBR/resources/ALL.GRCh38_sites.nuclear.20170504.vcf.gz",
@@ -57,6 +69,5 @@
"PATTERNS": "/data/GRIS_NCBR/resources/multiqc_config_file.yaml",
"REGIONS": "/data/GRIS_NCBR/resources/HG19_vcrome2.1_with_PKv1_and_PKv2_with_SNPtrace.bed",
"SMOOVEEXCLUSIONS": "/data/GRIS_NCBR/resources/hg38_smoove_exclusion.bed"
-
}
-}
\ No newline at end of file
+}
diff --git a/config/genomes/hg38.biowulf.json b/config/genomes/hg38.biowulf.json
index b4237b0..640bafb 100644
--- a/config/genomes/hg38.biowulf.json
+++ b/config/genomes/hg38.biowulf.json
@@ -1,49 +1,75 @@
{
- "references": {
- "FASTQ_SCREEN_CONFIG": "resources/fastq_screen.biowulf.conf",
- "KRAKENBACDB": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/kraken/20180907_standard_kraken2",
- "trimmomatic.adapters": "resources/adapters.fa",
- "SNPEFF_GENOME": "GRCh38.86",
- "SNPEFF_CONFIG": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/snpEff/4.3t/snpEff.config",
- "SNPEFF_BUNDLE": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/snpEff/4.3t/",
- "BWAGENOME": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/genome/Homo_sapiens_assembly38.fasta",
- "GENOME": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/genome/Homo_sapiens_assembly38.fasta",
- "GENOMEDICT": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/genome/Homo_sapiens_assembly38.dict",
- "DBSNP": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/dbsnp_138.hg38.vcf.gz",
- "KNOWNANCESTRY": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/1k_genomes/1k_genomes_phase3_autosomes.hg38.vcf.gz",
- "KNOWNINDELS": "-known /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz -known /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/ALL.wgs.1000G_phase3.GRCh38.ncbi_remapper.20150424.shapeit2_indels.vcf.gz",
- "KNOWNRECAL": "--known-sites /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/dbsnp_138.hg38.vcf.gz --known-sites /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz --known-sites /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/ALL.wgs.1000G_phase3.GRCh38.ncbi_remapper.20150424.shapeit2_indels.vcf.gz",
- "KNOWNSNPS": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/1000G_phase1.snps.high_confidence.hg38.vcf.gz",
- "HAPMAP": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/hapmap_3.3.hg38.vcf.gz",
- "OMNI": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/1000G_omni2.5.hg38.vcf.gz",
- "MILLS": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz",
- "AXIOM": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz",
- "PON": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/PON/hg38.noCOSMIC_ClinVar.pon.vcf.gz",
- "COSMIC": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/COSMIC/COSMIC_82_hg38.vcf.gz",
- "DBSNP_COSMIC": "--cosmic /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/COSMIC/COSMIC_82_hg38.vcf.gz --dbsnp /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/dbsnp_138.hg38.vcf.gz",
- "CONTAMINATION": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/ExomeContam.hg38.vcf.gz",
- "GNOMAD": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GNOMAD/somatic-hg38-af-only-gnomad.hg38.vcf.gz",
- "GERMLINERESOURCE": "--germline-resource /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GNOMAD/somatic-hg38-af-only-gnomad.hg38.vcf.gz",
- "FREECLENGTHS": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/FREEC/hg38.filtered.fa.fai",
- "FREECCHROMS": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/FREEC/Chromosomes",
- "FREECPILEUP": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/FREEC/dbsnp_146.hg38.SingleDiNucl.IDs.vcf",
- "FREECSNPS": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/FREEC/dbsnp_146.hg38.SingleDiNucl.IDs.vcf",
- "SEQUENZAGC": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/SEQUENZA/hg38_gc50Base.txt.gz",
- "ADMIXTUREKEY": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/1k_genomes/1k_genomes_superpop_key.txt",
- "ADMIXTUREREFS": "5",
- "MAF_FILTERVCF": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/VCF2MAF/ExAC_nonTCGA.r0.3.1.sites.vep.GRCh38.vcf.gz",
- "MAF_GENOME": "hg38",
- "chroms" : ["chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22","chrX","chrY","chrM"],
- "VCF2MAF": {
- "VEPRESOURCEBUNDLEPATH": "/fdb/VEP/102/cache",
- "GENOME_BUILD": "GRCh38",
- "SPECIES": "homo_sapiens"
- },
- "SOMALIER": {
- "ANCESTRY_DB": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/somalier/1kg-somalier",
- "SITES_VCF": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/somalier/sites.hg38.vcf.gz",
- "NCBI_BUILD": "GRCh38",
- "SPECIES": "homo_sapiens"
- }
- }
+ "references": {
+ "FASTQ_SCREEN_CONFIG": "resources/fastq_screen.biowulf.conf",
+ "KRAKENBACDB": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/kraken/20180907_standard_kraken2",
+ "trimmomatic.adapters": "resources/adapters.fa",
+ "SNPEFF_GENOME": "GRCh38.86",
+ "SNPEFF_CONFIG": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/snpEff/4.3t/snpEff.config",
+ "SNPEFF_BUNDLE": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/snpEff/4.3t/",
+ "BWAGENOME": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/genome/Homo_sapiens_assembly38.fasta",
+ "GENOME": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/genome/Homo_sapiens_assembly38.fasta",
+ "GENOMEDICT": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/genome/Homo_sapiens_assembly38.dict",
+ "DBSNP": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/dbsnp_138.hg38.vcf.gz",
+ "KNOWNANCESTRY": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/1k_genomes/1k_genomes_phase3_autosomes.hg38.vcf.gz",
+ "KNOWNINDELS": "-known /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz -known /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/ALL.wgs.1000G_phase3.GRCh38.ncbi_remapper.20150424.shapeit2_indels.vcf.gz",
+ "KNOWNRECAL": "--known-sites /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/dbsnp_138.hg38.vcf.gz --known-sites /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz --known-sites /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/ALL.wgs.1000G_phase3.GRCh38.ncbi_remapper.20150424.shapeit2_indels.vcf.gz",
+ "KNOWNSNPS": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/1000G_phase1.snps.high_confidence.hg38.vcf.gz",
+ "HAPMAP": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/hapmap_3.3.hg38.vcf.gz",
+ "OMNI": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/1000G_omni2.5.hg38.vcf.gz",
+ "MILLS": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz",
+ "AXIOM": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz",
+ "PON": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/PON/hg38.noCOSMIC_ClinVar.pon.vcf.gz",
+ "COSMIC": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/COSMIC/COSMIC_82_hg38.vcf.gz",
+ "DBSNP_COSMIC": "--cosmic /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/COSMIC/COSMIC_82_hg38.vcf.gz --dbsnp /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/dbsnp_138.hg38.vcf.gz",
+ "CONTAMINATION": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/ExomeContam.hg38.vcf.gz",
+ "GNOMAD": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GNOMAD/somatic-hg38-af-only-gnomad.hg38.vcf.gz",
+ "GERMLINERESOURCE": "--germline-resource /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GNOMAD/somatic-hg38-af-only-gnomad.hg38.vcf.gz",
+ "FREECLENGTHS": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/FREEC/hg38.filtered.fa.fai",
+ "FREECCHROMS": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/FREEC/Chromosomes",
+ "FREECPILEUP": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/FREEC/dbsnp_146.hg38.SingleDiNucl.IDs.vcf",
+ "FREECSNPS": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/FREEC/dbsnp_146.hg38.SingleDiNucl.IDs.vcf",
+ "SEQUENZAGC": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/SEQUENZA/hg38_gc50Base.txt.gz",
+ "ADMIXTUREKEY": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/1k_genomes/1k_genomes_superpop_key.txt",
+ "ADMIXTUREREFS": "5",
+ "MAF_FILTERVCF": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/VCF2MAF/ExAC_nonTCGA.r0.3.1.sites.vep.GRCh38.vcf.gz",
+ "MAF_GENOME": "hg38",
+ "chroms": [
+ "chr1",
+ "chr2",
+ "chr3",
+ "chr4",
+ "chr5",
+ "chr6",
+ "chr7",
+ "chr8",
+ "chr9",
+ "chr10",
+ "chr11",
+ "chr12",
+ "chr13",
+ "chr14",
+ "chr15",
+ "chr16",
+ "chr17",
+ "chr18",
+ "chr19",
+ "chr20",
+ "chr21",
+ "chr22",
+ "chrX",
+ "chrY",
+ "chrM"
+ ],
+ "VCF2MAF": {
+ "VEPRESOURCEBUNDLEPATH": "/fdb/VEP/102/cache",
+ "GENOME_BUILD": "GRCh38",
+ "SPECIES": "homo_sapiens"
+ },
+ "SOMALIER": {
+ "ANCESTRY_DB": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/somalier/1kg-somalier",
+ "SITES_VCF": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/somalier/sites.hg38.vcf.gz",
+ "NCBI_BUILD": "GRCh38",
+ "SPECIES": "homo_sapiens"
+ }
+ }
}
diff --git a/config/genomes/hg38.frce.json b/config/genomes/hg38.frce.json
index ff5e4f4..46e2805 100755
--- a/config/genomes/hg38.frce.json
+++ b/config/genomes/hg38.frce.json
@@ -1,50 +1,76 @@
{
- "references": {
- "FASTQ_SCREEN_CONFIG": "resources/fastq_screen.frce.conf",
- "KRAKENBACDB": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/kraken/20180907_standard_kraken2",
- "trimmomatic.adapters": "resources/adapters.fa",
- "SNPEFF_GENOME": "GRCh38.86",
- "SNPEFF_CONFIG": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/snpEff/4.3t/snpEff.config",
- "SNPEFF_BUNDLE": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/snpEff/4.3t/",
- "BWAGENOME": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/genome/Homo_sapiens_assembly38.fasta",
- "GENOME": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/genome/Homo_sapiens_assembly38.fasta",
- "GENOMEDICT": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/genome/Homo_sapiens_assembly38.dict",
- "DBSNP": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/GATK_resource_bundle/dbsnp_138.hg38.vcf.gz",
- "KNOWNANCESTRY": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/1k_genomes/1k_genomes_phase3_autosomes.hg38.vcf.gz",
- "KNOWNINDELS": "-known /mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/GATK_resource_bundle/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz -known /mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/GATK_resource_bundle/ALL.wgs.1000G_phase3.GRCh38.ncbi_remapper.20150424.shapeit2_indels.vcf.gz",
- "KNOWNRECAL": "--known-sites /mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/GATK_resource_bundle/dbsnp_138.hg38.vcf.gz --known-sites /mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/GATK_resource_bundle/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz --known-sites /mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/GATK_resource_bundle/ALL.wgs.1000G_phase3.GRCh38.ncbi_remapper.20150424.shapeit2_indels.vcf.gz",
- "1000GSNP": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/GATK_resource_bundle/1000G_phase1.snps.high_confidence.hg38.vcf.gz",
- "KNOWNSNPS": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/GATK_resource_bundle/1000G_phase1.snps.high_confidence.hg38.vcf.gz",
- "HAPMAP": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/GATK_resource_bundle/hapmap_3.3.hg38.vcf.gz",
- "OMNI": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/GATK_resource_bundle/1000G_omni2.5.hg38.vcf.gz",
- "MILLS": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/GATK_resource_bundle/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz",
- "AXIOM": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/GATK_resource_bundle/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz",
- "PON": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/PON/hg38.noCOSMIC_ClinVar.pon.vcf.gz",
- "COSMIC": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/COSMIC/COSMIC_82_hg38.vcf.gz",
- "DBSNP_COSMIC": "--cosmic /mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/COSMIC/COSMIC_82_hg38.vcf.gz --dbsnp /mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/GATK_resource_bundle/dbsnp_138.hg38.vcf.gz",
- "CONTAMINATION": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/GATK_resource_bundle/ExomeContam.hg38.vcf.gz",
- "GNOMAD": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/GNOMAD/somatic-hg38-af-only-gnomad.hg38.vcf.gz",
- "GERMLINERESOURCE": "--germline-resource /mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/GNOMAD/somatic-hg38-af-only-gnomad.hg38.vcf.gz",
- "FREECLENGTHS": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/FREEC/hg38.filtered.fa.fai",
- "FREECCHROMS": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/FREEC/Chromosomes",
- "FREECPILEUP": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/FREEC/dbsnp_146.hg38.SingleDiNucl.IDs.vcf",
- "FREECSNPS": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/FREEC/dbsnp_146.hg38.SingleDiNucl.IDs.vcf",
- "SEQUENZAGC": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/SEQUENZA/hg38_gc50Base.txt.gz",
- "ADMIXTUREKEY": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/1k_genomes/1k_genomes_superpop_key.txt",
- "ADMIXTUREREFS": "5",
- "MAF_FILTERVCF": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/VCF2MAF/ExAC_nonTCGA.r0.3.1.sites.vep.GRCh38.vcf.gz",
- "MAF_GENOME": "hg38",
- "chroms" : ["chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22","chrX","chrY","chrM"],
- "VCF2MAF": {
- "VEPRESOURCEBUNDLEPATH": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/vcf2maf/VEP_tarballs/.vep",
- "GENOME_BUILD": "GRCh38",
- "SPECIES": "homo_sapiens"
- },
- "SOMALIER": {
- "ANCESTRY_DB": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/somalier/1kg-somalier",
- "SITES_VCF": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/somalier/sites.hg38.vcf.gz",
- "NCBI_BUILD": "GRCh38",
- "SPECIES": "homo_sapiens"
- }
- }
+ "references": {
+ "FASTQ_SCREEN_CONFIG": "resources/fastq_screen.frce.conf",
+ "KRAKENBACDB": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/kraken/20180907_standard_kraken2",
+ "trimmomatic.adapters": "resources/adapters.fa",
+ "SNPEFF_GENOME": "GRCh38.86",
+ "SNPEFF_CONFIG": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/snpEff/4.3t/snpEff.config",
+ "SNPEFF_BUNDLE": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/snpEff/4.3t/",
+ "BWAGENOME": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/genome/Homo_sapiens_assembly38.fasta",
+ "GENOME": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/genome/Homo_sapiens_assembly38.fasta",
+ "GENOMEDICT": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/genome/Homo_sapiens_assembly38.dict",
+ "DBSNP": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/GATK_resource_bundle/dbsnp_138.hg38.vcf.gz",
+ "KNOWNANCESTRY": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/1k_genomes/1k_genomes_phase3_autosomes.hg38.vcf.gz",
+ "KNOWNINDELS": "-known /mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/GATK_resource_bundle/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz -known /mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/GATK_resource_bundle/ALL.wgs.1000G_phase3.GRCh38.ncbi_remapper.20150424.shapeit2_indels.vcf.gz",
+ "KNOWNRECAL": "--known-sites /mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/GATK_resource_bundle/dbsnp_138.hg38.vcf.gz --known-sites /mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/GATK_resource_bundle/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz --known-sites /mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/GATK_resource_bundle/ALL.wgs.1000G_phase3.GRCh38.ncbi_remapper.20150424.shapeit2_indels.vcf.gz",
+ "1000GSNP": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/GATK_resource_bundle/1000G_phase1.snps.high_confidence.hg38.vcf.gz",
+ "KNOWNSNPS": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/GATK_resource_bundle/1000G_phase1.snps.high_confidence.hg38.vcf.gz",
+ "HAPMAP": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/GATK_resource_bundle/hapmap_3.3.hg38.vcf.gz",
+ "OMNI": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/GATK_resource_bundle/1000G_omni2.5.hg38.vcf.gz",
+ "MILLS": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/GATK_resource_bundle/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz",
+ "AXIOM": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/GATK_resource_bundle/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz",
+ "PON": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/PON/hg38.noCOSMIC_ClinVar.pon.vcf.gz",
+ "COSMIC": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/COSMIC/COSMIC_82_hg38.vcf.gz",
+ "DBSNP_COSMIC": "--cosmic /mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/COSMIC/COSMIC_82_hg38.vcf.gz --dbsnp /mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/GATK_resource_bundle/dbsnp_138.hg38.vcf.gz",
+ "CONTAMINATION": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/GATK_resource_bundle/ExomeContam.hg38.vcf.gz",
+ "GNOMAD": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/GNOMAD/somatic-hg38-af-only-gnomad.hg38.vcf.gz",
+ "GERMLINERESOURCE": "--germline-resource /mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/GNOMAD/somatic-hg38-af-only-gnomad.hg38.vcf.gz",
+ "FREECLENGTHS": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/FREEC/hg38.filtered.fa.fai",
+ "FREECCHROMS": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/FREEC/Chromosomes",
+ "FREECPILEUP": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/FREEC/dbsnp_146.hg38.SingleDiNucl.IDs.vcf",
+ "FREECSNPS": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/FREEC/dbsnp_146.hg38.SingleDiNucl.IDs.vcf",
+ "SEQUENZAGC": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/SEQUENZA/hg38_gc50Base.txt.gz",
+ "ADMIXTUREKEY": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/1k_genomes/1k_genomes_superpop_key.txt",
+ "ADMIXTUREREFS": "5",
+ "MAF_FILTERVCF": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/VCF2MAF/ExAC_nonTCGA.r0.3.1.sites.vep.GRCh38.vcf.gz",
+ "MAF_GENOME": "hg38",
+ "chroms": [
+ "chr1",
+ "chr2",
+ "chr3",
+ "chr4",
+ "chr5",
+ "chr6",
+ "chr7",
+ "chr8",
+ "chr9",
+ "chr10",
+ "chr11",
+ "chr12",
+ "chr13",
+ "chr14",
+ "chr15",
+ "chr16",
+ "chr17",
+ "chr18",
+ "chr19",
+ "chr20",
+ "chr21",
+ "chr22",
+ "chrX",
+ "chrY",
+ "chrM"
+ ],
+ "VCF2MAF": {
+ "VEPRESOURCEBUNDLEPATH": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/vcf2maf/VEP_tarballs/.vep",
+ "GENOME_BUILD": "GRCh38",
+ "SPECIES": "homo_sapiens"
+ },
+ "SOMALIER": {
+ "ANCESTRY_DB": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/somalier/1kg-somalier",
+ "SITES_VCF": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/somalier/sites.hg38.vcf.gz",
+ "NCBI_BUILD": "GRCh38",
+ "SPECIES": "homo_sapiens"
+ }
+ }
}
diff --git a/config/genomes/mm10.biowulf.json b/config/genomes/mm10.biowulf.json
index e510058..91cc0aa 100644
--- a/config/genomes/mm10.biowulf.json
+++ b/config/genomes/mm10.biowulf.json
@@ -1,42 +1,64 @@
{
- "references": {
- "FASTQ_SCREEN_CONFIG": "resources/fastq_screen.biowulf.conf",
- "KRAKENBACDB": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/kraken/20180907_standard_kraken2",
- "trimmomatic.adapters": "resources/adapters.fa",
- "SNPEFF_GENOME": "GRCm38.86",
- "SNPEFF_CONFIG": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/snpEff/4.3t/snpEff.config",
- "SNPEFF_BUNDLE": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/snpEff/4.3t/",
- "BWAGENOME": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/genome/bwaindex/genome.fa",
- "GENOME": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/genome/bwaindex/genome.fa",
- "GENOMEDICT": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/genome/bwaindex/genome.dict",
- "DBSNP": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/dbsnp/mm10_allstrains_dbSNP142.vcf.gz",
- "DBSNP_COSMIC": "--dbsnp /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/dbsnp/mm10_allstrains_dbSNP142.vcf.gz",
- "KNOWNANCESTRY": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/dbsnp/mm10_known_snps.vcf.gz",
- "KNOWNINDELS": "-known /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/dbsnp/mm10_known_indels.vcf.gz",
- "KNOWNRECAL": "-known-sites /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/dbsnp/mm10_known_indels.vcf.gz -known-sites /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/dbsnp/mm10_known_snps.vcf.gz",
- "KNOWNSNPS":"/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/dbsnp/mm10_knownSNPs_sites.vcf.gz",
- "GERMLINERESOURCE":"",
- "FREECLENGTHS": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/FREEC/mm10.fa.fai",
- "PON": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/dbsnp/mm10_dbSNP_allStrains_compSet_noIND.vcf.gz",
- "FREECCHROMS": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/FREEC/Chromosomes",
- "FREECPILEUP": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/FREEC/mm10_dbSNP137.ucsc.freec.bed",
- "FREECSNPS": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/FREEC/mm10_dbSNP137.ucsc.freec.txt.gz",
- "SEQUENZAGC": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/SEQUENZA/mm10.gc50Base.wig.gz",
- "ADMIXTUREKEY": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/mouse_genomes_strain_key.txt",
- "ADMIXTUREREFS": "39",
- "MAF_GENOME": "mm10",
- "MAF_FILTERVCF": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/dbsnp/mm10_allstrains_dbSNP142.vcf.gz",
- "chroms": ["chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chrX","chrY","chrM"],
- "VCF2MAF": {
- "VEPRESOURCEBUNDLEPATH": "/fdb/VEP/102/cache",
- "GENOME_BUILD": "GRCm38",
- "SPECIES": "mus_musculus"
- },
- "SOMALIER": {
- "SITES_VCF": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/somalier/mm10.sites.vcf.gz",
- "NCBI_BUILD": "GRCm38",
- "SPECIES": "mus_musculus"
- }
- }
+ "references": {
+ "FASTQ_SCREEN_CONFIG": "resources/fastq_screen.biowulf.conf",
+ "KRAKENBACDB": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/kraken/20180907_standard_kraken2",
+ "trimmomatic.adapters": "resources/adapters.fa",
+ "SNPEFF_GENOME": "GRCm38.86",
+ "SNPEFF_CONFIG": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/snpEff/4.3t/snpEff.config",
+ "SNPEFF_BUNDLE": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/snpEff/4.3t/",
+ "BWAGENOME": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/genome/bwaindex/genome.fa",
+ "GENOME": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/genome/bwaindex/genome.fa",
+ "GENOMEDICT": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/genome/bwaindex/genome.dict",
+ "DBSNP": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/dbsnp/mm10_allstrains_dbSNP142.vcf.gz",
+ "DBSNP_COSMIC": "--dbsnp /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/dbsnp/mm10_allstrains_dbSNP142.vcf.gz",
+ "KNOWNANCESTRY": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/dbsnp/mm10_known_snps.vcf.gz",
+ "KNOWNINDELS": "-known /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/dbsnp/mm10_known_indels.vcf.gz",
+ "KNOWNRECAL": "-known-sites /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/dbsnp/mm10_known_indels.vcf.gz -known-sites /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/dbsnp/mm10_known_snps.vcf.gz",
+ "KNOWNSNPS": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/dbsnp/mm10_knownSNPs_sites.vcf.gz",
+ "GERMLINERESOURCE": "",
+ "FREECLENGTHS": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/FREEC/mm10.fa.fai",
+ "PON": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/dbsnp/mm10_dbSNP_allStrains_compSet_noIND.vcf.gz",
+ "FREECCHROMS": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/FREEC/Chromosomes",
+ "FREECPILEUP": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/FREEC/mm10_dbSNP137.ucsc.freec.bed",
+ "FREECSNPS": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/FREEC/mm10_dbSNP137.ucsc.freec.txt.gz",
+ "SEQUENZAGC": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/SEQUENZA/mm10.gc50Base.wig.gz",
+ "ADMIXTUREKEY": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/mouse_genomes_strain_key.txt",
+ "ADMIXTUREREFS": "39",
+ "MAF_GENOME": "mm10",
+ "MAF_FILTERVCF": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/dbsnp/mm10_allstrains_dbSNP142.vcf.gz",
+ "chroms": [
+ "chr1",
+ "chr2",
+ "chr3",
+ "chr4",
+ "chr5",
+ "chr6",
+ "chr7",
+ "chr8",
+ "chr9",
+ "chr10",
+ "chr11",
+ "chr12",
+ "chr13",
+ "chr14",
+ "chr15",
+ "chr16",
+ "chr17",
+ "chr18",
+ "chr19",
+ "chrX",
+ "chrY",
+ "chrM"
+ ],
+ "VCF2MAF": {
+ "VEPRESOURCEBUNDLEPATH": "/fdb/VEP/102/cache",
+ "GENOME_BUILD": "GRCm38",
+ "SPECIES": "mus_musculus"
+ },
+ "SOMALIER": {
+ "SITES_VCF": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/mm10/somalier/mm10.sites.vcf.gz",
+ "NCBI_BUILD": "GRCm38",
+ "SPECIES": "mus_musculus"
+ }
+ }
}
-
diff --git a/config/genomes/mm10.frce.json b/config/genomes/mm10.frce.json
index 57418e2..f6b5a9d 100644
--- a/config/genomes/mm10.frce.json
+++ b/config/genomes/mm10.frce.json
@@ -1,42 +1,64 @@
{
- "references": {
- "FASTQ_SCREEN_CONFIG": "resources/fastq_screen.frce.conf",
- "KRAKENBACDB": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/kraken/20180907_standard_kraken2",
- "trimmomatic.adapters": "resources/adapters.fa",
- "SNPEFF_GENOME": "GRCm38.86",
- "SNPEFF_CONFIG": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/snpEff/4.3t/snpEff.config",
- "SNPEFF_BUNDLE": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/snpEff/4.3t/",
- "BWAGENOME": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/genome/bwaindex/genome.fa",
- "GENOME": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/genome/bwaindex/genome.fa",
- "GENOMEDICT": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/genome/bwaindex/genome.dict",
- "DBSNP": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/dbsnp/mm10_allstrains_dbSNP142.vcf.gz",
- "DBSNP_COSMIC": "--dbsnp /mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/dbsnp/mm10_allstrains_dbSNP142.vcf.gz",
- "KNOWNANCESTRY": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/dbsnp/mm10_known_snps.vcf.gz",
- "KNOWNINDELS": "-known/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/dbsnp/mm10_known_indels.vcf.gz",
- "KNOWNRECAL": "-known-sites /mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/dbsnp/mm10_known_indels.vcf.gz -known-sites /mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/dbsnp/mm10_known_snps.vcf.gz",
- "KNOWNSNPS":"/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/dbsnp/mm10_knownSNPs_sites.vcf.gz",
- "GERMLINERESOURCE":"",
- "FREECLENGTHS": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/FREEC/mm10.fa.fai",
- "PON": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/dbsnp/mm10_dbSNP_allStrains_compSet_noIND.vcf.gz",
- "FREECCHROMS": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/FREEC/Chromosomes",
- "FREECPILEUP": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/FREEC/mm10_dbSNP137.ucsc.freec.bed",
- "FREECSNPS": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/FREEC/mm10_dbSNP137.ucsc.freec.txt.gz",
- "SEQUENZAGC": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/SEQUENZA/mm10.gc50Base.wig.gz",
- "ADMIXTUREKEY": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/mouse_genomes_strain_key.txt",
- "ADMIXTUREREFS": "39",
- "MAF_GENOME": "mm10",
- "MAF_FILTERVCF": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/dbsnp/mm10_allstrains_dbSNP142.vcf.gz",
- "chroms": ["chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chrX","chrY","chrM"],
- "VCF2MAF": {
- "VEPRESOURCEBUNDLEPATH": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/vcf2maf",
- "GENOME_BUILD": "GRCm38",
- "SPECIES": "mus_musculus"
- },
- "SOMALIER": {
- "SITES_VCF": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/somalier/mm10.sites.vcf.gz",
- "NCBI_BUILD": "GRCm38",
- "SPECIES": "mus_musculus"
- }
- }
+ "references": {
+ "FASTQ_SCREEN_CONFIG": "resources/fastq_screen.frce.conf",
+ "KRAKENBACDB": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/kraken/20180907_standard_kraken2",
+ "trimmomatic.adapters": "resources/adapters.fa",
+ "SNPEFF_GENOME": "GRCm38.86",
+ "SNPEFF_CONFIG": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/snpEff/4.3t/snpEff.config",
+ "SNPEFF_BUNDLE": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/snpEff/4.3t/",
+ "BWAGENOME": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/genome/bwaindex/genome.fa",
+ "GENOME": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/genome/bwaindex/genome.fa",
+ "GENOMEDICT": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/genome/bwaindex/genome.dict",
+ "DBSNP": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/dbsnp/mm10_allstrains_dbSNP142.vcf.gz",
+ "DBSNP_COSMIC": "--dbsnp /mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/dbsnp/mm10_allstrains_dbSNP142.vcf.gz",
+ "KNOWNANCESTRY": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/dbsnp/mm10_known_snps.vcf.gz",
+ "KNOWNINDELS": "-known/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/dbsnp/mm10_known_indels.vcf.gz",
+ "KNOWNRECAL": "-known-sites /mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/dbsnp/mm10_known_indels.vcf.gz -known-sites /mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/dbsnp/mm10_known_snps.vcf.gz",
+ "KNOWNSNPS": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/dbsnp/mm10_knownSNPs_sites.vcf.gz",
+ "GERMLINERESOURCE": "",
+ "FREECLENGTHS": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/FREEC/mm10.fa.fai",
+ "PON": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/dbsnp/mm10_dbSNP_allStrains_compSet_noIND.vcf.gz",
+ "FREECCHROMS": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/FREEC/Chromosomes",
+ "FREECPILEUP": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/FREEC/mm10_dbSNP137.ucsc.freec.bed",
+ "FREECSNPS": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/FREEC/mm10_dbSNP137.ucsc.freec.txt.gz",
+ "SEQUENZAGC": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/SEQUENZA/mm10.gc50Base.wig.gz",
+ "ADMIXTUREKEY": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/mouse_genomes_strain_key.txt",
+ "ADMIXTUREREFS": "39",
+ "MAF_GENOME": "mm10",
+ "MAF_FILTERVCF": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/dbsnp/mm10_allstrains_dbSNP142.vcf.gz",
+ "chroms": [
+ "chr1",
+ "chr2",
+ "chr3",
+ "chr4",
+ "chr5",
+ "chr6",
+ "chr7",
+ "chr8",
+ "chr9",
+ "chr10",
+ "chr11",
+ "chr12",
+ "chr13",
+ "chr14",
+ "chr15",
+ "chr16",
+ "chr17",
+ "chr18",
+ "chr19",
+ "chrX",
+ "chrY",
+ "chrM"
+ ],
+ "VCF2MAF": {
+ "VEPRESOURCEBUNDLEPATH": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/vcf2maf",
+ "GENOME_BUILD": "GRCm38",
+ "SPECIES": "mus_musculus"
+ },
+ "SOMALIER": {
+ "SITES_VCF": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/mm10/somalier/mm10.sites.vcf.gz",
+ "NCBI_BUILD": "GRCm38",
+ "SPECIES": "mus_musculus"
+ }
+ }
}
-
diff --git a/config/pairs.mixed.tsv b/config/pairs.mixed.tsv
index f945458..9d210fc 100644
--- a/config/pairs.mixed.tsv
+++ b/config/pairs.mixed.tsv
@@ -4,4 +4,3 @@ Sample11_ACI_158_S38.test Sample4_CRL1622_S31.test
Sample4_CRL1622_S31.test
Sample10_ARK1_S37.test
Sample11_ACI_158_S38.test
-
diff --git a/config/templates/project.json b/config/templates/project.json
index e271a0f..3f02603 100644
--- a/config/templates/project.json
+++ b/config/templates/project.json
@@ -1,16 +1,16 @@
{
- "project": {
- "filetype": "",
- "id": "CCBR_XAVIER_Pipeline",
- "nends": -1,
- "pairs": {},
- "pfamily": "dnaseq",
- "pipehome": "",
- "userhome": "",
- "pipeline": "XAVIER",
- "samples": {},
- "username": "",
- "version": "",
- "workpath": ""
- }
+ "project": {
+ "filetype": "",
+ "id": "CCBR_XAVIER_Pipeline",
+ "nends": -1,
+ "pairs": {},
+ "pfamily": "dnaseq",
+ "pipehome": "",
+ "userhome": "",
+ "pipeline": "XAVIER",
+ "samples": {},
+ "username": "",
+ "version": "",
+ "workpath": ""
+ }
}
diff --git a/config/templates/tools.json b/config/templates/tools.json
index 289e130..982c6a5 100644
--- a/config/templates/tools.json
+++ b/config/templates/tools.json
@@ -1,5 +1,5 @@
{
- "tools": {
+ "tools": {
"trimmomatic": {
"version": "0.39",
"modname": "trimmomatic/0.39"
diff --git a/docker/README.md b/docker/README.md
index 3e6a9a8..a23ecdf 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -25,7 +25,7 @@ docker image ls
docker push skchronicles/ccbr_example:v0.1.0
docker push skchronicles/ccbr_example:latest
docker push nciccbr/ccbr_example:v0.1.0
-docker push nciccbr/ccbr_example:latest
+docker push nciccbr/ccbr_example:latest
```
### Other Recommended Steps
diff --git a/docker/mutect/Dockerfile b/docker/mutect/Dockerfile
index eb251da..5469283 100644
--- a/docker/mutect/Dockerfile
+++ b/docker/mutect/Dockerfile
@@ -4,12 +4,12 @@ FROM ubuntu:14.04
LABEL maintainer=kuhnsa@nih.gov
-# Create Container filesystem specific
-# working directory and opt directories
+# Create Container filesystem specific
+# working directory and opt directories
RUN mkdir -p /opt2 && mkdir -p /data2
-WORKDIR /opt2
+WORKDIR /opt2
-# Set time zone to US east coast
+# Set time zone to US east coast
ENV TZ=America/New_York
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime \
&& echo $TZ > /etc/timezone
@@ -17,7 +17,7 @@ RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime \
# This section installs system packages required for your project
# If you need extra system packages add them here.
# Strelka strictly requires python/2.7,
-# it is not compatiable with any other
+# it is not compatible with any other
# version of python.
# Installs python/2.7.16
RUN apt-get update \
@@ -35,10 +35,10 @@ RUN apt-get update \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
# Install MuTect/1.1.7, from my mirror with a pre-compiled JAR file
-# Requires java7 or jdk 1.7 for the pre-compiled JAR file (already satified)
-# openjdk-7-jdk not a part of ubuntu >= 18.04, so installation using
+# Requires java7 or jdk 1.7 for the pre-compiled JAR file (already satisfied)
+# openjdk-7-jdk not a part of ubuntu >= 18.04, so installation using
# an older LTS version of Ubunutu is required
-# Setting environment variables: MUTECT_JAR and MUTECT_JARPATH to
+# Setting environment variables: MUTECT_JAR and MUTECT_JARPATH to
# mirror HPC module on Biowulf
RUN git clone https://github.com/skchronicles/mirror-muTect-1.1.7.git
ENV PATH="/opt2/mirror-muTect-1.1.7:$PATH"
@@ -51,4 +51,4 @@ RUN ln -fs /usr/lib/jvm/java-7-openjdk-amd64/bin/java /usr/bin/java7
# Reset working directory
ADD Dockerfile /opt2/
RUN chmod -R a+rwX /opt2
-WORKDIR /data2
\ No newline at end of file
+WORKDIR /data2
diff --git a/docker/vcf2maf/Dockerfile b/docker/vcf2maf/Dockerfile
index 388c76b..2ccbc28 100644
--- a/docker/vcf2maf/Dockerfile
+++ b/docker/vcf2maf/Dockerfile
@@ -2,33 +2,33 @@
# hub.docker.com/r/ensemblorg/ensembl-vep/tags
# VEP release 102 matches GENCODE v36 for GRCh38
# This GENCODE release matches the upcoming major
-# relase to GDC GENCODE gene reference model.
-# The GDC will be replacing the files in the
-# GDC data portal that were generated using
-# GENCODE v22 with files that were generated
+# release to GDC GENCODE gene reference model.
+# The GDC will be replacing the files in the
+# GDC data portal that were generated using
+# GENCODE v22 with files that were generated
# using GENCODE v36. GDC will release the
# reprocessed data on 11/30/2021.
# @ubuntu/18.04
-# @Dockerfile:
+# @Dockerfile:
# github.com/Ensembl/ensembl-vep/blob/release/102/docker/Dockerfile
FROM ensemblorg/ensembl-vep:release_102.0
LABEL maintainer=kuhnsa@nih.gov
-# Create Container filesystem specific
+# Create Container filesystem specific
# working directory and opt directories
USER root
RUN mkdir -p /opt2 && mkdir -p /data2
-WORKDIR /opt2
+WORKDIR /opt2
-# Set time zone to US east coast
+# Set time zone to US east coast
ENV TZ=America/New_York
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime \
&& echo $TZ > /etc/timezone
-# This section installs system
-# packages required for your
-# project. If you need extra
+# This section installs system
+# packages required for your
+# project. If you need extra
# system packages add them here.
# argparse.bash requires argparse
# python package.
@@ -98,4 +98,4 @@ RUN chmod -R a+rX /opt2 \
ENV PATH="$PATH:/opt2/bcftools-1.10:/opt2/samtools-1.10:/opt2/htslib-1.10:/opt2:/opt2/vcf2maf-1.6.21:/opt2/liftOver"
# Reset working directory
-WORKDIR /data2
\ No newline at end of file
+WORKDIR /data2
diff --git a/docker/vcf2maf/build.sh b/docker/vcf2maf/build.sh
index 7f9bfe0..4345c6e 100644
--- a/docker/vcf2maf/build.sh
+++ b/docker/vcf2maf/build.sh
@@ -19,4 +19,4 @@ docker tag ccbr_vcf2maf:v102.0.0 dnousome/ccbr_vcf2maf
#docker push nciccbr/ccbr_vcf2maf:latest
docker push dnousome/ccbr_vcf2maf:v102.0.0
-docker push dnousome/ccbr_vcf2maf:latest
\ No newline at end of file
+docker push dnousome/ccbr_vcf2maf:latest
diff --git a/docker/wes_base/Dockerfile b/docker/wes_base/Dockerfile
index a05e3b4..209eadd 100644
--- a/docker/wes_base/Dockerfile
+++ b/docker/wes_base/Dockerfile
@@ -3,12 +3,12 @@ FROM ubuntu:20.04
MAINTAINER
-# Create Container filesystem specific
-# working directory and opt directories
+# Create Container filesystem specific
+# working directory and opt directories
RUN mkdir -p /opt2 && mkdir -p /data2
-WORKDIR /opt2
+WORKDIR /opt2
-# Set time zone to US east coast
+# Set time zone to US east coast
ENV TZ=America/New_York
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime \
&& echo $TZ > /etc/timezone
@@ -19,7 +19,7 @@ RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime \
RUN apt-get update \
&& apt-get -y upgrade \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y \
- bc \
+ bc \
build-essential \
bzip2 \
cmake \
@@ -92,8 +92,8 @@ RUN DEBIAN_FRONTEND=noninteractive apt-get install -y \
zlibc
# Common bioinformatics tools
-# bwa/0.7.17 bowtie/1.2.3 bowtie2/2.3.5.1
-# bedtools/2.27.1 bedops/2.4.37 samtools/1.10
+# bwa/0.7.17 bowtie/1.2.3 bowtie2/2.3.5.1
+# bedtools/2.27.1 bedops/2.4.37 samtools/1.10
# bcftools/1.10.2 vcftools/0.1.16 trimmomatic/0.39
# tabix/1.10.2
RUN DEBIAN_FRONTEND=noninteractive apt-get install -y \
@@ -161,7 +161,7 @@ RUN wget https://github.com/BoevaLab/FREEC/archive/refs/tags/v11.6.zip \
&& cd /opt2/FREEC-11.6/src/ \
&& make
ENV PATH="/opt2/FREEC-11.6/src:$PATH"
-WORKDIR /opt2
+WORKDIR /opt2
# Install Sequenza-Utils/3.0.0 and Sequenza
# Requires R, Python, SAMtools, tabix (already satisfied)
@@ -251,10 +251,10 @@ ENV TRIMMOJAR="/usr/share/java/trimmomatic-0.39.jar"
WORKDIR /data2
# Clean-up step to reduce size
-# and install GNU awk to calculate mean and standard
-# deviation, ensures backward compatibility with
+# and install GNU awk to calculate mean and standard
+# deviation, ensures backward compatibility with
# biowulf installation of awk is a pointer to gawk,
-# and install pandoc (>= 1.12.3 required for Rmarkdown)
+# and install pandoc (>= 1.12.3 required for Rmarkdown)
RUN DEBIAN_FRONTEND=noninteractive apt-get install -y \
gawk \
pandoc \
diff --git a/docs/README.md b/docs/README.md
index d81c2b2..549a45b 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,8 +1,9 @@
-# XAVIER Pipeline Documentation
+# XAVIER Pipeline Documentation
> **Please Note:** When a commit is pushed to the `docs/` directory, it triggers a [github actions workflow](https://https://github.com/CCBR/XAVIER/actions) to build the static-site and push it to the gh-pages branch.
### Installation
+
```bash
# Clone the Repository
git clone https://github.com/CCBR/XAVIER.git
@@ -16,8 +17,10 @@ pip install --upgrade pip
pip install -r docs/requirements.txt
```
-### Preview while editing
-MkDocs includes a previewing server, so you can view your updates live and as you write your documentation. The server will automatically rebuild the site upon editing and saving a file.
+### Preview while editing
+
+MkDocs includes a previewing server, so you can view your updates live and as you write your documentation. The server will automatically rebuild the site upon editing and saving a file.
+
```bash
# Activate the virtual environment
. .venv/bin/activate
@@ -25,8 +28,10 @@ MkDocs includes a previewing server, so you can view your updates live and as yo
mkdocs serve
```
-### Build static site
-Once you are content with your changes, you can build the static site:
+### Build static site
+
+Once you are content with your changes, you can build the static site:
+
```bash
mkdocs build
```
diff --git a/docs/css/extra.css b/docs/css/extra.css
index 279fe1b..e1fe0b3 100644
--- a/docs/css/extra.css
+++ b/docs/css/extra.css
@@ -1,12 +1,16 @@
@keyframes heart {
- 0%, 40%, 80%, 100% {
- transform: scale(1);
- }
- 20%, 60% {
- transform: scale(1.15);
- }
+ 0%,
+ 40%,
+ 80%,
+ 100% {
+ transform: scale(1);
+ }
+ 20%,
+ 60% {
+ transform: scale(1.15);
+ }
}
.heart {
- animation: heart 1500ms infinite;
+ animation: heart 1500ms infinite;
}
diff --git a/docs/index.md b/docs/index.md
index 9bfcf48..d7afdb6 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -1,16 +1,18 @@
-# XAVIER 🔬 [![tests](https://github.com/CCBR/XAVIER/workflows/tests/badge.svg)](https://github.com/CCBR/XAVIER/actions/workflows/main.yaml) [![docs](https://github.com/CCBR/XAVIER/workflows/docs/badge.svg)](https://github.com/CCBR/XAVIER/actions/workflows/docs.yml) [![Docker Pulls](https://img.shields.io/docker/pulls/nciccbr/ccbr_wes_base)](https://hub.docker.com/r/nciccbr/ccbr_wes_base) [![GitHub issues](https://img.shields.io/github/issues/CCBR/XAVIER?color=brightgreen)](https://github.com/CCBR/XAVIER/issues) [![GitHub license](https://img.shields.io/github/license/CCBR/XAVIER)](https://github.com/CCBR/XAVIER/blob/main/LICENSE)
+# XAVIER 🔬 [![tests](https://github.com/CCBR/XAVIER/workflows/tests/badge.svg)](https://github.com/CCBR/XAVIER/actions/workflows/main.yaml) [![docs](https://github.com/CCBR/XAVIER/workflows/docs/badge.svg)](https://github.com/CCBR/XAVIER/actions/workflows/docs.yml) [![Docker Pulls](https://img.shields.io/docker/pulls/nciccbr/ccbr_wes_base)](https://hub.docker.com/r/nciccbr/ccbr_wes_base) [![GitHub issues](https://img.shields.io/github/issues/CCBR/XAVIER?color=brightgreen)](https://github.com/CCBR/XAVIER/issues) [![GitHub license](https://img.shields.io/github/license/CCBR/XAVIER)](https://github.com/CCBR/XAVIER/blob/main/LICENSE)
> **_XAVIER - eXome Analysis and Variant explorER_**. **XAVIER** is an open-source, reproducible, and scalable solution for analyzing Whole Exome sequencing data. Its long-term goals: to accurately call germline and somatic variants, to infer CNVs, and to boldly annotate variants like no pipeline before!
---
+
## Overview
-Welcome to XAVIER's documentation! This guide is the main source of documentation for users that are getting started with the [XAVIER pipeline](https://github.com/CCBR/XAVIER).
-The **`xavier`** pipeline is composed several inter-related sub commands to setup and run the pipeline across different systems. Each of the available sub commands perform different functions:
+Welcome to XAVIER's documentation! This guide is the main source of documentation for users that are getting started with the [XAVIER pipeline](https://github.com/CCBR/XAVIER).
+
+The **`xavier`** pipeline is composed several inter-related sub commands to setup and run the pipeline across different systems. Each of the available sub commands perform different functions:
- * [xavier run](usage/run.md): Run the XAVIER pipeline with your input files.
- * [xavier unlock](usage/unlock.md): Unlocks a previous runs output directory.
- * [xavier cache](usage/cache.md): Cache remote resources locally, coming soon!
+- [xavier run](usage/run.md): Run the XAVIER pipeline with your input files.
+- [xavier unlock](usage/unlock.md): Unlocks a previous runs output directory.
+- [xavier cache](usage/cache.md): Cache remote resources locally, coming soon!
XAVIER is a comprehensive whole exome-sequencing pipeline following the Broad's set of best practices. It relies on technologies like [Singularity1](https://singularity.lbl.gov/) to maintain the highest-level of reproducibility. The pipeline consists of a series of data processing and quality-control steps orchestrated by [Snakemake2](https://snakemake.readthedocs.io/en/stable/), a flexible and scalable workflow management system, to submit jobs to a cluster or cloud provider.
@@ -20,11 +22,11 @@ Before getting started, we highly recommend reading through the [usage](usage/ru
For more information about issues or trouble-shooting a problem, please checkout our [FAQ](faq/questions.md) prior to [opening an issue on Github](https://github.com/CCBR/XAVIER/issues).
-## Contribute
-
-This site is a living document, created for and by members like you. XAVIER is maintained by the members of CCBR and is improved by continous feedback! We encourage you to contribute new content and make improvements to existing content via pull request to our [GitHub repository :octicons-heart-fill-24:{ .heart }](https://github.com/CCBR/XAVIER/).
+## Contribute
+This site is a living document, created for and by members like you. XAVIER is maintained by the members of CCBR and is improved by continuous feedback! We encourage you to contribute new content and make improvements to existing content via pull request to our [GitHub repository :octicons-heart-fill-24:{ .heart }](https://github.com/CCBR/XAVIER/).
## References
-**1.** Kurtzer GM, Sochat V, Bauer MW (2017). Singularity: Scientific containers for mobility of compute. PLoS ONE 12(5): e0177459.
-**2.** Koster, J. and S. Rahmann (2018). "Snakemake-a scalable bioinformatics workflow engine." Bioinformatics 34(20): 3600.
+
+**1.** Kurtzer GM, Sochat V, Bauer MW (2017). Singularity: Scientific containers for mobility of compute. PLoS ONE 12(5): e0177459.
+**2.** Koster, J. and S. Rahmann (2018). "Snakemake-a scalable bioinformatics workflow engine." Bioinformatics 34(20): 3600.
diff --git a/docs/license.md b/docs/license.md
index 2f658c6..f75c7d6 100644
--- a/docs/license.md
+++ b/docs/license.md
@@ -1,6 +1,6 @@
# MIT License
-*Copyright (c) 2021 CCBR*
+_Copyright (c) 2021 CCBR_
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
diff --git a/docs/pipeline-details/assets/XAVIER_workflow.svg b/docs/pipeline-details/assets/XAVIER_workflow.svg
index 7bc7b16..e1e5199 100644
--- a/docs/pipeline-details/assets/XAVIER_workflow.svg
+++ b/docs/pipeline-details/assets/XAVIER_workflow.svg
@@ -1 +1 @@
-
\ No newline at end of file
+
diff --git a/docs/pipeline-details/methods.md b/docs/pipeline-details/methods.md
index b5ed553..409e711 100644
--- a/docs/pipeline-details/methods.md
+++ b/docs/pipeline-details/methods.md
@@ -1 +1,60 @@
-# Methods description
This page contains a description of all methods used in the pipeline, along with references for important tools.
**Note that depending on the settings used, not all of these methods may be applicable, so please adapt this text appropriately for your application.**
You can also download this text as a Word document (.docx) that contains an EndNote traveling library using the button below.
[![Download DOCX](https://custom-icon-badges.herokuapp.com/badge/-Download-blue?style=for-the-badge&logo=download&logoColor=white "Download Methods Document")](assets/wes_pipeline_methods.docx)
---
## Data preprocessing
Low-quality and adapters sequences are trimmed from the raw sequencing reads using Trimmomatic (v. 0.39)[^1]. Trimmed reads are then aligned to the human hg38 reference genome using BWA mapping software (v. 0.7.17)[^2]. Duplicate reads are marked using Samblaster (v. 0.1.25)[^3] and sorted using samtools (v. 1.8). Finally, base quality score recalibration is performed as indicated in the GATK4 (v. 4.2.2.0) best practices [^4].
## Germline variant calling
HaplotypeCaller from GATK4 (v. 4.2.2.0) is used to call germline variants, parallelized across chromosomes, and all samples in the cohort are joint genotyped together [^4],[^5].
## Somatic variant calling
Somatic variant calling (SNPs and Indels) is performed using Mutect (v. 1.1.7)[^6], Mutect2 (GATK v. 4.2.0)[^7], Strelka2 (v. 2.9.0)[^8], and VarDict (v. 1.4)[^9] in tumor-normal mode. Variants from all callers are merged using the CombineVariants tool from GATK version 3.8-1. Genomic, functional and consequence annotations are added using Variant Effect Predictor (VEP v. 99)[^10] and converted to Mutation Annotation Format (MAF) using the vcf2maf tool (v. 1.6.16)[^11].
For Copy Number Variants (CNVs), Control-Freec (v. 11.6)[^12] is used to generate pileups, which are used as input for the R package 'sequenza' (v. 3.0.0)[^13]. The complete Control-Freec workflow is then re-run using ploidy and cellularity estimates from 'sequenza'.
## FFPE Artifact filtering
SOBDetector is a tool that scores variants based on strand-orientation bias, which can be a sign of DNA damage caused by fixation of tissue. This pipeline runs SOBDetector in a two-pass method. The first pass uses parameters provided with the software (calculated from publicly available data from TCGA), then cohort-specific bias metrics are computed from those results, and SOBDetector is re-run using these cohort-specific values.
## Quality and identity metrics
Ancestry and relatedness scores are generated using Somalier (v. 0.2.13)[^14]. Contamination analyses are performed against viral and bacterial genomes from NCBI using Kraken2 (v. 2.1.2)[^15], as well as against mouse, human, and UniVec databases using FastQ Screen (v. 0.14.1)[^16]. Sequence, mapping and variant statistics are computed using FastQC (v. 0.11.9), Qualimap (v. 2.2.1)[^17] and SNPeff (v. 4.3t)[^18]. All of these metrics are combined into an interactive HTML report using MultiQC (v. 1.11)[^19].
## Pipeline Orchestration
Job execution and management is done using Snakemake (v. 6.8.2)[^20] using custom-built Singularity (v. 3.8.5) containers for reproducibility.
## References
[^1]: Bolger, A.M., M. Lohse, and B. Usadel, Trimmomatic: a flexible trimmer for Illumina sequence data. Bioinformatics, 2014. 30(15): p. 2114-20.
[^2]: Li, H. and R. Durbin, Fast and accurate short read alignment with Burrows-Wheeler transform. Bioinformatics, 2009. 25(14): p. 1754-60.
[^3]: Faust, G.G. and I.M. Hall, SAMBLASTER: fast duplicate marking and structural variant read extraction. Bioinformatics, 2014. 30(17): p. 2503-5.
[^4]: Van der Auwera, G.A. and B.D. O'Connor, Genomics in the cloud : using Docker, GATK, and WDL in Terra. First edition. ed. 2020, Sebastopol, CA: O'Reilly Media.
[^5]: Poplin, R., et al., Scaling accurate genetic variant discovery to tens of thousands of samples. bioRxiv, 2018: p. 201178.
[^6]: Cibulskis, K., et al., Sensitive detection of somatic point mutations in impure and heterogeneous cancer samples. Nat Biotechnol, 2013. 31(3): p. 213-9.
[^7]: Benjamin, D., et al., Calling Somatic SNVs and Indels with Mutect2. bioRxiv, 2019: p. 861054.
[^8]: Kim, S., et al., Strelka2: fast and accurate calling of germline and somatic variants. Nat Methods, 2018. 15(8): p. 591-594.
[^9]: Lai, Z., et al., VarDict: a novel and versatile variant caller for next-generation sequencing in cancer research. Nucleic Acids Res, 2016. 44(11): p. e108.
[^10]: McLaren, W., et al., The Ensembl Variant Effect Predictor. Genome Biol, 2016. 17(1): p. 122.
[^11]: Memorial Sloan Kettering Cancer Center. vcf2maf. 2013; Available from: https://github.com/mskcc/vcf2maf.
[^12]: Boeva, V., et al., Control-FREEC: a tool for assessing copy number and allelic content using next-generation sequencing data. Bioinformatics, 2012. 28(3): p. 423-5.
[^13]: Favero, F., et al., Sequenza: allele-specific copy number and mutation profiles from tumor sequencing data. Ann Oncol, 2015. 26(1): p. 64-70.
[^14]: Pedersen, B. somalier: extract informative sites, evaluate relatedness, and perform quality-control on BAM/CRAM/BCF/VCF/GVCF. 2018; Available from: https://github.com/brentp/somalier.
[^15]: Wood, D.E., J. Lu, and B. Langmead, Improved metagenomic analysis with Kraken 2. Genome Biol, 2019. 20(1): p. 257.
[^16]: Wingett, S.W. and S. Andrews, FastQ Screen: A tool for multi-genome mapping and quality control. F1000Res, 2018. 7: p. 1338.
[^17]: Okonechnikov, K., A. Conesa, and F. Garcia-Alcalde, Qualimap 2: advanced multi-sample quality control for high-throughput sequencing data. Bioinformatics, 2016. 32(2): p. 292-4.
[^18]: Cingolani, P., et al., A program for annotating and predicting the effects of single nucleotide polymorphisms, SnpEff: SNPs in the genome of Drosophila melanogaster strain w1118; iso-2; iso-3. Fly (Austin), 2012. 6(2): p. 80-92.
[^19]: Ewels, P., et al., MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics, 2016. 32(19): p. 3047-8.
[^20]: Koster, J. and S. Rahmann, Snakemake-a scalable bioinformatics workflow engine. Bioinformatics, 2018. 34(20): p. 3600.
\ No newline at end of file
+# Methods description
+
+This page contains a description of all methods used in the pipeline, along with references for important tools.
+
+**Note that depending on the settings used, not all of these methods may be applicable, so please adapt this text appropriately for your application.**
+
+You can also download this text as a Word document (.docx) that contains an EndNote traveling library using the button below.
+
+[![Download DOCX](https://custom-icon-badges.herokuapp.com/badge/-Download-blue?style=for-the-badge&logo=download&logoColor=white "Download Methods Document")](assets/wes_pipeline_methods.docx)
+
+---
+
+## Data preprocessing
+
+Low-quality and adapters sequences are trimmed from the raw sequencing reads using Trimmomatic (v. 0.39)[^1]. Trimmed reads are then aligned to the human hg38 reference genome using BWA mapping software (v. 0.7.17)[^2]. Duplicate reads are marked using Samblaster (v. 0.1.25)[^3] and sorted using samtools (v. 1.8). Finally, base quality score recalibration is performed as indicated in the GATK4 (v. 4.2.2.0) best practices [^4].
+
+## Germline variant calling
+
+HaplotypeCaller from GATK4 (v. 4.2.2.0) is used to call germline variants, parallelized across chromosomes, and all samples in the cohort are joint genotyped together [^4],[^5].
+
+## Somatic variant calling
+
+Somatic variant calling (SNPs and Indels) is performed using Mutect (v. 1.1.7)[^6], Mutect2 (GATK v. 4.2.0)[^7], Strelka2 (v. 2.9.0)[^8], and VarDict (v. 1.4)[^9] in tumor-normal mode. Variants from all callers are merged using the CombineVariants tool from GATK version 3.8-1. Genomic, functional and consequence annotations are added using Variant Effect Predictor (VEP v. 99)[^10] and converted to Mutation Annotation Format (MAF) using the vcf2maf tool (v. 1.6.16)[^11].
+
+For Copy Number Variants (CNVs), Control-Freec (v. 11.6)[^12] is used to generate pileups, which are used as input for the R package 'sequenza' (v. 3.0.0)[^13]. The complete Control-Freec workflow is then re-run using ploidy and cellularity estimates from 'sequenza'.
+
+## FFPE Artifact filtering
+
+SOBDetector is a tool that scores variants based on strand-orientation bias, which can be a sign of DNA damage caused by fixation of tissue. This pipeline runs SOBDetector in a two-pass method. The first pass uses parameters provided with the software (calculated from publicly available data from TCGA), then cohort-specific bias metrics are computed from those results, and SOBDetector is re-run using these cohort-specific values.
+
+## Quality and identity metrics
+
+Ancestry and relatedness scores are generated using Somalier (v. 0.2.13)[^14]. Contamination analyses are performed against viral and bacterial genomes from NCBI using Kraken2 (v. 2.1.2)[^15], as well as against mouse, human, and UniVec databases using FastQ Screen (v. 0.14.1)[^16]. Sequence, mapping and variant statistics are computed using FastQC (v. 0.11.9), Qualimap (v. 2.2.1)[^17] and SNPeff (v. 4.3t)[^18]. All of these metrics are combined into an interactive HTML report using MultiQC (v. 1.11)[^19].
+
+## Pipeline Orchestration
+
+Job execution and management is done using Snakemake (v. 6.8.2)[^20] using custom-built Singularity (v. 3.8.5) containers for reproducibility.
+
+## References
+
+[^1]: Bolger, A.M., M. Lohse, and B. Usadel, Trimmomatic: a flexible trimmer for Illumina sequence data. Bioinformatics, 2014. 30(15): p. 2114-20.
+[^2]: Li, H. and R. Durbin, Fast and accurate short read alignment with Burrows-Wheeler transform. Bioinformatics, 2009. 25(14): p. 1754-60.
+[^3]: Faust, G.G. and I.M. Hall, SAMBLASTER: fast duplicate marking and structural variant read extraction. Bioinformatics, 2014. 30(17): p. 2503-5.
+[^4]: Van der Auwera, G.A. and B.D. O'Connor, Genomics in the cloud : using Docker, GATK, and WDL in Terra. First edition. ed. 2020, Sebastopol, CA: O'Reilly Media.
+[^5]: Poplin, R., et al., Scaling accurate genetic variant discovery to tens of thousands of samples. bioRxiv, 2018: p. 201178.
+[^6]: Cibulskis, K., et al., Sensitive detection of somatic point mutations in impure and heterogeneous cancer samples. Nat Biotechnol, 2013. 31(3): p. 213-9.
+[^7]: Benjamin, D., et al., Calling Somatic SNVs and Indels with Mutect2. bioRxiv, 2019: p. 861054.
+[^8]: Kim, S., et al., Strelka2: fast and accurate calling of germline and somatic variants. Nat Methods, 2018. 15(8): p. 591-594.
+[^9]: Lai, Z., et al., VarDict: a novel and versatile variant caller for next-generation sequencing in cancer research. Nucleic Acids Res, 2016. 44(11): p. e108.
+[^10]: McLaren, W., et al., The Ensembl Variant Effect Predictor. Genome Biol, 2016. 17(1): p. 122.
+[^11]: Memorial Sloan Kettering Cancer Center. vcf2maf. 2013; Available from: https://github.com/mskcc/vcf2maf.
+[^12]: Boeva, V., et al., Control-FREEC: a tool for assessing copy number and allelic content using next-generation sequencing data. Bioinformatics, 2012. 28(3): p. 423-5.
+[^13]: Favero, F., et al., Sequenza: allele-specific copy number and mutation profiles from tumor sequencing data. Ann Oncol, 2015. 26(1): p. 64-70.
+[^14]: Pedersen, B. somalier: extract informative sites, evaluate relatedness, and perform quality-control on BAM/CRAM/BCF/VCF/GVCF. 2018; Available from: https://github.com/brentp/somalier.
+[^15]: Wood, D.E., J. Lu, and B. Langmead, Improved metagenomic analysis with Kraken 2. Genome Biol, 2019. 20(1): p. 257.
+[^16]: Wingett, S.W. and S. Andrews, FastQ Screen: A tool for multi-genome mapping and quality control. F1000Res, 2018. 7: p. 1338.
+[^17]: Okonechnikov, K., A. Conesa, and F. Garcia-Alcalde, Qualimap 2: advanced multi-sample quality control for high-throughput sequencing data. Bioinformatics, 2016. 32(2): p. 292-4.
+[^18]: Cingolani, P., et al., A program for annotating and predicting the effects of single nucleotide polymorphisms, SnpEff: SNPs in the genome of Drosophila melanogaster strain w1118; iso-2; iso-3. Fly (Austin), 2012. 6(2): p. 80-92.
+[^19]: Ewels, P., et al., MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics, 2016. 32(19): p. 3047-8.
+[^20]: Koster, J. and S. Rahmann, Snakemake-a scalable bioinformatics workflow engine. Bioinformatics, 2018. 34(20): p. 3600.
diff --git a/docs/pipeline-details/output.md b/docs/pipeline-details/output.md
index 147e9c4..fb80adb 100644
--- a/docs/pipeline-details/output.md
+++ b/docs/pipeline-details/output.md
@@ -5,6 +5,7 @@
The output files and their locations are broken down here for the XAVIER pipeline. Pre-processing and germline variant calling steps are common but somatic variant calling is dependent on whether the pipeline was run in either (A) tumor-normal pair or (B) tumor-only analysis mode. All file locations are relative to the output directory specified during the job submission.
The output directory after a complete XAVIER run should look like:
+
```bash
xavier_output/
├── bams
@@ -33,12 +34,13 @@ xavier_output/
└── workflow
```
-Below we describe the different folders that contain specific outputs obtained for all samples from the XAVIER pipeline
+
+Below we describe the different folders that contain specific outputs obtained for all samples from the XAVIER pipeline
### 1. `QC`
-The `QC` folder contains all the Quality-Control analyses performed at different steps of the pipeline for each sample to assess sequencing quality before and after adapter trimming, microbial taxonomic composition, contamination, variant calling, etc. The final summary report and data is available `finalQC` folder. \
-The MultiQC report also contains results from other analysis like mapping statistics, ancestry and relatdness, etc. It is recommended to study the MultiQC report first to get a birds eye view of the sequence data quality.
+The `QC` folder contains all the Quality-Control analyses performed at different steps of the pipeline for each sample to assess sequencing quality before and after adapter trimming, microbial taxonomic composition, contamination, variant calling, etc. The final summary report and data is available `finalQC` folder. \
+The MultiQC report also contains results from other analysis like mapping statistics, ancestry and relatedness, etc. It is recommended to study the MultiQC report first to get a birds eye view of the sequence data quality.
```bash
QC/
@@ -63,7 +65,6 @@ QC/
└── raw_variants.variant_calling_summary_metrics
```
-
### 2. `bams`
The `bams` folder contain two subfolders `chrom_split` and `final_bams`. `final_bams` contains the final processed BAM files for each sample in the run and the `chrom_split` folder contains all the sample BAM files split by each chromosome.
@@ -89,13 +90,14 @@ bams/
### 3. `germline`
This folder contains the output from the GATK Best Practices pipeline to obtain germline variants with a few alterations detailed below. Briefly, joint SNP and INDEL variant detection is conducted across all samples included in a pipeline run using the GATK Haplotypcaller under default settings. Raw variants are then subsequently filtered based on several GATK annotations: \
-A strict set of criteria (QD < 2.0, FS > 60.0, MQ < 40.0, MQRankSum < -12.5, ReadPosRankSum < -8.0 for SNPs; QD < 2.0, FS > 200.0, ReadPosRankSum < -20.0 for INDELs) generates the 'combined.strictFilter.vcf'.
+A strict set of criteria (QD < 2.0, FS > 60.0, MQ < 40.0, MQRankSum < -12.5, ReadPosRankSum < -8.0 for SNPs; QD < 2.0, FS > 200.0, ReadPosRankSum < -20.0 for INDELs) generates the 'combined.strictFilter.vcf'.
This call set is highly stringent, maximizing the true positive rate at the expense of an elevated false negative rate. This call set is really only intended for more general population genetic scale analyses (e.g., burden tests, admixture, linkage/pedigree based analysis, etc.) where false positives can be significantly confounding.
In case of human sequence data, a basic analyses of sample relatedness and ancestry (e.g., % European, African, etc.) is also performed using somalier.
The output folder looks like:
+
```bash
germline/
├── gVCFs
@@ -105,7 +107,8 @@ germline/
├── somalier # only for hg38 genome
└── VCF
```
-The `VCF` folder contains the final filtered germline variants (SNPs and INDELs) for all samples combined. The folder also contains raw variants for each sample, all samples combined, and also combined raw variants split by chromosome.
+
+The `VCF` folder contains the final filtered germline variants (SNPs and INDELs) for all samples combined. The folder also contains raw variants for each sample, all samples combined, and also combined raw variants split by chromosome.
```bash
VCF/
@@ -129,7 +132,6 @@ VCF/
└── snp_indel.filtered.vcf.gz[.tbi]
```
-
### 4. `logfiles`
This folder contains the snakemake log files and computational statistics for the XAVIER run. All the log files (i.e., standard output and error) for each individual step are in the `slurmfiles` folder. These logfiles are important to diagnose errors in case the pipeline fails.
@@ -157,9 +159,8 @@ For Mutect2, we use a panel of normals (PON) developed from the ExAC (excluding
For Copy Number Variants (CNVs), two tools are employed in tandem. First, Control-FREEC is run with default parameters. This generates pileup files that can be used by Sequenza, primarily for jointly estimating contamination and ploidy. These value are used to run Freec a second time for improved performance.
-
-
The output directory should look like:
+
```bash
somatic_paired/
├── CNV # only if CNVs analyzed
@@ -168,7 +169,7 @@ somatic_paired/
├── ffpe_filter # only if FFPE filter applied
├── qc
└── SNP_Indels
- ├── merged_somatic_variants
+ ├── merged_somatic_variants
│  ├── cohort_summary
│  ├── maf # Final merged MAFs for each sample
│  └── vcf
@@ -217,4 +218,4 @@ somatic_tumor_only/
│  └── vcf
├── vardict_out
└── varscan_out
-```
\ No newline at end of file
+```
diff --git a/docs/pipeline-details/overview.md b/docs/pipeline-details/overview.md
index 36adecc..3b3a6f4 100644
--- a/docs/pipeline-details/overview.md
+++ b/docs/pipeline-details/overview.md
@@ -1,4 +1,4 @@
-# Pipeline Overview
+# Pipeline Overview
![Pipeline Diagram](assets/XAVIER_workflow.svg)
diff --git a/docs/release-guide.md b/docs/release-guide.md
index 61f85ba..c91efe3 100644
--- a/docs/release-guide.md
+++ b/docs/release-guide.md
@@ -8,19 +8,19 @@ Only approve or merge PRs that either update the changelog or have no user-facin
1. Determine the new version number according to [semantic versioning guidelines](https://semver.org/).
1. Update `CHANGELOG.md`:
- - Edit the heading for the development version to match the new version.
- - If needed, clean up the changelog -- fix any typos, optionally create subheadings for 'New features' and 'Bug fixes' if there are lots of changes, etc.
+ - Edit the heading for the development version to match the new version.
+ - If needed, clean up the changelog -- fix any typos, optionally create subheadings for 'New features' and 'Bug fixes' if there are lots of changes, etc.
1. Update the version in [`src/__init__.py`](https://github.com/CCBR/XAVIER/blob/main/src/__init__.py).
1. On GitHub, go to "Releases" and click "Draft a new release".
- - Choose a tag: same as the version number.
- - Choose the target: most likely this should be the main branch, or a specific commit hash.
- - Set the title as the new version number, e.g. **v3.0.2**
- - Copy and paste the release notes from the CHANGELOG into the description box.
- - Check the box "Set as the latest release".
- - Click "Publish release".
+ - Choose a tag: same as the version number.
+ - Choose the target: most likely this should be the main branch, or a specific commit hash.
+ - Set the title as the new version number, e.g. **v3.0.2**
+ - Copy and paste the release notes from the CHANGELOG into the description box.
+ - Check the box "Set as the latest release".
+ - Click "Publish release".
1. Post release chores:
- - Add a new "development version" heading to the top of `CHANGELOG.md`.
- - Bump the version number in `src/__init__.py` to include `-dev`, e.g. `v3.0.2-dev` if you just released `v3.0.2`.
+ - Add a new "development version" heading to the top of `CHANGELOG.md`.
+ - Bump the version number in `src/__init__.py` to include `-dev`, e.g. `v3.0.2-dev` if you just released `v3.0.2`.
## How to install a release on biowulf
@@ -52,6 +52,7 @@ If the new pipeline release only increments the patch number, `ccbrpipeliner` wi
If you need to release a new major or minor version of a pipeline on biowulf, contact [Kelly](mailto:kelly.sovacool@nih.gov) or [Vishal](mailto:vishal.koparde@nih.gov).
Verify that `ccbrpipeliner` uses the latest version with:
+
```sh
module load ccbrpipeliner && xavier --version
```
diff --git a/docs/usage/cache.md b/docs/usage/cache.md
index dbefcc9..8f6f848 100644
--- a/docs/usage/cache.md
+++ b/docs/usage/cache.md
@@ -1,20 +1,21 @@
# xavier cache
-## 1. About
+## 1. About
+
The `xavier` executable is composed of several inter-related sub commands. Please see `xavier -h` for all available options.
-This part of the documentation describes options and concepts for xavier cache sub command in more detail. With minimal configuration, the **`cache`** sub command enables you to cache remote resources for the xavier pipeline. Caching remote resources allows the pipeline to run in an offline mode. The cache sub command can also be used to pull our pre-built reference bundles onto a new cluster or target system.
+This part of the documentation describes options and concepts for xavier cache sub command in more detail. With minimal configuration, the **`cache`** sub command enables you to cache remote resources for the xavier pipeline. Caching remote resources allows the pipeline to run in an offline mode. The cache sub command can also be used to pull our pre-built reference bundles onto a new cluster or target system.
The cache sub command creates local cache on the filesysytem for resources hosted on DockerHub or AWS S3. These resources are normally pulled onto the filesystem when the pipeline runs; however, due to network issues or DockerHub pull rate limits, it may make sense to pull the resources once so a shared cache can be created and re-used. It is worth noting that a singularity cache cannot normally be shared across users. Singularity strictly enforces that its cache is owned by the user. To get around this issue, the cache subcommand can be used to create local SIFs on the filesystem from images on DockerHub.
## 2. Synopsis
-Coming Soon!
+Coming Soon!
\ No newline at end of file
+-->
diff --git a/docs/usage/gui.md b/docs/usage/gui.md
index 9ff9f93..5c73ddf 100644
--- a/docs/usage/gui.md
+++ b/docs/usage/gui.md
@@ -1,19 +1,23 @@
# Getting started
## 1. Synopsis
+
XAVIER pipeline can be executed from either using the graphical user interface (GUI) or the command line interface (CLI). GUI offers a more interactive way for the user to provide input and adjust parameter settings.
This part of the documentation describes how to run xavier using the GUI (with screenshots). See [Command Line](run.md) tab to read more about the `xavier` executable and running XAVIER pipeline using the CLI.
## 2. Setting up XAVIER
### 2.1 Login to cluster
+
```bash
# Setup Step 1.) ssh into cluster's head node
# example below for Biowulf cluster
ssh -Y $USER@biowulf.nih.gov
```
+
### 2.2 Grab an interactive node
-```bash
+
+```bash
# Setup Step 2.) Please do not run XAVIER on the head node!
# Grab an interactive node first
sinteractive --time=12:00:00 --mem=8gb --cpus-per-task=4
@@ -23,7 +27,7 @@ sinteractive --time=12:00:00 --mem=8gb --cpus-per-task=4
_NOTE: `ccbrpipeliner` is a custom module created on biowulf which contains various NGS data analysis pipelines developed, tested, and benchmarked by experts at CCBR._
-```bash
+```bash
# Setup Step 3.) Add ccbrpipeliner module
module purge # to reset the module environment
module load ccbrpipeliner
@@ -32,7 +36,7 @@ module load ccbrpipeliner
If the module was loaded correctly, the greetings message should be displayed.
```bash
-[+] Loading ccbrpipeliner 5 ...
+[+] Loading ccbrpipeliner 5 ...
###########################################################################
CCBR Pipeliner
###########################################################################
@@ -56,7 +60,8 @@ Thank you for using CCBR Pipeliner
###########################################################################
```
-To check the current version of XAVIER, enter:
+To check the current version of XAVIER, enter:
+
```bash
xavier --version
```
@@ -66,15 +71,19 @@ xavier --version
### 3.1 Launching XAVIER GUI
To run the XAVIER pipeline from the GUI, simply enter:
+
```bash
xavier_gui
```
+
and it will launch the XAVIER window.
+
> Note: Please wait until `window created!` message appears on the terminal.
![xavier_launchscreen](images/gui_launch.png)
### 3.2 Folder paths and reference genomes
+
To enter the location of the input folder containing FASTQ files and the location where the output folders should be created, either simply type the absolute paths
![xavier_enterPath](images/gui_path.png)
@@ -100,6 +109,7 @@ XAVIER pipeline can be run in two different modes:\
#### 3.3a Tumor-normal pair analysis
In case of tumor-normal pairs, a tab-delimited text file is neeed that contains the list of normal and tumor samples. For example,
+
```bash
Normal Tumor
sample1-normal sample1-tumor
@@ -124,7 +134,7 @@ In case the paired normal samples are unavailable, XAVIER pipeline can be run in
After all the information is filled out, press **Submit**.
-If the pipeline detects no errors and the run was submitted, a new window appears that has the output of a "dry-run" which summarizes each step of the pipeline.
+If the pipeline detects no errors and the run was submitted, a new window appears that has the output of a "dry-run" which summarizes each step of the pipeline.
![xavier_dryrun](images/gui_dryrun.png)
@@ -145,7 +155,6 @@ The XAVIER gui will ask to submit another job.
Click **Yes** to start again or **No** to close the XAVIER gui.
-
### 3.5 Additional settings
Users can input certain additional settings for the pipeline run including running an additional step to correct strand orientation bias in Formalin-Fixed Paraffin-Embedded (FFPE) samples and to provide a custom exome targets BED file. This file can be obtained from the manufacturer of the target capture kit that was used.
@@ -154,7 +163,6 @@ Users can input certain additional settings for the pipeline run including runni
![gui_additionalSettings2](images/gui_additionalSettings2.png)
-
## 4. Special instructions for Biowulf
XAVIER GUI natively uses the X11 Window System to run XAVIER pipeline and display the graphics on a personal desktop or laptop. However, if running XAVIER specifically on NIH's Biowulf cluster, the HPC staff recommends NoMachine (NX) to run graphics applications.
@@ -171,4 +179,4 @@ and start an interactive session.
Similar to the instructions above, load `ccbrpipeliner` module and enter `xavier_gui` to launch the XAVIER gui.
-![gui_nx_xavier](images/gui_nx_xavier.png)
\ No newline at end of file
+![gui_nx_xavier](images/gui_nx_xavier.png)
diff --git a/docs/usage/run.md b/docs/usage/run.md
index d8a5565..9141570 100644
--- a/docs/usage/run.md
+++ b/docs/usage/run.md
@@ -1,13 +1,15 @@
# xavier run
-## 1. About
+## 1. About
+
The `xavier` executable is composed of several inter-related sub commands. Please see `xavier -h` for all available options.
-This part of the documentation describes options and concepts for xavier run sub command in more detail. With minimal configuration, the **`run`** sub command enables you to start running xavier pipeline.
+This part of the documentation describes options and concepts for xavier run sub command in more detail. With minimal configuration, the **`run`** sub command enables you to start running xavier pipeline.
-Setting up the xavier pipeline is fast and easy! In its most basic form, xavier run only has *four required inputs*.
+Setting up the xavier pipeline is fast and easy! In its most basic form, xavier run only has _four required inputs_.
## 2. Synopsis
+
```text
$ xavier run [--help] \
[--mode {local, slurm}] \
@@ -24,198 +26,229 @@ $ xavier run [--help] \
--input INPUT [INPUT ...] \
--output OUTPUT \
--genome {hg38, ...} \
- --targets TARGETS
+ --targets TARGETS
```
The synopsis for each command shows its parameters and their usage. Optional parameters are shown in square brackets.
-A user **must** provide a list of FastQ or BAM files (globbing is supported) to analyze via `--input` argument, an output directory to store results via `--output` argument, an exome targets BED file for the samples' capture kit, and select reference genome for alignment and annotation via the `--genome` argument.
+A user **must** provide a list of FastQ or BAM files (globbing is supported) to analyze via `--input` argument, an output directory to store results via `--output` argument, an exome targets BED file for the samples' capture kit, and select reference genome for alignment and annotation via the `--genome` argument.
-Use you can always use the `-h` option for information on a specific command.
+Use you can always use the `-h` option for information on a specific command.
### 2.1 Required Arguments
Each of the following arguments are required. Failure to provide a required argument will result in a non-zero exit-code.
- `--input INPUT [INPUT ...]`
+`--input INPUT [INPUT ...]`
+
> **Input FastQ or BAM file(s) to process.**
-> *type: file(s)*
->
-> One or more FastQ files can be provided. The pipeline does NOT support single-end WES data. Please provide either a set of FastQ files or a set of BAM files. The pipeline does NOT support processing a mixture of FastQ files and BAM files. From the command-line, each input file should seperated by a space. Globbing is supported! This makes selecting FastQ files easy. Input FastQ files should be gzipp-ed.
->
-> ***Example:*** `--input .tests/*.R?.fastq.gz`
-
----
- `--output OUTPUT`
-> **Path to an output directory.**
-> *type: path*
->
+> _type: file(s)_
+>
+> One or more FastQ files can be provided. The pipeline does NOT support single-end WES data. Please provide either a set of FastQ files or a set of BAM files. The pipeline does NOT support processing a mixture of FastQ files and BAM files. From the command-line, each input file should separated by a space. Globbing is supported! This makes selecting FastQ files easy. Input FastQ files should be gzipp-ed.
+>
+> **_Example:_** `--input .tests/*.R?.fastq.gz`
+
+---
+
+`--output OUTPUT`
+
+> **Path to an output directory.**
+> _type: path_
+>
> This location is where the pipeline will create all of its output files, also known as the pipeline's working directory. If the provided output directory does not exist, it will be initialized automatically.
->
-> ***Example:*** `--output /data/$USER/WES_hg38`
+>
+> **_Example:_** `--output /data/$USER/WES_hg38`
+
+---
+
+`--runmode {init,dryrun,run}` `
----
- `--runmode {init,dryrun,run}` `
> **Execution Process.**
-> *type: string*
->
+> _type: string_
+>
> User should initialize the pipeline folder by first running `--runmode init`
> User should then perform a dry-run to list all steps the pipeline will take`--runmode dryrun`
-> User should then perform the full run `--runmode run`
->
-> ***Example:*** `--runmode init` *THEN* `--runmode dryrun` *THEN* `--runmode run`
-
----
- `--genome {hg38, custom.json}`
-> **Reference genome.**
-> *type: string/file*
->
+> User should then perform the full run `--runmode run`
+>
+> **_Example:_** `--runmode init` _THEN_ `--runmode dryrun` _THEN_ `--runmode run`
+
+---
+
+`--genome {hg38, custom.json}`
+
+> **Reference genome.**
+> _type: string/file_
+>
> This option defines the reference genome for your set of samples. On Biowulf, xavier does comes bundled with pre built reference files for human samples; however, it is worth noting that the pipeline does accept a pre-built resource bundle pulled with the cache sub command (coming soon). Currently, the pipeline only supports the human reference hg38; however, support for mouse reference mm10 will be added soon.
>
-> ***Pre built Option***
+> **_Pre built Option_**
> Here is a list of available pre built genomes on Biowulf: hg38.
>
-> ***Custom Option***
+> **_Custom Option_**
> For users running the pipeline outside of Biowulf, a pre-built resource bundle can be pulled with the cache sub command (coming soon). Please supply the custom reference JSON file that was generated by the cache sub command.
->
-> ***Example:*** `--genome hg38` *OR* `--genome /data/${USER}/hg38/hg38.json`
-
----
- `--targets TARGETS`
-> **Exome targets BED file.**
-> *type: file*
->
+>
+> **_Example:_** `--genome hg38` _OR_ `--genome /data/${USER}/hg38/hg38.json`
+
+---
+
+`--targets TARGETS`
+
+> **Exome targets BED file.**
+> _type: file_
+>
> This file can be obtained from the manufacturer of the target capture kit that was used.
->
-> ***Example:*** `--targets /data/$USER/Agilent_SSv7_allExons_hg38.bed`
+>
+> **_Example:_** `--targets /data/$USER/Agilent_SSv7_allExons_hg38.bed`
### 2.2 Options
-Each of the following arguments are optional and do not need to be provided.
+Each of the following arguments are optional and do not need to be provided.
+
+`-h, --help`
- `-h, --help`
> **Display Help.**
-> *type: boolean flag*
->
+> _type: boolean flag_
+>
> Shows command's synopsis, help message, and an example command
->
-> ***Example:*** `--help`
+>
+> **_Example:_** `--help`
+
+---
+
+`--silent`
----
- `--silent`
> **Silence standard output.**
-> *type: boolean flag*
->
+> _type: boolean flag_
+>
> Reduces the amount of information directed to standard output when submitting master job to the job scheduler. Only the job id of the master job is returned.
>
-> ***Example:*** `--silent`
+> **_Example:_** `--silent`
+
+---
+
+`--mode {local,slurm}`
----
- `--mode {local,slurm}`
> **Execution Method.**
-> *type: string*
-> *default: slurm*
->
-> Execution Method. Defines the mode or method of execution. Vaild mode options include: local or slurm.
->
-> ***local***
-> Local executions will run serially on compute instance. This is useful for testing, debugging, or when a users does not have access to a high performance computing environment. If this option is not provided, it will default to a local execution mode.
->
-> ***slurm***
+> _type: string_
+> _default: slurm_
+>
+> Execution Method. Defines the mode or method of execution. Valid mode options include: local or slurm.
+>
+> **_local_**
+> Local executions will run serially on compute instance. This is useful for testing, debugging, or when a users does not have access to a high performance computing environment. If this option is not provided, it will default to a local execution mode.
+>
+> **_slurm_**
> The slurm execution method will submit jobs to a cluster using a singularity backend. It is recommended running xavier in this mode as execution will be significantly faster in a distributed environment.
->
-> ***Example:*** `--mode slurm`
+>
+> **_Example:_** `--mode slurm`
+
+---
+
+`--job-name JOB_NAME`
----
- `--job-name JOB_NAME`
> **Set the name of the pipeline's master job.**
-> *type: string*
-> *default: pl:xavier*
->
+> _type: string_ > _default: pl:xavier_
+>
> When submitting the pipeline to a job scheduler, like SLURM, this option always you to set the name of the pipeline's master job. By default, the name of the pipeline's master job is set to "pl:xavier".
->
-> ***Example:*** `--job-name xavier_run1`
+>
+> **_Example:_** `--job-name xavier_run1`
+
+---
+
+`--callers CALLERS [CALLERS ...]`
----
- `--callers CALLERS [CALLERS ...]`
> **Variant Callers.**
-> *type: string(s)*
-> *default: mutect2, mutect, strelka, vardict, varscan*
->
+> _type: string(s)_ > _default: mutect2, mutect, strelka, vardict, varscan_
+>
> List of variant callers to detect mutations. Please select from one or more of the following options: [mutect2, mutect, strelka, vardict, varscan]. Defaults to using all variant callers.
->
-> ***Example:*** `--callers mutect2 strelka varscan`
+>
+> **_Example:_** `--callers mutect2 strelka varscan`
+
+---
+
+`--pairs PAIRS`
----
- `--pairs PAIRS`
> **Tumor normal pairs file.**
-> *type: file*
->
-> This tab delimited file contains two columns with the names of tumor and normal pairs, one per line. The header of the file needs to be `Tumor` for the tumor column and `Normal` for the normal column. The base name of each sample should be listed in the pairs file. The base name of a given sample can be determined by removing the following extension from the sample's R1 FastQ file: `.R1.fastq.gz`.
+> _type: file_
+>
+> This tab delimited file contains two columns with the names of tumor and normal pairs, one per line. The header of the file needs to be `Tumor` for the tumor column and `Normal` for the normal column. The base name of each sample should be listed in the pairs file. The base name of a given sample can be determined by removing the following extension from the sample's R1 FastQ file: `.R1.fastq.gz`.
> **Contents of example pairs file:**
+>
> ```
> Normal Tumor
> Sample4_CRL1622_S31 Sample10_ARK1_S37
> Sample4_CRL1622_S31 Sample11_ACI_158_S38
-> ```
-> ***Example:*** `--pairs /data/$USER/pairs.tsv`
+> ```
+>
+> **_Example:_** `--pairs /data/$USER/pairs.tsv`
+
+---
+
+`--ffpe`
----
- `--ffpe`
> **Apply FFPE correction.**
-> *type: boolean flag*
->
+> _type: boolean flag_
+>
> Runs an additional steps to correct strand orientation bias in Formalin-Fixed Paraffin-Embedded (FFPE) samples. Do NOT use this option with non-FFPE samples.
->
-> ***Example:*** `--ffpe`
+>
+> **_Example:_** `--ffpe`
+
+---
+
+`--cnv`
----
- `--cnv`
> **Call copy number variations (CNVs).**
-> *type: boolean flag*
->
+> _type: boolean flag_
+>
> CNVs will only be called from tumor-normal pairs. If this option is provided without providing a --pairs file, CNVs will NOT be called.
->
-> ***Example:*** `--cnv`
+>
+> **_Example:_** `--cnv`
+
+---
+
+`--singularity-cache SINGULARITY_CACHE`
----
- `--singularity-cache SINGULARITY_CACHE`
> **Overrides the $SINGULARITY_CACHEDIR environment variable.**
-> *type: path*
-> *default: `--output OUTPUT/.singularity`*
+> _type: path_
+> _default: `--output OUTPUT/.singularity`_
>
-> Singularity will cache image layers pulled from remote registries. This ultimately speeds up the process of pull an image from DockerHub if an image layer already exists in the singularity cache directory. By default, the cache is set to the value provided to the `--output` argument. Please note that this cache cannot be shared across users. Singularity strictly enforces you own the cache directory and will return a non-zero exit code if you do not own the cache directory! See the `--sif-cache` option to create a shareable resource.
->
-> ***Example:*** `--singularity-cache /data/$USER/.singularity`
+> Singularity will cache image layers pulled from remote registries. This ultimately speeds up the process of pull an image from DockerHub if an image layer already exists in the singularity cache directory. By default, the cache is set to the value provided to the `--output` argument. Please note that this cache cannot be shared across users. Singularity strictly enforces you own the cache directory and will return a non-zero exit code if you do not own the cache directory! See the `--sif-cache` option to create a shareable resource.
+>
+> **_Example:_** `--singularity-cache /data/$USER/.singularity`
+
+---
+
+`--sif-cache SIF_CACHE`
----
- `--sif-cache SIF_CACHE`
> **Path where a local cache of SIFs are stored.**
-> *type: path*
+> _type: path_
>
> Uses a local cache of SIFs on the filesystem. This SIF cache can be shared across users if permissions are set correctly. If a SIF does not exist in the SIF cache, the image will be pulled from Dockerhub and a warning message will be displayed. The `xavier cache` subcommand can be used to create a local SIF cache. Please see `xavier cache` for more information. This command is extremely useful for avoiding DockerHub pull rate limits. It also remove any potential errors that could occur due to network issues or DockerHub being temporarily unavailable. We recommend running xavier with this option when ever possible.
->
-> ***Example:*** `--singularity-cache /data/$USER/SIFs`
+>
+> **_Example:_** `--singularity-cache /data/$USER/SIFs`
+
+---
+
+`--threads THREADS`
----
- `--threads THREADS`
> **Max number of threads for each process.**
-> *type: int*
-> *default: 2*
->
-> Max number of threads for each process. This option is more applicable when running the pipeline with `--mode local`. It is recommended setting this vaule to the maximum number of CPUs available on the host machine.
->
-> ***Example:*** `--threads 12`
+> _type: int_
+> _default: 2_
+>
+> Max number of threads for each process. This option is more applicable when running the pipeline with `--mode local`. It is recommended setting this value to the maximum number of CPUs available on the host machine.
+>
+> **_Example:_** `--threads 12`
## 3. Example
-```bash
+
+```bash
# Step 1.) Grab an interactive node
# Do not run on head node!
sinteractive --mem=8g --cpus-per-task=4
module purge
module load ccbrpipeliner
-# Step 2A.) Initialize the all resources to the output folder
+# Step 2A.) Initialize the all resources to the output folder
xavier run --input .tests/*.R?.fastq.gz \
--output /data/$USER/xavier_hg38 \
--genome hg38 \
@@ -241,4 +274,4 @@ xavier run --input .tests/*.R?.fastq.gz \
--mode slurm \
--runmode run
-```
\ No newline at end of file
+```
diff --git a/docs/usage/unlock.md b/docs/usage/unlock.md
index 154feb0..1700f09 100644
--- a/docs/usage/unlock.md
+++ b/docs/usage/unlock.md
@@ -1,51 +1,55 @@
# xavier unlock
-## 1. About
+## 1. About
+
The `xavier` executable is composed of several inter-related sub commands. Please see `xavier -h` for all available options.
-This part of the documentation describes options and concepts for xavier unlock sub command in more detail. With minimal configuration, the **`unlock`** sub command enables you to unlock a pipeline output directory.
+This part of the documentation describes options and concepts for xavier unlock sub command in more detail. With minimal configuration, the **`unlock`** sub command enables you to unlock a pipeline output directory.
-If the pipeline fails ungracefully, it maybe required to unlock the working directory before proceeding again. Snakemake will inform a user when it maybe necessary to unlock a working directory with an error message stating: `Error: Directory cannot be locked`.
+If the pipeline fails ungracefully, it maybe required to unlock the working directory before proceeding again. Snakemake will inform a user when it maybe necessary to unlock a working directory with an error message stating: `Error: Directory cannot be locked`.
Please verify that the pipeline is not running before running this command. If the pipeline is currently running, the workflow manager will report the working directory is locked. The is the default behavior of snakemake, and it is normal. Do NOT run this command if the pipeline is still running! Please kill the master job and it's child jobs prior to running this command.
-Unlocking xavier pipeline output directory is fast and easy! In its most basic form, xavier unlock only has *one required input*.
+Unlocking xavier pipeline output directory is fast and easy! In its most basic form, xavier unlock only has _one required input_.
## 2. Synopsis
+
```text
$ xavier unlock [-h] --output OUTPUT
```
The synopsis for this command shows its parameters and their usage. Optional parameters are shown in square brackets.
-A user **must** provide an output directory to unlock via `--output` argument. After running the unlock sub command, you can resume the build or run pipeline from where it left off by re-running it.
+A user **must** provide an output directory to unlock via `--output` argument. After running the unlock sub command, you can resume the build or run pipeline from where it left off by re-running it.
-Use you can always use the `-h` option for information on a specific command.
+Use you can always use the `-h` option for information on a specific command.
-### 2.1 Required Arguments
+### 2.1 Required Arguments
+
+`--output OUTPUT`
- `--output OUTPUT`
> **Output directory to unlock.**
-> *type: path*
->
-> Path to a previous run's output directory. This will remove a lock on the working directory. Please verify that the pipeline is not running before running this command.
-> ***Example:*** `--output /data/$USER/WES_hg38`
+> _type: path_
+>
+> Path to a previous run's output directory. This will remove a lock on the working directory. Please verify that the pipeline is not running before running this command.
+> **_Example:_** `--output /data/$USER/WES_hg38`
### 2.2 Options
-Each of the following arguments are optional and do not need to be provided.
+Each of the following arguments are optional and do not need to be provided.
+
+`-h, --help`
- `-h, --help`
> **Display Help.**
-> *type: boolean*
->
+> _type: boolean_
+>
> Shows command's synopsis, help message, and an example command
->
-> ***Example:*** `--help`
-
+>
+> **_Example:_** `--help`
## 3. Example
-```bash
+
+```bash
# Step 0.) Grab an interactive node (do not run on head node)
sinteractive --mem=8g -N 1 -n 4
module purge
@@ -53,4 +57,4 @@ module load ccbrpipeliner
# Step 1.) Unlock a pipeline output directory
xavier unlock --output /data/$USER/xavier_hg38
-```
\ No newline at end of file
+```
diff --git a/mkdocs.yml b/mkdocs.yml
index 8248c86..0bbf15f 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -3,7 +3,7 @@ site_name: XAVIER Documentation
site_author: CCBR
site_description: >-
XAVIER is an open-source, reproducible, and scalable best
- practices pipeline for analyzing Whole Exome DNA-sequencing data.
+ practices pipeline for analyzing Whole Exome DNA-sequencing data.
# Repository
repo_name: CCBR/XAVIER
@@ -23,7 +23,7 @@ theme:
features:
- navigation.tabs
- navigation.top
- - toc.integrate
+ - toc.integrate
palette:
- scheme: default
primary: indigo
@@ -47,7 +47,6 @@ plugins:
- minify:
minify_html: true
-
# Customization
extra:
social:
@@ -60,7 +59,6 @@ extra:
version:
provider: mike
-
# Extensions
markdown_extensions:
- markdown.extensions.admonition
@@ -102,15 +100,15 @@ nav:
- About: index.md
- Getting Started: usage/gui.md
- Command Line:
- - xavier run: usage/run.md
- - xavier unlock: usage/unlock.md
- - xavier cache: usage/cache.md
- - Pipeline Details:
- - Overview: pipeline-details/overview.md
- - Methods: pipeline-details/methods.md
- - Tools: pipeline-details/tools.md
- - Settings: pipeline-details/settings.md
- - Output Files: pipeline-details/output.md
+ - xavier run: usage/run.md
+ - xavier unlock: usage/unlock.md
+ - xavier cache: usage/cache.md
+ - Pipeline Details:
+ - Overview: pipeline-details/overview.md
+ - Methods: pipeline-details/methods.md
+ - Tools: pipeline-details/tools.md
+ - Settings: pipeline-details/settings.md
+ - Output Files: pipeline-details/output.md
- FAQ:
- - General Questions: faq/questions.md
+ - General Questions: faq/questions.md
- License: license.md
diff --git a/resources/adapters.fa b/resources/adapters.fa
index bdf472a..b79ee40 100644
--- a/resources/adapters.fa
+++ b/resources/adapters.fa
@@ -69,7 +69,7 @@ TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG
>nextera_transposase_seq_read2
GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG
>pacbio
-AAGCAGTGGTATCAACGCAGAGTAC
+AAGCAGTGGTATCAACGCAGAGTAC
#adapters for ccbr647,648 Vanderweele
>universal_primer_rc
GATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT
diff --git a/resources/bundler b/resources/bundler
index a4086b2..462542b 100644
--- a/resources/bundler
+++ b/resources/bundler
@@ -1,6 +1,6 @@
#!/bin/bash
-set -euo pipefail
+set -euo pipefail
usage="Usage: $0 "
@@ -9,7 +9,7 @@ function fatal() { cat <<< "$@" 1>&2; err "$usage"; exit 1; }
function abspath() { readlink -f "$1"; }
function bundle() { tar -hczvf "$1" "$2"; }
-
+
function check() {
die=false
if [ -z "${1:-}" ]; then die=true; err "Error: Failed to provide directory to archive."; fi
@@ -21,8 +21,8 @@ function check() {
function main() {
# Checks for required positional
- # command line arguments
- check "${1:-}" "${2:-}"
+ # command line arguments
+ check "${1:-}" "${2:-}"
# Converts any relative paths to
# absolute paths, creates uninit
diff --git a/resources/cacher b/resources/cacher
index f34dbe3..14b6b84 100644
--- a/resources/cacher
+++ b/resources/cacher
@@ -8,7 +8,7 @@ USAGE:
SYNOPSIS:
This script submits the cacher master job to the cluster. This main process dictates
how subsequent resources are pulled onto the cluster's filesystem. cacher utilizes SLURM
-to avoid pull into resources on a compute node but support for additional job schedulers
+to avoid pull into resources on a compute node but support for additional job schedulers
(i.e. PBS, SGE, LSF, Tibanna) may be added in the near future.
The main entry point of the pipeline 'xavier' calls this job submission wrapper script.
As so, this script can be used to manually by-pass 'xavier' for a previously failed cache.
@@ -25,18 +25,18 @@ Required Positional Argument:
most of the steps are computationally intensive.
Required Arguments:
-s, --sif-cache [Type: Path] Path to output directory to cache remote resources.
- -i, --image-uris [Type: Str] Image(s) to pull from Dockerhub. Multiple images
- are seperated by a comma.
+ -i, --image-uris [Type: Str] Image(s) to pull from Dockerhub. Multiple images
+ are separated by a comma.
OPTIONS:
-t, --tmp-dir [Type: Path] Path to tmp singularity dir. Singularity uses this
directory when images are pulled from DockerHub
- and coverted into SIFs. If not provided, the
- location to the temp dir will default to the
- following "/tmp/$USER/cacher/.singularity/"
+ and converted into SIFs. If not provided, the
+ location to the temp dir will default to the
+ following "/tmp/$USER/cacher/.singularity/"
directory.
- -d, --docker-login [Type: Str] Path to a file with DockerHub Credentials. Allows for
- authenticated or non-anonomous pulls from DockerHub
- to avoid Dockerhub pull rate limits or to pull from
+ -d, --docker-login [Type: Str] Path to a file with DockerHub Credentials. Allows for
+ authenticated or non-anonomous pulls from DockerHub
+ to avoid Dockerhub pull rate limits or to pull from
a private DockerHub repository.
-h, --help [Type: Bool] Displays usage and help information.
Example:
@@ -108,19 +108,19 @@ function check(){
function retry() {
- # Tries to run a cmd 5 times before failing
+ # Tries to run a cmd 5 times before failing
# If a command is successful, it will break out of attempt loop
- # Failed attempts are padding with the following exponential
+ # Failed attempts are padding with the following exponential
# back-off strategy {4, 16, 64, 256, 1024} in seconds
- # @INPUTS "$@"" = cmd to run
+ # @INPUTS "$@"" = cmd to run
# @CALLS fatal() if command cannot be run in 5 attempts
local n=1
local max=5
local attempt=true # flag for while loop
while $attempt; do
- # Attempt command and break if successful
+ # Attempt command and break if successful
"$@" && attempt=false || {
- # Try again up to 5 times
+ # Try again up to 5 times
if [[ $n -le $max ]]; then
err "Command failed: $@"
delay=$(( 4**$n ))
@@ -145,17 +145,17 @@ function _pull(){
# Check if singularity in $PATH
# If not, try to module load singularity as a last resort
- command -V singularity &> /dev/null || {
- command -V module &> /dev/null &&
+ command -V singularity &> /dev/null || {
+ command -V module &> /dev/null &&
module purge && module load singularity
} || fatal "Fail to find or load 'singularity', not installed on target system."
# cache executor
executor=${1}
- # Goto Pipeline Ouput directory
+ # Goto Pipeline Output directory
# Create a local singularity cache in output directory
- # cache can be re-used instead of re-pulling from DockerHub everytime
+ # cache can be re-used instead of re-pulling from DockerHub every time
cd "$2" && export SINGULARITY_CACHEDIR="${3}"
# unsetting XDG_RUNTIME_DIR to avoid some unsighly but harmless warnings
@@ -165,13 +165,13 @@ function _pull(){
# Expected contents of credientals file:
# export SINGULARITY_DOCKER_USERNAME=
# export SINGULARITY_DOCKER_PASSWORD=
- if [[ ! -z "${5}" ]]; then source "${5}"; fi
+ if [[ ! -z "${5}" ]]; then source "${5}"; fi
# Run the workflow with specified executor
case "$executor" in
slurm)
# Create directory for logfiles
- for image in ${4//,/$'\t'}; do
+ for image in ${4//,/$'\t'}; do
# Try to pull image from URI with 5 max attempt
echo "Singularity pulling ${image}"
retry singularity pull -F ${image}
@@ -221,7 +221,7 @@ function main(){
echo -e "Running with the following parameters:"
for key in "${!Arguments[@]}"; do echo -e "\t${key}\t${Arguments["$key"]}"; done
- # Pull remote resources into XAVIER cache
+ # Pull remote resources into XAVIER cache
# Cache remote image from DockerHub
# INPUT $1 = Snakemake Mode of execution
# INPUT $2 = Cache output directory
diff --git a/resources/fastq_screen.biowulf.conf b/resources/fastq_screen.biowulf.conf
index 13d77b3..0d5cbf3 100644
--- a/resources/fastq_screen.biowulf.conf
+++ b/resources/fastq_screen.biowulf.conf
@@ -3,7 +3,7 @@
###########
## Bowtie #
###########
-## If the bowtie binary is not in your PATH then you can
+## If the bowtie binary is not in your PATH then you can
## set this value to tell the program where to find it.
## Uncomment the line below and set the appropriate location
##
@@ -39,9 +39,9 @@ THREADS 8
## This section allows you to configure multiple databases
## to search against in your screen. For each database
## you need to provide a database name (which can't contain
-## spaces) and the location of the bowtie indices which
+## spaces) and the location of the bowtie indices which
## you created for that database.
-##
+##
## The default entries shown below are only suggested examples
## you can add as many DATABASE sections as you like, and you
## can comment out or remove as many of the existing entries
diff --git a/resources/fastq_screen.frce.conf b/resources/fastq_screen.frce.conf
index 57d8eb8..e2a4720 100755
--- a/resources/fastq_screen.frce.conf
+++ b/resources/fastq_screen.frce.conf
@@ -3,7 +3,7 @@
###########
## Bowtie #
###########
-## If the bowtie binary is not in your PATH then you can
+## If the bowtie binary is not in your PATH then you can
## set this value to tell the program where to find it.
## Uncomment the line below and set the appropriate location
##
@@ -39,9 +39,9 @@ THREADS 8
## This section allows you to configure multiple databases
## to search against in your screen. For each database
## you need to provide a database name (which can't contain
-## spaces) and the location of the bowtie indices which
+## spaces) and the location of the bowtie indices which
## you created for that database.
-##
+##
## The default entries shown below are only suggested examples
## you can add as many DATABASE sections as you like, and you
## can comment out or remove as many of the existing entries
diff --git a/resources/gather_cluster_stats b/resources/gather_cluster_stats
index c53c6bd..9895d47 100755
--- a/resources/gather_cluster_stats
+++ b/resources/gather_cluster_stats
@@ -20,7 +20,7 @@ function get_sacct_info {
attribute=$2
x=$(sacct -j $jobid --noheader --format="${attribute}%500"|head -n1|awk '{print $1}')
echo $x
-}
+}
function displaytime {
local T=$1
@@ -86,7 +86,7 @@ END {
print str
}
}' > /dev/shm/${jobid}.sacct.batchline
-#batch line variables
+#batch line variables
jobdataarray["elapsed"]=$(get_batchline_variable "Elapsed")
jobdataarray["reqcpus"]=$(get_batchline_variable "ReqCPUS")
@@ -119,7 +119,7 @@ END {
jobdataarray["runtime"]=$(displaytime $rt)
jobdataarray["job_name"]=$(get_secondline_variable "JobName")
jobdataarray["time_limit"]=$(get_secondline_variable "Timelimit")
- jobdataarray["node_list"]=$(get_secondline_variable "NodeList")
+ jobdataarray["node_list"]=$(get_secondline_variable "NodeList")
jobdataarray["run_node_partition"]=$(get_secondline_variable "Partition")
jobdataarray["qos"]=$(get_secondline_variable "QOS")
jobdataarray["username"]=$(get_secondline_variable "User")
@@ -151,4 +151,4 @@ echo -ne "##SubmitTime\tHumanSubmitTime\tJobID:JobState:JobName\tNode;Partition:
while read jid;do
print_jobid_stats $jid
done < $externalidslst |sort -k1,1n
-rm -f $externalidslst /dev/shm/${jobid}.sacct*
\ No newline at end of file
+rm -f $externalidslst /dev/shm/${jobid}.sacct*
diff --git a/resources/jobby b/resources/jobby
index cb5f584..c75ed5d 100755
--- a/resources/jobby
+++ b/resources/jobby
@@ -4,14 +4,14 @@
# -*- coding: UTF-8 -*-
"""
-ABOUT:
+ABOUT:
`jobby` will take your past jobs and display their job information.
- Why? We have pipelines running on several different clusters and
+ Why? We have pipelines running on several different clusters and
job schedulers. `jobby` is an attempt to centralize and abstract
the process of querying different job schedulers. On each supported
- target system, `jobby` will attempt to determine the best method for
- getting job information to return to the user in a standardized
- format and unified cli.
+ target system, `jobby` will attempt to determine the best method for
+ getting job information to return to the user in a standardized
+ format and unified cli.
REQUIRES:
- python>=3.5
@@ -22,26 +22,26 @@ DISCLAIMER:
National Institute of Allergy and Infectious Diseases (NIAID)
This software/database is a "United States Government Work" under
- the terms of the United States Copyright Act. It was written as
+ the terms of the United States Copyright Act. It was written as
part of the author's official duties as a United States Government
employee and thus cannot be copyrighted. This software is freely
available to the public for use.
-
+
Although all reasonable efforts have been taken to ensure the
accuracy and reliability of the software and data, NCBR do not and
- cannot warrant the performance or results that may be obtained by
+ cannot warrant the performance or results that may be obtained by
using this software or data. NCBR and NIH disclaim all warranties,
- express or implied, including warranties of performance,
+ express or implied, including warranties of performance,
merchantability or fitness for any particular purpose.
-
- Please cite the author and NIH resources like the "Biowulf Cluster"
+
+ Please cite the author and NIH resources like the "Biowulf Cluster"
in any work or product based on this material.
USAGE:
$ jobby [OPTIONS] JOB_ID [JOB_ID ...]
EXAMPLE:
- $ jobby 18627545 15627516 58627597
+ $ jobby 18627545 15627516 58627597
"""
# Python standard library
@@ -50,52 +50,53 @@ import sys, os, subprocess, math, re
from subprocess import PIPE
import argparse # added in python/3.5
import textwrap # added in python/3.5
-import tempfile # added in python/3.5
+import tempfile # added in python/3.5
# Jobby metadata
-__version__ = 'v0.2.0'
-__authors__ = 'Skyler Kuhn'
-__email__ = 'skyler.kuhn@nih.gov'
-__home__ = os.path.dirname(os.path.abspath(__file__))
+__version__ = "v0.2.0"
+__authors__ = "Skyler Kuhn"
+__email__ = "skyler.kuhn@nih.gov"
+__home__ = os.path.dirname(os.path.abspath(__file__))
_name = os.path.basename(sys.argv[0])
-_description = 'Will take your job(s)... and display their information!'
+_description = "Will take your job(s)... and display their information!"
# Classes
-class Colors():
- """Class encoding for ANSI escape sequeces for styling terminal text.
+class Colors:
+ """Class encoding for ANSI escape sequences for styling terminal text.
Any string that is formatting with these styles must be terminated with
the escape sequence, i.e. `Colors.end`.
"""
+
# Escape sequence
- end = '\33[0m'
+ end = "\33[0m"
# Formatting options
- bold = '\33[1m'
- italic = '\33[3m'
- url = '\33[4m'
- blink = '\33[5m'
- higlighted = '\33[7m'
+ bold = "\33[1m"
+ italic = "\33[3m"
+ url = "\33[4m"
+ blink = "\33[5m"
+ highlighted = "\33[7m"
# Text Colors
- black = '\33[30m'
- red = '\33[31m'
- green = '\33[32m'
- yellow = '\33[33m'
- blue = '\33[34m'
- pink = '\33[35m'
- cyan = '\33[96m'
- white = '\33[37m'
+ black = "\33[30m"
+ red = "\33[31m"
+ green = "\33[32m"
+ yellow = "\33[33m"
+ blue = "\33[34m"
+ pink = "\33[35m"
+ cyan = "\33[96m"
+ white = "\33[37m"
# Background fill colors
- bg_black = '\33[40m'
- bg_red = '\33[41m'
- bg_green = '\33[42m'
- bg_yellow = '\33[43m'
- bg_blue = '\33[44m'
- bg_pink = '\33[45m'
- bg_cyan = '\33[46m'
- bg_white = '\33[47m'
+ bg_black = "\33[40m"
+ bg_red = "\33[41m"
+ bg_green = "\33[42m"
+ bg_yellow = "\33[43m"
+ bg_blue = "\33[44m"
+ bg_pink = "\33[45m"
+ bg_cyan = "\33[46m"
+ bg_white = "\33[47m"
-# Helper Functions
+# Helper Functions
def which(cmd, path=None):
"""Checks if an executable is in $PATH
@param cmd :
@@ -120,7 +121,7 @@ def which(cmd, path=None):
def err(*message, **kwargs):
"""Prints any provided args to standard error.
- kwargs can be provided to modify print functions
+ kwargs can be provided to modify print functions
behavior.
@param message :
Values printed to standard error
@@ -130,7 +131,6 @@ def err(*message, **kwargs):
print(*message, file=sys.stderr, **kwargs)
-
def fatal(*message, **kwargs):
"""Prints any provided args to standard error
and exits with an exit code of 1.
@@ -146,44 +146,42 @@ def fatal(*message, **kwargs):
def get_toolkit(tool_list):
"""Finds the best suited tool from a list of
possible choices. Assumes tool list is already
- ordered from the best to worst choice. The first
+ ordered from the best to worst choice. The first
tool found in a user's $PATH is returned.
@param tool_list list[]:
List of ordered tools to find
@returns best_choice :
First tool found in tool_list
"""
- best_choice = None
+ best_choice = None
for exe in tool_list:
if which(exe):
best_choice = exe
break
-
+
# Did not find any tools
# to potentially use
if not best_choice:
- err(
- 'Error: Did not find any tools to get job information!'
- )
+ err("Error: Did not find any tools to get job information!")
fatal(
- 'Expected one of the following tools to be in $PATH:'
- '\t{0}'.format(tool_list)
+ "Expected one of the following tools to be in $PATH:"
+ "\t{0}".format(tool_list)
)
-
+
return best_choice
def add_missing(linelist, insertion_dict):
- """Adds missing information to a list. This can be used
- to add missing job information fields to the results of
+ """Adds missing information to a list. This can be used
+ to add missing job information fields to the results of
job querying tool.
@param linelist list[]:
List containing job information for each field of interest
@param insertion_dict dict[] = str
Dictionary used to insert missing information to a given
- index, where the keys are indices of the `linelist` and the
+ index, where the keys are indices of the `linelist` and the
values are information to add. Please note that the indices
- should be zero based. Note that multiple consequetive values
+ should be zero based. Note that multiple consecutive values
should be inserted at once as a list, see example below:
Example:
add_missing([0,1,2,3,4], {3:['+','++'], 1:'-', 4:'@'})
@@ -192,16 +190,16 @@ def add_missing(linelist, insertion_dict):
# Get the order of indices
# add missing information
# starting from largest to
- # smallest, if we insert
- # missing values in this
+ # smallest, if we insert
+ # missing values in this
# order we do not need to
- # calculate the offset of
+ # calculate the offset of
# new indices
tmp_list = linelist
indices = sorted(list(insertion_dict.keys()), reverse=True)
for i in indices:
# Check if multiple values
- # need to be inserted at a
+ # need to be inserted at a
# given index
if isinstance(insertion_dict[i], list):
for v in reversed(insertion_dict[i]):
@@ -212,17 +210,12 @@ def add_missing(linelist, insertion_dict):
def convert_size(size_bytes):
- """Converts bytes to a human readable format.
- """
- # Sizes range from B to YiB,
+ """Converts bytes to a human readable format."""
+ # Sizes range from B to YiB,
# warning larger sizes storage
- # may results in blackhole
- size_name = (
- "B", "KiB", "MiB",
- "GiB", "TiB", "PiB",
- "EiB", "ZiB", "YiB"
- )
- if size_bytes == 0:
+ # may results in blackhole
+ size_name = ("B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB")
+ if size_bytes == 0:
return "0B"
i = int(math.floor(math.log(size_bytes, 1024)))
p = math.pow(1024, i)
@@ -234,36 +227,53 @@ def to_bytes(size):
"""Convert a human readable size unit into bytes.
Returns None if cannot convert/parse provided size."""
size2bytes = {
- "b":1, "bytes":1, "byte":1,
- "k":1024, "kib":1024, "kb":1000,
- "m": 1024**2, "mib": 1024**2, "mb": 1000**2,
- "g": 1024**3, "gib": 1024**3, "gb": 1000**3,
- "t": 1024**4, "tib": 1024**4, "tb": 1000**4,
- "p": 1024**5, "pib": 1024**5, "pb": 1000**5,
- "e": 1024**6, "eib": 1024**6, "eb": 1000**6,
- "z": 1024**7, "zib": 1024**7, "zb": 1000**7,
- "y": 1024**8, "yib": 1024**8, "yb": 1000**8
+ "b": 1,
+ "bytes": 1,
+ "byte": 1,
+ "k": 1024,
+ "kib": 1024,
+ "kb": 1000,
+ "m": 1024**2,
+ "mib": 1024**2,
+ "mb": 1000**2,
+ "g": 1024**3,
+ "gib": 1024**3,
+ "gb": 1000**3,
+ "t": 1024**4,
+ "tib": 1024**4,
+ "tb": 1000**4,
+ "p": 1024**5,
+ "pib": 1024**5,
+ "pb": 1000**5,
+ "e": 1024**6,
+ "eib": 1024**6,
+ "eb": 1000**6,
+ "z": 1024**7,
+ "zib": 1024**7,
+ "zb": 1000**7,
+ "y": 1024**8,
+ "yib": 1024**8,
+ "yb": 1000**8,
}
-
- size = size.replace(' ','')
- match = re.search('(?P[0-9.]+)(?P[a-zA-Z]+)$', size)
-
+
+ size = size.replace(" ", "")
+ match = re.search("(?P[0-9.]+)(?P[a-zA-Z]+)$", size)
+
if match:
- human_units = match.group('units').lower()
+ human_units = match.group("units").lower()
human_units = human_units.lstrip().rstrip()
scaling_factor = size2bytes[human_units]
- bytes = int(math.ceil(scaling_factor * float(match.group('size'))))
+ bytes = int(math.ceil(scaling_factor * float(match.group("size"))))
else:
# Cannot parse units,
# cannot convert value
# into bytes
return None
-
- return bytes
+ return bytes
-# Core logic for getting
+# Core logic for getting
# job information
def sge(jobs, threads, tmp_dir):
"""Displays SGE job information to standard output.
@@ -281,19 +291,19 @@ def uge(jobs, threads, tmp_dir):
Parsed command-line arguments
@return None
"""
- # NOTE: add later for LOCUS cluster
+ # NOTE: add later for LOCUS cluster
pass
def dashboard_cli(jobs, threads=1, tmp_dir=None):
"""Biowulf-specific tool to get SLURM job information.
- HPC staff recommend using this over the default slurm
- `sacct` command for performance reasons. By default,
+ HPC staff recommend using this over the default slurm
+ `sacct` command for performance reasons. By default,
the `dashboard_cli` returns information for the following
fields:
- jobid state submit_time partition nodes
- cpus mem timelimit gres dependency
- queued_time state_reason start_time elapsed_time end_time
+ jobid state submit_time partition nodes
+ cpus mem timelimit gres dependency
+ queued_time state_reason start_time elapsed_time end_time
cpu_max mem_max eval
Runs command:
$ dashboard_cli jobs \\
@@ -302,41 +312,50 @@ def dashboard_cli(jobs, threads=1, tmp_dir=None):
--tab --archive
"""
fields = [
- "jobid","jobname",
- "state","partition",
- "gres","cpus","mem",
- "cpu_max","mem_max",
- "timelimit","queued_time",
- "start_time","end_time",
- "elapsed_time","nodelist",
- "user", "std_out", "std_err",
- "work_dir"
+ "jobid",
+ "jobname",
+ "state",
+ "partition",
+ "gres",
+ "cpus",
+ "mem",
+ "cpu_max",
+ "mem_max",
+ "timelimit",
+ "queued_time",
+ "start_time",
+ "end_time",
+ "elapsed_time",
+ "nodelist",
+ "user",
+ "std_out",
+ "std_err",
+ "work_dir",
]
-
+
# Display header information,
# --tab option does not print
# the header
- print('\t'.join(fields))
+ print("\t".join(fields))
# Display job information
cmd = subprocess.run(
- 'dashboard_cli jobs --archive --tab --joblist {0} --fields {1}'.format(
- ','.join(jobs),
- ','.join(fields)
+ "dashboard_cli jobs --archive --tab --joblist {0} --fields {1}".format(
+ ",".join(jobs), ",".join(fields)
),
stdout=PIPE,
stderr=PIPE,
universal_newlines=True,
- shell=True
+ shell=True,
)
# Check for failure
# of the last command
if cmd.returncode != 0:
err("\nError: Failed to get job information with 'dashboard_cli'!")
- err('Please see error message below:')
- fatal(' └── ', cmd.stderr)
+ err("Please see error message below:")
+ fatal(" └── ", cmd.stderr)
- print(cmd.stdout.rstrip('\n'))
+ print(cmd.stdout.rstrip("\n"))
def sacct(jobs, threads=1, tmp_dir=None):
@@ -344,11 +363,11 @@ def sacct(jobs, threads=1, tmp_dir=None):
`sacct` should be available on all SLURM clusters.
The `dashboard_cli` is prioritized over using `sacct`
due to perform reasons; however, this method will be
- portable across different SLURM clusters. To get maximum
- memory usage for a job, we will need to parse the MaxRSS
+ portable across different SLURM clusters. To get maximum
+ memory usage for a job, we will need to parse the MaxRSS
field from the `$SLURM_JOBID.batch` lines.
Returns job information for the following fields:
- jobid jobname state partition reqtres
+ jobid jobname state partition reqtres
alloccpus reqmem maxrss timelimit reserved
start end elapsed nodelist user
workdir
@@ -357,49 +376,70 @@ def sacct(jobs, threads=1, tmp_dir=None):
Runs command:
$ sacct -j 12345679,12345680 \\
--fields FIELD,FIELD,FIELD \\
- -P --delimiter $'\t'
+ -P --delimiter $'\t'
"""
- header = [
- "jobid","jobname","state","partition",
- "gres","cpus","mem","cpu_max","mem_max",
- "timelimit","queued_time","start_time",
- "end_time","elapsed_time","nodelist",
- "user","std_out","std_err", "work_dir"
+ header = [
+ "jobid",
+ "jobname",
+ "state",
+ "partition",
+ "gres",
+ "cpus",
+ "mem",
+ "cpu_max",
+ "mem_max",
+ "timelimit",
+ "queued_time",
+ "start_time",
+ "end_time",
+ "elapsed_time",
+ "nodelist",
+ "user",
+ "std_out",
+ "std_err",
+ "work_dir",
]
fields = [
- "jobid", "jobname",
- "state", "partition",
- "reqtres", "alloccpus",
- "reqmem", "maxrss",
- "timelimit", "reserved",
- "start", "end",
- "elapsed", "nodelist",
- "user", "workdir"
+ "jobid",
+ "jobname",
+ "state",
+ "partition",
+ "reqtres",
+ "alloccpus",
+ "reqmem",
+ "maxrss",
+ "timelimit",
+ "reserved",
+ "start",
+ "end",
+ "elapsed",
+ "nodelist",
+ "user",
+ "workdir",
]
# Missing std_out and std_err
- missing_fields = {15:['-','-']}
+ missing_fields = {15: ["-", "-"]}
# Display header information,
- print('\t'.join(header))
+ print("\t".join(header))
# Display job information
cmd = subprocess.run(
"sacct -j {0} -P --delimiter $'\\t' --format={1}".format(
- ','.join(jobs),
- ','.join(fields)
+ ",".join(jobs), ",".join(fields)
),
- stdout=PIPE,
+ stdout=PIPE,
stderr=PIPE,
universal_newlines=True,
- shell=True
+ shell=True,
)
# Check for failure
# of the last command
if cmd.returncode != 0:
err("\nError: Failed to get job information with 'dashboard_cli'!")
- err('Please see error message below:')
- fatal(' └── ', cmd.stderr)
-
+ err("Please see error message below:")
+ fatal(" └── ", cmd.stderr)
+
# Get max memory information,
# Stored as $SLURM_JOBID.batch
# in the MaxRSS field
@@ -407,22 +447,22 @@ def sacct(jobs, threads=1, tmp_dir=None):
# Remove trailing newline from
# standard output and split lines
# on remaining newline characters
- job_information = cmd.stdout.rstrip('\n').split('\n')
+ job_information = cmd.stdout.rstrip("\n").split("\n")
for i, line in enumerate(job_information):
if i < 1:
# skip over header
continue
- linelist = line.lstrip().rstrip().split('\t')
- if linelist[0].endswith('.batch'):
- jobid = linelist[0].strip().split('.')[0]
- maxmem = linelist[7].replace(' ', '')
+ linelist = line.lstrip().rstrip().split("\t")
+ if linelist[0].endswith(".batch"):
+ jobid = linelist[0].strip().split(".")[0]
+ maxmem = linelist[7].replace(" ", "")
mem_bytes = to_bytes(maxmem)
if not mem_bytes:
# Could not convert
- # max_mem value into
+ # max_mem value into
# bytes
- j2m[jobid] = '-'
- continue # goto next line
+ j2m[jobid] = "-"
+ continue # goto next line
human_readable_mem = convert_size(mem_bytes)
j2m[jobid] = human_readable_mem
@@ -432,22 +472,22 @@ def sacct(jobs, threads=1, tmp_dir=None):
if i < 1:
# skip over header
continue
- linelist = line.lstrip().rstrip().split('\t')
+ linelist = line.lstrip().rstrip().split("\t")
jobid = linelist[0].strip()
- if '.' not in jobid:
+ if "." not in jobid:
try:
max_mem = j2m[jobid]
except KeyError:
- # Job maybe still be
+ # Job maybe still be
# running or in a non-
# completed state.
- max_mem = '-'
- status = linelist[2].split(' ')[0]
+ max_mem = "-"
+ status = linelist[2].split(" ")[0]
linelist[2] = status
missing_fields[8] = max_mem
linelist = add_missing(linelist, missing_fields)
- linelist = [info if info else '-' for info in linelist]
- print('\t'.join(linelist))
+ linelist = [info if info else "-" for info in linelist]
+ print("\t".join(linelist))
def slurm(jobs, threads, tmp_dir):
@@ -456,11 +496,11 @@ def slurm(jobs, threads, tmp_dir):
Parsed command-line arguments
@return None
"""
- # Try to use the following tools in this
+ # Try to use the following tools in this
# order to get job information!
# [1] `dashboard_cli` is Biowulf-specific
# [2] `sacct` should always be there
- tool_priority = ['dashboard_cli', 'sacct']
+ tool_priority = ["dashboard_cli", "sacct"]
job_tool = get_toolkit(tool_priority)
# Get information about each job
# must use eval() to make string
@@ -470,65 +510,57 @@ def slurm(jobs, threads, tmp_dir):
def jobby(args):
"""
- Wrapper to each supported job scheduler: slurm, etc.
+ Wrapper to each supported job scheduler: slurm, etc.
Each scheduler has a custom handler to most effectively
- get and parse job information.
+ get and parse job information.
@param sub_args :
Parsed command-line arguments
@return None
"""
# Get command line options
abstract_handler = None
- job_ids = args.JOB_ID
+ job_ids = args.JOB_ID
scheduler = args.scheduler
- threads = args.threads
- tmp_dir = args.tmp_dir
+ threads = args.threads
+ tmp_dir = args.tmp_dir
- # Set handler for each
- # supported scheduler
- if scheduler == 'slurm':
+ # Set handler for each
+ # supported scheduler
+ if scheduler == "slurm":
abstract_handler = slurm
else:
# Unsupported job scheduler,
# needs to be implemented
- fatal(
- 'Error: "{0}" is an unsupported job scheduler!'.format(scheduler)
- )
-
- # Display job(s) information
+ fatal('Error: "{0}" is an unsupported job scheduler!'.format(scheduler))
+
+ # Display job(s) information
# to standard output
- abstract_handler(
- jobs=job_ids,
- threads=threads,
- tmp_dir=tmp_dir
- )
+ abstract_handler(jobs=job_ids, threads=threads, tmp_dir=tmp_dir)
# Parse command-line arguments
def parsed_arguments(name, description):
- """Parses user-provided command-line arguments. This requires
- argparse and textwrap packages. To create custom help formatting
- a text wrapped docstring is used to create the help message for
- required options. As so, the help message for require options
- must be suppressed. If a new required argument is added to the
+ """Parses user-provided command-line arguments. This requires
+ argparse and textwrap packages. To create custom help formatting
+ a text wrapped docstring is used to create the help message for
+ required options. As so, the help message for require options
+ must be suppressed. If a new required argument is added to the
cli, it must be updated in the usage statement docstring below.
@param name :
- Name of the pipeline or command-line tool
+ Name of the pipeline or command-line tool
@param description :
- Short description of pipeline or command-line tool
+ Short description of pipeline or command-line tool
"""
# Add styled name and description
c = Colors
- styled_name = "{0}{1}{2}{3}{4}".format(
- c.bold, c.bg_black,
- c.cyan, name, c.end
- )
+ styled_name = "{0}{1}{2}{3}{4}".format(c.bold, c.bg_black, c.cyan, name, c.end)
description = "{0}{1}{2}".format(c.bold, description, c.end)
temp = tempfile.gettempdir()
# Please note: update the usage statement
# below if a new option is added!
- usage_statement = textwrap.dedent("""\
+ usage_statement = textwrap.dedent(
+ """\
{0}: {1}
{3}{4}Synopsis:{5}
@@ -538,138 +570,136 @@ def parsed_arguments(name, description):
{3}{4}Description:{5}
- {2} will take your past jobs and display their job information
+ {2} will take your past jobs and display their job information
in a standardized format. Why???! We have pipelines running on several
- different clusters (using different job schedulers). {2} centralizes
+ different clusters (using different job schedulers). {2} centralizes
and abstracts the process of querying different job schedulers within
- a unified command-line interface.
-
+ a unified command-line interface.
+
For each supported scheduler, jobby will determine the best method
- on a given target system for getting job information to return to the
+ on a given target system for getting job information to return to the
user in a common output format.
{3}{4}Required Positional Arguments:{5}
- Identiers of past jobs. One or more JOB_IDs
- can be provided. Multiple JOB_IDs should be
- seperated by a space. Information for each
- of the JOB_IDs will be displayed to standard
- output. Please see example section below for
+ Identiers of past jobs. One or more JOB_IDs
+ can be provided. Multiple JOB_IDs should be
+ separated by a space. Information for each
+ of the JOB_IDs will be displayed to standard
+ output. Please see example section below for
more information.
{3}{4}Options:{5}
- -s,--scheduler {{slurm | ...}}
+ -s,--scheduler {{slurm | ...}}
@Default: slurm
- Job scheduler. Defines the job scheduler
+ Job scheduler. Defines the job scheduler
of the target system. Additional support
- for more schedulers coming soon!
+ for more schedulers coming soon!
@Example: --scheduler slurm
- -n, --threads THREADS
+ -n, --threads THREADS
@Default: 1
- Number of threads to query the scheduler
+ Number of threads to query the scheduler
in parallel.
@Example: --threads: 8
- -t, --tmp-dir TMP_DIR
+ -t, --tmp-dir TMP_DIR
@Default: {7}/
- Temporary directory. Path on the filesystem
- for writing temporary output files. Ideally,
- this path should point to a dedicated space
- on the filesystem for writing tmp files. If
- you need to inject a variable into this path
- that should NOT be expanded, please quote the
- options value in single quotes. The default
+ Temporary directory. Path on the filesystem
+ for writing temporary output files. Ideally,
+ this path should point to a dedicated space
+ on the filesystem for writing tmp files. If
+ you need to inject a variable into this path
+ that should NOT be expanded, please quote the
+ options value in single quotes. The default
location of this option is set to the system
default via the $TMPDIR environment variable.
@Example: --tmp-dir '/scratch/$USER/'
-
+
-h, --help Shows help and usage information and exits.
@Example: --help
-
+
-v, --version Displays version information and exits.
@Example: --version
- """.format(styled_name, description, name, c.bold, c.url, c.end, c.italic, temp))
+ """.format(
+ styled_name, description, name, c.bold, c.url, c.end, c.italic, temp
+ )
+ )
# Display example usage in epilog
- run_epilog = textwrap.dedent("""\
+ run_epilog = textwrap.dedent(
+ """\
{2}{3}Example:{4}
- # Please avoid running jobby
+ # Please avoid running jobby
# on a cluster's head node!
./jobby -s slurm -n 4 18627542 13627516 58627597 48627666
{2}{3}Version:{4}
{1}
- """.format(name, __version__, c.bold, c.url, c.end))
+ """.format(
+ name, __version__, c.bold, c.url, c.end
+ )
+ )
# Create a top-level parser
parser = argparse.ArgumentParser(
- usage = argparse.SUPPRESS,
+ usage=argparse.SUPPRESS,
formatter_class=argparse.RawDescriptionHelpFormatter,
- description = usage_statement,
- epilog = run_epilog,
- add_help=False
+ description=usage_statement,
+ epilog=run_epilog,
+ add_help=False,
)
# Required Positional Arguments
# List of JOB_IDs, 1 ... N_JOB_IDS
- parser.add_argument(
- 'JOB_ID',
- nargs = '+',
- help = argparse.SUPPRESS
- )
+ parser.add_argument("JOB_ID", nargs="+", help=argparse.SUPPRESS)
# Options
- # Adding verison information
+ # Adding version information
parser.add_argument(
- '-v', '--version',
- action = 'version',
- version = '%(prog)s {}'.format(__version__),
- help = argparse.SUPPRESS
+ "-v",
+ "--version",
+ action="version",
+ version="%(prog)s {}".format(__version__),
+ help=argparse.SUPPRESS,
)
# Add custom help message
- parser.add_argument(
- '-h', '--help',
- action='help',
- help=argparse.SUPPRESS
- )
+ parser.add_argument("-h", "--help", action="help", help=argparse.SUPPRESS)
- # Base directory to write
- # temporary/intermediate files
+ # Base directory to write
+ # temporary/intermediate files
parser.add_argument(
- '-t', '--tmp-dir',
- type = str,
- required = False,
- default = temp,
- help = argparse.SUPPRESS
+ "-t",
+ "--tmp-dir",
+ type=str,
+ required=False,
+ default=temp,
+ help=argparse.SUPPRESS,
)
- # Number of threads for the
+ # Number of threads for the
# pipeline's main proceess
- # This is only applicable for
- # local rules or when running
+ # This is only applicable for
+ # local rules or when running
# in local mode.
parser.add_argument(
- '-n', '--threads',
- type = int,
- required = False,
- default = 1,
- help = argparse.SUPPRESS
+ "-n", "--threads", type=int, required=False, default=1, help=argparse.SUPPRESS
)
# Job scheduler to query,
# available: SLURM, ...
# More coming soon!
parser.add_argument(
- '-s', '--scheduler',
- type = lambda s: str(s).lower(),
- required = False,
- default = "slurm",
- choices = ['slurm'],
- help = argparse.SUPPRESS
+ "-s",
+ "--scheduler",
+ type=lambda s: str(s).lower(),
+ required=False,
+ default="slurm",
+ choices=["slurm"],
+ help=argparse.SUPPRESS,
)
# Define handlers for each sub-parser
- parser.set_defaults(func = jobby)
+ parser.set_defaults(func=jobby)
# Parse command-line args
args = parser.parse_args()
@@ -680,20 +710,17 @@ def main():
# Sanity check for usage
if len(sys.argv) == 1:
# Nothing was provided
- fatal('Invalid usage: {} [-h] [--version] ...'.format(_name))
-
+ fatal("Invalid usage: {} [-h] [--version] ...".format(_name))
+
# Collect args for sub-command
- args = parsed_arguments(
- name = _name,
- description = _description
- )
-
+ args = parsed_arguments(name=_name, description=_description)
+
# Display version information
- err('{} ({})'.format(_name, __version__))
- # Mediator method to call the
+ err("{} ({})".format(_name, __version__))
+ # Mediator method to call the
# default handler function
args.func(args)
-if __name__ == '__main__':
- main()
\ No newline at end of file
+if __name__ == "__main__":
+ main()
diff --git a/resources/py311.conda_env.yaml b/resources/py311.conda_env.yaml
index 621273c..0d2595c 100644
--- a/resources/py311.conda_env.yaml
+++ b/resources/py311.conda_env.yaml
@@ -33,16 +33,16 @@ dependencies:
- xxhash=0.8.1=h0b41bf4_0
- xz=5.2.6=h166bdaf_0
- pip:
- - cowsay==5.0
- - et-xmlfile==1.1.0
- - fortune==1.1.1
- - grizzled-python==2.2.0
- - numpy==1.24.3
- - openpyxl==3.1.2
- - pandas==2.0.2
- - pysimplegui==4.60.5
- - python-dateutil==2.8.2
- - pytz==2023.3
- - six==1.16.0
- - tzdata==2023.3
+ - cowsay==5.0
+ - et-xmlfile==1.1.0
+ - fortune==1.1.1
+ - grizzled-python==2.2.0
+ - numpy==1.24.3
+ - openpyxl==3.1.2
+ - pandas==2.0.2
+ - pysimplegui==4.60.5
+ - python-dateutil==2.8.2
+ - pytz==2023.3
+ - six==1.16.0
+ - tzdata==2023.3
prefix: /data/CCBR_Pipeliner/db/PipeDB/Conda/envs/py311
diff --git a/resources/runner b/resources/runner
index d47b148..a6403eb 100755
--- a/resources/runner
+++ b/resources/runner
@@ -12,20 +12,20 @@ USAGE:
-t TMP_DIR
SYNOPSIS:
- This script creates/submits the pipeline's master job to the
-cluster. The master job acts as the pipeline's main controller or
-its main process. This main job dictates how subsequent jobs are
+ This script creates/submits the pipeline's master job to the
+cluster. The master job acts as the pipeline's main controller or
+its main process. This main job dictates how subsequent jobs are
submitted to the cluster via the job scheduler, SLURM. Support for
additional job schedulers (i.e. PBS, SGE, LSF, Tibanna) may be added
in the future.
- The main entry point of the pipeline 'xavier' calls this job
-submission wrapper script. As so, this script can be used to by-pass
-'xavier' for a previously failed run; meaning, it can be used to
+ The main entry point of the pipeline 'xavier' calls this job
+submission wrapper script. As so, this script can be used to by-pass
+'xavier' for a previously failed run; meaning, it can be used to
re-run the pipeline to pick back off where the last failure occurred
or re-start the pipeline.
- Please Note: it is highly recommended to use 'xavier'; it is the
-main entry point and preferred entry point of the xavier pipeline.
-If you are experiencing an error, it maybe due to improperly mounting
+ Please Note: it is highly recommended to use 'xavier'; it is the
+main entry point and preferred entry point of the xavier pipeline.
+If you are experiencing an error, it maybe due to improperly mounting
singularity bind paths which 'xavier' will internally handle.
Required Positional Argument:
@@ -33,8 +33,8 @@ Required Positional Argument:
Valid mode options include:
slurm: uses slurm and singularity/snakemake
backend. This EXECUTOR will submit child
- jobs to the cluster. It is recommended
- running xavier in this mode as most
+ jobs to the cluster. It is recommended
+ running xavier in this mode as most
of the steps are computationally intensive.
Required Arguments:
@@ -43,31 +43,31 @@ Required Arguments:
where all output files will be generated.
-j, --job-name [Type: Str] Name of pipeline's master job.
-b, --bind-paths [Type:Path] Singularity bind paths. The pipeline uses
- singularity images for exection. Bind
- paths are used to mount the host file
- system to the container's file system.
- Multiple bind paths can be provided as
- a comma seperated list. The main entry
+ singularity images for execution. Bind
+ paths are used to mount the host file
+ system to the container's file system.
+ Multiple bind paths can be provided as
+ a comma-separated list. The main entry
point of the pipeline internally collects
and aggregates bind paths to mount to the
- container's filesystem.
+ container's filesystem.
If you are manually running this script
or by-passing xavier, you will need
to provide the bind paths of the rawdata
directory(s) along with the pipeline's
- output directory and any directories for
+ output directory and any directories for
reference files. Please see example usage
below.
-t, --tmp-dir [Type:Path] Temporary directory. The pipeline generates
- intermediate, temporary output files. Any
+ intermediate, temporary output files. Any
temporary output files will be written to
- this location. On Biowulf, it should be
+ this location. On Biowulf, it should be
set to '/lscratch/\$SLURM_JOBID/'. On FRCE,
- this value should be set to the following:
- '/scratch/cluster_scratch/\$USER/'.
+ this value should be set to the following:
+ '/scratch/cluster_scratch/\$USER/'.
OPTIONS:
- -c, --cache [Type: Path] Path to singularity cache. If not provided,
+ -c, --cache [Type: Path] Path to singularity cache. If not provided,
the path will default to the current working
directory of this script.
[Default: $(dirname "$0")/.singularity/]
@@ -180,9 +180,9 @@ function submit(){
# more maybe added in the future, TBA
executor=${1}
- # Goto Pipeline Ouput directory
+ # Goto Pipeline Output directory
# Create a local singularity cache in output directory
- # cache can be re-used instead of re-pulling from DockerHub everytime
+ # cache can be re-used instead of re-pulling from DockerHub every time
cd "$3" && export SINGULARITY_CACHEDIR="${5}"
# unsetting XDG_RUNTIME_DIR to avoid some unsighly but harmless warnings
@@ -196,27 +196,27 @@ function submit(){
CLUSTER_OPTS="sbatch --cpus-per-task {cluster.threads} -p {cluster.partition} -t {cluster.time} --mem {cluster.mem} --job-name {cluster.name} --output {cluster.output} --error {cluster.error}"
snakemakeVer=$(snakemake --version 2>/dev/null)
# Submit the master job to the cluster
- # sbatch --parsable -J {jobname} --time=5-00:00:00 --mail-type=BEGIN,END,FAIL
- # --cpus-per-task=24 --mem=96g --gres=lscratch:500
- # --output os.path.join({outdir}, 'logfiles', 'snakemake.log') --error os.path.join({outdir}, 'logfiles', 'snakemake.log')
- # snakemake -pr --latency-wait 120 -d {outdir} --configfile=config.json
- # --cluster-config os.path.join({outdir}, 'config', 'cluster.json')
- # --cluster {CLUSTER_OPTS} --stats os.path.join({outdir}, 'logfiles', 'runtime_statistics.json')
- # --printshellcmds --keep-going --rerun-incomplete
- # --keep-remote --restart-times 3 -j 500 --use-singularity
+ # sbatch --parsable -J {jobname} --time=5-00:00:00 --mail-type=BEGIN,END,FAIL
+ # --cpus-per-task=24 --mem=96g --gres=lscratch:500
+ # --output os.path.join({outdir}, 'logfiles', 'snakemake.log') --error os.path.join({outdir}, 'logfiles', 'snakemake.log')
+ # snakemake -pr --latency-wait 120 -d {outdir} --configfile=config.json
+ # --cluster-config os.path.join({outdir}, 'config', 'cluster.json')
+ # --cluster {CLUSTER_OPTS} --stats os.path.join({outdir}, 'logfiles', 'runtime_statistics.json')
+ # --printshellcmds --keep-going --rerun-incomplete
+ # --keep-remote --restart-times 3 -j 500 --use-singularity
# --singularity-args -B {}.format({bindpaths}) --local-cores 24
# Check if NOT running on Biowulf
- # Assumes other clusters do NOT
+ # Assumes other clusters do NOT
# have GRES for local node disk,
- # long term it might be worth
- # adding a new option to allow
- # a user to decide whether to
+ # long term it might be worth
+ # adding a new option to allow
+ # a user to decide whether to
# use GRES at job submission,
# trying to infer this because
# most users will not even know
# what GRES is and how or why
# it should be used and by default
- # SLURM is not configured to use
+ # SLURM is not configured to use
# GRES, remove prefix single quote
#if [[ ${6#\'} != /lscratch* ]]; then
# CLUSTER_OPTS="sbatch --cpus-per-task {cluster.threads} -p {cluster.partition} -t {cluster.time} --mem {cluster.mem} --job-name={params.rname} -e $SLURM_DIR/slurm-%j_{params.rname}.out -o $SLURM_DIR/slurm-%j_{params.rname}.out"
@@ -241,7 +241,7 @@ function submit(){
# Create sbatch script to build index
cat << EOF > kickoff.sh
#!/usr/bin/env bash
-#SBATCH --cpus-per-task=32
+#SBATCH --cpus-per-task=32
#SBATCH --mem=96g
#SBATCH --time=5-00:00:00
#SBATCH --parsable
diff --git a/src/__init__.py b/src/__init__.py
index fea378e..5c85348 100644
--- a/src/__init__.py
+++ b/src/__init__.py
@@ -1,10 +1,11 @@
import os, sys
+
# Makes relative imports to work in Python 3.6
# without the need of '.' before the name of the
# package or py file.
-# Allows for consistent syntax of relative imports
+# Allows for consistent syntax of relative imports
# across python2 and python3.
sys.path.append(os.path.dirname(os.path.realpath(__file__)))
# Ground source of truth for version information
-version ='v3.0.2-dev'
+version = "v3.0.2-dev"
diff --git a/src/options.py b/src/options.py
index 0927fe0..e4e5686 100644
--- a/src/options.py
+++ b/src/options.py
@@ -7,10 +7,11 @@
# Local imports
from utils import os, fatal, err, permissions
-import os # required for call in line 30: 'permissions(parser, user_option, os.R_OK)'
+import os # required for call in line 30: 'permissions(parser, user_option, os.R_OK)'
+
def genome_options(parser, user_option, prebuilt):
- """Dynamically checks if --genome option is a vaild choice. Compares against a
+ """Dynamically checks if --genome option is a valid choice. Compares against a
list of prebuilt or bundled genome reference genomes and accepts a custom reference
JSON file.
@param parser :
@@ -18,29 +19,33 @@ def genome_options(parser, user_option, prebuilt):
@param user_option :
Provided value to the xavier run, --genome argument
@param prebuilt list[]:
- List of prebuilt or builded reference genomes
+ List of pre-built reference genomes
return user_option :
Provided value to the xavier run, --genome argument
- If vaule is not valid or custom reference genome JSON file not readable,
+ If value is not valid or custom reference genome JSON file not readable,
an exception is raised.
"""
# Checks for custom built genomes using xavier build
- if user_option.endswith('.json'):
+ if user_option.endswith(".json"):
# Check file is readable or accessible
permissions(parser, user_option, os.R_OK)
- # Checks against vaild pre-built options
+ # Checks against valid pre-built options
# TODO: makes this more dynamic in the future to have it check against
# a list of genomes (files) in config/genomes/*.json
elif not user_option in prebuilt:
- # User did NOT provide a vaild choice
- parser.error("""provided invalid choice, '{}', to --genome argument!\n
+ # User did NOT provide a valid choice
+ parser.error(
+ """provided invalid choice, '{}', to --genome argument!\n
Choose from one of the following pre-built genome options: \n
\t{}\n
or supply a custom reference genome JSON file generated from xavier build.
- """.format(user_option, prebuilt))
+ """.format(
+ user_option, prebuilt
+ )
+ )
return user_option
-if __name__ == '__main__':
+if __name__ == "__main__":
pass
diff --git a/src/run.py b/src/run.py
index 5c23814..8ec0734 100644
--- a/src/run.py
+++ b/src/run.py
@@ -7,19 +7,16 @@
import os, re, json, sys, subprocess
# Local imports
-from utils import (git_commit_hash,
- join_jsons,
- fatal,
- which,
- exists,
- err)
+from utils import git_commit_hash, join_jsons, fatal, which, exists, err
from . import version as __version__
-def init(repo_path, output_path, links=[], required=['workflow', 'resources', 'config']):
+def init(
+ repo_path, output_path, links=[], required=["workflow", "resources", "config"]
+):
"""Initialize the output directory. If user provides a output
- directory path that already exists on the filesystem as a file
+ directory path that already exists on the filesystem as a file
(small chance of happening but possible), a OSError is raised. If the
output directory PATH already EXISTS, it will not try to create the directory.
@param repo_path :
@@ -37,25 +34,28 @@ def init(repo_path, output_path, links=[], required=['workflow', 'resources', 'c
os.makedirs(output_path)
elif exists(output_path) and os.path.isfile(output_path):
- # Provided Path for pipeline
+ # Provided Path for pipeline
# output directory exists as file
- raise OSError("""\n\tFatal: Failed to create provided pipeline output directory!
+ raise OSError(
+ """\n\tFatal: Failed to create provided pipeline output directory!
User provided --output PATH already exists on the filesystem as a file.
Please run {} again with a different --output PATH.
- """.format(sys.argv[0])
+ """.format(
+ sys.argv[0]
+ )
)
# Copy over templates are other required resources
- copy_safe(source = repo_path, target = output_path, resources = required)
+ copy_safe(source=repo_path, target=output_path, resources=required)
- # Create renamed symlinks for each rawdata
+ # Create renamed symlinks for each rawdata
# file provided as input to the pipeline
- inputs = sym_safe(input_data = links, target = output_path)
+ inputs = sym_safe(input_data=links, target=output_path)
return inputs
-def copy_safe(source, target, resources = []):
+def copy_safe(source, target, resources=[]):
"""Private function: Given a list paths it will recursively copy each to the
target location. If a target path already exists, it will NOT over-write the
existing paths data.
@@ -85,7 +85,7 @@ def sym_safe(input_data, target):
@return input_fastqs list[]:
List of renamed input FastQs
"""
- input_fastqs = [] # store renamed fastq file names
+ input_fastqs = [] # store renamed fastq file names
for file in input_data:
filename = os.path.basename(file)
renamed = os.path.join(target, rename(filename))
@@ -102,7 +102,7 @@ def sym_safe(input_data, target):
def rename(filename):
"""Dynamically renames FastQ file to have one of the following extensions: *.R1.fastq.gz, *.R2.fastq.gz
To automatically rename the fastq files, a few assumptions are made. If the extension of the
- FastQ file cannot be infered, an exception is raised telling the user to fix the filename
+ FastQ file cannot be inferred, an exception is raised telling the user to fix the filename
of the fastq files.
@param filename :
Original name of file to be renamed
@@ -121,12 +121,14 @@ def rename(filename):
".R2.(?P...).f(ast)?q.gz$": ".R2.fastq.gz",
# Matches: _[12].fastq.gz, _[12].fq.gz, _[12]_fastq_gz, etc.
"_1.f(ast)?q.gz$": ".R1.fastq.gz",
- "_2.f(ast)?q.gz$": ".R2.fastq.gz"
+ "_2.f(ast)?q.gz$": ".R2.fastq.gz",
}
- if (filename.endswith('.R1.fastq.gz') or
- filename.endswith('.R2.fastq.gz') or
- filename.endswith('.bam')):
+ if (
+ filename.endswith(".R1.fastq.gz")
+ or filename.endswith(".R2.fastq.gz")
+ or filename.endswith(".bam")
+ ):
# Filename is already in the correct format
return filename
@@ -137,10 +139,11 @@ def rename(filename):
# regex matches with a pattern in extensions
converted = True
filename = re.sub(regex, new_ext, filename)
- break # only rename once
+ break # only rename once
if not converted:
- raise NameError("""\n\tFatal: Failed to rename provided input '{}'!
+ raise NameError(
+ """\n\tFatal: Failed to rename provided input '{}'!
Cannot determine the extension of the user provided input file.
Please rename the file list above before trying again.
Here is example of acceptable input file extensions:
@@ -149,13 +152,15 @@ def rename(filename):
sampleName_1.fastq.gz sampleName_2.fastq.gz
Please also check that your input files are gzipped?
If they are not, please gzip them before proceeding again.
- """.format(filename, sys.argv[0])
+ """.format(
+ filename, sys.argv[0]
+ )
)
return filename
-def setup(sub_args, repo_path, output_path, create_nidap_folder_YN = 'no',links=[]):
+def setup(sub_args, repo_path, output_path, create_nidap_folder_YN="no", links=[]):
"""Setup the pipeline for execution and creates config file from templates
@param sub_args :
Parsed arguments for run sub-command
@@ -170,8 +175,8 @@ def setup(sub_args, repo_path, output_path, create_nidap_folder_YN = 'no',links=
"""
# Check for mixed inputs,
# inputs which are a mixture
- # of FastQ and BAM files
- ifiles = sym_safe(input_data = links, target = output_path)
+ # of FastQ and BAM files
+ ifiles = sym_safe(input_data=links, target=output_path)
mixed_inputs(ifiles)
hpcget = subprocess.run(
@@ -187,32 +192,39 @@ def setup(sub_args, repo_path, output_path, create_nidap_folder_YN = 'no',links=
print("Thank you for running XAVIER on FRCE")
else:
shorthostname = "biowulf"
- print("%s unknown host. Configuration files for references may not be correct. Defaulting to Biowulf config"%(hpcget))
+ print(
+ "%s unknown host. Configuration files for references may not be correct. Defaulting to Biowulf config"
+ % (hpcget)
+ )
- genome_config = os.path.join(repo_path,'config','genomes', sub_args.genome + '.' + shorthostname + '.json')
-
- if sub_args.genome.endswith('.json'):
+ genome_config = os.path.join(
+ repo_path, "config", "genomes", sub_args.genome + "." + shorthostname + ".json"
+ )
+
+ if sub_args.genome.endswith(".json"):
# Provided a custom reference genome generated by rna-seek build
genome_config = os.path.abspath(sub_args.genome)
required = {
# Base configuration file
- "base": os.path.join(repo_path,'config','config.json'),
+ "base": os.path.join(repo_path, "config", "config.json"),
# Template for project-level information
- "project": os.path.join(repo_path,'config','templates','project.json'),
+ "project": os.path.join(repo_path, "config", "templates", "project.json"),
# Template for genomic reference files
# User provided argument --genome is used to select the template
"genome": genome_config,
# Template for tool information
- "tools": os.path.join(repo_path,'config','templates','tools.json'),
+ "tools": os.path.join(repo_path, "config", "templates", "tools.json"),
}
- cluster_config = os.path.join(repo_path,'config', 'cluster' + '.' + shorthostname + '.json')
- cluster_output = os.path.join(output_path, 'cluster.json')
- copyfile(cluster_config,cluster_output)
+ cluster_config = os.path.join(
+ repo_path, "config", "cluster" + "." + shorthostname + ".json"
+ )
+ cluster_output = os.path.join(output_path, "cluster.json")
+ copyfile(cluster_config, cluster_output)
# Global config file for pipeline, config.json
- config = join_jsons(required.values()) # uses templates in the rna-seek repo
+ config = join_jsons(required.values()) # uses templates in the rna-seek repo
config = add_user_information(config)
config = add_rawdata_information(sub_args, config, ifiles)
@@ -221,31 +233,30 @@ def setup(sub_args, repo_path, output_path, create_nidap_folder_YN = 'no',links=
config = image_cache(sub_args, config, repo_path)
# Add other cli collected info
- config['project']['annotation'] = sub_args.genome
- config['project']['version'] = __version__
- config['project']['workpath'] = os.path.abspath(sub_args.output)
+ config["project"]["annotation"] = sub_args.genome
+ config["project"]["version"] = __version__
+ config["project"]["workpath"] = os.path.abspath(sub_args.output)
# Add optional cli workflow steps
- config['input_params']['CNV_CALLING'] = str(sub_args.cnv).lower()
- config['input_params']['FFPE_FILTER'] = str(sub_args.ffpe).lower()
- config['input_params']['EXOME_TARGETS'] = str(sub_args.targets)
- config['input_params']['VARIANT_CALLERS'] = sub_args.callers
- config['input_params']['PAIRS_FILE'] = str(sub_args.pairs)
- config['input_params']['BASE_OUTDIR'] = str(sub_args.output)
- config['input_params']['tmpdisk'] = str(sub_args.tmp_dir)
- config['input_params']['create_nidap_folder'] = str(create_nidap_folder_YN)
+ config["input_params"]["CNV_CALLING"] = str(sub_args.cnv).lower()
+ config["input_params"]["FFPE_FILTER"] = str(sub_args.ffpe).lower()
+ config["input_params"]["EXOME_TARGETS"] = str(sub_args.targets)
+ config["input_params"]["VARIANT_CALLERS"] = sub_args.callers
+ config["input_params"]["PAIRS_FILE"] = str(sub_args.pairs)
+ config["input_params"]["BASE_OUTDIR"] = str(sub_args.output)
+ config["input_params"]["tmpdisk"] = str(sub_args.tmp_dir)
+ config["input_params"]["create_nidap_folder"] = str(create_nidap_folder_YN)
# Get latest git commit hash
git_hash = git_commit_hash(repo_path)
- config['project']['git_commit_hash'] = git_hash
- config['project']['pipehome'] = repo_path
- config['project']['pairs'] = str(sub_args.pairs)
+ config["project"]["git_commit_hash"] = git_hash
+ config["project"]["pipehome"] = repo_path
+ config["project"]["pairs"] = str(sub_args.pairs)
if sub_args.runmode == "init":
# Save config to output directory
- with open(os.path.join(output_path, 'config.json'), 'w') as fh:
- json.dump(config, fh, indent = 4, sort_keys = True)
-
+ with open(os.path.join(output_path, "config.json"), "w") as fh:
+ json.dump(config, fh, indent=4, sort_keys=True)
return config
@@ -254,7 +265,7 @@ def unpacked(nested_dict):
"""Generator to recursively retrieves all values in a nested dictionary.
@param nested_dict dict[]:
Nested dictionary to unpack
- @yields value in dictionary
+ @yields value in dictionary
"""
# Iterate over all values of given dictionary
for value in nested_dict.values():
@@ -270,7 +281,7 @@ def unpacked(nested_dict):
yield value
-def get_fastq_screen_paths(fastq_screen_confs, match = 'DATABASE', file_index = -1):
+def get_fastq_screen_paths(fastq_screen_confs, match="DATABASE", file_index=-1):
"""Parses fastq_screen.conf files to get the paths of each fastq_screen database.
This path contains bowtie2 indices for reference genome to screen against.
The paths are added as singularity bind points.
@@ -285,11 +296,11 @@ def get_fastq_screen_paths(fastq_screen_confs, match = 'DATABASE', file_index =
"""
databases = []
for file in fastq_screen_confs:
- with open(file, 'r') as fh:
+ with open(file, "r") as fh:
for line in fh:
if line.startswith(match):
- db_path = line.strip().split()[file_index]
- databases.append(db_path)
+ db_path = line.strip().split()[file_index]
+ databases.append(db_path)
return databases
@@ -312,25 +323,27 @@ def resolve_additional_bind_paths(search_paths):
# Skip over resources with remote URI and
# skip over strings that are not file PATHS as
# build command creates absolute resource PATHS
- if ref.lower().startswith('sftp://') or \
- ref.lower().startswith('s3://') or \
- ref.lower().startswith('gs://') or \
- not ref.lower().startswith(os.sep):
+ if (
+ ref.lower().startswith("sftp://")
+ or ref.lower().startswith("s3://")
+ or ref.lower().startswith("gs://")
+ or not ref.lower().startswith(os.sep)
+ ):
continue
# Break up path into directory tokens
path_list = os.path.abspath(ref).split(os.sep)
- try: # Create composite index from first two directories
+ try: # Create composite index from first two directories
# Avoids issues created by shared /gpfs/ PATHS
index = path_list[1:3]
index = tuple(index)
except IndexError:
- index = path_list[1] # ref startswith /
+ index = path_list[1] # ref startswith /
if index not in indexed_paths:
indexed_paths[index] = []
- # Create an INDEX to find common PATHS for each root
- # child directory like /scratch or /data. This prevents
- # issues when trying to find the common path betweeen
+ # Create an INDEX to find common PATHS for each root
+ # child directory like /scratch or /data. This prevents
+ # issues when trying to find the common path between
# these two different directories (resolves to /)
indexed_paths[index].append(str(os.sep).join(path_list))
@@ -338,8 +351,8 @@ def resolve_additional_bind_paths(search_paths):
# Find common paths for each path index
p = os.path.dirname(os.path.commonprefix(paths))
if p == os.sep:
- # Aviods adding / to bind list when
- # given /tmp or /scratch as input
+ # Avoids adding / to bind list when
+ # given /tmp or /scratch as input
p = os.path.commonprefix(paths)
common_paths.append(p)
@@ -353,7 +366,7 @@ def bind(sub_args, config):
@param configfile dict[]:
Config dictionary generated by setup command.
@return bindpaths list[]:
- List of singularity/docker bind paths
+ List of singularity/docker bind paths
"""
bindpaths = []
for value in unpacked(config):
@@ -364,18 +377,24 @@ def bind(sub_args, config):
value = os.path.dirname(value)
if value not in bindpaths:
bindpaths.append(value)
-
+
# Get FastQ Screen Database paths
# and other reference genome file paths
- rawdata_bind_paths = [os.path.realpath(p) for p in config['project']['datapath'].split(',')]
- working_directory = os.path.realpath(config['project']['workpath'])
- fqscreen_cfg = config['references']['FASTQ_SCREEN_CONFIG']
- fq_screen_paths = get_fastq_screen_paths([os.path.join(sub_args.output, fqscreen_cfg)])
- kraken_db_path = config['references']['KRAKENBACDB']
+ rawdata_bind_paths = [
+ os.path.realpath(p) for p in config["project"]["datapath"].split(",")
+ ]
+ working_directory = os.path.realpath(config["project"]["workpath"])
+ fqscreen_cfg = config["references"]["FASTQ_SCREEN_CONFIG"]
+ fq_screen_paths = get_fastq_screen_paths(
+ [os.path.join(sub_args.output, fqscreen_cfg)]
+ )
+ kraken_db_path = config["references"]["KRAKENBACDB"]
# Add Bindpath for VCF2maf
- vep_db_path= config['references']['VCF2MAF']['VEPRESOURCEBUNDLEPATH']
- genome_bind_paths = resolve_additional_bind_paths(bindpaths + fq_screen_paths + [kraken_db_path] + [vep_db_path])
- bindpaths = [working_directory] + rawdata_bind_paths + genome_bind_paths
+ vep_db_path = config["references"]["VCF2MAF"]["VEPRESOURCEBUNDLEPATH"]
+ genome_bind_paths = resolve_additional_bind_paths(
+ bindpaths + fq_screen_paths + [kraken_db_path] + [vep_db_path]
+ )
+ bindpaths = [working_directory] + rawdata_bind_paths + genome_bind_paths
bindpaths = list(set([p for p in bindpaths if p != os.sep]))
return bindpaths
@@ -392,31 +411,35 @@ def mixed_inputs(ifiles):
fastqs = False
bams = False
for file in ifiles:
- if file.endswith('.R1.fastq.gz') or file.endswith('.R2.fastq.gz'):
- fastqs = True
+ if file.endswith(".R1.fastq.gz") or file.endswith(".R2.fastq.gz"):
+ fastqs = True
fq_files.append(file)
- elif file.endswith('.bam'):
+ elif file.endswith(".bam"):
bams = True
bam_files.append(file)
if fastqs and bams:
# User provided a mix of FastQs and BAMs
- raise TypeError("""\n\tFatal: Detected a mixture of --input data types.
+ raise TypeError(
+ """\n\tFatal: Detected a mixture of --input data types.
A mixture of BAM and FastQ files were provided; however, the pipeline
does NOT support processing a mixture of input FastQ and BAM files.
Input FastQ Files:
{}
Input BAM Files:
- {}
+ {}
Please do not run the pipeline with a mixture of FastQ and BAM files.
This feature is currently not supported within '{}', and it is not
recommended to process samples in this way either. If this is a priority
- for your project, please run the set of FastQ and BAM files separately
+ for your project, please run the set of FastQ and BAM files separately
(in two separate output directories). If you feel like this functionality
should exist, feel free to open an issue on Github.
- """.format(" ".join(fq_files), " ".join(bam_files), sys.argv[0])
+ """.format(
+ " ".join(fq_files), " ".join(bam_files), sys.argv[0]
+ )
)
+
def add_user_information(config):
"""Adds username and user's home directory to config.
@params config :
@@ -432,8 +455,8 @@ def add_user_information(config):
username = os.path.split(home)[-1]
# Update config with home directory and username
- config['project']['userhome'] = home
- config['project']['username'] = username
+ config["project"]["userhome"] = home
+ config["project"]["username"] = username
return config
@@ -452,20 +475,21 @@ def add_rawdata_information(sub_args, config, ifiles):
@return config :
Updated config dictionary containing user information (username and home directory)
"""
-
+
# Determine whether dataset is paired-end or single-ends
# Updates config['project']['nends']: 1 = single-end, 2 = paired-end, -1 = bams
- convert = {1: 'single-end', 2: 'paired-end', -1: 'bam'}
+ convert = {1: "single-end", 2: "paired-end", -1: "bam"}
nends = get_nends(ifiles) # Checks PE data for both mates (R1 and R2)
- config['project']['nends'] = nends
- config['project']['filetype'] = convert[nends]
+ config["project"]["nends"] = nends
+ config["project"]["filetype"] = convert[nends]
# Finds the set of rawdata directories to bind
- rawdata_paths = get_rawdata_bind_paths(input_files = sub_args.input)
- config['project']['datapath'] = ','.join(rawdata_paths)
+ rawdata_paths = get_rawdata_bind_paths(input_files=sub_args.input)
+ config["project"]["datapath"] = ",".join(rawdata_paths)
return config
+
def image_cache(sub_args, config, repo_path):
"""Adds Docker Image URIs, or SIF paths to config if singularity cache option is provided.
If singularity cache option is provided and a local SIF does not exist, a warning is
@@ -479,22 +503,30 @@ def image_cache(sub_args, config, repo_path):
@return config :
Updated config dictionary containing user information (username and home directory)
"""
- images = os.path.join(repo_path, 'config','containers', 'images.json')
+ images = os.path.join(repo_path, "config", "containers", "images.json")
- # Read in config for docker image uris
- with open(images, 'r') as fh:
+ # Read in config for docker image uris
+ with open(images, "r") as fh:
data = json.load(fh)
- # Check if local sif exists
- for image, uri in data['images'].items():
+ # Check if local sif exists
+ for image, uri in data["images"].items():
if sub_args.sif_cache:
- sif = os.path.join(sub_args.sif_cache, '{}.sif'.format(os.path.basename(uri).replace(':', '_')))
+ sif = os.path.join(
+ sub_args.sif_cache,
+ "{}.sif".format(os.path.basename(uri).replace(":", "_")),
+ )
if not exists(sif):
- # If local sif does not exist on in cache, print warning
+ # If local sif does not exist on in cache, print warning
# and default to pulling from URI in config/containers/images.json
- print('Warning: Local image "{}" does not exist in singularity cache'.format(sif), file=sys.stderr)
+ print(
+ 'Warning: Local image "{}" does not exist in singularity cache'.format(
+ sif
+ ),
+ file=sys.stderr,
+ )
else:
# Change pointer to image from Registry URI to local SIF
- data['images'][image] = sif
+ data["images"][image] = sif
config.update(data)
@@ -515,22 +547,22 @@ def get_nends(ifiles):
bam_files = False
nends_status = 1
for file in ifiles:
- if file.endswith('.bam'):
+ if file.endswith(".bam"):
bam_files = True
nends_status = -1
break
- elif file.endswith('.R2.fastq.gz'):
+ elif file.endswith(".R2.fastq.gz"):
paired_end = True
nends_status = 2
- break # dataset is paired-end
+ break # dataset is paired-end
- # Check to see if both mates (R1 and R2)
+ # Check to see if both mates (R1 and R2)
# are present paired-end data
if paired_end:
- nends = {} # keep count of R1 and R2 for each sample
+ nends = {} # keep count of R1 and R2 for each sample
for file in ifiles:
# Split sample name on file extension
- sample = re.split('\.R[12]\.fastq\.gz', os.path.basename(file))[0]
+ sample = re.split("\.R[12]\.fastq\.gz", os.path.basename(file))[0]
if sample not in nends:
nends[sample] = 0
@@ -540,7 +572,8 @@ def get_nends(ifiles):
missing_mates = [sample for sample, count in nends.items() if count == 1]
if missing_mates:
# Missing an R1 or R2 for a provided input sample
- raise NameError("""\n\tFatal: Detected pair-end data but user failed to provide
+ raise NameError(
+ """\n\tFatal: Detected pair-end data but user failed to provide
both mates (R1 and R2) for the following samples:\n\t\t{}\n
Please check that the basename for each sample is consistent across mates.
Here is an example of a consistent basename across mates:
@@ -550,19 +583,24 @@ def get_nends(ifiles):
Please do not run the pipeline with a mixture of single-end and paired-end
samples. This feature is currently not supported within {}, and it is
not recommended either. If this is a priority for your project, please run
- paired-end samples and single-end samples separately (in two separate output
- directories). If you feel like this functionality should exist, feel free to
+ paired-end samples and single-end samples separately (in two separate output
+ directories). If you feel like this functionality should exist, feel free to
open an issue on Github.
- """.format(missing_mates, sys.argv[0])
+ """.format(
+ missing_mates, sys.argv[0]
+ )
)
elif not bam_files:
# Provided only single-end data
# not supported or recommended
- raise TypeError("""\n\tFatal: Single-end data detected.
+ raise TypeError(
+ """\n\tFatal: Single-end data detected.
{} does not support single-end data. Calling variants from single-end
- data is not recommended either. If you feel like this functionality should
+ data is not recommended either. If you feel like this functionality should
exist, feel free to open an issue on Github.
- """.format(sys.argv[0])
+ """.format(
+ sys.argv[0]
+ )
)
return nends_status
@@ -586,46 +624,65 @@ def get_rawdata_bind_paths(input_files):
return bindpaths
-def dryrun(outdir, config='config.json', snakefile=os.path.join('workflow', 'Snakefile')):
- """Dryruns the pipeline to ensure there are no errors prior to runnning.
+def dryrun(
+ outdir, config="config.json", snakefile=os.path.join("workflow", "Snakefile")
+):
+ """Dryruns the pipeline to ensure there are no errors prior to running.
@param outdir :
Pipeline output PATH
@return dryrun_output :
Byte string representation of dryrun command
"""
try:
- dryrun_output = subprocess.check_output([
- 'snakemake', '-npr',
- '--rerun-incomplete',
- '-s', str(snakefile),
- '--use-singularity',
- '--cores', str(1),
- '--configfile={}'.format(config)
- ], cwd = outdir,
- stderr=subprocess.STDOUT)
+ dryrun_output = subprocess.check_output(
+ [
+ "snakemake",
+ "-npr",
+ "--rerun-incomplete",
+ "-s",
+ str(snakefile),
+ "--use-singularity",
+ "--cores",
+ str(1),
+ "--configfile={}".format(config),
+ ],
+ cwd=outdir,
+ stderr=subprocess.STDOUT,
+ )
except OSError as e:
# Catch: OSError: [Errno 2] No such file or directory
# Occurs when command returns a non-zero exit-code
- if e.errno == 2 and not which('snakemake'):
+ if e.errno == 2 and not which("snakemake"):
# Failure caused because snakemake is NOT in $PATH
- err('\n\x1b[6;37;41mError: Are snakemake AND singularity in your $PATH?\x1b[0m')
- fatal('\x1b[6;37;41mPlease check before proceeding again!\x1b[0m')
+ err(
+ "\n\x1b[6;37;41mError: Are snakemake AND singularity in your $PATH?\x1b[0m"
+ )
+ fatal("\x1b[6;37;41mPlease check before proceeding again!\x1b[0m")
else:
# Failure caused by unknown cause, raise error
raise e
except subprocess.CalledProcessError as e:
print(e, e.output)
- raise(e)
+ raise (e)
return dryrun_output
-def runner(mode, outdir, alt_cache, logger, additional_bind_paths = None,
- threads=2, jobname='pl:xavier', submission_script='runner',
- tmp_dir = '/lscratch/$SLURM_JOBID/', wait = ''):
+def runner(
+ mode,
+ outdir,
+ alt_cache,
+ logger,
+ additional_bind_paths=None,
+ threads=2,
+ jobname="pl:xavier",
+ submission_script="runner",
+ tmp_dir="/lscratch/$SLURM_JOBID/",
+ wait="",
+):
"""Runs the pipeline via selected executor: local or slurm.
If 'local' is selected, the pipeline is executed locally on a compute node/instance.
- If 'slurm' is selected, jobs will be submited to the cluster using SLURM job scheduler.
+ If 'slurm' is selected, jobs will be submitted to the cluster using SLURM job scheduler.
Support for additional job schedulers (i.e. PBS, SGE, LSF) may be added in the future.
@param outdir :
Pipeline output PATH
@@ -648,61 +705,72 @@ def runner(mode, outdir, alt_cache, logger, additional_bind_paths = None,
@return masterjob :
"""
# Add additional singularity bind PATHs
- # to mount the local filesystem to the
- # containers filesystem, NOTE: these
+ # to mount the local filesystem to the
+ # containers filesystem, NOTE: these
# PATHs must be an absolute PATHs
outdir = os.path.abspath(outdir)
- # Add any default PATHs to bind to
- # the container's filesystem, like
+ # Add any default PATHs to bind to
+ # the container's filesystem, like
# tmp directories, /lscratch
- bindpaths = "{},{}".format(outdir, os.path.dirname(tmp_dir.rstrip('/')))
- # Set ENV variable 'SINGULARITY_CACHEDIR'
+ bindpaths = "{},{}".format(outdir, os.path.dirname(tmp_dir.rstrip("/")))
+ # Set ENV variable 'SINGULARITY_CACHEDIR'
# to output directory
- my_env = {}; my_env.update(os.environ)
+ my_env = {}
+ my_env.update(os.environ)
cache = os.path.join(outdir, ".singularity")
- my_env['SINGULARITY_CACHEDIR'] = cache
+ my_env["SINGULARITY_CACHEDIR"] = cache
if alt_cache:
- # Override the pipeline's default
+ # Override the pipeline's default
# cache location
- my_env['SINGULARITY_CACHEDIR'] = alt_cache
+ my_env["SINGULARITY_CACHEDIR"] = alt_cache
cache = alt_cache
if additional_bind_paths:
# Add Bind PATHs for rawdata directories
- bindpaths = "{},{}".format(additional_bind_paths,bindpaths)
+ bindpaths = "{},{}".format(additional_bind_paths, bindpaths)
- if not exists(os.path.join(outdir, 'logfiles')):
+ if not exists(os.path.join(outdir, "logfiles")):
# Create directory for logfiles
- os.makedirs(os.path.join(outdir, 'logfiles'))
-
- # Create .singularity directory for
+ os.makedirs(os.path.join(outdir, "logfiles"))
+
+ # Create .singularity directory for
# installations of snakemake without
# setuid which creates a sandbox in
# the SINGULARITY_CACHEDIR
if not exists(cache):
- # Create directory for sandbox
+ # Create directory for sandbox
# and image layers
os.makedirs(cache)
# Run on compute node or instance
# without submitting jobs to a scheduler
- if mode == 'local':
+ if mode == "local":
# Run pipeline's main process
- # Look into later: it maybe worth
+ # Look into later: it maybe worth
# replacing Popen subprocess with a direct
# snakemake API call: https://snakemake.readthedocs.io/en/stable/api_reference/snakemake.html
- masterjob = subprocess.Popen([
- 'snakemake', '-pr', '--rerun-incomplete',
- '--use-singularity',
- '--singularity-args', "'-B {}'".format(bindpaths),
- '--cores', str(threads),
- '--configfile=config.json'
- ], cwd = outdir, stderr=subprocess.STDOUT, stdout=logger, env=my_env)
+ masterjob = subprocess.Popen(
+ [
+ "snakemake",
+ "-pr",
+ "--rerun-incomplete",
+ "--use-singularity",
+ "--singularity-args",
+ "'-B {}'".format(bindpaths),
+ "--cores",
+ str(threads),
+ "--configfile=config.json",
+ ],
+ cwd=outdir,
+ stderr=subprocess.STDOUT,
+ stdout=logger,
+ env=my_env,
+ )
# Submitting jobs to cluster via SLURM's job scheduler
- elif mode == 'slurm':
+ elif mode == "slurm":
# Run pipeline's main process
- # Look into later: it maybe worth
+ # Look into later: it maybe worth
# replacing Popen subprocess with a direct
# snakemake API call: https://snakemake.readthedocs.io/en/stable/api_reference/snakemake.html
# CLUSTER_OPTS="'sbatch --gres {cluster.gres} --cpus-per-task {cluster.threads} -p {cluster.partition} \
@@ -716,19 +784,48 @@ def runner(mode, outdir, alt_cache, logger, additional_bind_paths = None,
# --cluster "${CLUSTER_OPTS}" --keep-going --restart-times 3 -j 500 \
# --rerun-incomplete --stats "$3"/logfiles/runtime_statistics.json \
# --keep-remote --local-cores 30 2>&1 | tee -a "$3"/logfiles/master.log
- if wait=='':
- masterjob = subprocess.Popen([
- str(os.path.join(outdir, 'resources', str(submission_script))), mode,
- '-j', jobname, '-b', str(bindpaths),
- '-o', str(outdir), '-c', str(cache),
- '-t', "'{}'".format(tmp_dir)
- ], cwd = outdir, stderr=subprocess.STDOUT, stdout=logger, env=my_env)
+ if wait == "":
+ masterjob = subprocess.Popen(
+ [
+ str(os.path.join(outdir, "resources", str(submission_script))),
+ mode,
+ "-j",
+ jobname,
+ "-b",
+ str(bindpaths),
+ "-o",
+ str(outdir),
+ "-c",
+ str(cache),
+ "-t",
+ "'{}'".format(tmp_dir),
+ ],
+ cwd=outdir,
+ stderr=subprocess.STDOUT,
+ stdout=logger,
+ env=my_env,
+ )
else:
- masterjob = subprocess.Popen([
- str(os.path.join(outdir, 'resources', str(submission_script))), mode,
- '-j', jobname, '-b', str(bindpaths),
- '-o', str(outdir), '-c', str(cache), str(wait),
- '-t', "'{}'".format(tmp_dir)
- ], cwd = outdir, stderr=subprocess.STDOUT, stdout=logger, env=my_env)
+ masterjob = subprocess.Popen(
+ [
+ str(os.path.join(outdir, "resources", str(submission_script))),
+ mode,
+ "-j",
+ jobname,
+ "-b",
+ str(bindpaths),
+ "-o",
+ str(outdir),
+ "-c",
+ str(cache),
+ str(wait),
+ "-t",
+ "'{}'".format(tmp_dir),
+ ],
+ cwd=outdir,
+ stderr=subprocess.STDOUT,
+ stdout=logger,
+ env=my_env,
+ )
return masterjob
diff --git a/src/shells.py b/src/shells.py
index 2fd2de1..3160bc7 100644
--- a/src/shells.py
+++ b/src/shells.py
@@ -12,59 +12,61 @@
def set_options(strict):
"""
- Changes behavior of default shell and get overrides options
- to run bash in a strict mode.
+ Changes behavior of default shell and get overrides options
+ to run bash in a strict mode.
@param strict :
- Overrides default shell options and runs shell in strict or
+ Overrides default shell options and runs shell in strict or
less permissive mode.
@return prefix :
Returns overrides options to run bash in a strict mode
"""
- prefix = '' # permissive shell option
- if strict:
+ prefix = "" # permissive shell option
+ if strict:
# Changes behavior of default shell
# set -e: exit immediately upon error
# set -u: treats unset variables as an error
# set -o pipefail: exits if a error occurs in any point of a pipeline
- prefix = 'set -euo pipefail; '
+ prefix = "set -euo pipefail; "
return prefix
-def bash(cmd, interpreter='/bin/bash', strict=set_options(True), cwd=os.getcwd(), **kwargs):
+def bash(
+ cmd, interpreter="/bin/bash", strict=set_options(True), cwd=os.getcwd(), **kwargs
+):
"""
Interface to run a process or bash command. Using subprocess.call_check()
due to portability across most python versions. It was introduced in python 2.5
- and it is also interoperabie across all python 3 versions.
+ and it is also interoperabie across all python 3 versions.
@param cmd :
Shell command to run
@param interpreter :
Interpreter for command to run [default: bash]
@pararm strict :
Prefixes any command with 'set -euo pipefail' to ensure process fail with
- the expected exit-code
+ the expected exit-code
@params kwargs :
Keyword arguments to modify subprocess.check_call() behavior
@return exitcode :
Returns the exit code of the run command, failures return non-zero exit codes
"""
try:
- exitcode = subprocess.check_call(strict + cmd,
- shell=True,
- executable=interpreter,
- cwd=cwd,
- **kwargs
+ exitcode = subprocess.check_call(
+ strict + cmd, shell=True, executable=interpreter, cwd=cwd, **kwargs
)
except CalledProcessError as e:
exitcode = e.returncode
- err("""WARNING: Failed to run '{}' command!
- └── Command returned a non-zero exitcode of '{}'.""".format(strict + cmd, exitcode)
+ err(
+ """WARNING: Failed to run '{}' command!
+ └── Command returned a non-zero exitcode of '{}'.""".format(
+ strict + cmd, exitcode
+ )
)
return exitcode
-if __name__ == '__main__':
+if __name__ == "__main__":
# Tests
- bash('ls -la /home/')
- bash('ls -la /fake/dne/path')
+ bash("ls -la /home/")
+ bash("ls -la /fake/path")
diff --git a/src/utils.py b/src/utils.py
index 81f3544..6f69aa9 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -8,9 +8,9 @@
import subprocess, json
-def md5sum(filename, first_block_only = False, blocksize = 65536):
+def md5sum(filename, first_block_only=False, blocksize=65536):
"""Gets md5checksum of a file in memory-safe manner.
- The file is read in blocks/chunks defined by the blocksize parameter. This is
+ The file is read in blocks/chunks defined by the blocksize parameter. This is
a safer option to reading the entire file into memory if the file is very large.
@param filename :
Input file on local filesystem to find md5 checksum
@@ -22,12 +22,12 @@ def md5sum(filename, first_block_only = False, blocksize = 65536):
MD5 checksum of the file's contents
"""
hasher = hashlib.md5()
- with open(filename, 'rb') as fh:
+ with open(filename, "rb") as fh:
buf = fh.read(blocksize)
if first_block_only:
- # Calculate MD5 of first block or chunck of file.
- # This is a useful heuristic for when potentially
- # calculating an MD5 checksum of thousand or
+ # Calculate MD5 of first block or chunk of file.
+ # This is a useful heuristic for when potentially
+ # calculating an MD5 checksum of thousand or
# millions of file.
hasher.update(buf)
return hasher.hexdigest()
@@ -38,6 +38,7 @@ def md5sum(filename, first_block_only = False, blocksize = 65536):
return hasher.hexdigest()
+
## copied directly from rna-seek
def check_cache(parser, cache, *args, **kwargs):
"""Check if provided SINGULARITY_CACHE is valid. Singularity caches cannot be
@@ -55,24 +56,36 @@ def check_cache(parser, cache, *args, **kwargs):
os.makedirs(cache)
elif os.path.isfile(cache):
# Cache directory exists as file, raise error
- parser.error("""\n\t\x1b[6;37;41mFatal: Failed to provided a valid singularity cache!\x1b[0m
+ parser.error(
+ """\n\t\x1b[6;37;41mFatal: Failed to provided a valid singularity cache!\x1b[0m
The provided --singularity-cache already exists on the filesystem as a file.
Please run {} again with a different --singularity-cache location.
- """.format(sys.argv[0]))
+ """.format(
+ sys.argv[0]
+ )
+ )
elif os.path.isdir(cache):
# Provide cache exists as directory
# Check that the user owns the child cache directory
- # May revert to os.getuid() if user id is not sufficent
- if exists(os.path.join(cache, 'cache')) and os.stat(os.path.join(cache, 'cache')).st_uid != os.getuid():
- # User does NOT own the cache directory, raise error
- parser.error("""\n\t\x1b[6;37;41mFatal: Failed to provided a valid singularity cache!\x1b[0m
+ # May revert to os.getuid() if user id is not sufficient
+ if (
+ exists(os.path.join(cache, "cache"))
+ and os.stat(os.path.join(cache, "cache")).st_uid != os.getuid()
+ ):
+ # User does NOT own the cache directory, raise error
+ parser.error(
+ """\n\t\x1b[6;37;41mFatal: Failed to provided a valid singularity cache!\x1b[0m
The provided --singularity-cache already exists on the filesystem with a different owner.
Singularity strictly enforces that the cache directory is not shared across users.
Please run {} again with a different --singularity-cache location.
- """.format(sys.argv[0]))
+ """.format(
+ sys.argv[0]
+ )
+ )
return cache
+
def permissions(parser, path, *args, **kwargs):
"""Checks permissions using os.access() to see the user is authorized to access
a file/directory. Checks for existence, readability, writability and executability via:
@@ -85,9 +98,13 @@ def permissions(parser, path, *args, **kwargs):
Returns abs path if it exists and permissions are correct
"""
if not exists(path):
- parser.error("Path '{}' does not exists! Failed to provide vaild input.".format(path))
+ parser.error(
+ "Path '{}' does not exists! Failed to provide valid input.".format(path)
+ )
if not os.access(path, *args, **kwargs):
- parser.error("Path '{}' exists, but cannot read path due to permissions!".format(path))
+ parser.error(
+ "Path '{}' exists, but cannot read path due to permissions!".format(path)
+ )
return os.path.abspath(path)
@@ -105,7 +122,7 @@ def standard_input(parser, path, *args, **kwargs):
if not sys.stdin.isatty():
# Standard input provided, set path as an
# empty string to prevent searching of '-'
- path = ''
+ path = ""
return path
# Checks for positional arguments as paths
@@ -125,7 +142,7 @@ def exists(testpath):
"""
does_exist = True
if not os.path.exists(testpath):
- does_exist = False # File or directory does not exist on the filesystem
+ does_exist = False # File or directory does not exist on the filesystem
return does_exist
@@ -141,7 +158,7 @@ def ln(files, outdir):
for file in files:
ln = os.path.join(outdir, os.path.basename(file))
if not exists(ln):
- os.symlink(os.path.abspath(os.path.realpath(file)), ln)
+ os.symlink(os.path.abspath(os.path.realpath(file)), ln)
def which(cmd, path=None):
@@ -167,7 +184,7 @@ def which(cmd, path=None):
def err(*message, **kwargs):
"""Prints any provided args to standard error.
- kwargs can be provided to modify print functions
+ kwargs can be provided to modify print functions
behavior.
@param message :
Values printed to standard error
@@ -177,7 +194,6 @@ def err(*message, **kwargs):
print(*message, file=sys.stderr, **kwargs)
-
def fatal(*message, **kwargs):
"""Prints any provided args to standard error
and exits with an exit code of 1.
@@ -205,16 +221,20 @@ def require(cmds, suggestions, path=None):
available = which(cmds[i])
if not available:
error = True
- err("""\x1b[6;37;41m\n\tFatal: {} is not in $PATH and is required during runtime!
- └── Solution: please 'module load {}' and run again!\x1b[0m""".format(cmds[i], suggestions[i])
+ err(
+ """\x1b[6;37;41m\n\tFatal: {} is not in $PATH and is required during runtime!
+ └── Solution: please 'module load {}' and run again!\x1b[0m""".format(
+ cmds[i], suggestions[i]
+ )
)
- if error: fatal()
+ if error:
+ fatal()
- return
+ return
-def safe_copy(source, target, resources = []):
+def safe_copy(source, target, resources=[]):
"""Private function: Given a list paths it will recursively copy each to the
target location. If a target path already exists, it will NOT over-write the
existing paths data.
@@ -241,15 +261,21 @@ def git_commit_hash(repo_path):
Latest git commit hash
"""
try:
- githash = subprocess.check_output(['git', 'rev-parse', 'HEAD'], stderr=subprocess.STDOUT, cwd = repo_path).strip().decode('utf-8')
+ githash = (
+ subprocess.check_output(
+ ["git", "rev-parse", "HEAD"], stderr=subprocess.STDOUT, cwd=repo_path
+ )
+ .strip()
+ .decode("utf-8")
+ )
# Typecast to fix python3 TypeError (Object of type bytes is not JSON serializable)
# subprocess.check_output() returns a byte string
githash = str(githash)
except Exception as e:
# Github releases are missing the .git directory,
- # meaning you cannot get a commit hash, set the
+ # meaning you cannot get a commit hash, set the
# commit hash to indicate its from a GH release
- githash = 'github_release'
+ githash = "github_release"
return githash
@@ -266,16 +292,20 @@ def join_jsons(templates):
aggregated = {}
for file in templates:
- with open(os.path.join(repo_path, file), 'r') as fh:
+ with open(os.path.join(repo_path, file), "r") as fh:
aggregated.update(json.load(fh))
return aggregated
-if __name__ == '__main__':
- # Calculate MD5 checksum of entire file
- print('{} {}'.format(md5sum(sys.argv[0]), sys.argv[0]))
- # Calcualte MD5 cehcksum of 512 byte chunck of file,
- # which is similar to following unix command:
- # dd if=utils.py bs=512 count=1 2>/dev/null | md5sum
- print('{} {}'.format(md5sum(sys.argv[0], first_block_only = True, blocksize = 512), sys.argv[0]))
+if __name__ == "__main__":
+ # Calculate MD5 checksum of entire file
+ print("{} {}".format(md5sum(sys.argv[0]), sys.argv[0]))
+ # Calculate MD5 checksum of 512 byte chunk of file,
+ # which is similar to following unix command:
+ # dd if=utils.py bs=512 count=1 2>/dev/null | md5sum
+ print(
+ "{} {}".format(
+ md5sum(sys.argv[0], first_block_only=True, blocksize=512), sys.argv[0]
+ )
+ )
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 0e57b04..98ab439 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -14,7 +14,7 @@ import uuid
def rename(filename):
"""Dynamically renames FastQ file to have one of the following extensions: *.R1.fastq.gz, *.R2.fastq.gz
To automatically rename the fastq files, a few assumptions are made. If the extension of the
- FastQ file cannot be infered, an exception is raised telling the user to fix the filename
+ FastQ file cannot be inferred, an exception is raised telling the user to fix the filename
of the fastq files.
@param filename :
Original name of file to be renamed
@@ -100,49 +100,49 @@ def _sym_safe_(input_data, target):
return input_fastqs
-def read_pairsfile(tn_mode="auto", pairs_filepath="", sample_names=[]):
+def read_pairsfile(tn_mode="auto", pairs_filepath="", sample_names=[]):
## Make sure tn_mode is valid
if not tn_mode in ["auto","paired","tumor_only"]:
raise NameError("""\n\tFatal: tn_mode must be one of 'auto', 'paired', or 'tumor_only'
Argument received: {}
""".format(tn_mode, sys.argv[0])
)
-
+
## Initialize some empty variables
tumor_ids = []
normal_ids = []
paired_ids={}
-
+
## If pairs file exists, try to use it
if os.path.isfile(pairs_filepath):
## Read pairs file as data frame
df = pd.read_csv(pairs_filepath, header=0, sep='\t')
df.columns = df.columns.str.lower() ## Make column names case-insensitive
-
+
## Make sure it contains a "tumor" column
if not "tumor" in df:
raise NameError("""\n\tFatal: Pairs file must contain at least a 'tumor' column
Columns found: {}
""".format(df.columns.tolist(), sys.argv[0])
)
-
+
df = df[pd.notna(df["tumor"])] ## Remove rows where tumor id is empty/na
tumor_ids = df["tumor"]
-
+
if "normal" in df:
normal_ids = df["normal"]
-
+
## Make sure normal ids are not empty/na
if any(pd.notna(normal_ids)):
t_pair=tumor_ids[pd.notna(normal_ids)]
n_pair=normal_ids[pd.notna(normal_ids)]
- paired_ids=dict(zip(t_pair.tolist(), n_pair.tolist()))
-
+ paired_ids=dict(zip(t_pair.tolist(), n_pair.tolist()))
+
## If pairs file not found, try to use provided sample names as tumor-only IDs
else:
if tn_mode == "paired":
print("WARNING: Paired mode selected without a valid pairs file!!!")
-
+
if not sample_names:
raise NameError("""\n\tFatal: Either a valid pairs file or sample names must be provided.
Pairs file path provided: {}
@@ -151,25 +151,25 @@ def read_pairsfile(tn_mode="auto", pairs_filepath="", sample_names=[]):
)
else:
tumor_ids=sample_names
-
+
## Overlap with given sample names
if sample_names:
overlapped_pairs = {k: paired_ids[k] for k in sample_names if k in paired_ids}
overlapped_tumors = list(set(tumor_ids) & set(sample_names))
-
+
# print(str(len(overlapped_pairs)) + " of " + str(len(paired_ids)) + " pairs in pairs file matched given sample names")
# print(str(len(overlapped_tumors)) + " of " + str(len(tumor_ids)) + " tumors in pairs file matched given sample names")
-
+
paired_ids=overlapped_pairs
tumor_ids=overlapped_tumors
-
+
out_dict={"paired":paired_ids, "tumor_only": dict.fromkeys(set(tumor_ids))}
-
+
if tn_mode=="paired":
out_dict["tumor_only"]=[]
elif tn_mode=="tumor_only":
out_dict["paired"]=[]
-
+
return(out_dict)
configfile:"config.json"
@@ -182,7 +182,7 @@ def get_nidap_folder_input_files(wildcards):
rightf=os.path.join(BASEDIR,"nidap_right")
left=open(leftf,'w')
right=open(rightf,'w')
-
+
for f in list(expand(os.path.join(SOBDetector_out,"{vc_outdir}","pass2","{samples}.artifact_filtered.vcf.gz"), samples=ffpe_sample_list, vc_outdir=ffpe_caller_list)):
nidap_files.append(f)
left.write("%s\n"%(f))
@@ -194,13 +194,13 @@ def get_nidap_folder_input_files(wildcards):
left.write("%s\n"%(f))
for f in list(expand(os.path.join(NIDAP_OUTDIR,"{vc_outdir}_all_somatic_variants.maf"), vc_outdir=ffpe_caller_list)):
right.write("%s\n"%(f))
-
+
for f in list(expand(os.path.join(SOBDetector_out,"{vc_outdir}","metrics","all_metrics.txt"), vc_outdir=ffpe_caller_list)):
nidap_files.append(f)
left.write("%s\n"%(f))
for f in list(expand(os.path.join(NIDAP_OUTDIR,"{vc_outdir}_all_metrics.txt"), vc_outdir=ffpe_caller_list)):
right.write("%s\n"%(f))
-
+
for f in list(expand(os.path.join(output_somatic_cnv,"freec_out","pass2","{samples}.recal.bam_CNVs.p.value.txt"), samples=cnv_sample_list)):
nidap_files.append(f)
left.write("%s\n"%(f))
@@ -212,22 +212,22 @@ def get_nidap_folder_input_files(wildcards):
left.write("%s\n"%(f))
for f in list(expand(os.path.join(NIDAP_OUTDIR,"{samples}.contamination.table"), samples=samples_for_caller_merge)):
right.write("%s\n"%(f))
-
+
f=os.path.join(output_qcdir,"finalQC","MultiQC_Report.html")
nidap_files.append(f)
left.write("%s\n"%(f))
- f=os.path.join(NIDAP_OUTDIR,"MultiQC_Report.html")
+ f=os.path.join(NIDAP_OUTDIR,"MultiQC_Report.html")
right.write("%s\n"%(f))
-
+
left.close()
right.close()
outfile = os.path.join(BASEDIR,"NIDAP_files.tsv")
cmd = "paste " + leftf + " " + rightf + " > " + outfile + " && rm -f " + leftf + " && rm -f " + rightf
os.system(cmd)
-
+
nidap_files.append(outfile)
-
+
return nidap_files
def get_nidap_folder_output_files(wildcards):
@@ -239,7 +239,7 @@ def get_nidap_folder_output_files(wildcards):
for f in list(expand(os.path.join(NIDAP_OUTDIR,"{vc_outdir}_all_somatic_variants.maf"), vc_outdir=ffpe_caller_list)):
nidap_files.append(f)
for f in list(expand(os.path.join(NIDAP_OUTDIR,"{vc_outdir}_all_metrics.txt"), vc_outdir=ffpe_caller_list)):
- nidap_files.append(f)
+ nidap_files.append(f)
for f in list(expand(os.path.join(NIDAP_OUTDIR,"{samples}.recal.bam_CNVs.p.value.txt"), samples=cnv_sample_list)):
nidap_files.append(f)
for f in list(expand(os.path.join(NIDAP_OUTDIR,"{samples}.contamination.table"), samples=samples_for_caller_merge)):
@@ -274,14 +274,14 @@ if fqs_found:
name_suffix=".R[1,2].fastq.gz"
if not os.path.exists(input_fqdir):
# print("making"+output_fqdir)
- os.makedirs(input_fqdir)
+ os.makedirs(input_fqdir)
name_symlinks=_sym_safe_(fqs_found, input_fqdir)
else:
name_symlinks=glob.glob(os.path.join(input_fqdir,'*.fastq.gz'))
elif bams_found:
name_suffix=".input.bam"
if not os.path.exists(input_bamdir):
- os.makedirs(input_bamdir)
+ os.makedirs(input_bamdir)
if (len(os.listdir(input_bamdir))==0):
bam_symlinks=_sym_safe_(bams_found, input_bamdir)
name_symlinks=glob.glob(os.path.join(input_bamdir,'*.input.bam'))
@@ -327,7 +327,7 @@ output_somatic_base=os.path.join(BASEDIR,"somatic_"+tn_mode)
output_somatic_snpindels=os.path.join(output_somatic_base,"SNP_Indels")
output_somatic_cnv=os.path.join(output_somatic_base,"CNV")
-#Convert chroms into the config.json file
+#Convert chroms into the config.json file
chroms=config['references']['chroms']
intervals_file=os.path.join(BASEDIR,"intervals.list")
@@ -338,7 +338,7 @@ if not os.path.isfile(intervals_file):
# Check if user provided at least
-# one useable variant caller
+# one usable variant caller
caller_list=[caller_name.lower() for caller_name in config['input_params']['VARIANT_CALLERS']]
caller_list=list(set(caller_list) & set(config['available_somatic_callers'][tn_mode]))
@@ -428,22 +428,22 @@ rule all:
expand(os.path.join(input_bamdir,"{samples}.input.bam"), samples=samples),
expand(os.path.join(output_bamdir,"final_bams","{samples}.bam"), samples=samples),
expand(os.path.join(output_germline_base,"VCF","{samples}.germline.vcf.gz"), samples=samples),
-
-
+
+
expand(os.path.join(output_somatic_snpindels,merge_outdir,"maf","{samples}.maf"),samples=samples_for_caller_merge),
# expand(os.path.join("{vc_outdir}","maf","{samples}.maf"), samples=pairs_ids, vc_outdir=somatic_callers),
-
+
expand(os.path.join(output_somatic_snpindels,"{vc_outdir}","cohort_summary","all_somatic_variants.maf"), vc_outdir=somatic_callers_dirs),
# expand(os.path.join(SOBDetector_out,"{vc_outdir}","cohort_summary","all_somatic_variants.maf"), samplespairs_ids, vc_outdir=somatic_callers),
-
+
expand(os.path.join(SOBDetector_out,"{vc_outdir}","pass2","{samples}.artifact_filtered.vcf.gz"), samples=ffpe_sample_list, vc_outdir=ffpe_caller_list),
expand(os.path.join(SOBDetector_out,"{vc_outdir}","cohort_summary","all_somatic_variants.maf"), vc_outdir=ffpe_caller_list),
# expand(os.path.join(SOBDetector_out,"{vc_outdir}","pass2","{samples}.sobdetect.vcf"), samplespairs_ids, vc_outdir=somatic_callers_dirs),
expand(os.path.join(SOBDetector_out,"{vc_outdir}","metrics","all_metrics.txt"), vc_outdir=ffpe_caller_list),
-
-
+
+
expand(os.path.join(output_somatic_cnv,"freec_out","pass2","{samples}.recal.bam_CNVs.p.value.txt"), samples=cnv_sample_list),
-
+
expand(os.path.join(output_somatic_base,"qc","gatk_contamination","{samples}.contamination.table"), samples=samples_for_caller_merge),
# expand(os.path.join(output_fqdir,"{samples}.fastq.info.txt"), samples=samples),
@@ -452,10 +452,10 @@ rule all:
# # expand(os.path.join(output_qcdir,"{samples}_fastqc.zip"), samples=samples),
# expand(os.path.join(output_qcdir,"{samples}","genome_results.txt"), samples=samples),
# expand(os.path.join(output_qcdir,"{samples}.samtools_flagstat.txt"), samples=samples),
- # os.path.join(output_qcdir,"raw_variants.het"),
+ # os.path.join(output_qcdir,"raw_variants.het"),
# os.path.join(output_qcdir,"raw_variants.variant_calling_detail_metrics"),
os.path.join(output_qcdir,"finalQC","MultiQC_Report.html"),
-
+
# expand(os.path.join(output_qcdir,"{samples}.germline.bcftools_stats.txt"), samples=samples),
# expand(os.path.join(output_qcdir,"{samples}.germline.eval.grp"), samples=samples),
# expand(os.path.join(output_qcdir,"{samples}.germline.snpeff.ann.html"), samples=samples),
@@ -504,4 +504,3 @@ onerror:
shell(jobby_cmd)
print(spook_cmd)
shell(spook_cmd)
-
diff --git a/workflow/rules/cnv.smk b/workflow/rules/cnv.smk
index fdb5a4c..e2a4e4f 100644
--- a/workflow/rules/cnv.smk
+++ b/workflow/rules/cnv.smk
@@ -1,12 +1,12 @@
# Rules to predict copy number variation
rule freec_exome_somatic_pass1:
- input:
+ input:
normal = lambda w: [os.path.join(output_bamdir,"final_bams", pairs_dict[w.samples] + ".bam")],
tumor = os.path.join(output_bamdir,"final_bams","{samples}.bam"),
targets = os.path.join(output_qcdir, "exome_targets.bed"),
- output:
+ output:
cnvs = os.path.join(output_somatic_cnv, "freec_out", "pass1", "{samples}.recal.bam_CNVs.p.value.txt"),
- params:
+ params:
normalsample = lambda w: [pairs_dict[w.samples]],
tumorsample = "{samples}",
fasta = config['references']['GENOME'],
@@ -18,7 +18,7 @@ rule freec_exome_somatic_pass1:
sig_script = config['scripts']['freec_significance'],
plot_script = config['scripts']['freec_plot'],
rname = 'freec1',
- envmodules:
+ envmodules:
config['tools']['freec']['modname'],
config['tools']['samtools']['modname'],
config['tools']['bedtools']['modname'],
@@ -27,7 +27,7 @@ rule freec_exome_somatic_pass1:
shell: """
myoutdir="$(dirname {output.cnvs})/{params.tumorsample}"
if [ ! -d "$myoutdir" ]; then mkdir -p "$myoutdir"; fi
-
+
perl "{params.config_script}" \\
"$myoutdir" \\
{params.lengths} \\
@@ -56,11 +56,11 @@ rule freec_exome_somatic_pass1:
rule sequenza:
- input:
+ input:
freeccnvs = os.path.join(output_somatic_cnv, "freec_out", "pass1", "{samples}.recal.bam_CNVs.p.value.txt"),
- output:
+ output:
fit = os.path.join(output_somatic_cnv, "sequenza_out", "{samples}_alternative_solutions.txt"),
- params:
+ params:
normalsample = lambda w: [pairs_dict[w.samples]],
tumorsample = "{samples}",
gc = config['references']['SEQUENZAGC'],
@@ -80,7 +80,7 @@ rule sequenza:
> "$myoutdir/{params.normalsample}.recal.bam_minipileup.pileup.gz"
gzip -c "$(dirname {input.freeccnvs})/{params.tumorsample}/{params.tumorsample}.bam_minipileup.pileup" \\
> "$myoutdir/{params.tumorsample}.recal.bam_minipileup.pileup.gz"
-
+
sequenza-utils bam2seqz \\
-p \\
-gc {params.gc} \\
@@ -153,7 +153,7 @@ rule freec_exome_somatic_pass2:
R --slave \\
--args $myoutdir/{params.tumorsample}.bam_CNVs \\
$myoutdir/{params.tumorsample}.bam_ratio.txt
-
+
mv $myoutdir/{params.tumorsample}.bam_CNVs.p.value.txt {output.cnvs}
cat "{params.plot_script}" | \\
R --slave \\
diff --git a/workflow/rules/ffpe.smk b/workflow/rules/ffpe.smk
index da300ea..1719953 100644
--- a/workflow/rules/ffpe.smk
+++ b/workflow/rules/ffpe.smk
@@ -1,7 +1,7 @@
# Rules for correcting stand orientation bias in FFPE samples
rule sobdetect_get:
- input:
- output:
+ input:
+ output:
SOBDetector_jar = SOBDetector_JARFILE
params:
rname = 'get_sobdetector'
@@ -26,22 +26,22 @@ rule sobdetect_pass1:
config['tools']['samtools']['modname'],
config['tools']['bcftools']['modname']
container:
- config['images']['wes_base']
+ config['images']['wes_base']
shell: """
- if [ ! -d "$(dirname {output.pass1_vcf})" ]; then
+ if [ ! -d "$(dirname {output.pass1_vcf})" ]; then
mkdir -p "$(dirname {output.pass1_vcf})"
fi
echo "Running SOBDetector..."
# Try/catch for running SOB Dectetor
- # with an empty input VCF file
+ # with an empty input VCF file
java -jar {input.SOBDetector_jar} \\
--input-type VCF \\
--input-variants "{input.vcf}" \\
--input-bam {input.bam} \\
--output-variants {output.pass1_vcf} \\
--only-passed false || {{
- # Compare length of VCF header to
+ # Compare length of VCF header to
# the total length of the file
header_length=$(grep '^#' "{input.vcf}" | wc -l)
file_length=$(cat "{input.vcf}" | wc -l)
@@ -51,7 +51,7 @@ rule sobdetect_pass1:
# problem so pipeline can continue
cat "{input.vcf}" > {output.pass1_vcf}
else
- # SOB Dectector failed for another reason
+ # SOB Detector failed for another reason
echo "SOB Detector Failed... exiting now!" 1>&2
exit 1
fi
@@ -60,7 +60,7 @@ rule sobdetect_pass1:
bcftools query \\
-f '%INFO/numF1R2Alt\\t%INFO/numF2R1Alt\\t%INFO/numF1R2Ref\\t%INFO/numF2R1Ref\\t%INFO/numF1R2Other\\t%INFO/numF2R1Other\\t%INFO/SOB\\n' \\
{output.pass1_vcf} \\
- | awk '{{if ($1 != "."){{tum_alt=$1+$2; tum_depth=$1+$2+$3+$4+$5+$6; if (tum_depth==0){{tum_af=1}} else {{tum_af=tum_alt/tum_depth }}; print tum_alt,tum_depth,tum_af,$7}}}}' > {output.pass1_info}
+ | awk '{{if ($1 != "."){{tum_alt=$1+$2; tum_depth=$1+$2+$3+$4+$5+$6; if (tum_depth==0){{tum_af=1}} else {{tum_af=tum_alt/tum_depth }}; print tum_alt,tum_depth,tum_af,$7}}}}' > {output.pass1_info}
"""
@@ -73,17 +73,17 @@ rule sobdetect_cohort_params:
params:
rname = 'sobdetect_params'
container:
- config['images']['wes_base']
+ config['images']['wes_base']
shell: """
echo -e "#TUMOR.alt\\tTUMOR.depth\\tTUMOR.AF\\tSOB\\tFS\\tSOR\\tTLOD\\tReadPosRankSum" > {output.all_info_file}
cat {input.info_files} >> {output.all_info_file}
-
+
# Try/catch for running calculating
- # mean and standard deviation with
+ # mean and standard deviation with
# with a set of empty input VCF files
all_length=$(tail -n+2 {output.all_info_file} | wc -l)
- if [ $all_length -eq 0 ]; then
- echo 'WARNING: All SOB Dectect pass1 samples contained no variants.' \\
+ if [ $all_length -eq 0 ]; then
+ echo 'WARNING: All SOB Detect pass1 samples contained no variants.' \\
| tee {output.params_file}
else
# Calculate mean and standard deviation
@@ -92,7 +92,7 @@ rule sobdetect_cohort_params:
fi
"""
-
+
rule sobdetect_pass2:
input:
vcf = os.path.join(output_somatic_snpindels, "{vc_outdir}", "vcf", "{samples}.FINAL.norm.vcf"),
@@ -112,9 +112,9 @@ rule sobdetect_pass2:
config['tools']['samtools']['modname'],
config['tools']['bcftools']['modname']
container:
- config['images']['wes_base']
+ config['images']['wes_base']
shell: """
- if [ ! -d "$(dirname {output.pass2_vcf})" ]; then
+ if [ ! -d "$(dirname {output.pass2_vcf})" ]; then
mkdir -p "$(dirname {output.pass2_vcf})"
fi
@@ -129,7 +129,7 @@ rule sobdetect_pass2:
--output-variants "{output.pass2_vcf}" \\
--only-passed true \\
--standardization-parameters "{input.params_file}" || {{
- # Compare length of VCF header to
+ # Compare length of VCF header to
# the total length of the file
header_length=$(grep '^#' "{input.vcf}" | wc -l)
file_length=$(cat "{input.vcf}" | wc -l)
@@ -139,12 +139,12 @@ rule sobdetect_pass2:
# problem so pipeline can continue
cat "{input.vcf}" > {output.pass2_vcf}
else
- # SOB Dectector failed for another reason
+ # SOB Detector failed for another reason
echo "SOB Detector Failed... exiting now!" 1>&2
exit 1
fi
}}
-
+
echo "Making info table..."
bcftools query \\
-f '%INFO/numF1R2Alt\\t%INFO/numF2R1Alt\\t%INFO/numF1R2Ref\\t%INFO/numF2R1Ref\\t%INFO/numF1R2Other\\t%INFO/numF2R1Other\\t%INFO/SOB\\n' \\
@@ -187,24 +187,24 @@ rule sobdetect_metrics:
envmodules:
config['tools']['bcftools']['modname']
container:
- config['images']['wes_base']
+ config['images']['wes_base']
shell: """
echo -e "#ID\\tDefaultParam\\tCohortParam\\tTotalVariants" > {output.count_table}
echo -e "#SAMPLE_ID\\tParam\\tCHROM\\tPOS\\tnumF1R2Alt\\tnumF2R1Alt\\tnumF1R2Ref\\tnumF2R1Ref\\tnumF1R2Other\\tnumF2R1Other\\tSOB\\tpArtifact\\tFS\\tSOR\\tTLOD\\tReadPosRankSum" > {output.full_metric_table}
-
+
P1FILES=({input.pass1_vcf})
P2FILES=({input.pass2_vcf})
for (( i=0; i<${{#P1FILES[@]}}; i++ )); do
MYID=$(basename -s ".sobdetect.vcf" ${{P1FILES[$i]}})
echo "Collecting metrics from $MYID..."
- # grep may fail if input files do not contain any variants
+ # grep may fail if input files do not contain any variants
total_count=$(grep -v ^# ${{P1FILES[$i]}} | wc -l) || total_count=0
count_1p=$(bcftools query -f '%INFO/pArtifact\n' ${{P1FILES[$i]}} | awk '{{if ($1 != "." && $1 < 0.05){{print}}}}' | wc -l)
count_2p=$(bcftools query -f '%INFO/pArtifact\n' ${{P2FILES[$i]}} | awk '{{if ($1 != "." && $1 < 0.05){{print}}}}' | wc -l)
-
+
echo -e "$MYID\\t$count_1p\\t$count_2p\\t$total_count" >> {output.count_table}
-
+
bcftools query -f '%CHROM\\t%POS\\t%INFO/numF1R2Alt\\t%INFO/numF2R1Alt\\t%INFO/numF1R2Ref\\t%INFO/numF2R1Ref\\t%INFO/numF1R2Other\\t%INFO/numF2R1Other\\t%INFO/SOB\\t%INFO/pArtifact\n' ${{P1FILES[$i]}} | awk -v id=$MYID 'BEGIN{{OFS="\t"}}{{print id,"PASS_1",$0}}' >> {output.full_metric_table}
bcftools query -f '%CHROM\\t%POS\\t%INFO/numF1R2Alt\\t%INFO/numF2R1Alt\\t%INFO/numF1R2Ref\\t%INFO/numF2R1Ref\\t%INFO/numF1R2Other\\t%INFO/numF2R1Other\\t%INFO/SOB\\t%INFO/pArtifact\n' ${{P2FILES[$i]}} | awk -v id=$MYID 'BEGIN{{OFS="\t"}}{{print id,"PASS_2",$0}}' >> {output.full_metric_table}
done
@@ -228,12 +228,12 @@ rule ffpefilter_mafs:
vcf2maf_script = VCF2MAF_WRAPPER
threads: 4
container:
- config['images']['vcf2maf']
+ config['images']['vcf2maf']
shell: """
filetype=$(file -b --mime-type {input.filtered_vcf})
if [ $filetype == "application/gzip" ] ; then
zcat {input.filtered_vcf} > {output.filtered_vcf}
- else
+ else
{input.filtered_vcf} > {output.filtered_vcf}
fi
@@ -260,9 +260,9 @@ rule collect_ffpefilter_mafs:
params:
rname = "combine_maf"
container:
- config['images']['wes_base']
- shell: """
+ config['images']['wes_base']
+ shell: """
echo "Combining MAFs..."
head -2 {input.mafs[0]} > {output.maf}
awk 'FNR>2 {{print}}' {input.mafs} >> {output.maf}
- """
\ No newline at end of file
+ """
diff --git a/workflow/rules/gatk_vqsr.smk b/workflow/rules/gatk_vqsr.smk
index 7d383b0..a7e8792 100644
--- a/workflow/rules/gatk_vqsr.smk
+++ b/workflow/rules/gatk_vqsr.smk
@@ -1,20 +1,20 @@
-rule gatk_vqsr:
+rule gatk_vqsr:
"""
##Hold off on implementing due to mouse genomes not having all the same resources for VQSR and the sample sizes required.
- ##Implement Deepvariant instead
+ ##Implement Deepvariant instead
Run GATK VQSR on the SNP and INDEls
@Input:
Multi-sample gVCF with all chromosomes combined
@Output:
Variants scored by VQSLOD
"""
- input:
+ input:
vcf = os.path.join(output_germline_base,"VCF","raw_variants.vcf.gz"),
- output:
+ output:
indelvcf = os.path.join(output_germline_base,"VCF","indel.recalibrated.vcf.gz"),
snpindelvcf = os.path.join(output_germline_base,"VCF","snp_indel.recalibrated.vcf.gz")
- params:
- genome=config['references']['GENOME'],
+ params:
+ genome=config['references']['GENOME'],
mills=config['references']['MILLS'],
axiom=config['references']['AXIOM'],
dbsnp=config['references']['DBSNP'],
@@ -40,7 +40,7 @@ rule gatk_vqsr:
-resource:axiomPoly,known=false,training=true,truth=false,prior=10 {params.axiom} \\
-resource:dbsnp,known=true,training=false,truth=false,prior=2 {params.dbsnp} \\
--tranches-file cohort_indels.tranches \\
- -O cohort_indels.recal
+ -O cohort_indels.recal
gatk --java-options VariantRecalibrator \\
-R {params.genome} \\
@@ -66,7 +66,7 @@ rule gatk_vqsr:
--create-output-variant-index true \\
-mode INDEL \\
-O {output.indelvcf}
-
+
gatk --java-options ApplyVQSR \\
-V indel.recalibrated.vcf.gz \\
--recal-file cohort_snps.recal \\
@@ -76,4 +76,4 @@ rule gatk_vqsr:
-mode SNP \\
-O {output.snpindelvcf}
- """
\ No newline at end of file
+ """
diff --git a/workflow/rules/germline.smk b/workflow/rules/germline.smk
index 7712130..dbeae57 100644
--- a/workflow/rules/germline.smk
+++ b/workflow/rules/germline.smk
@@ -8,13 +8,13 @@ rule haplotypecaller:
@Output:
Single-sample gVCF
"""
- input:
+ input:
bam = os.path.join(output_bamdir,"final_bams","{samples}.bam"),
bai = os.path.join(output_bamdir,"final_bams","{samples}.bai"),
output:
gzvcf = temp(os.path.join(output_germline_base,"gVCFs","{samples}.{chroms}.g.vcf.gz")),
index = temp(os.path.join(output_germline_base,"gVCFs","{samples}.{chroms}.g.vcf.gz.tbi")),
- params:
+ params:
sample = "{samples}",
genome = config['references']['GENOME'],
snpsites=config['references']['DBSNP'],
@@ -28,7 +28,7 @@ rule haplotypecaller:
"""
myoutdir="$(dirname {output.gzvcf})"
if [ ! -d "$myoutdir" ]; then mkdir -p "$myoutdir"; fi
-
+
gatk --java-options '-Xmx24g' HaplotypeCaller \\
--reference {params.genome} \\
--input {input.bam} \\
@@ -58,17 +58,17 @@ rule mergegvcfs:
output:
gzvcf = os.path.join(output_germline_base,"gVCFs","merged.{chroms}.g.vcf.gz"),
index = os.path.join(output_germline_base,"gVCFs","merged.{chroms}.g.vcf.gz.tbi"),
- params:
+ params:
genome = config['references']['GENOME'],
ver_gatk=config['tools']['gatk4']['version'],
rname = "mergegvcfs"
message: "Running GATK4 CombineGVCFs on '{input.gzvcf}' input file"
envmodules: config['tools']['gatk4']['modname']
- container: config['images']['wes_base']
+ container: config['images']['wes_base']
shell:
"""
input_str="--variant $(echo "{input.gzvcf}" | sed -e 's/ / --variant /g')"
-
+
gatk --java-options '-Xmx24g' CombineGVCFs \\
--reference {params.genome} \\
--annotation-group StandardAnnotation \\
@@ -89,7 +89,7 @@ rule genotype:
@Output:
Multi-sample gVCF, scattered across chromosomes (with joint genotyping updates)
"""
- input:
+ input:
gzvcf = os.path.join(output_germline_base,"gVCFs","merged.{chroms}.g.vcf.gz"),
index = os.path.join(output_germline_base,"gVCFs","merged.{chroms}.g.vcf.gz.tbi"),
output:
@@ -107,7 +107,7 @@ rule genotype:
"""
myoutdir="$(dirname {output.vcf})"
if [ ! -d "$myoutdir" ]; then mkdir -p "$myoutdir"; fi
-
+
gatk --java-options '-Xmx96g' GenotypeGVCFs \\
--reference {params.genome} \\
--use-jdk-inflater \\
@@ -138,7 +138,7 @@ rule germline_merge_chrom:
rname = "merge_chrom", genome = config['references']['GENOME']
message: "Running GATK4 MergeVcfs on all chrom split VCF files"
envmodules: config['tools']['gatk4']['modname']
- container: config['images']['wes_base']
+ container: config['images']['wes_base']
shell:
"""
# Avoids ARG_MAX issue which limits max length of a command
@@ -151,7 +151,7 @@ rule germline_merge_chrom:
"""
-rule Gatk_Variantfilter:
+rule Gatk_Variantfilter:
"""
Hard filters on
@Input:
@@ -159,15 +159,15 @@ rule Gatk_Variantfilter:
@Output:
Variants filtered by QD, QUAL, SOR, FS, MQ, MQRankSum, ReadPosRankSum, Indels: QD, QUAL, FS, ReadPosRankSum
"""
- input:
+ input:
vcf = os.path.join(output_germline_base,"VCF","raw_variants.vcf.gz"),
- output:
+ output:
indelvcf = os.path.join(output_germline_base,"VCF","indel.filterd.vcf.gz"),
snpvcf = os.path.join(output_germline_base,"VCF","snp.filtered.vcf.gz"),
vcf = os.path.join(output_germline_base,"VCF","snp_indel.filtered.vcf.gz")
- params:
- genome=config['references']['GENOME'],
+ params:
+ genome=config['references']['GENOME'],
rname="gatk_hardfilters",
ver_gatk=config['tools']['gatk4']['version']
message: "Running GATK4 hard filters on Cohort VCF input file"
@@ -179,7 +179,7 @@ rule Gatk_Variantfilter:
-V {input.vcf} \\
-select-type SNP \\
-O snps.vcf.gz
-
+
gatk SelectVariants \\
-V {input.vcf} \\
-select-type INDEL \\
@@ -203,7 +203,7 @@ rule Gatk_Variantfilter:
-filter "FS > 200.0" --filter-name "FS200" \\
-filter "ReadPosRankSum < -20.0" --filter-name "ReadPosRankSum-20" \\
-O {output.indelvcf}
-
+
gatk MergeVcfs \\
-R {params.genome} \\
--INPUT {output.indelvcf} \\
@@ -219,13 +219,13 @@ rule Gatk_SelectVariants:
@Output:
Single-sample VCF with unfiltered germline variants
"""
- input:
+ input:
vcf = os.path.join(output_germline_base,"VCF","snp_indel.filtered.vcf.gz"),
- output:
+ output:
vcf = os.path.join(output_germline_base,"VCF","{samples}.germline.vcf.gz")
- params:
- genome=config['references']['GENOME'],
- Sname = "{samples}",
+ params:
+ genome=config['references']['GENOME'],
+ Sname = "{samples}",
rname="varselect",
ver_gatk=config['tools']['gatk4']['version'],
targets=exome_targets_bed
@@ -242,4 +242,4 @@ rule Gatk_SelectVariants:
--exclude-filtered \\
--exclude-non-variants \\
--output {output.vcf}
- """
\ No newline at end of file
+ """
diff --git a/workflow/rules/nidap.smk b/workflow/rules/nidap.smk
index 59f0daf..6827979 100644
--- a/workflow/rules/nidap.smk
+++ b/workflow/rules/nidap.smk
@@ -13,13 +13,13 @@ rule nidap:
outdir=os.path.join(NIDAP_OUTDIR)
shell:"""
set -exo pipefail
-if [ -d {params.outdir} ];then rm -rf {params.outdir};fi
+if [ -d {params.outdir} ];then rm -rf {params.outdir};fi
mkdir -p {params.outdir}
cd {params.outdir}
# last file in inputs is NIDAP_files.tsv ... col1 is file ... col2 is the same file hardlinked in the NIDAP folder
# this file is created in get_nidap_folder_input_files function
linking_file=$(echo {input}|awk '{{print $NF}}')
-while read a b;do
+while read a b;do
ln $a $b
done < $linking_file
-"""
\ No newline at end of file
+"""
diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk
index 497fc67..b5f1eaa 100644
--- a/workflow/rules/qc.smk
+++ b/workflow/rules/qc.smk
@@ -6,7 +6,7 @@ rule fc_lane:
SRA have a different format than newer FastQ files generated with the
current version of Casava. It is worth noting that FastQ files downloaded from SRA
or FastQ files generated with Casava version < 1.8 do not have Flowcell
- IDs in its sequence indentifer. If a FastQ file does not have Flowcell IDs,
+ IDs in its sequence identifier. If a FastQ file does not have Flowcell IDs,
the Machine or Instrument ID is grabbed instead.
@Input:
Raw FastQ R1 file (scatter)
@@ -23,10 +23,10 @@ rule fc_lane:
envmodules: config['tools']['python']['modname']
container: config['images']['python']
shell: """
- if [ ! -d "$(dirname {output.txt})" ]; then
+ if [ ! -d "$(dirname {output.txt})" ]; then
mkdir -p "$(dirname {output.txt})"
fi
-
+
python {params.get_flowcell_lanes} \\
{input.r1} \\
{wildcards.samples} > {output.txt}
@@ -55,7 +55,7 @@ rule fastq_screen:
params:
rname = "fqscreen",
outdir = os.path.join(output_qcdir,"FQscreen"),
- # Exposed Parameters: modify resources/fastq_screen.conf to change
+ # Exposed Parameters: modify resources/fastq_screen.conf to change
# default locations to bowtie2 indices
fastq_screen_config = config['references']['FASTQ_SCREEN_CONFIG'],
envmodules: config['tools']['fastq_screen']['modname']
@@ -81,7 +81,7 @@ rule kraken:
@Input:
Trimmed FastQ files (scatter)
@Output:
- Kraken logfile and interative krona report
+ Kraken logfile and interactive krona report
"""
input:
fq1 = os.path.join(output_fqdir,"{samples}.R1.trimmed.fastq.gz"),
@@ -102,7 +102,7 @@ rule kraken:
threads: 24
shell: """
# Setups temporary directory for
- # intermediate files with built-in
+ # intermediate files with built-in
# mechanism for deletion on exit
{params.set_tmp}
@@ -146,7 +146,7 @@ rule fastqc_bam:
fastqc -t {threads} \\
-f bam \\
-o {params.outdir} \\
- {input.bam}
+ {input.bam}
"""
localrules: reformat_targets_bed
@@ -163,7 +163,7 @@ rule reformat_targets_bed:
"""
input:
targets=exome_targets_bed,
- output:
+ output:
bed=os.path.join(output_qcdir, "exome_targets.bed"),
params:
script_path_reformat_bed=config['scripts']['reformat_bed'],
@@ -179,11 +179,11 @@ rule reformat_targets_bed:
python3 {params.script_path_correct_target_bed} {output.bed}.temp {output.bed}
rm -f {output.bed}.temp
"""
-
-
+
+
rule qualimap_bamqc:
"""
- Quality-control step to assess various post-alignment metrics
+ Quality-control step to assess various post-alignment metrics
and a secondary method to calculate insert size. Please see
QualiMap's website for more information about BAM QC:
http://qualimap.conesalab.org/
@@ -195,7 +195,7 @@ rule qualimap_bamqc:
input:
bam = os.path.join(output_bamdir,"final_bams","{samples}.bam"),
bed=os.path.join(output_qcdir, "exome_targets.bed"),
- output:
+ output:
txt = os.path.join(output_qcdir,"{samples}","genome_results.txt"),
html = os.path.join(output_qcdir,"{samples}","qualimapReport.html")
params:
@@ -222,9 +222,9 @@ rule qualimap_bamqc:
rule samtools_flagstats:
"""
- Quality-control step to assess alignment quality. Flagstat provides
- counts for each of 13 categories based primarily on bit flags in the
- FLAG field. Information on the meaning of the flags is given in the
+ Quality-control step to assess alignment quality. Flagstat provides
+ counts for each of 13 categories based primarily on bit flags in the
+ FLAG field. Information on the meaning of the flags is given in the
SAM specification: https://samtools.github.io/hts-specs/SAMv1.pdf
@Input:
Recalibrated BAM file (scatter)
@@ -235,7 +235,7 @@ rule samtools_flagstats:
bam = os.path.join(output_bamdir,"final_bams","{samples}.bam"),
output:
txt = os.path.join(output_qcdir,"{samples}.samtools_flagstat.txt")
- params:
+ params:
rname = "samtools_flagstats"
message: "Running SAMtools flagstat on '{input}' input file"
envmodules: config['tools']['samtools']['modname']
@@ -247,21 +247,21 @@ rule samtools_flagstats:
rule vcftools:
"""
- Quality-control step to calculates a measure of heterozygosity on
+ Quality-control step to calculates a measure of heterozygosity on
a per-individual basis. The inbreeding coefficient, F, is estimated
for each individual using a method of moments. Please see VCFtools
- documentation for more information:
+ documentation for more information:
https://vcftools.github.io/man_latest.html
@Input:
Multi-sample gVCF file (indirect-gather-due-to-aggregation)
@Output:
Text file containing a measure of heterozygosity
"""
- input:
+ input:
vcf = os.path.join(output_germline_base,"VCF","raw_variants.vcf.gz"),
- output:
+ output:
het = os.path.join(output_qcdir,"raw_variants.het"),
- params:
+ params:
prefix = os.path.join(output_qcdir,"raw_variants"),
rname = "vcftools",
message: "Running VCFtools on '{input.vcf}' input file"
@@ -281,13 +281,13 @@ rule collectvariantcallmetrics:
@Input:
Multi-sample gVCF file (indirect-gather-due-to-aggregation)
@Output:
- Text file containing a collection of metrics relating to snps and indels
+ Text file containing a collection of metrics relating to snps and indels
"""
- input:
+ input:
vcf = os.path.join(output_germline_base,"VCF","raw_variants.vcf.gz"),
- output:
+ output:
metrics = os.path.join(output_qcdir,"raw_variants.variant_calling_detail_metrics"),
- params:
+ params:
dbsnp=config['references']['DBSNP'],
prefix = os.path.join(output_qcdir,"raw_variants"),
rname="varcallmetrics",
@@ -307,20 +307,20 @@ rule bcftools_stats:
"""
Quality-control step to collect summary statistics from bcftools stats.
When bcftools stats is run with one VCF file then stats by non-reference
- allele frequency, depth distribution, stats by quality and per-sample
- counts, singleton statsistics are calculated. Please see bcftools'
- documentation for more information:
+ allele frequency, depth distribution, stats by quality and per-sample
+ counts, singleton statsistics are calculated. Please see bcftools'
+ documentation for more information:
http://samtools.github.io/bcftools/bcftools.html#stats
@Input:
Per sample gVCF file (scatter)
@Output:
Text file containing a collection of summary statistics
"""
- input:
+ input:
vcf = os.path.join(output_germline_base,"VCF","{samples}.germline.vcf.gz"),
- output:
+ output:
txt = os.path.join(output_qcdir,"{samples}.germline.bcftools_stats.txt"),
- params:
+ params:
rname="bcfstats",
message: "Running BCFtools on '{input.vcf}' input file"
envmodules: config['tools']['bcftools']['modname']
@@ -332,20 +332,20 @@ rule bcftools_stats:
rule gatk_varianteval:
"""
- Quality-control step to calculate various quality control metrics from a
- variant callset. These metrics include the number of raw or filtered SNP
+ Quality-control step to calculate various quality control metrics from a
+ variant callset. These metrics include the number of raw or filtered SNP
counts; ratio of transition mutations to transversions; concordance of a
particular sample's calls to a genotyping chip; number of s per sample.
- Please see GATK's documentation for more information:
+ Please see GATK's documentation for more information:
https://gatk.broadinstitute.org/hc/en-us/articles/360040507171-VariantEval
@Input:
Per sample gVCF file (scatter)
@Output:
Evaluation table containing a collection of summary statistics
"""
- input:
- vcf = os.path.join(output_germline_base,"VCF","{samples}.germline.vcf.gz"),
- output:
+ input:
+ vcf = os.path.join(output_germline_base,"VCF","{samples}.germline.vcf.gz"),
+ output:
grp = os.path.join(output_qcdir,"{samples}.germline.eval.grp"),
params:
rname = "vareval",
@@ -361,7 +361,7 @@ rule gatk_varianteval:
-R {params.genome} \\
-O {output.grp} \\
--dbsnp {params.dbsnp} \\
- --eval {input.vcf}
+ --eval {input.vcf}
"""
@@ -369,20 +369,20 @@ rule snpeff:
"""
Data processing and quality-control step to annotate variants, predict its
functional effects, and collect various summary statistics about variants and
- their annotations. Please see SnpEff's documentation for more information:
+ their annotations. Please see SnpEff's documentation for more information:
https://pcingola.github.io/SnpEff/
@Input:
Per sample gVCF file (scatter)
@Output:
Evaluation table containing a collection of summary statistics
"""
- input:
+ input:
vcf = os.path.join(output_germline_base,"VCF","{samples}.germline.vcf.gz")
- output:
+ output:
vcf = os.path.join(output_qcdir,"{samples}.germline.snpeff.ann.vcf"),
csv = os.path.join(output_qcdir,"{samples}.germline.snpeff.ann.csv"),
html = os.path.join(output_qcdir,"{samples}.germline.snpeff.ann.html"),
- params:
+ params:
rname = "snpeff",
genome = config['references']['SNPEFF_GENOME'],
config = config['references']['SNPEFF_CONFIG'],
@@ -412,14 +412,14 @@ if config['project']['annotation']=='hg38':
input:
bam = os.path.join(output_bamdir,"final_bams","{samples}.bam"),
bai = os.path.join(output_bamdir,"final_bams","{samples}.bai"),
- output:
+ output:
somalierOut = os.path.join(output_germline_base,"somalier","{samples}.somalier")
params:
sites_vcf = config['references']['SOMALIER']['SITES_VCF'],
genomeFasta = config['references']['GENOME'],
rname = 'somalier_extract'
container: config['images']['wes_base']
- shell: """
+ shell: """
echo "Extracting sites to estimate ancestry"
somalier extract \\
-d "$(dirname {output.somalierOut})" \\
@@ -458,12 +458,12 @@ if config['project']['annotation']=='hg38':
script_path_pca = config['scripts']['ancestry'],
rname = 'somalier_analysis'
container: config['images']['wes_base']
- shell: """
+ shell: """
echo "Estimating relatedness"
somalier relate \\
-o "$(dirname {output.relatedness})/relatedness" \\
{input.somalier}
-
+
echo "Estimating ancestry"
somalier ancestry \\
-o "$(dirname {output.relatedness})/ancestry" \\
@@ -472,12 +472,12 @@ if config['project']['annotation']=='hg38':
{input.somalier}
Rscript {params.script_path_gender} \\
{output.relatednessSamples} \\
- {output.finalFileGender}
-
+ {output.finalFileGender}
+
Rscript {params.script_path_samples} \\
{output.relatedness} \\
{output.finalFilePairs}
-
+
Rscript {params.script_path_pca} \\
{output.ancestry} \\
{output.finalFilePairs} \\
@@ -490,16 +490,16 @@ if config['project']['annotation']=='hg38':
rule multiqc:
"""
Reporting step to aggregate sample summary statistics and quality-control
- information across all samples. This will be one of the last steps of the
- pipeline. The inputs listed here are to ensure that this step runs last.
- During runtime, MultiQC will recurively crawl through the working directory
+ information across all samples. This will be one of the last steps of the
+ pipeline. The inputs listed here are to ensure that this step runs last.
+ During runtime, MultiQC will recursively crawl through the working directory
and parse files that it supports.
@Input:
List of files to ensure this step runs last (gather)
@Output:
Interactive MulitQC report and a QC metadata table
"""
- input:
+ input:
expand(os.path.join(output_fqdir,"{samples}.fastq.info.txt"), samples=samples),
expand(os.path.join(output_qcdir,"FQscreen","{samples}.R2.trimmed_screen.txt"), samples=samples),
expand(os.path.join(output_qcdir,"kraken","{samples}.trimmed.kraken_bacteria.krona.html"), samples=samples),
@@ -509,12 +509,12 @@ if config['project']['annotation']=='hg38':
expand(os.path.join(output_qcdir,"{samples}.germline.bcftools_stats.txt"), samples=samples),
expand(os.path.join(output_qcdir,"{samples}.germline.eval.grp"), samples=samples),
expand(os.path.join(output_qcdir,"{samples}.germline.snpeff.ann.html"), samples=samples),
- os.path.join(output_qcdir,"raw_variants.het"),
+ os.path.join(output_qcdir,"raw_variants.het"),
os.path.join(output_qcdir,"raw_variants.variant_calling_detail_metrics"),
os.path.join(output_germline_base,"somalier","ancestry.somalier-ancestry.tsv"),
- output:
+ output:
report = os.path.join(output_qcdir,"finalQC","MultiQC_Report.html"),
- params:
+ params:
rname = "multiqc",
workdir = os.path.join(BASEDIR)
envmodules: config['tools']['multiqc']['modname']
@@ -542,14 +542,14 @@ if config['project']['annotation']=='mm10':
input:
bam = os.path.join(output_bamdir,"final_bams","{samples}.bam"),
bai = os.path.join(output_bamdir,"final_bams","{samples}.bai"),
- output:
+ output:
somalierOut = os.path.join(output_germline_base,"somalier","{samples}.somalier")
params:
sites_vcf = config['references']['SOMALIER']['SITES_VCF'],
genomeFasta = config['references']['GENOME'],
rname = 'somalier_extract'
container: config['images']['wes_base']
- shell: """
+ shell: """
echo "Extracting sites to estimate ancestry"
somalier extract \\
-d "$(dirname {output.somalierOut})" \\
@@ -587,7 +587,7 @@ if config['project']['annotation']=='mm10':
script_path_pca = config['scripts']['ancestry'],
rname = 'somalier_analysis'
container: config['images']['wes_base']
- shell: """
+ shell: """
echo "Estimating relatedness"
somalier relate \\
-o "$(dirname {output.relatedness})/relatedness" \\
@@ -595,27 +595,27 @@ if config['project']['annotation']=='mm10':
Rscript {params.script_path_gender} \\
{output.relatednessSamples} \\
- {output.finalFileGender}
-
+ {output.finalFileGender}
+
Rscript {params.script_path_samples} \\
{output.relatedness} \\
{output.finalFilePairs}
-
+
"""
rule multiqc:
"""
Reporting step to aggregate sample summary statistics and quality-control
- information across all samples. This will be one of the last steps of the
- pipeline. The inputs listed here are to ensure that this step runs last.
- During runtime, MultiQC will recurively crawl through the working directory
+ information across all samples. This will be one of the last steps of the
+ pipeline. The inputs listed here are to ensure that this step runs last.
+ During runtime, MultiQC will recursively crawl through the working directory
and parse files that it supports.
@Input:
List of files to ensure this step runs last (gather)
@Output:
Interactive MulitQC report and a QC metadata table
"""
- input:
+ input:
expand(os.path.join(output_fqdir,"{samples}.fastq.info.txt"), samples=samples),
expand(os.path.join(output_qcdir,"FQscreen","{samples}.R2.trimmed_screen.txt"), samples=samples),
expand(os.path.join(output_qcdir,"kraken","{samples}.trimmed.kraken_bacteria.krona.html"), samples=samples),
@@ -625,11 +625,11 @@ if config['project']['annotation']=='mm10':
expand(os.path.join(output_qcdir,"{samples}.germline.bcftools_stats.txt"), samples=samples),
expand(os.path.join(output_qcdir,"{samples}.germline.eval.grp"), samples=samples),
expand(os.path.join(output_qcdir,"{samples}.germline.snpeff.ann.html"), samples=samples),
- os.path.join(output_qcdir,"raw_variants.het"),
+ os.path.join(output_qcdir,"raw_variants.het"),
os.path.join(output_qcdir,"raw_variants.variant_calling_detail_metrics"),
- output:
+ output:
report = os.path.join(output_qcdir,"finalQC","MultiQC_Report.html"),
- params:
+ params:
rname = "multiqc",
workdir = os.path.join(BASEDIR)
envmodules: config['tools']['multiqc']['modname']
diff --git a/workflow/rules/somatic_snps.common.smk b/workflow/rules/somatic_snps.common.smk
index 20400c6..27e6989 100644
--- a/workflow/rules/somatic_snps.common.smk
+++ b/workflow/rules/somatic_snps.common.smk
@@ -14,7 +14,7 @@ rule split_bam_by_chrom:
envmodules:
config['tools']['samtools']['modname']
container:
- config['images']['wes_base']
+ config['images']['wes_base']
shell: """
if [ ! -d "$(dirname {output.split_bam})" ]; then
mkdir -p "$(dirname {output.split_bam})"
@@ -25,11 +25,11 @@ rule split_bam_by_chrom:
-o {output.split_bam} \\
-@ {threads} \\
{input.bam} {wildcards.chroms}
-
+
samtools index \\
-@ {threads} \\
{output.split_bam} {output.split_bam_idx}
-
+
cp {output.split_bam_idx} {output.split_bam}.bai
"""
@@ -50,7 +50,7 @@ rule LearnReadOrientationModel:
config['images']['wes_base']
shell: """
input_str="--input $(echo "{input.read_orientation_file}" | sed -e 's/ / --input /g')"
-
+
gatk LearnReadOrientationModel \\
--output {output.model} \\
$input_str
@@ -83,12 +83,12 @@ rule mutect2_filter:
config['images']['wes_base']
shell: """
# Setups temporary directory for
- # intermediate files with built-in
+ # intermediate files with built-in
# mechanism for deletion on exit
{params.set_tmp}
statfiles="--stats $(echo "{input.statsfiles}" | sed -e 's/ / --stats /g')"
-
+
gatk MergeMutectStats \\
$statfiles \\
-O {output.final}.stats
@@ -106,7 +106,7 @@ rule mutect2_filter:
--variant {output.marked_vcf} \\
--exclude-filtered \\
--output {output.final}
-
+
# VarScan can output ambiguous IUPAC bases/codes
# the awk one-liner resets them to N, from:
# https://github.com/fpbarthel/GLASS/issues/23
@@ -115,7 +115,7 @@ rule mutect2_filter:
| awk '{{gsub(/\y[W|K|Y|R|S|M|B|D|H|V]\y/,"N",$4); OFS = "\t"; print}}' \\
| sed '/^$/d' > {output.norm}
"""
-
+
rule somatic_merge_chrom:
input:
vcf = expand(os.path.join(output_somatic_snpindels, "{{vc_out}}", "chrom_split", "{{samples}}.{chroms}.vcf"), chroms=chroms),
@@ -144,7 +144,7 @@ rule somatic_merge_chrom:
rule somatic_merge_callers:
input:
vcf = expand(os.path.join(output_somatic_snpindels, "{vc_outdir}_out", "vcf", "{{samples}}.FINAL.norm.vcf"), vc_outdir=caller_list)
- output:
+ output:
mergedvcf = os.path.join(output_somatic_snpindels, "merged_somatic_variants", "vcf", "{samples}.FINAL.norm.vcf"),
params:
genome = config['references']['GENOME'],
@@ -160,7 +160,7 @@ rule somatic_merge_callers:
config['images']['wes_base']
shell: """
# Setups temporary directory for
- # intermediate files with built-in
+ # intermediate files with built-in
# mechanism for deletion on exit
{params.set_tmp}
@@ -185,7 +185,7 @@ rule somatic_mafs:
filtered_vcf = os.path.join(output_somatic_snpindels, "{vc_outdir}", "vcf", "{samples}.FINAL.norm.vcf")
output:
maf = os.path.join(output_somatic_snpindels, "{vc_outdir}", "maf", "{samples}.maf")
- params:
+ params:
tumorsample = '{samples}',
genome = config['references']['GENOME'],
build= config['references']['VCF2MAF']['GENOME_BUILD'],
@@ -194,10 +194,10 @@ rule somatic_mafs:
rname = 'vcf2maf',
normalsample = lambda w: "--normal-id {0}".format(
pairs_dict[w.samples]
- ) if pairs_dict[w.samples] else "",
+ ) if pairs_dict[w.samples] else "",
threads: 4
container:
- config['images']['vcf2maf']
+ config['images']['vcf2maf']
shell: """
vcf2maf.pl \\
@@ -212,15 +212,15 @@ rule somatic_mafs:
--ref-fasta {params.genome} \\
--retain-info "set" \\
--vep-overwrite
-
+
"""
localrules: collect_cohort_mafs
rule collect_cohort_mafs:
- input:
+ input:
mafs = expand(os.path.join(output_somatic_snpindels, "{{vc_outdir}}", "maf", "{samples}"+".maf"), samples=samples_for_caller_merge)
- output:
+ output:
maf = os.path.join(output_somatic_snpindels, "{vc_outdir}", "cohort_summary", "all_somatic_variants.maf")
params:
rname = 'combine_maf'
@@ -228,4 +228,4 @@ rule collect_cohort_mafs:
echo "Combining MAFs..."
head -2 {input.mafs[0]} > {output.maf}
awk 'FNR>2 {{print}}' {input.mafs} >> {output.maf}
- """
\ No newline at end of file
+ """
diff --git a/workflow/rules/somatic_snps.paired.smk b/workflow/rules/somatic_snps.paired.smk
index 00b5c73..d19a616 100644
--- a/workflow/rules/somatic_snps.paired.smk
+++ b/workflow/rules/somatic_snps.paired.smk
@@ -1,6 +1,6 @@
# Somatic SNP calling rules for tumor/normal pairs
rule gatk_mutect2:
- input:
+ input:
normal = lambda w: [os.path.join(output_bamdir, "chrom_split", pairs_dict[w.samples] + ".{chroms}.split.bam")],
tumor = os.path.join(output_bamdir, "chrom_split", "{samples}.{chroms}.split.bam")
output:
@@ -22,7 +22,7 @@ rule gatk_mutect2:
container:
config['images']['wes_base']
shell: """
- if [ ! -d "$(dirname {output.vcf})" ];
+ if [ ! -d "$(dirname {output.vcf})" ];
then mkdir -p "$(dirname {output.vcf})";
fi
gatk Mutect2 \\
@@ -59,7 +59,7 @@ rule pileup_paired:
container:
config['images']['wes_base']
shell: """
- # Run GetPileupSummaries in bg concurrently for a tumor/normal pair
+ # Run GetPileupSummaries in bg concurrently for a tumor/normal pair
gatk --java-options '-Xmx48g' GetPileupSummaries \\
-I {input.tumor} \\
-V {params.germsource} \\
@@ -102,7 +102,7 @@ rule contamination_paired:
"""
-
+
rule strelka:
input:
normal = lambda w: [os.path.join(output_bamdir, "chrom_split", pairs_dict[w.samples] + ".{chroms}.split.bam")],
@@ -130,7 +130,7 @@ rule strelka:
config['images']['wes_base']
shell: """
# Setups temporary directory for
- # intermediate files with built-in
+ # intermediate files with built-in
# mechanism for deletion on exit
{params.set_tmp}
@@ -138,7 +138,7 @@ rule strelka:
myoutdir="$(dirname {output.vcf})/{wildcards.samples}/{wildcards.chroms}"
if [ -d "$myoutdir" ]; then rm -r "$myoutdir"; fi
mkdir -p "$myoutdir"
-
+
configureStrelkaSomaticWorkflow.py \\
--ref={params.genome} \\
--tumor={input.tumor} \\
@@ -187,7 +187,7 @@ rule strelka_filter:
config['images']['wes_base']
shell: """
# Setups temporary directory for
- # intermediate files with built-in
+ # intermediate files with built-in
# mechanism for deletion on exit
{params.set_tmp}
@@ -199,12 +199,12 @@ rule strelka_filter:
--output {output.filtered}
echo -e "TUMOR\t{params.tumorsample}\nNORMAL\t{params.normalsample}" > "{output.samplesfile}"
-
+
echo "Reheading VCFs with sample names..."
bcftools reheader \\
-o "{output.final}" \\
-s "{output.samplesfile}" "{output.filtered}"
-
+
# VarScan can output ambiguous IUPAC bases/codes
# the awk one-liner resets them to N, from:
# https://github.com/fpbarthel/GLASS/issues/23
@@ -238,12 +238,12 @@ rule mutect_paired:
config['images']['mutect']
shell: """
# Setups temporary directory for
- # intermediate files with built-in
+ # intermediate files with built-in
# mechanism for deletion on exit
{params.set_tmp}
if [ ! -d "$(dirname {output.vcf})" ]; then mkdir -p "$(dirname {output.vcf})"; fi
-
+
java -Xmx8g -Djava.io.tmpdir=${{tmp}} -jar ${{MUTECT_JAR}} \\
--analysis_type MuTect \\
--reference_sequence {params.genome} \\
@@ -264,7 +264,7 @@ rule mutect_filter:
output:
final = os.path.join(output_somatic_snpindels, "mutect_out", "vcf", "{samples}.FINAL.vcf"),
norm = os.path.join(output_somatic_snpindels, "mutect_out", "vcf", "{samples}.FINAL.norm.vcf"),
- params:
+ params:
normalsample = lambda w: [pairs_dict[w.samples]],
tumorsample = '{samples}',
genome = config['references']['GENOME'],
@@ -281,7 +281,7 @@ rule mutect_filter:
config['images']['wes_base']
shell: """
# Setups temporary directory for
- # intermediate files with built-in
+ # intermediate files with built-in
# mechanism for deletion on exit
{params.set_tmp}
@@ -307,7 +307,7 @@ rule vardict_paired:
tumor = os.path.join(output_bamdir, "chrom_split", "{samples}.{chroms}.split.bam"),
output:
vcf = os.path.join(output_somatic_snpindels, "vardict_out", "chrom_split", "{samples}.{chroms}.vcf"),
- params:
+ params:
normalsample = lambda w: [pairs_dict[w.samples]],
tumorsample = "{samples}",
genome = config['references']['GENOME'],
@@ -368,7 +368,7 @@ rule vardict_filter:
config['images']['wes_base']
shell: """
# Setups temporary directory for
- # intermediate files with built-in
+ # intermediate files with built-in
# mechanism for deletion on exit
{params.set_tmp}
@@ -382,7 +382,7 @@ rule vardict_filter:
--discordance {params.pon} \\
--exclude-filtered \\
--output {output.final}
-
+
# VarScan can output ambiguous IUPAC bases/codes
# the awk one-liner resets them to N, from:
# https://github.com/fpbarthel/GLASS/issues/23
@@ -394,7 +394,7 @@ rule vardict_filter:
rule varscan_paired:
- """Note: Refactor formatting of shell command for readability to
+ """Note: Refactor formatting of shell command for readability to
be more snake-thonic."""
input:
normal = lambda w: [os.path.join(output_bamdir, "chrom_split", pairs_dict[w.samples] + ".{chroms}.split.bam")],
@@ -418,17 +418,17 @@ rule varscan_paired:
config['images']['wes_base']
shell: """
# Setups temporary directory for
- # intermediate files with built-in
+ # intermediate files with built-in
# mechanism for deletion on exit
{params.set_tmp}
if [ ! -d "$(dirname {output.vcf})" ]; then mkdir -p "$(dirname {output.vcf})"; fi
-
+
tumor_purity=$( echo "1-$(printf '%.6f' $(tail -n -1 {input.tumor_summary} | cut -f2 ))" | bc -l)
normal_purity=$( echo "1-$(printf '%.6f' $(tail -n -1 {input.normal_summary} | cut -f2 ))" | bc -l)
varscan_opts="--strand-filter 1 --min-var-freq 0.01 --min-avg-qual 30 --somatic-p-value 0.05 --output-vcf 1 --normal-purity $normal_purity --tumor-purity $tumor_purity"
dual_pileup="samtools mpileup -d 10000 -q 15 -Q 15 -f {params.genome} {input.normal} {input.tumor}"
- varscan_cmd="varscan somatic <($dual_pileup) {output.vcf} $varscan_opts --mpileup 1"
+ varscan_cmd="varscan somatic <($dual_pileup) {output.vcf} $varscan_opts --mpileup 1"
eval "$varscan_cmd"
# VarScan can output ambiguous IUPAC bases/codes
@@ -446,7 +446,7 @@ rule varscan_paired:
--variant {output.vcf}.indel_temp \\
--assumeIdenticalSamples \\
--filteredrecordsmergetype KEEP_UNCONDITIONAL \\
- -o {output.vcf}
+ -o {output.vcf}
"""
@@ -480,14 +480,14 @@ rule varscan_filter:
config['images']['wes_base']
shell: """
# Setups temporary directory for
- # intermediate files with built-in
+ # intermediate files with built-in
# mechanism for deletion on exit
{params.set_tmp}
varscan filter \\
{input.vcf} \\
{params.filter_settings} > {output.filtered1}
-
+
gatk SelectVariants \\
-R {params.genome} \\
--variant {output.filtered1} \\
diff --git a/workflow/rules/somatic_snps.tumor_only.smk b/workflow/rules/somatic_snps.tumor_only.smk
index fccb8a5..3160ef2 100644
--- a/workflow/rules/somatic_snps.tumor_only.smk
+++ b/workflow/rules/somatic_snps.tumor_only.smk
@@ -55,7 +55,7 @@ rule pileup_single:
config['images']['wes_base']
shell: """
# Setups temporary directory for
- # intermediate files with built-in
+ # intermediate files with built-in
# mechanism for deletion on exit
{params.set_tmp}
@@ -69,7 +69,7 @@ rule pileup_single:
localrules: contamination_single
rule contamination_single:
- input:
+ input:
pileup = os.path.join(output_somatic_snpindels, "mutect2_out", "pileup_summaries", "{samples}.pileup.table")
output:
tumor_summary = os.path.join(output_somatic_base, "qc", "gatk_contamination", "{samples}.contamination.table")
@@ -77,7 +77,7 @@ rule contamination_single:
genome = config['references']['GENOME'],
germsource = config['references']['KNOWNSNPS'],
ver_gatk = config['tools']['gatk4']['version'],
- chroms = chroms,
+ chroms = chroms,
rname = 'contamination'
envmodules:
config['tools']['gatk4']['modname']
@@ -109,11 +109,11 @@ rule mutect_single:
config['images']['mutect']
shell: """
# Setups temporary directory for
- # intermediate files with built-in
+ # intermediate files with built-in
# mechanism for deletion on exit
{params.set_tmp}
- if [ ! -d "$(dirname {output.vcf})" ]; then
+ if [ ! -d "$(dirname {output.vcf})" ]; then
mkdir -p "$(dirname {output.vcf})"
fi
@@ -129,7 +129,7 @@ rule mutect_single:
--out {output.stats} \\
-rf BadCigar
"""
-
+
rule mutect_filter_single:
input:
@@ -152,7 +152,7 @@ rule mutect_filter_single:
config['images']['wes_base']
shell: """
# Setups temporary directory for
- # intermediate files with built-in
+ # intermediate files with built-in
# mechanism for deletion on exit
{params.set_tmp}
@@ -173,11 +173,11 @@ rule mutect_filter_single:
rule vardict_single:
- input:
+ input:
tumor = os.path.join(output_bamdir, "chrom_split", "{samples}.{chroms}.split.bam"),
output:
vcf = os.path.join(output_somatic_snpindels, "vardict_out", "chrom_split", "{samples}.{chroms}.vcf"),
- params:
+ params:
genome = config['references']['GENOME'],
targets = exome_targets_bed,
pon = config['references']['PON'],
@@ -189,7 +189,7 @@ rule vardict_single:
container:
config['images']['wes_base']
shell: """
- if [ ! -d "$(dirname {output.vcf})" ]; then
+ if [ ! -d "$(dirname {output.vcf})" ]; then
mkdir -p "$(dirname {output.vcf})"
fi
@@ -217,7 +217,7 @@ rule vardict_single:
rule vardict_filter_single:
- input:
+ input:
vcf = os.path.join(output_somatic_snpindels, "vardict_out", "vcf", "{samples}.collected.vcf"),
output:
final = os.path.join(output_somatic_snpindels, "vardict_out", "vcf", "{samples}.FINAL.vcf"),
@@ -229,7 +229,7 @@ rule vardict_filter_single:
pon = config['references']['PON'],
ver_gatk = config['tools']['gatk4']['version'],
ver_bcftools = config['tools']['bcftools']['version'],
- rname = 'vardict_filter',
+ rname = 'vardict_filter',
set_tmp = set_tmp(),
threads: 4
envmodules:
@@ -239,7 +239,7 @@ rule vardict_filter_single:
config['images']['wes_base']
shell: """
# Setups temporary directory for
- # intermediate files with built-in
+ # intermediate files with built-in
# mechanism for deletion on exit
{params.set_tmp}
@@ -249,7 +249,7 @@ rule vardict_filter_single:
--discordance {params.pon} \\
--exclude-filtered \\
--output {output.final}
-
+
# VarScan can output ambiguous IUPAC bases/codes
# the awk one-liner resets them to N, from:
# https://github.com/fpbarthel/GLASS/issues/23
@@ -326,7 +326,7 @@ rule varscan_filter_single:
config['images']['wes_base']
shell: """
# Setups temporary directory for
- # intermediate files with built-in
+ # intermediate files with built-in
# mechanism for deletion on exit
{params.set_tmp}
@@ -356,4 +356,4 @@ rule varscan_filter_single:
| bcftools norm --threads {threads} --check-ref s -f {params.genome} -O v \\
| awk '{{gsub(/\y[W|K|Y|R|S|M|B|D|H|V]\y/,"N",$4); OFS = "\t"; print}}' \\
| sed '/^$/d' > {output.norm}
- """
\ No newline at end of file
+ """
diff --git a/workflow/rules/trim_map_preprocess.smk b/workflow/rules/trim_map_preprocess.smk
index 89eeb8b..907af6b 100644
--- a/workflow/rules/trim_map_preprocess.smk
+++ b/workflow/rules/trim_map_preprocess.smk
@@ -29,7 +29,7 @@ rule bam2fastq:
config['images']['wes_base']
shell: """
# Setups temporary directory for
- # intermediate files with built-in
+ # intermediate files with built-in
# mechanism for deletion on exit
{params.set_tmp}
@@ -66,7 +66,7 @@ rule trimmomatic:
adapterfile = config['references']['trimmomatic.adapters'],
ver = config['tools']['trimmomatic']['version'],
rname = 'trimmomatic'
- envmodules:
+ envmodules:
config['tools']['trimmomatic']['modname']
container:
config['images']['wes_base']
@@ -84,7 +84,7 @@ rule trimmomatic:
LEADING:10 \\
TRAILING:10 \\
SLIDINGWINDOW:4:20 \\
- MINLEN:20
+ MINLEN:20
"""
@@ -107,12 +107,12 @@ rule bwa_mem:
ver_samtools = config['tools']['samtools']['version'],
ver_bwa = config['tools']['bwa']['version'],
rname = 'bwamem'
- envmodules:
+ envmodules:
config['tools']['samtools']['modname'],
config['tools']['bwa']['modname'],
config['tools']['samblaster']['modname']
container:
- config['images']['wes_base']
+ config['images']['wes_base']
threads: 24
shell: """
myoutdir="$(dirname {output})"
@@ -145,9 +145,9 @@ rule raw_index:
params:
ver_samtools = config['tools']['samtools']['version'],
rname = 'raw_index'
- envmodules:
+ envmodules:
config['tools']['samtools']['modname']
- container:
+ container:
config['images']['wes_base']
shell: """
samtools index -@ 2 {input.bam} {output.bai}
@@ -173,7 +173,7 @@ rule gatk_recal:
output:
bam = os.path.join(input_bamdir, "{samples}.input.bam"),
re = temp(os.path.join(output_bamdir, "preprocessing", "{samples}_recal_data.grp"))
- params:
+ params:
genome = config['references']['GENOME'],
knowns = config['references']['KNOWNRECAL'],
ver_gatk = config['tools']['gatk4']['version'],
@@ -192,7 +192,7 @@ rule gatk_recal:
{params.knowns} \\
--output {output.re} \\
--intervals {params.intervals}
-
+
gatk --java-options '-Xmx48g' ApplyBQSR \\
--reference {params.genome} \\
--input {input.bam} \\
@@ -207,7 +207,7 @@ rule bam_check:
"""
This is a checkpoint to make sure BAMs are ready for variant calling.
The read group (RG) tags are checked to make sure they match the sample ID
- inferred from the file name, and the bam is indexed. This rule needs to be
+ inferred from the file name, and the bam is indexed. This rule needs to be
refactored at some point. It is not making great use of snake-thonic.
@Input:
Aligned reads in BAM format (scatter)
@@ -233,8 +233,8 @@ rule bam_check:
sample={wildcards.samples}
ID=$sample
PL="ILLUMINA" # exposed as a config param
- LB="na" # exposed as a config param
-
+ LB="na" # exposed as a config param
+
# Check if there is no header or any of the info
HEADER=`samtools view -H {input.bam} | grep ^@RG`
if [[ "$HEADER" != "" ]]; then
@@ -258,7 +258,7 @@ rule bam_check:
--RGPL ${{PL}} \\
--RGSM ${{ID}} \\
--RGPU na
-
+
samtools index -@ 2 {output.bam} {output.bai}
cp {output.bai} {output.bai2}
- """
\ No newline at end of file
+ """
diff --git a/workflow/scripts/RScripts/combineAllSampleCompareResults.R b/workflow/scripts/RScripts/combineAllSampleCompareResults.R
index 6b646b6..d0fa96a 100644
--- a/workflow/scripts/RScripts/combineAllSampleCompareResults.R
+++ b/workflow/scripts/RScripts/combineAllSampleCompareResults.R
@@ -1,57 +1,52 @@
library(dplyr)
-###VerifyBAM Output
-args = commandArgs(trailingOnly=TRUE)
-user.input.1=args[1]
-#user.input.2=args[2]
-user.input.3=args[2]
+### VerifyBAM Output
+args <- commandArgs(trailingOnly = TRUE)
+user.input.1 <- args[1]
+# user.input.2=args[2]
+user.input.3 <- args[2]
-#SomaliaOutput
-somaliaDistance<-read.table(user.input.1,sep = "\t",header = F)
-#somaliaDistance<-read.table("~/relatedness.pairs.tsv",sep = "\t",header = F)
-predictedPairs<-list()
-for(sample in sort(unique(c(somaliaDistance$V1,somaliaDistance$V2))))
+# SomaliaOutput
+somaliaDistance <- read.table(user.input.1, sep = "\t", header = F)
+# somaliaDistance<-read.table("~/relatedness.pairs.tsv",sep = "\t",header = F)
+predictedPairs <- list()
+for (sample in sort(unique(c(somaliaDistance$V1, somaliaDistance$V2))))
{
- samplePairs<-somaliaDistance %>% dplyr::filter(V1 %in% sample | V2 %in% sample)
- maxRelatedess<-which(samplePairs$V3 == max(samplePairs$V3,na.rm = T))
- maxHomCon<-which(samplePairs$V6 ==max(samplePairs$V6,na.rm = T))
- m<-intersect(maxRelatedess,maxHomCon)
- if(length(m)>0)
- {
- for(i in m)
+ samplePairs <- somaliaDistance %>% dplyr::filter(V1 %in% sample | V2 %in% sample)
+ maxRelatedess <- which(samplePairs$V3 == max(samplePairs$V3, na.rm = T))
+ maxHomCon <- which(samplePairs$V6 == max(samplePairs$V6, na.rm = T))
+ m <- intersect(maxRelatedess, maxHomCon)
+ if (length(m) > 0) {
+ for (i in m)
{
- if(sample == unlist(samplePairs[i,]$V2))
- {
- t<-unlist(samplePairs[i,c(2,1,3,6)])
- names(t)<-NULL
- predictedPairs[[sample]]<-(t)
- }else
- {
- predictedPairs[[sample]]<-(unlist(samplePairs[i,c(1,2,3,6)]))
+ if (sample == unlist(samplePairs[i, ]$V2)) {
+ t <- unlist(samplePairs[i, c(2, 1, 3, 6)])
+ names(t) <- NULL
+ predictedPairs[[sample]] <- (t)
+ } else {
+ predictedPairs[[sample]] <- (unlist(samplePairs[i, c(1, 2, 3, 6)]))
}
}
- }else{
- print(paste0("No consensous between the Relatedness and Homology for sample:",sample))
- maxVal<-c(maxRelatedess,maxHomCon)
- for(i in maxVal)
+ } else {
+ print(paste0("No consensous between the Relatedness and Homology for sample:", sample))
+ maxVal <- c(maxRelatedess, maxHomCon)
+ for (i in maxVal)
{
- if(sample == unlist(samplePairs[i,]$V2))
- {
- t<-unlist(samplePairs[i,c(2,1,3,6)])
- names(t)<-NULL
- predictedPairs[[sample]]<-(t)
- }else
- {
- predictedPairs[[sample]]<-(unlist(samplePairs[i,c(1,2,3,6)]))
+ if (sample == unlist(samplePairs[i, ]$V2)) {
+ t <- unlist(samplePairs[i, c(2, 1, 3, 6)])
+ names(t) <- NULL
+ predictedPairs[[sample]] <- (t)
+ } else {
+ predictedPairs[[sample]] <- (unlist(samplePairs[i, c(1, 2, 3, 6)]))
}
}
}
}
-finalPredPairs<-data.frame(do.call("rbind",predictedPairs))
-colnames(finalPredPairs)<-c("Sample1","Sample2","Som:relatedness","Som:hom_concordance")
+finalPredPairs <- data.frame(do.call("rbind", predictedPairs))
+colnames(finalPredPairs) <- c("Sample1", "Sample2", "Som:relatedness", "Som:hom_concordance")
-#VerifyBAMID
+# VerifyBAMID
# verifyBAMID<-read.table(user.input.2,sep = "\t",header = T)
-#verifyBAMID<-read.table("~/IntendedSamplesPCs.cor.pairs.tsv",sep = "\t",header = T)
+# verifyBAMID<-read.table("~/IntendedSamplesPCs.cor.pairs.tsv",sep = "\t",header = T)
# predictedPairsVerifyBAMID<-list()
# repSam<-c()
# for(sample in sort(unique(c(verifyBAMID$Sample1,verifyBAMID$Sample2))))
@@ -74,8 +69,7 @@ colnames(finalPredPairs)<-c("Sample1","Sample2","Som:relatedness","Som:hom_conco
# }
# finalpredictedPairsVerifyBAMID<-do.call("rbind",predictedPairsVerifyBAMID)
-##Combine the output from both the tools
-#mergedDF<-merge(x=finalPredPairs,y=finalpredictedPairsVerifyBAMID,by = "Sample1",all = TRUE)
-#write.table(mergedDF[,c(1:4,6)],file = user.input.3,sep = "\t",quote = FALSE,row.names = FALSE)
-write.table(finalPredPairs,file = user.input.3,sep = "\t",quote = FALSE,row.names = FALSE)
-
+## Combine the output from both the tools
+# mergedDF<-merge(x=finalPredPairs,y=finalpredictedPairsVerifyBAMID,by = "Sample1",all = TRUE)
+# write.table(mergedDF[,c(1:4,6)],file = user.input.3,sep = "\t",quote = FALSE,row.names = FALSE)
+write.table(finalPredPairs, file = user.input.3, sep = "\t", quote = FALSE, row.names = FALSE)
diff --git a/workflow/scripts/RScripts/combineVerifyBAMIDResults.R b/workflow/scripts/RScripts/combineVerifyBAMIDResults.R
index 68354ef..5319546 100644
--- a/workflow/scripts/RScripts/combineVerifyBAMIDResults.R
+++ b/workflow/scripts/RScripts/combineVerifyBAMIDResults.R
@@ -1,22 +1,29 @@
library(dplyr)
-###VerifyBAM Output
-args = commandArgs(trailingOnly=TRUE)
-user.input.1=args[1]
-user.input.2=args[2]
-sample_files<-scan(user.input.1,character())
-#mywd<-"/Users/jaina13/myPART/"
-#sample_files <- list.files(path = mywd,pattern = ".Ancestry",recursive = F, full.names = T)
-fileNames<-unlist(lapply(sample_files, function(currfile){unlist(stringr::str_split(fs::path_file(currfile),pattern = ".Ancestry"))[1]}))
-names(sample_files)<-fileNames
+### VerifyBAM Output
+args <- commandArgs(trailingOnly = TRUE)
+user.input.1 <- args[1]
+user.input.2 <- args[2]
+sample_files <- scan(user.input.1, character())
+# mywd<-"/Users/jaina13/myPART/"
+# sample_files <- list.files(path = mywd,pattern = ".Ancestry",recursive = F, full.names = T)
+fileNames <- unlist(lapply(sample_files, function(currfile) {
+ unlist(stringr::str_split(fs::path_file(currfile), pattern = ".Ancestry"))[1]
+}))
+names(sample_files) <- fileNames
verifyBAMIdResults <- lapply(sample_files, function(currfile) {
- r<-read.table(currfile,header = T,sep = "\t")
- t<-r[,3,drop=FALSE]
- colnames(t)<-names(currfile)
+ r <- read.table(currfile, header = T, sep = "\t")
+ t <- r[, 3, drop = FALSE]
+ colnames(t) <- names(currfile)
return(t)
})
-v<-do.call("cbind",verifyBAMIdResults)
-corRes<-cor(v)
-finalSamplePairsCorrelation <- cor(v) %>% as.data.frame() %>% dplyr::mutate(var1 = rownames(.)) %>% tidyr::gather(var2, value, -var1) %>%
- dplyr::arrange(desc(value)) %>% dplyr::group_by(value) %>% dplyr::filter(var1 != var2) #%>% dplyr::filter(row_number()==1)
-colnames(finalSamplePairsCorrelation)<-c("Sample1","Sample2","VerfiyBAMId:Correlation")
-write.table(finalSamplePairsCorrelation,file = user.input.2,sep = "\t",quote = F,row.names = F)
+v <- do.call("cbind", verifyBAMIdResults)
+corRes <- cor(v)
+finalSamplePairsCorrelation <- cor(v) %>%
+ as.data.frame() %>%
+ dplyr::mutate(var1 = rownames(.)) %>%
+ tidyr::gather(var2, value, -var1) %>%
+ dplyr::arrange(desc(value)) %>%
+ dplyr::group_by(value) %>%
+ dplyr::filter(var1 != var2) # %>% dplyr::filter(row_number()==1)
+colnames(finalSamplePairsCorrelation) <- c("Sample1", "Sample2", "VerfiyBAMId:Correlation")
+write.table(finalSamplePairsCorrelation, file = user.input.2, sep = "\t", quote = F, row.names = F)
diff --git a/workflow/scripts/RScripts/predictGender.R b/workflow/scripts/RScripts/predictGender.R
index a214e6e..31a4ddf 100644
--- a/workflow/scripts/RScripts/predictGender.R
+++ b/workflow/scripts/RScripts/predictGender.R
@@ -1,12 +1,18 @@
require(dplyr)
-args = commandArgs(trailingOnly=TRUE)
-user.input.1=args[1]
-user.input.2=args[2]
+args <- commandArgs(trailingOnly = TRUE)
+user.input.1 <- args[1]
+user.input.2 <- args[2]
-somaliaDistance<-read.table(user.input.1,sep = "\t",header = T,comment.char = "")
-sampleSexChrMD<-somaliaDistance[,c("sample_id","X_depth_mean","Y_depth_mean")]
-sampleSexChrMD$Scale_X_depth_mean<-unlist(scale(sampleSexChrMD$X_depth_mean)[,1])
-sampleSexChrMD$Scale_Y_depth_mean<-unlist(scale(sampleSexChrMD$Y_depth_mean)[,1])
-sampleSexChrMD$Gender<-unlist(lapply(sampleSexChrMD$Y_depth_mean, FUN=function(x){if(x>0) return("Male") else return("Female")}))
-write.table(sampleSexChrMD[,c("sample_id","Gender")],user.input.2,quote = F,sep = "\t",row.names = F)
+somaliaDistance <- read.table(user.input.1, sep = "\t", header = T, comment.char = "")
+sampleSexChrMD <- somaliaDistance[, c("sample_id", "X_depth_mean", "Y_depth_mean")]
+sampleSexChrMD$Scale_X_depth_mean <- unlist(scale(sampleSexChrMD$X_depth_mean)[, 1])
+sampleSexChrMD$Scale_Y_depth_mean <- unlist(scale(sampleSexChrMD$Y_depth_mean)[, 1])
+sampleSexChrMD$Gender <- unlist(lapply(sampleSexChrMD$Y_depth_mean, FUN = function(x) {
+ if (x > 0) {
+ return("Male")
+ } else {
+ return("Female")
+ }
+}))
+write.table(sampleSexChrMD[, c("sample_id", "Gender")], user.input.2, quote = F, sep = "\t", row.names = F)
diff --git a/workflow/scripts/RScripts/sampleCompareAncestoryPlots.R b/workflow/scripts/RScripts/sampleCompareAncestoryPlots.R
deleted file mode 100644
index 3c21dc0..0000000
--- a/workflow/scripts/RScripts/sampleCompareAncestoryPlots.R
+++ /dev/null
@@ -1,40 +0,0 @@
-library(ggplot2)
-library(dplyr)
-library(plotly)
-library(htmlwidgets)
-library(tidyr)
-
-args = commandArgs(trailingOnly=TRUE)
-user.input.1=args[1]
-user.input.2=args[2]
-user.input.3=args[3]
-user.input.4=args[4]
-
-t<-read.table(user.input.1,sep = "\t",header = T,comment.char = "")
-pairs<-read.table(user.input.2,sep = "\t",header = T)
-pairs<-pairs %>% mutate(key = paste0(pmin(Sample1, Sample2), pmax(Sample1, Sample2), sep = ""))
-pairs<-pairs[duplicated(pairs[,"key"]),]
-samples<-unique(c(pairs$Sample1,pairs$Sample2))
-
-mapping<-list("EUR"="European","EAS"="East Asian","AMR"="American","SAS"="South Asian","AFR"="African")
-t$predAncestry<-unlist(mapping[t$predicted_ancestry])
-t$color<-t$predAncestry
-t$color[t$X.sample_id %in% samples]<-"UserSamples"
-p <- plot_ly(t, x = ~PC1, y = ~PC2, color = as.factor(t$color),colors = c('#0C4B8E','#FF0000','#f1a340','#43a2ca','#8856a7','grey'),
- hoverinfo = 'text',text = ~paste(' Sample Id:',t$X.sample_id,' Ancestory:', t$predAncestry), type = 'scatter', mode = 'markers') %>%
- #add_trace(marker = list(size = 12)) %>%
- layout(scene = list(xaxis = list(title = 'PC1'),yaxis = list(title = 'PC2')))
-htmlwidgets::saveWidget(p,user.input.3)
-
-samplesAncestory<-t[t$X.sample_id %in% samples,c(1,4:8)]
-mapping<-list("EUR_prob"="European","EAS_prob"="East Asian","AMR_prob"="American","SAS_prob"="South Asian","AFR_prob"="African")
-colnames(samplesAncestory)<-c(colnames(samplesAncestory)[1],unlist(mapping[colnames(samplesAncestory)[2:6]]))
-d<-data.frame(rbind(cbind(pairs$Sample1,c(rep("Sample1",length(pairs$Sample1))),paste0(pairs$Sample1,"\nvs\n",pairs$Sample2)),
- cbind(pairs$Sample2,c(rep("Sample2",length(pairs$Sample2))),paste0(pairs$Sample1,"\nvs\n",pairs$Sample2))))
-mDF<-merge(x=d,y=samplesAncestory,by.x="X1",by.y="X.sample_id",all.x=TRUE,all.y=FALSE)
-gData<-mDF %>% tidyr::pivot_longer(c(4:8), names_to = "Ancestory", values_to = "Somalier.Score")
-g<-ggplot(gData, aes(fill=Ancestory, y=Somalier.Score, x=X2)) +
- geom_bar(position="stack", stat="identity") +
- theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank(),strip.text = element_text(size = 5))+
- facet_wrap(~X3)
-ggsave(user.input.4,g)
diff --git a/workflow/scripts/RScripts/sampleCompareAncestryPlots.R b/workflow/scripts/RScripts/sampleCompareAncestryPlots.R
new file mode 100644
index 0000000..73dca87
--- /dev/null
+++ b/workflow/scripts/RScripts/sampleCompareAncestryPlots.R
@@ -0,0 +1,44 @@
+library(ggplot2)
+library(dplyr)
+library(plotly)
+library(htmlwidgets)
+library(tidyr)
+
+args <- commandArgs(trailingOnly = TRUE)
+user.input.1 <- args[1]
+user.input.2 <- args[2]
+user.input.3 <- args[3]
+user.input.4 <- args[4]
+
+t <- read.table(user.input.1, sep = "\t", header = T, comment.char = "")
+pairs <- read.table(user.input.2, sep = "\t", header = T)
+pairs <- pairs %>% mutate(key = paste0(pmin(Sample1, Sample2), pmax(Sample1, Sample2), sep = ""))
+pairs <- pairs[duplicated(pairs[, "key"]), ]
+samples <- unique(c(pairs$Sample1, pairs$Sample2))
+
+mapping <- list("EUR" = "European", "EAS" = "East Asian", "AMR" = "American", "SAS" = "South Asian", "AFR" = "African")
+t$predAncestry <- unlist(mapping[t$predicted_ancestry])
+t$color <- t$predAncestry
+t$color[t$X.sample_id %in% samples] <- "UserSamples"
+p <- plot_ly(t,
+ x = ~PC1, y = ~PC2, color = as.factor(t$color), colors = c("#0C4B8E", "#FF0000", "#f1a340", "#43a2ca", "#8856a7", "grey"),
+ hoverinfo = "text", text = ~ paste(" Sample Id:", t$X.sample_id, " Ancestry:", t$predAncestry), type = "scatter", mode = "markers"
+) %>%
+ # add_trace(marker = list(size = 12)) %>%
+ layout(scene = list(xaxis = list(title = "PC1"), yaxis = list(title = "PC2")))
+htmlwidgets::saveWidget(p, user.input.3)
+
+samplesAncestry <- t[t$X.sample_id %in% samples, c(1, 4:8)]
+mapping <- list("EUR_prob" = "European", "EAS_prob" = "East Asian", "AMR_prob" = "American", "SAS_prob" = "South Asian", "AFR_prob" = "African")
+colnames(samplesAncestry) <- c(colnames(samplesAncestry)[1], unlist(mapping[colnames(samplesAncestry)[2:6]]))
+d <- data.frame(rbind(
+ cbind(pairs$Sample1, c(rep("Sample1", length(pairs$Sample1))), paste0(pairs$Sample1, "\nvs\n", pairs$Sample2)),
+ cbind(pairs$Sample2, c(rep("Sample2", length(pairs$Sample2))), paste0(pairs$Sample1, "\nvs\n", pairs$Sample2))
+))
+mDF <- merge(x = d, y = samplesAncestry, by.x = "X1", by.y = "X.sample_id", all.x = TRUE, all.y = FALSE)
+gData <- mDF %>% tidyr::pivot_longer(c(4:8), names_to = "Ancestry", values_to = "Somalier.Score")
+g <- ggplot(gData, aes(fill = Ancestry, y = Somalier.Score, x = X2)) +
+ geom_bar(position = "stack", stat = "identity") +
+ theme(axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), strip.text = element_text(size = 5)) +
+ facet_wrap(~X3)
+ggsave(user.input.4, g)
diff --git a/workflow/scripts/assess_significance.R b/workflow/scripts/assess_significance.R
index 4193d97..e32f7f7 100644
--- a/workflow/scripts/assess_significance.R
+++ b/workflow/scripts/assess_significance.R
@@ -1,62 +1,72 @@
-#!/usr/bin/env Rscript
-
-library(rtracklayer)
-
-args <- commandArgs()
-
-dataTable <-read.table(args[5], header=TRUE);
-ratio<-data.frame(dataTable)
-
-dataTable <- read.table(args[4], header=FALSE)
-cnvs<- data.frame(dataTable)
-
-ratio$Ratio[which(ratio$Ratio==-1)]=NA
-
-cnvs.bed=GRanges(cnvs[,1],IRanges(cnvs[,2],cnvs[,3]))
-ratio.bed=GRanges(ratio$Chromosome,IRanges(ratio$Start,ratio$Start),score=ratio$Ratio)
-
-overlaps <- subsetByOverlaps(ratio.bed,cnvs.bed)
-normals <- setdiff(ratio.bed,cnvs.bed)
-normals <- subsetByOverlaps(ratio.bed,normals)
-
-#mu <- mean(score(normals),na.rm=TRUE)
-#sigma<- sd(score(normals),na.rm=TRUE)
-
-#hist(score(normals),n=500,xlim=c(0,2))
-#hist(log(score(normals)),n=500,xlim=c(-1,1))
-
-#shapiro.test(score(normals)[which(!is.na(score(normals)))][5001:10000])
-#qqnorm (score(normals)[which(!is.na(score(normals)))],ylim=(c(0,10)))
-#qqline(score(normals)[which(!is.na(score(normals)))], col = 2)
-
-#shapiro.test(log(score(normals))[which(!is.na(score(normals)))][5001:10000])
-#qqnorm (log(score(normals))[which(!is.na(score(normals)))],ylim=(c(-6,10)))
-#qqline(log(score(normals))[which(!is.na(score(normals)))], col = 2)
-
-numberOfCol=length(cnvs)
-
-for (i in c(1:length(cnvs[,1]))) {
- values <- score(subsetByOverlaps(ratio.bed,cnvs.bed[i]))
- #wilcox.test(values,mu=mu)
- W <- function(values,normals){resultw <- try(wilcox.test(values,score(normals)), silent = TRUE)
- if(class(resultw)=="try-error") return(list("statistic"=NA,"parameter"=NA,"p.value"=NA,"null.value"=NA,"alternative"=NA,"method"=NA,"data.name"=NA)) else resultw}
- KS <- function(values,normals){resultks <- try(ks.test(values,score(normals)), silent = TRUE)
- if(class(resultks)=="try-error") return(list("statistic"=NA,"p.value"=NA,"alternative"=NA,"method"=NA,"data.name"=NA)) else resultks}
- #resultks <- try(KS <- ks.test(values,score(normals)), silent = TRUE)
- # if(class(resultks)=="try-error") NA) else resultks
- cnvs[i,numberOfCol+1]=W(values,normals)$p.value
- cnvs[i,numberOfCol+2]=KS(values,normals)$p.value
- }
-
-if (numberOfCol==5) {
- names(cnvs)=c("chr","start","end","copy number","status","WilcoxonRankSumTestPvalue","KolmogorovSmirnovPvalue")
-}
-if (numberOfCol==7) {
- names(cnvs)=c("chr","start","end","copy number","status","genotype","uncertainty","WilcoxonRankSumTestPvalue","KolmogorovSmirnovPvalue")
-}
-if (numberOfCol==9) {
- names(cnvs)=c("chr","start","end","copy number","status","genotype","uncertainty","somatic/germline","precentageOfGermline","WilcoxonRankSumTestPvalue","KolmogorovSmirnovPvalue")
-}
-write.table(cnvs, file=paste(args[4],".p.value.txt",sep=""),sep="\t",quote=F,row.names=F)
-
-
+#!/usr/bin/env Rscript
+
+library(rtracklayer)
+
+args <- commandArgs()
+
+dataTable <- read.table(args[5], header = TRUE)
+ratio <- data.frame(dataTable)
+
+dataTable <- read.table(args[4], header = FALSE)
+cnvs <- data.frame(dataTable)
+
+ratio$Ratio[which(ratio$Ratio == -1)] <- NA
+
+cnvs.bed <- GRanges(cnvs[, 1], IRanges(cnvs[, 2], cnvs[, 3]))
+ratio.bed <- GRanges(ratio$Chromosome, IRanges(ratio$Start, ratio$Start), score = ratio$Ratio)
+
+overlaps <- subsetByOverlaps(ratio.bed, cnvs.bed)
+normals <- setdiff(ratio.bed, cnvs.bed)
+normals <- subsetByOverlaps(ratio.bed, normals)
+
+# mu <- mean(score(normals),na.rm=TRUE)
+# sigma<- sd(score(normals),na.rm=TRUE)
+
+# hist(score(normals),n=500,xlim=c(0,2))
+# hist(log(score(normals)),n=500,xlim=c(-1,1))
+
+# shapiro.test(score(normals)[which(!is.na(score(normals)))][5001:10000])
+# qqnorm (score(normals)[which(!is.na(score(normals)))],ylim=(c(0,10)))
+# qqline(score(normals)[which(!is.na(score(normals)))], col = 2)
+
+# shapiro.test(log(score(normals))[which(!is.na(score(normals)))][5001:10000])
+# qqnorm (log(score(normals))[which(!is.na(score(normals)))],ylim=(c(-6,10)))
+# qqline(log(score(normals))[which(!is.na(score(normals)))], col = 2)
+
+numberOfCol <- length(cnvs)
+
+for (i in c(1:length(cnvs[, 1]))) {
+ values <- score(subsetByOverlaps(ratio.bed, cnvs.bed[i]))
+ # wilcox.test(values,mu=mu)
+ W <- function(values, normals) {
+ resultw <- try(wilcox.test(values, score(normals)), silent = TRUE)
+ if (class(resultw) == "try-error") {
+ return(list("statistic" = NA, "parameter" = NA, "p.value" = NA, "null.value" = NA, "alternative" = NA, "method" = NA, "data.name" = NA))
+ } else {
+ resultw
+ }
+ }
+ KS <- function(values, normals) {
+ resultks <- try(ks.test(values, score(normals)), silent = TRUE)
+ if (class(resultks) == "try-error") {
+ return(list("statistic" = NA, "p.value" = NA, "alternative" = NA, "method" = NA, "data.name" = NA))
+ } else {
+ resultks
+ }
+ }
+ # resultks <- try(KS <- ks.test(values,score(normals)), silent = TRUE)
+ # if(class(resultks)=="try-error") NA) else resultks
+ cnvs[i, numberOfCol + 1] <- W(values, normals)$p.value
+ cnvs[i, numberOfCol + 2] <- KS(values, normals)$p.value
+}
+
+if (numberOfCol == 5) {
+ names(cnvs) <- c("chr", "start", "end", "copy number", "status", "WilcoxonRankSumTestPvalue", "KolmogorovSmirnovPvalue")
+}
+if (numberOfCol == 7) {
+ names(cnvs) <- c("chr", "start", "end", "copy number", "status", "genotype", "uncertainty", "WilcoxonRankSumTestPvalue", "KolmogorovSmirnovPvalue")
+}
+if (numberOfCol == 9) {
+ names(cnvs) <- c("chr", "start", "end", "copy number", "status", "genotype", "uncertainty", "somatic/germline", "precentageOfGermline", "WilcoxonRankSumTestPvalue", "KolmogorovSmirnovPvalue")
+}
+write.table(cnvs, file = paste(args[4], ".p.value.txt", sep = ""), sep = "\t", quote = F, row.names = F)
diff --git a/workflow/scripts/correct_target_bed.py b/workflow/scripts/correct_target_bed.py
index 0a7e379..cad812f 100644
--- a/workflow/scripts/correct_target_bed.py
+++ b/workflow/scripts/correct_target_bed.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
-__desc__="""
-# FREEC errors out with
+__desc__ = """
+# FREEC errors out with
##################################################################################
# Error: your BED file with coordinates of targeted regions may contain duplicates
##################################################################################
@@ -17,16 +17,16 @@
import subprocess
import os
-if len(sys.argv)!=3: # 2 arguments are required
- print(__desc__)
- print("Usage: python3 "+sys.argv[0]+" ")
- exit()
+if len(sys.argv) != 3: # 2 arguments are required
+ print(__desc__)
+ print("Usage: python3 " + sys.argv[0] + " ")
+ exit()
inputBed = sys.argv[1]
outputBed = sys.argv[2]
-iBed = open(inputBed,'r')
-oBed = open(outputBed+".tmp",'w')
+iBed = open(inputBed, "r")
+oBed = open(outputBed + ".tmp", "w")
# collapse redundant regions
# regiongs with identical chrom/start/end but with possibly different annotations
@@ -34,26 +34,29 @@
annotations = dict()
for line in iBed.readlines():
- line = line.strip().split("\t")
- region = line[0] + "##" + line[1] + "##" + line[2]
- if not region in annotations:
- annotations[region] = dict()
- annotations[region]['1'] = list()
- annotations[region]['2'] = list()
- a1 = line[3].split(",")
- a2 = line[4].split(",")
- annotations[region]['1'].extend(a1)
- annotations[region]['2'].extend(a2)
+ line = line.strip().split("\t")
+ region = line[0] + "##" + line[1] + "##" + line[2]
+ if not region in annotations:
+ annotations[region] = dict()
+ annotations[region]["1"] = list()
+ annotations[region]["2"] = list()
+ a1 = line[3].split(",")
+ a2 = line[4].split(",")
+ annotations[region]["1"].extend(a1)
+ annotations[region]["2"].extend(a2)
iBed.close()
-for k,v in annotations.items():
- region = k.split("##")
- oBed.write("%s\t%s\t%s\t%s\t%s\t.\n"%(region[0],region[1],region[2],','.join(v['1']),','.join(v['2'])))
+for k, v in annotations.items():
+ region = k.split("##")
+ oBed.write(
+ "%s\t%s\t%s\t%s\t%s\t.\n"
+ % (region[0], region[1], region[2], ",".join(v["1"]), ",".join(v["2"]))
+ )
oBed.close()
# sort the collapsed file
-sortcmd = "sort -k1,1 -k2,2n -k3,3n " + outputBed+".tmp" + " > " + outputBed+".tmp2"
+sortcmd = "sort -k1,1 -k2,2n -k3,3n " + outputBed + ".tmp" + " > " + outputBed + ".tmp2"
subprocess.run(sortcmd, shell=True, check=True)
# Based off of the suggestions here --> https://github.com/BoevaLab/FREEC/issues/43
@@ -61,16 +64,23 @@
# different ends and annotations
# Solution: increment start by 1 to create "non-duplicate" entry for FREEC
-incrementcmd = "awk -F\"\\t\" -v OFS=\"\\t\" \'{seen[$1\"##\"$2]+=1;if (seen[$1\"##\"$2]!=1){$2=$2+seen[$1\"##\"$2]-1};print}' " + outputBed+".tmp2" + ">" + outputBed+".tmp3"
-subprocess.run(incrementcmd,shell=True,check=True)
+incrementcmd = (
+ 'awk -F"\\t" -v OFS="\\t" \'{seen[$1"##"$2]+=1;if (seen[$1"##"$2]!=1){$2=$2+seen[$1"##"$2]-1};print}\' '
+ + outputBed
+ + ".tmp2"
+ + ">"
+ + outputBed
+ + ".tmp3"
+)
+subprocess.run(incrementcmd, shell=True, check=True)
# resort the corrected file
-sortcmd = "sort -k1,1 -k2,2n -k3,3n " + outputBed+".tmp3" + " > " + outputBed
+sortcmd = "sort -k1,1 -k2,2n -k3,3n " + outputBed + ".tmp3" + " > " + outputBed
subprocess.run(sortcmd, shell=True, check=True)
# delete intermediate files
-os.remove(outputBed+".tmp")
-os.remove(outputBed+".tmp2")
-os.remove(outputBed+".tmp3")
+os.remove(outputBed + ".tmp")
+os.remove(outputBed + ".tmp2")
+os.remove(outputBed + ".tmp3")
diff --git a/workflow/scripts/freec/make_freec_config.py b/workflow/scripts/freec/make_freec_config.py
index 08dcb2f..b971918 100644
--- a/workflow/scripts/freec/make_freec_config.py
+++ b/workflow/scripts/freec/make_freec_config.py
@@ -3,18 +3,39 @@
import argparse
import os
-parser = argparse.ArgumentParser(description='Make a config file for running CONTROL-Freec.')
-parser.add_argument("-t","--tumor", help="Tumor BAM file")
-parser.add_argument("-n","--normal", help="Normal BAM file")
-parser.add_argument("-g","--genome-fasta", help="Full genome fasta")
-parser.add_argument("-l","--chrom-lengths", help="File with chromosome lengths (an .fai file)")
-parser.add_argument("-s","--chrom-seqs", help="Folder with chromosome fastqs")
-parser.add_argument("-r","--capture-regions", help="BED file with exome capture regions",default=None)
-parser.add_argument("-o","--output-config", help="Filename of the config file to be generated", default="freec_config.txt")
-parser.add_argument("--snps-file", help="Passed to 'minCNAlength' argument", default="3")
-parser.add_argument("--degree", help="Polynomial degree used for GC normalization used for 'degree' argument", default="3")
-parser.add_argument("--min-length", help="Passed to 'minCNAlength' argument", default="3")
-parser.add_argument("--min-read-count", help="Passed to 'readCountThreshold' argument", default="50")
+parser = argparse.ArgumentParser(
+ description="Make a config file for running CONTROL-Freec."
+)
+parser.add_argument("-t", "--tumor", help="Tumor BAM file")
+parser.add_argument("-n", "--normal", help="Normal BAM file")
+parser.add_argument("-g", "--genome-fasta", help="Full genome fasta")
+parser.add_argument(
+ "-l", "--chrom-lengths", help="File with chromosome lengths (an .fai file)"
+)
+parser.add_argument("-s", "--chrom-seqs", help="Folder with chromosome fastqs")
+parser.add_argument(
+ "-r", "--capture-regions", help="BED file with exome capture regions", default=None
+)
+parser.add_argument(
+ "-o",
+ "--output-config",
+ help="Filename of the config file to be generated",
+ default="freec_config.txt",
+)
+parser.add_argument(
+ "--snps-file", help="Passed to 'minCNAlength' argument", default="3"
+)
+parser.add_argument(
+ "--degree",
+ help="Polynomial degree used for GC normalization used for 'degree' argument",
+ default="3",
+)
+parser.add_argument(
+ "--min-length", help="Passed to 'minCNAlength' argument", default="3"
+)
+parser.add_argument(
+ "--min-read-count", help="Passed to 'readCountThreshold' argument", default="50"
+)
parser.add_argument("--pileup-vcf", help="Passed to 'makePileup' argument", default="")
parser.add_argument("--ploidy", help="Estimated ploidy", default="")
parser.add_argument("--contamination", help="Estimated contamination", default="")
@@ -83,9 +104,8 @@
config_contents.append("")
-with open(args.output_config, 'w') as config_out:
+with open(args.output_config, "w") as config_out:
config_out.write("\n".join(config_contents))
-
########################################################################################################
@@ -93,12 +113,12 @@
# #!/usr/bin/perl -w
# use strict;
# use List::Util 'shuffle';
-#
+#
# #INPUT
-#
+#
# #my $mergedmaf = $ARGV[1] . '_out/oncotator_out/' . $ARGV[1] . '_merged.maf'; #to fix...
# #open C, ">$mergedmaf";
-#
+#
# my $outfile = $ARGV[0] . '/freec_exome_config.txt';
# my $chrLenFile = $ARGV[1];
# my $chrFiles = $ARGV[2];
@@ -112,7 +132,7 @@
# my $contamination='';
# my $ploidy='';
# my $rep=0;
-#
+#
# my $infile=$ARGV[9];
# open G, "<$infile";
# while (){
@@ -126,35 +146,35 @@
# $rep++;
# }
# }
-#
+#
# open C, ">$outfile";
-#
+#
# print C '[general]' . "\n\n";
-#
+#
# print C "BedGraphOutput = TRUE\ndegree = 1\nforceGCcontentNormalization = 1\nminCNAlength = 3\nnoisyData = TRUE\nreadCountThreshold = 50\n";
# print C "chrLenFile = $chrLenFile\n";
# print C "ploidy = $ploidy\ncontamination=$contamination\nbreakPointThreshold = 0.8\nwindow = 0\n";
# print C "chrFiles = $chrFiles\n";
# print C "minimalSubclonePresence = 30\nprintNA = FALSE\ncontaminationAdjustment = TRUE\nmaxThreads = 24\nnumberOfProcesses = 24\n";
# print C "outputDir = $ARGV[0]\n\n";
-#
+#
# print C '[sample]' . "\n\n";
-#
+#
# print C "mateFile = $tumormateFile\n";
# print C "inputFormat = BAM\nmateOrientation = FR\n\n";
-#
+#
# print C '[control]' . "\n\n";
-#
+#
# print C "mateFile = $controlmateFile\n";
# print C "inputFormat = BAM\nmateOrientation = FR\n\n";
-#
+#
# print C '[target]' . "\n\n";
-#
+#
# print C "captureRegions = $targets\n\n";
-#
+#
# print C '[BAF]' . "\n\n";
-#
+#
# print C "makePileup = $makePileup\n";
# print C "fastaFile = $fastaFile\n";
# print C "minimalCoveragePerPosition = 20\nminimalQualityPerPosition = 20\n";
-# print C "SNPfile = $SNPfile";
\ No newline at end of file
+# print C "SNPfile = $SNPfile";
diff --git a/workflow/scripts/freec/py_config/config.txt b/workflow/scripts/freec/py_config/config.txt
index ffb1015..b7c3ee4 100644
--- a/workflow/scripts/freec/py_config/config.txt
+++ b/workflow/scripts/freec/py_config/config.txt
@@ -3,8 +3,8 @@ degree = 3
minCNAlength = 3
readCountThreshold = 50
chrLenFile = /data/CCBR_Pipeliner/db/PipeDB/lib/hg38.filtered.fa.fai
-ploidy =
-contamination =
+ploidy =
+contamination =
BedGraphOutput = TRUE
forceGCcontentNormalization = 1
noisyData = TRUE
@@ -16,4 +16,4 @@ printNA = FALSE
contaminationAdjustment = TRUE
maxThreads = 2
numberOfProcesses = 2
-outputDir = py_config
\ No newline at end of file
+outputDir = py_config
diff --git a/workflow/scripts/get_flowcell_lanes.py b/workflow/scripts/get_flowcell_lanes.py
index 4d1dc59..51b97a0 100644
--- a/workflow/scripts/get_flowcell_lanes.py
+++ b/workflow/scripts/get_flowcell_lanes.py
@@ -17,7 +17,7 @@
# +
# AAAFFJJFJJJJJJFJJJJJJJJJJFJAJJJJJFJJJJJFFJJAJJJJ7JJ
-# Input 2 (SRA doesn't store FC ID, use intrument name instead)
+# Input 2 (SRA doesn't store FC ID, use instrument name instead)
# @SRR5351039.1 SN608:8:1101:31.20:96.50 length=51
# NTTTANNNNNNGNGCNCTGNNNNNNNNGNNNNNAAGGGNTNNNNNNNNNNN
# +SRR5351039.1 SN608:8:1101:31.20:96.50 length=51
@@ -29,12 +29,17 @@
# +SRR6755966.1 1 length=101
# CC@FFFFFHHHHHJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJHIJJJJI
-def usage(message = '', exitcode = 0):
+
+def usage(message="", exitcode=0):
"""Displays help and usage information. If provided invalid usage
returns non-zero exit-code. Additional message can be displayed with
the 'message' parameter.
"""
- print('Usage: python {} sampleName.R1.fastq.gz sampleName > sampleName.flowcell_lanes.txt'.format(sys.argv[0]))
+ print(
+ "Usage: python {} sampleName.R1.fastq.gz sampleName > sampleName.flowcell_lanes.txt".format(
+ sys.argv[0]
+ )
+ )
if message:
print(message)
sys.exit(exitcode)
@@ -45,7 +50,7 @@ def reader(fname):
or non-gzipped FastQ files based on the file extension. Assumes
gzipped files endwith the '.gz' extension.
"""
- if fname.endswith('.gz'):
+ if fname.endswith(".gz"):
# Opens up file with gzip handler
return gzip.open
else:
@@ -53,20 +58,20 @@ def reader(fname):
return open
-def get_flowcell_lane(sequence_identifer):
+def get_flowcell_lane(sequence_identifier):
"""Returns flowcell and lane information for different fastq formats.
FastQ files generated with older versions of Casava or downloaded from
SRA have a different format than newer FastQ files generated with the
current version of Casava. It is worth noting that FastQ files downloaded from SRA
or FastQ files generated with Casava version < 1.8 do not have Flowcell
- IDs in its sequence indentifer.
+ IDs in its sequence identifier.
For more information visit: https://en.wikipedia.org/wiki/FASTQ_format
"""
- id_list = sequence_identifer.strip().split(':')
+ id_list = sequence_identifier.strip().split(":")
if len(id_list) < 7:
# No Flowcell IDs in this format
# Return next instrument id instead (next best thing)
- if sequence_identifer.startswith('@SRR'):
+ if sequence_identifier.startswith("@SRR"):
# SRA format or downloaded SRA FastQ file
# SRA format 1: contains machine and lane information
# @SRR001666.1 071112_SLXA-EAS1_s_7:5:1:817:345 length=36
@@ -79,20 +84,20 @@ def get_flowcell_lane(sequence_identifer):
except IndexError:
# SRA format 2
id1 = id_list[0].split()[0].split(".")[0]
- id2 = id1.lstrip('@')
- return id1,id2
+ id2 = id1.lstrip("@")
+ return id1, id2
else:
# Casava < 1.8 (fastq format)
# @HWUSI-EAS100R:6:73:941:1973#0/1
- return id_list[0],id_list[1]
+ return id_list[0], id_list[1]
else:
# Casava >= 1.8
# Normal FastQ format
# @J00170:88:HNYVJBBXX:8:1101:6390:1244 1:N:0:ACTTGA
- return id_list[2],id_list[3]
+ return id_list[2], id_list[3]
-def md5sum(filename, blocksize = 65536):
+def md5sum(filename, blocksize=65536):
"""Gets md5checksum of a file in memory-safe manner.
The file is read in blocks defined by the blocksize parameter. This is a safer
option to reading the entire file into memory if the file is very large.
@@ -106,7 +111,7 @@ def md5sum(filename, blocksize = 65536):
import hashlib
hasher = hashlib.md5()
- with open(filename, 'rb') as fh:
+ with open(filename, "rb") as fh:
buf = fh.read(blocksize)
while len(buf) > 0:
hasher.update(buf)
@@ -115,13 +120,15 @@ def md5sum(filename, blocksize = 65536):
return hasher.hexdigest()
-if __name__ == '__main__':
-
+if __name__ == "__main__":
# Check Usage
- if '-h' in sys.argv or '--help' in sys.argv or '-help' in sys.argv:
- usage(exitcode = 0)
+ if "-h" in sys.argv or "--help" in sys.argv or "-help" in sys.argv:
+ usage(exitcode=0)
elif len(sys.argv) != 3:
- usage(message = 'Error: failed to provide all required positional arguments!', exitcode = 1)
+ usage(
+ message="Error: failed to provide all required positional arguments!",
+ exitcode=1,
+ )
# Get file name and sample name prefix
filename = sys.argv[1]
@@ -131,22 +138,33 @@ def md5sum(filename, blocksize = 65536):
# Get Flowcell and Lane information
handle = reader(filename)
- meta = {'flowcell': [], 'lane': [], 'flowcell_lane': []}
+ meta = {"flowcell": [], "lane": [], "flowcell_lane": []}
i = 0 # keeps track of line number
- with handle(filename, 'r') as file:
- print('sample_name\ttotal_read_pairs\tflowcell_ids\tlanes\tflowcell_lanes\tmd5_checksum')
+ with handle(filename, "r") as file:
+ print(
+ "sample_name\ttotal_read_pairs\tflowcell_ids\tlanes\tflowcell_lanes\tmd5_checksum"
+ )
for line in file:
line = line.strip()
- if i%4 == 0: # read id or sequence identifer
+ if i % 4 == 0: # read id or sequence identifier
fc, lane = get_flowcell_lane(line)
- fc = fc.lstrip('@')
- fc_lane = "{}_{}".format(fc,lane)
- if fc not in meta['flowcell']:
- meta['flowcell'].append(fc)
- if lane not in meta['lane']:
- meta['lane'].append(lane)
- if fc_lane not in meta['flowcell_lane']:
- meta['flowcell_lane'].append(fc_lane)
+ fc = fc.lstrip("@")
+ fc_lane = "{}_{}".format(fc, lane)
+ if fc not in meta["flowcell"]:
+ meta["flowcell"].append(fc)
+ if lane not in meta["lane"]:
+ meta["lane"].append(lane)
+ if fc_lane not in meta["flowcell_lane"]:
+ meta["flowcell_lane"].append(fc_lane)
i += 1
- print("{}\t{}\t{}\t{}\t{}\t{}".format(sample, int(i/4),",".join(sorted(meta['flowcell'])),",".join(sorted(meta['lane'])),",".join(sorted(meta['flowcell_lane'])), md5))
+ print(
+ "{}\t{}\t{}\t{}\t{}\t{}".format(
+ sample,
+ int(i / 4),
+ ",".join(sorted(meta["flowcell"])),
+ ",".join(sorted(meta["lane"])),
+ ",".join(sorted(meta["flowcell_lane"])),
+ md5,
+ )
+ )
diff --git a/workflow/scripts/makeGraph.R b/workflow/scripts/makeGraph.R
index 9938587..ef151b2 100644
--- a/workflow/scripts/makeGraph.R
+++ b/workflow/scripts/makeGraph.R
@@ -2,79 +2,78 @@
args <- commandArgs()
-dataTable <-read.table(args[5], header=TRUE);
-
-ratio<-data.frame(dataTable)
+dataTable <- read.table(args[5], header = TRUE)
+ratio <- data.frame(dataTable)
ploidy <- type.convert(args[4])
-png(filename = paste(args[5],".log2.png",sep = ""), width = 1180, height = 1180,
- units = "px", pointsize = 20, bg = "white", res = NA)
+png(
+ filename = paste(args[5], ".log2.png", sep = ""), width = 1180, height = 1180,
+ units = "px", pointsize = 20, bg = "white", res = NA
+)
plot(1:10)
-op <- par(mfrow = c(5,5))
-
-for (i in c(1:22,'X','Y')) {
- tt <- which(ratio$Chromosome==i)
- if (length(tt)>0) {
- plot(ratio$Start[tt],log2(ratio$Ratio[tt]),xlab = paste ("position, chr",i),ylab = "normalized copy number profile (log2)",pch = ".",col = colors()[88])
- tt <- which(ratio$Chromosome==i & ratio$CopyNumber>ploidy )
- points(ratio$Start[tt],log2(ratio$Ratio[tt]),pch = ".",col = colors()[136])
-
-
- tt <- which(ratio$Chromosome==i & ratio$CopyNumber 0) {
+ plot(ratio$Start[tt], log2(ratio$Ratio[tt]), xlab = paste("position, chr", i), ylab = "normalized copy number profile (log2)", pch = ".", col = colors()[88])
+ tt <- which(ratio$Chromosome == i & ratio$CopyNumber > ploidy)
+ points(ratio$Start[tt], log2(ratio$Ratio[tt]), pch = ".", col = colors()[136])
+
+
+ tt <- which(ratio$Chromosome == i & ratio$CopyNumber < ploidy & ratio$CopyNumber != -1)
+ points(ratio$Start[tt], log2(ratio$Ratio[tt]), pch = ".", col = colors()[461])
+ tt <- which(ratio$Chromosome == i)
+
+ # UNCOMMENT HERE TO SEE THE PREDICTED COPY NUMBER LEVEL:
+ # points(ratio$Start[tt],log2(ratio$CopyNumber[tt]/ploidy), pch = ".", col = colors()[24],cex=4)
+ }
+ tt <- which(ratio$Chromosome == i)
+
+ # UNCOMMENT HERE TO SEE THE EVALUATED MEDIAN LEVEL PER SEGMENT:
+ # points(ratio$Start[tt],log2(ratio$MedianRatio[tt]), pch = ".", col = colors()[463],cex=4)
}
dev.off()
-png(filename = paste(args[5],".png",sep = ""), width = 1180, height = 1180,
- units = "px", pointsize = 20, bg = "white", res = NA)
+png(
+ filename = paste(args[5], ".png", sep = ""), width = 1180, height = 1180,
+ units = "px", pointsize = 20, bg = "white", res = NA
+)
plot(1:10)
-op <- par(mfrow = c(5,5))
+op <- par(mfrow = c(5, 5))
maxLevelToPlot <- 3
for (i in c(1:length(ratio$Ratio))) {
- if (ratio$Ratio[i]>maxLevelToPlot) {
- ratio$Ratio[i]=maxLevelToPlot;
- }
+ if (ratio$Ratio[i] > maxLevelToPlot) {
+ ratio$Ratio[i] <- maxLevelToPlot
+ }
}
-for (i in c(1:22,'X','Y')) {
- tt <- which(ratio$Chromosome==i)
- if (length(tt)>0) {
- plot(ratio$Start[tt],ratio$Ratio[tt]*ploidy,ylim = c(0,maxLevelToPlot*ploidy),xlab = paste ("position, chr",i),ylab = "normalized copy number profile",pch = ".",col = colors()[88])
- tt <- which(ratio$Chromosome==i & ratio$CopyNumber>ploidy )
- points(ratio$Start[tt],ratio$Ratio[tt]*ploidy,pch = ".",col = colors()[136])
-
- tt <- which(ratio$Chromosome==i & ratio$Ratio==maxLevelToPlot & ratio$CopyNumber>ploidy)
- points(ratio$Start[tt],ratio$Ratio[tt]*ploidy,pch = ".",col = colors()[136],cex=4)
-
- tt <- which(ratio$Chromosome==i & ratio$CopyNumber 0) {
+ plot(ratio$Start[tt], ratio$Ratio[tt] * ploidy, ylim = c(0, maxLevelToPlot * ploidy), xlab = paste("position, chr", i), ylab = "normalized copy number profile", pch = ".", col = colors()[88])
+ tt <- which(ratio$Chromosome == i & ratio$CopyNumber > ploidy)
+ points(ratio$Start[tt], ratio$Ratio[tt] * ploidy, pch = ".", col = colors()[136])
+
+ tt <- which(ratio$Chromosome == i & ratio$Ratio == maxLevelToPlot & ratio$CopyNumber > ploidy)
+ points(ratio$Start[tt], ratio$Ratio[tt] * ploidy, pch = ".", col = colors()[136], cex = 4)
+
+ tt <- which(ratio$Chromosome == i & ratio$CopyNumber < ploidy & ratio$CopyNumber != -1)
+ points(ratio$Start[tt], ratio$Ratio[tt] * ploidy, pch = ".", col = colors()[461])
+ tt <- which(ratio$Chromosome == i)
+
+ # UNCOMMENT HERE TO SEE THE PREDICTED COPY NUMBER LEVEL:
+ # points(ratio$Start[tt],ratio$CopyNumber[tt], pch = ".", col = colors()[24],cex=4)
+ }
+ tt <- which(ratio$Chromosome == i)
+
+ # UNCOMMENT HERE TO SEE THE EVALUATED MEDIAN LEVEL PER SEGMENT:
+ # points(ratio$Start[tt],ratio$MedianRatio[tt]*ploidy, pch = ".", col = colors()[463],cex=4)
}
dev.off()
@@ -82,53 +81,52 @@ dev.off()
-if (length(args)>5) {
- dataTable <-read.table(args[6], header=TRUE);
- BAF<-data.frame(dataTable)
-
- png(filename = paste(args[6],".png",sep = ""), width = 1180, height = 1180,
- units = "px", pointsize = 20, bg = "white", res = NA)
- plot(1:10)
- op <- par(mfrow = c(5,5))
-
- for (i in c(1:22,'X','Y')) {
- tt <- which(BAF$Chromosome==i)
- if (length(tt)>0){
- lBAF <-BAF[tt,]
- plot(lBAF$Position,lBAF$BAF,ylim = c(-0.1,1.1),xlab = paste ("position, chr",i),ylab = "BAF",pch = ".",col = colors()[1])
-
- tt <- which(lBAF$A==0.5)
- points(lBAF$Position[tt],lBAF$BAF[tt],pch = ".",col = colors()[92])
- tt <- which(lBAF$A!=0.5 & lBAF$A>=0)
- points(lBAF$Position[tt],lBAF$BAF[tt],pch = ".",col = colors()[62])
- tt <- 1
- pres <- 1
-
- if (length(lBAF$A)>4) {
- for (j in c(2:(length(lBAF$A)-pres-1))) {
- if (lBAF$A[j]==lBAF$A[j+pres]) {
- tt[length(tt)+1] <- j
- }
- }
- points(lBAF$Position[tt],lBAF$A[tt],pch = ".",col = colors()[24],cex=4)
- points(lBAF$Position[tt],lBAF$B[tt],pch = ".",col = colors()[24],cex=4)
- }
-
- tt <- 1
- pres <- 1
- if (length(lBAF$FittedA)>4) {
- for (j in c(2:(length(lBAF$FittedA)-pres-1))) {
- if (lBAF$FittedA[j]==lBAF$FittedA[j+pres]) {
- tt[length(tt)+1] <- j
- }
- }
- points(lBAF$Position[tt],lBAF$FittedA[tt],pch = ".",col = colors()[463],cex=4)
- points(lBAF$Position[tt],lBAF$FittedB[tt],pch = ".",col = colors()[463],cex=4)
- }
-
- }
-
- }
- dev.off()
-
+if (length(args) > 5) {
+ dataTable <- read.table(args[6], header = TRUE)
+ BAF <- data.frame(dataTable)
+
+ png(
+ filename = paste(args[6], ".png", sep = ""), width = 1180, height = 1180,
+ units = "px", pointsize = 20, bg = "white", res = NA
+ )
+ plot(1:10)
+ op <- par(mfrow = c(5, 5))
+
+ for (i in c(1:22, "X", "Y")) {
+ tt <- which(BAF$Chromosome == i)
+ if (length(tt) > 0) {
+ lBAF <- BAF[tt, ]
+ plot(lBAF$Position, lBAF$BAF, ylim = c(-0.1, 1.1), xlab = paste("position, chr", i), ylab = "BAF", pch = ".", col = colors()[1])
+
+ tt <- which(lBAF$A == 0.5)
+ points(lBAF$Position[tt], lBAF$BAF[tt], pch = ".", col = colors()[92])
+ tt <- which(lBAF$A != 0.5 & lBAF$A >= 0)
+ points(lBAF$Position[tt], lBAF$BAF[tt], pch = ".", col = colors()[62])
+ tt <- 1
+ present <- 1
+
+ if (length(lBAF$A) > 4) {
+ for (j in c(2:(length(lBAF$A) - present - 1))) {
+ if (lBAF$A[j] == lBAF$A[j + present]) {
+ tt[length(tt) + 1] <- j
+ }
+ }
+ points(lBAF$Position[tt], lBAF$A[tt], pch = ".", col = colors()[24], cex = 4)
+ points(lBAF$Position[tt], lBAF$B[tt], pch = ".", col = colors()[24], cex = 4)
+ }
+
+ tt <- 1
+ present <- 1
+ if (length(lBAF$FittedA) > 4) {
+ for (j in c(2:(length(lBAF$FittedA) - present - 1))) {
+ if (lBAF$FittedA[j] == lBAF$FittedA[j + present]) {
+ tt[length(tt) + 1] <- j
+ }
+ }
+ points(lBAF$Position[tt], lBAF$FittedA[tt], pch = ".", col = colors()[463], cex = 4)
+ points(lBAF$Position[tt], lBAF$FittedB[tt], pch = ".", col = colors()[463], cex = 4)
+ }
+ }
+ }
+ dev.off()
}
diff --git a/workflow/scripts/make_freec_pass1_exome_tn_config.pl b/workflow/scripts/make_freec_pass1_exome_tn_config.pl
index e989c1c..afc2219 100644
--- a/workflow/scripts/make_freec_pass1_exome_tn_config.pl
+++ b/workflow/scripts/make_freec_pass1_exome_tn_config.pl
@@ -27,17 +27,17 @@
print C "chrFiles = $chrFiles\n";
print C "minimalSubclonePresence = 30\nprintNA = FALSE\ncontaminationAdjustment = TRUE\nmaxThreads = 24\nnumberOfProcesses = 24\n";
print C "outputDir = $ARGV[0]\n\n";
-
+
print C '[sample]' . "\n\n";
-
+
print C "mateFile = $tumormateFile\n";
print C "inputFormat = BAM\nmateOrientation = FR\n\n";
-
+
print C '[control]' . "\n\n";
-
+
print C "mateFile = $controlmateFile\n";
print C "inputFormat = BAM\nmateOrientation = FR\n\n";
-
+
print C '[target]' . "\n\n";
print C "captureRegions = $targets\n\n";
diff --git a/workflow/scripts/make_freec_pass2_exome_tn_config.pl b/workflow/scripts/make_freec_pass2_exome_tn_config.pl
index fa332e0..6cb16b2 100644
--- a/workflow/scripts/make_freec_pass2_exome_tn_config.pl
+++ b/workflow/scripts/make_freec_pass2_exome_tn_config.pl
@@ -45,17 +45,17 @@
print C "chrFiles = $chrFiles\n";
print C "minimalSubclonePresence = 30\nprintNA = FALSE\ncontaminationAdjustment = TRUE\nmaxThreads = 24\nnumberOfProcesses = 24\n";
print C "outputDir = $ARGV[0]\n\n";
-
+
print C '[sample]' . "\n\n";
-
+
print C "mateFile = $tumormateFile\n";
print C "inputFormat = BAM\nmateOrientation = FR\n\n";
-
+
print C '[control]' . "\n\n";
-
+
print C "mateFile = $controlmateFile\n";
print C "inputFormat = BAM\nmateOrientation = FR\n\n";
-
+
print C '[target]' . "\n\n";
print C "captureRegions = $targets\n\n";
diff --git a/workflow/scripts/parse_tn_mode.py b/workflow/scripts/parse_tn_mode.py
index 8ea8d24..329bc4f 100644
--- a/workflow/scripts/parse_tn_mode.py
+++ b/workflow/scripts/parse_tn_mode.py
@@ -11,91 +11,115 @@
# Example
# $ python get_flowcell_lanes.py input.R1.fastq.gz input > flowcell_lanes.txt
-def usage(message = '', exitcode = 0):
+
+def usage(message="", exitcode=0):
"""Displays help and usage information. If provided invalid usage
returns non-zero exit-code. Additional message can be displayed with
the 'message' parameter.
"""
- print('Usage: python {} paired [/path/to/pairs] [ sample_names ] > sampleName.flowcell_lanes.txt'.format(sys.argv[0]))
+ print(
+ "Usage: python {} paired [/path/to/pairs] [ sample_names ] > sampleName.flowcell_lanes.txt".format(
+ sys.argv[0]
+ )
+ )
if message:
print(message)
sys.exit(exitcode)
-def read_pairsfile(tn_mode="auto", pairs_filepath="", sample_names=[]):
+def read_pairsfile(tn_mode="auto", pairs_filepath="", sample_names=[]):
## Make sure tn_mode is valid
- if not tn_mode in ["auto","paired","tumor_only"]:
- raise NameError("""\n\tFatal: tn_mode must be one of 'auto', 'paired', or 'tumor_only'
+ if not tn_mode in ["auto", "paired", "tumor_only"]:
+ raise NameError(
+ """\n\tFatal: tn_mode must be one of 'auto', 'paired', or 'tumor_only'
Argument received: {}
- """.format(tn_mode, sys.argv[0])
+ """.format(
+ tn_mode, sys.argv[0]
+ )
)
-
+
## Initialize some empty variables
tumor_ids = []
normal_ids = []
- paired_ids={}
-
+ paired_ids = {}
+
## If pairs file exists, try to use it
if os.path.isfile(pairs_filepath):
## Read pairs file as data frame
- df = pd.read_csv(pairs_filepath, header=0, sep='\t')
- df.columns = df.columns.str.lower() ## Make column names case-insensitive
-
+ df = pd.read_csv(pairs_filepath, header=0, sep="\t")
+ df.columns = df.columns.str.lower() ## Make column names case-insensitive
+
## Make sure it contains a "tumor" column
if not "tumor" in df:
- raise NameError("""\n\tFatal: Pairs file must contain at least a 'tumor' column
+ raise NameError(
+ """\n\tFatal: Pairs file must contain at least a 'tumor' column
Columns found: {}
- """.format(df.columns.tolist(), sys.argv[0])
+ """.format(
+ df.columns.tolist(), sys.argv[0]
+ )
)
-
- df = df[pd.notna(df["tumor"])] ## Remove rows where tumor id is empty/na
+
+ df = df[pd.notna(df["tumor"])] ## Remove rows where tumor id is empty/na
tumor_ids = df["tumor"]
-
+
if "normal" in df:
normal_ids = df["normal"]
-
+
## Make sure normal ids are not empty/na
if any(pd.notna(normal_ids)):
- t_pair=tumor_ids[pd.notna(normal_ids)]
- n_pair=normal_ids[pd.notna(normal_ids)]
- paired_ids=dict(zip(t_pair.tolist(), n_pair.tolist()))
-
+ t_pair = tumor_ids[pd.notna(normal_ids)]
+ n_pair = normal_ids[pd.notna(normal_ids)]
+ paired_ids = dict(zip(t_pair.tolist(), n_pair.tolist()))
+
## If pairs file not found, try to use provided sample names as tumor-only IDs
else:
if tn_mode == "paired":
print("WARNING: Paired mode selected without a valid pairs file!!!")
-
+
if not sample_names:
- raise NameError("""\n\tFatal: Either a valid pairs file or sample names must be provided.
+ raise NameError(
+ """\n\tFatal: Either a valid pairs file or sample names must be provided.
Pairs file path provided: {}
Sample names provided: {}
- """.format(pairs_filepath, sample_names, sys.argv[0])
+ """.format(
+ pairs_filepath, sample_names, sys.argv[0]
+ )
)
else:
- tumor_ids=sample_names
-
+ tumor_ids = sample_names
+
## Overlap with given sample names
if sample_names:
overlapped_pairs = {k: paired_ids[k] for k in sample_names if k in paired_ids}
overlapped_tumors = list(set(tumor_ids) & set(sample_names))
-
- print(str(len(overlapped_pairs)) + " of " + str(len(paired_ids)) + " pairs in pairs file matched given sample names")
- print(str(len(overlapped_tumors)) + " of " + str(len(tumor_ids)) + " tumors in pairs file matched given sample names")
-
- paired_ids=overlapped_pairs
- tumor_ids=overlapped_tumors
-
- out_dict={"pairs":paired_ids, "tumors": set(tumor_ids)}
-
- if tn_mode=="paired":
- out_dict["tumors"]=[]
- elif tn_mode=="tumor_only":
- out_dict["pairs"]=[]
-
- return(out_dict)
-
-
-def md5sum(filename, blocksize = 65536):
+
+ print(
+ str(len(overlapped_pairs))
+ + " of "
+ + str(len(paired_ids))
+ + " pairs in pairs file matched given sample names"
+ )
+ print(
+ str(len(overlapped_tumors))
+ + " of "
+ + str(len(tumor_ids))
+ + " tumors in pairs file matched given sample names"
+ )
+
+ paired_ids = overlapped_pairs
+ tumor_ids = overlapped_tumors
+
+ out_dict = {"pairs": paired_ids, "tumors": set(tumor_ids)}
+
+ if tn_mode == "paired":
+ out_dict["tumors"] = []
+ elif tn_mode == "tumor_only":
+ out_dict["pairs"] = []
+
+ return out_dict
+
+
+def md5sum(filename, blocksize=65536):
"""Gets md5checksum of a file in memory-safe manner.
The file is read in blocks defined by the blocksize parameter. This is a safer
option to reading the entire file into memory if the file is very large.
@@ -109,7 +133,7 @@ def md5sum(filename, blocksize = 65536):
import hashlib
hasher = hashlib.md5()
- with open(filename, 'rb') as fh:
+ with open(filename, "rb") as fh:
buf = fh.read(blocksize)
while len(buf) > 0:
hasher.update(buf)
@@ -118,13 +142,15 @@ def md5sum(filename, blocksize = 65536):
return hasher.hexdigest()
-if __name__ == '__main__':
-
+if __name__ == "__main__":
# Check Usage
- if '-h' in sys.argv or '--help' in sys.argv or '-help' in sys.argv:
- usage(exitcode = 0)
+ if "-h" in sys.argv or "--help" in sys.argv or "-help" in sys.argv:
+ usage(exitcode=0)
elif len(sys.argv) != 3:
- usage(message = 'Error: failed to provide all required positional arguments!', exitcode = 1)
+ usage(
+ message="Error: failed to provide all required positional arguments!",
+ exitcode=1,
+ )
# Get file name and sample name prefix
filename = sys.argv[1]
@@ -134,22 +160,33 @@ def md5sum(filename, blocksize = 65536):
# Get Flowcell and Lane information
handle = reader(filename)
- meta = {'flowcell': [], 'lane': [], 'flowcell_lane': []}
+ meta = {"flowcell": [], "lane": [], "flowcell_lane": []}
i = 0 # keeps track of line number
- with handle(filename, 'r') as file:
- print('sample_name\ttotal_read_pairs\tflowcell_ids\tlanes\tflowcell_lanes\tmd5_checksum')
+ with handle(filename, "r") as file:
+ print(
+ "sample_name\ttotal_read_pairs\tflowcell_ids\tlanes\tflowcell_lanes\tmd5_checksum"
+ )
for line in file:
line = line.strip()
- if i%4 == 0: # read id or sequence identifer
+ if i % 4 == 0: # read id or sequence identifier
fc, lane = get_flowcell_lane(line)
- fc = fc.lstrip('@')
- fc_lane = "{}_{}".format(fc,lane)
- if fc not in meta['flowcell']:
- meta['flowcell'].append(fc)
- if lane not in meta['lane']:
- meta['lane'].append(lane)
- if fc_lane not in meta['flowcell_lane']:
- meta['flowcell_lane'].append(fc_lane)
+ fc = fc.lstrip("@")
+ fc_lane = "{}_{}".format(fc, lane)
+ if fc not in meta["flowcell"]:
+ meta["flowcell"].append(fc)
+ if lane not in meta["lane"]:
+ meta["lane"].append(lane)
+ if fc_lane not in meta["flowcell_lane"]:
+ meta["flowcell_lane"].append(fc_lane)
i += 1
- print("{}\t{}\t{}\t{}\t{}\t{}".format(sample, int(i/4),",".join(sorted(meta['flowcell'])),",".join(sorted(meta['lane'])),",".join(sorted(meta['flowcell_lane'])), md5))
+ print(
+ "{}\t{}\t{}\t{}\t{}\t{}".format(
+ sample,
+ int(i / 4),
+ ",".join(sorted(meta["flowcell"])),
+ ",".join(sorted(meta["lane"])),
+ ",".join(sorted(meta["flowcell_lane"])),
+ md5,
+ )
+ )
diff --git a/workflow/scripts/reformat_bed.py b/workflow/scripts/reformat_bed.py
index 8b033c2..69a4320 100644
--- a/workflow/scripts/reformat_bed.py
+++ b/workflow/scripts/reformat_bed.py
@@ -10,58 +10,76 @@
import sys
parser = argparse.ArgumentParser()
-parser.add_argument("-i","--input_bed", help="Input BED file to be reformatted")
+parser.add_argument("-i", "--input_bed", help="Input BED file to be reformatted")
# parser.add_argument("-g","--genome", help="Only used if equals 'hg19'; otherwise ignored", default="hg38")
-parser.add_argument("-o","--output_bed", help="Reformatted output BED file", default="exome_targets.bed")
-parser.add_argument("-f","--output_fields", help="Number of fields in output BED ('3' or '6')", default="6")
+parser.add_argument(
+ "-o",
+ "--output_bed",
+ help="Reformatted output BED file",
+ default="exome_targets.bed",
+)
+parser.add_argument(
+ "-f",
+ "--output_fields",
+ help="Number of fields in output BED ('3' or '6')",
+ default="6",
+)
args = parser.parse_args()
-infile=args.input_bed
-outfile=args.output_bed
-nfield=str(args.output_fields)
+infile = args.input_bed
+outfile = args.output_bed
+nfield = str(args.output_fields)
-last_start="-1" ## Position of the last start site
+last_start = "-1" ## Position of the last start site
### Open input bed file for reading
-with open(infile, 'r') as inputFile:
+with open(infile, "r") as inputFile:
### Open output exome targets bed file for writing
- with open(outfile, 'w') as exome_bed:
+ with open(outfile, "w") as exome_bed:
### Step through each line of input
for line in inputFile:
### Skip comments
- if not line.startswith("#") and not line.startswith("track") and not line.startswith("browser"):
- curr_cols=line.strip().split("\t")
- if (len(curr_cols) < 3):
- sys.exit("Targets BED file must contain at least three columns: chr, start, end")
-
- if (len(curr_cols) < 4):
+ if (
+ not line.startswith("#")
+ and not line.startswith("track")
+ and not line.startswith("browser")
+ ):
+ curr_cols = line.strip().split("\t")
+ if len(curr_cols) < 3:
+ sys.exit(
+ "Targets BED file must contain at least three columns: chr, start, end"
+ )
+
+ if len(curr_cols) < 4:
curr_cols.append(".")
-
- min_output=curr_cols[0] + "\t" + curr_cols[1] + "\t" + curr_cols[2]
-
- extra_fields=""
- if (not(nfield == "3")):
- extra_fields="\t" + curr_cols[3] + "\t0\t.\n"
-
- bed_output=min_output + extra_fields
-
+
+ min_output = curr_cols[0] + "\t" + curr_cols[1] + "\t" + curr_cols[2]
+
+ extra_fields = ""
+ if not (nfield == "3"):
+ extra_fields = "\t" + curr_cols[3] + "\t0\t.\n"
+
+ bed_output = min_output + extra_fields
+
### Add 'chr' prefix if genome is hg19
# if (args.genome=="hg19"):
# freec_bed_output="chr" + curr_cols[0].lstrip("chr") + "\t" + curr_cols[1] + "\t" + curr_cols[2] + "\n"
# else:
# freec_bed_output=curr_cols[0] + "\t" + curr_cols[1] + "\t" + curr_cols[2] + "\n"
-
+
### If current start location is same as previous, output empty string
- if (curr_cols[1] == last_start):
- print(curr_cols[1] + " IS equal to " + last_start + " so skipping it...")
- bed_output=""
+ if curr_cols[1] == last_start:
+ print(
+ curr_cols[1]
+ + " IS equal to "
+ + last_start
+ + " so skipping it..."
+ )
+ bed_output = ""
### Write to both files
exome_bed.write(bed_output)
# freec_bed.write(freec_bed_output)
-
- ### Update loop variables
- last_start=curr_cols[1]
-
-
+ ### Update loop variables
+ last_start = curr_cols[1]
diff --git a/workflow/scripts/run_sequenza.R b/workflow/scripts/run_sequenza.R
index e193474..cb836c3 100644
--- a/workflow/scripts/run_sequenza.R
+++ b/workflow/scripts/run_sequenza.R
@@ -1,44 +1,43 @@
-
-args = commandArgs(trailingOnly=TRUE)
+args <- commandArgs(trailingOnly = TRUE)
library(sequenza)
-if (length(args)==0) {
- stop("Must provide a seqz file")
+if (length(args) == 0) {
+ stop("Must provide a seqz file")
} else {
- seqz_file = args[1]
- if (! file.exists(seqz_file)) {
+ seqz_file <- args[1]
+ if (!file.exists(seqz_file)) {
stop(paste0("Can't find this SEQZ output file: ", seqz_file))
- }
+ }
}
if (length(args) > 1) {
- out_dir = args[2]
+ out_dir <- args[2]
} else {
- out_dir = dirname(seqz_file)
+ out_dir <- dirname(seqz_file)
}
if (length(args) > 2) {
- sampleid = args[3]
+ sampleid <- args[3]
} else {
- sampleid = gsub(".seqz.gz","",basename(seqz_file))
+ sampleid <- gsub(".seqz.gz", "", basename(seqz_file))
}
if (length(args) > 3) {
- n_cores = as.numeric(args[4])
+ n_cores <- as.numeric(args[4])
} else {
- n_cores = as.numeric(Sys.getenv("SLURM_CPUS_PER_TASK"))
+ n_cores <- as.numeric(Sys.getenv("SLURM_CPUS_PER_TASK"))
}
if (is.na(n_cores)) {
- n_cores = 1
+ n_cores <- 1
}
-print(paste0("Using ",n_cores," cores..."))
+print(paste0("Using ", n_cores, " cores..."))
date()
print("Extracting seqz data...")
-seqzdata <- sequenza.extract(seqz_file, min.reads = 30, min.reads.normal= 10, parallel=n_cores)
+seqzdata <- sequenza.extract(seqz_file, min.reads = 30, min.reads.normal = 10, parallel = n_cores)
date()
print("Fitting model...")
@@ -48,16 +47,16 @@ CP.example <- sequenza.fit(seqzdata, mc.cores = n_cores)
num_mutations <- unlist(lapply(seqzdata$mutations, nrow))
chrom_list <- names(num_mutations)[num_mutations > 3]
## But it might actually be segments, idk?
-#num_segments <- unlist(lapply(seqzdata$segments, nrow))
-#chrom_list <- names(num_mutations)[num_segments > 1]
+# num_segments <- unlist(lapply(seqzdata$segments, nrow))
+# chrom_list <- names(num_mutations)[num_segments > 1]
not_included <- setdiff(names(num_mutations), chrom_list)
print("Printing results...")
if (length(not_included) > 0) {
- print("Excluding these chromosomes because of too few mutations...")
- print(not_included)
+ print("Excluding these chromosomes because of too few mutations...")
+ print(not_included)
}
-sequenza.results(sequenza.extract = seqzdata,cp.table = CP.example, sample.id = sampleid, out.dir=out_dir, chromosome.list=chrom_list)
+sequenza.results(sequenza.extract = seqzdata, cp.table = CP.example, sample.id = sampleid, out.dir = out_dir, chromosome.list = chrom_list)
date()
print("Done")
diff --git a/workflow/scripts/vcf2maf_wrapper.bash b/workflow/scripts/vcf2maf_wrapper.bash
index 81cd4a0..7a25d81 100644
--- a/workflow/scripts/vcf2maf_wrapper.bash
+++ b/workflow/scripts/vcf2maf_wrapper.bash
@@ -22,29 +22,29 @@ parser.add_argument('--vepresourcebundlepath', required=False, default='/data/CC
parser.add_argument('--genomefasta', required=False, help='Fasta location')
EOF
-# Set Genome aliases, vaild choices = hg19/hg38/mm10
+# Set Genome aliases, valid choices = hg19/hg38/mm10
ncbi_build=""
species=""
-if [ $GENOMEBUILD == "hg38" ]; then
- ncbi_build="GRCh38"
+if [ $GENOMEBUILD == "hg38" ]; then
+ ncbi_build="GRCh38"
species="homo_sapiens"
-elif [ $GENOMEBUILD == "hg19" ]; then
+elif [ $GENOMEBUILD == "hg19" ]; then
ncbi_build="GRCh37"
species="homo_sapiens"
elif [ $GENOMEBUILD == "mm10" ] || [ $GENOMEBUILD == "GRCm38" ]; then
ncbi_build="GRCm38"
species="mus_musculus"
else
- echo "Unsupport value to option: --genomebuild"
+ echo "Unsupported value to option: --genomebuild"
echo "Please select from: hg19/hg38/mm10"
exit 1
fi
-# Set paths to VEP resources based on paths
+# Set paths to VEP resources based on paths
dotvep="${VEPRESOURCEBUNDLEPATH}"
# to species and ncbi build name
-#if [ $GENOME == "hg38" ]; then
+#if [ $GENOME == "hg38" ]; then
#fa="${VEPRESOURCEBUNDLEPATH}/${ncbi_build}.fa"
#elif [ $GENOME == "mm10" ]; then
# fa="${VEPRESOURCEBUNDLEPATH}/mouse.fa"
@@ -65,7 +65,7 @@ INPUT_DIR=$(dirname $VCF)
# Add chr prefix if requested and missing
chr_text=""
-if [ $GENOMEBUILD == "hg38" ] || [ $GENOMEBUILD == "mm10" ] || [ $GENOMEBUILD == "GRCm38" ]; then
+if [ $GENOMEBUILD == "hg38" ] || [ $GENOMEBUILD == "mm10" ] || [ $GENOMEBUILD == "GRCm38" ]; then
chr_text="chr"
fi
echo "Adding text '$chr_text' to chrom names..."
@@ -84,15 +84,15 @@ VCF_NID=""
NORM_VCF_ID_ARG=""
NSAMPLES=${#VCF_SAMPLE_IDS[@]}
if [ $NSAMPLES -gt 1 ]; then
- # Assign tumor, normal IDs
- # Look through column names and
+ # Assign tumor, normal IDs
+ # Look through column names and
# see if they match provided IDs
for (( i = 0; i < $NSAMPLES; i++ )); do
echo "${VCF_SAMPLE_IDS[$i]}"
if [ "${VCF_SAMPLE_IDS[$i]}" == "$TID" ]; then
TID_IDX=$i
fi
-
+
if [ "${VCF_SAMPLE_IDS[$i]}" == "$NID" ]; then
NID_IDX=$i
fi
@@ -126,7 +126,7 @@ if [ ! "$MAF_NID" == "" ]; then
NORM_MAF_ID_ARG="--normal-id $MAF_NID"
fi
-# Set option for multiple threads
+# Set option for multiple threads
if [ $THREADS==0 ]; then
THREADS=1
if [ ! -z $SLURM_CPUS_PER_TASK ]; then
@@ -150,5 +150,3 @@ vcf2maf.pl \
--ref-fasta "$GENOMEFASTA" \
--vep-overwrite
#--filter-vcf "$filtervcf" \
-
-
diff --git a/xavier b/xavier
index 29107be..9cb603f 100755
--- a/xavier
+++ b/xavier
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
-"""XAVIER: eXome Analysis and Variant explorER:
+"""XAVIER: eXome Analysis and Variant explorER:
A highly reproducible and portable Whole Exome-seq data analysis pipeline
ABOUT: This is the main entry for the XAVIER pipeline.
@@ -15,16 +15,16 @@ DISCLAIMER:
National Cancer Institute (NCI)
This software/database is a "United States Government Work" under
-the terms of the United States Copyright Act. It was written as
+the terms of the United States Copyright Act. It was written as
part of the author's official duties as a United States Government
employee and thus cannot be copyrighted. This software is freely
available to the public for use.
Although all reasonable efforts have been taken to ensure the
accuracy and reliability of the software and data, CCBR do not and
-cannot warrant the performance or results that may be obtained by
+cannot warrant the performance or results that may be obtained by
using this software or data. CCBR and NCI disclaim all warranties,
-express or implied, including warranties of performance,
+express or implied, including warranties of performance,
merchantability or fitness for any particular purpose.
Please cite the author and the "NIH Biowulf Cluster" in any work or
@@ -48,16 +48,12 @@ from src import version
from src.run import init, setup, bind, dryrun, runner
from src.shells import bash
from src.options import genome_options
-from src.utils import (err,
- exists,
- fatal,
- permissions,
- check_cache,
- require)
+from src.utils import err, exists, fatal, permissions, check_cache, require
__version__ = version
-__email__ = 'kuhnsa@nih.gov'
-__home__ = os.path.dirname(os.path.abspath(__file__))
+__email__ = "kuhnsa@nih.gov"
+__home__ = os.path.dirname(os.path.abspath(__file__))
+
def run(sub_args):
"""Initialize, setup, and run the XAVIER pipeline.
@@ -69,22 +65,20 @@ def run(sub_args):
"""
# Step 0. Check for required dependencies
# The pipelines has only two requirements:
- # snakemake and singularity
- require(['snakemake', 'singularity'], ['snakemake', 'singularity'])
+ # snakemake and singularity
+ require(["snakemake", "singularity"], ["snakemake", "singularity"])
# Optional Step. Initialize working directory,
- # copy over required resources to run
+ # copy over required resources to run
# the pipeline
git_repo = __home__
if sub_args.runmode == "init":
print("--Initializing")
input_files = init(
- repo_path = git_repo,
- output_path = sub_args.output,
- links = sub_args.input
+ repo_path=git_repo, output_path=sub_args.output, links=sub_args.input
)
- # Required Step. Setup pipeline for execution,
+ # Required Step. Setup pipeline for execution,
# dynamically create config.json config
# file from user inputs and base config
# determine "nidap folder"
@@ -93,81 +87,92 @@ def run(sub_args):
create_nidap_folder_YN = "yes"
# templates
- config = setup(sub_args,
- repo_path = git_repo,
- output_path = sub_args.output,
- create_nidap_folder_YN = create_nidap_folder_YN,
- links = sub_args.input
+ config = setup(
+ sub_args,
+ repo_path=git_repo,
+ output_path=sub_args.output,
+ create_nidap_folder_YN=create_nidap_folder_YN,
+ links=sub_args.input,
)
# Required Step. Resolve docker/singularity bind
# paths from the config file.
- bindpaths = bind(
- sub_args,
- config = config
- )
+ bindpaths = bind(sub_args, config=config)
# Optional Step: Dry-run pipeline
# if sub_args.dry_run:
if sub_args.runmode == "dryrun" or sub_args.runmode == "run":
print("--Dry-Run")
# Dryrun pipeline
- dryrun_output = dryrun(outdir = sub_args.output) # python3 returns byte-string representation
- print("\nDry-running XAVIER pipeline:\n{}".format(dryrun_output.decode("utf-8")))
+ dryrun_output = dryrun(
+ outdir=sub_args.output
+ ) # python3 returns byte-string representation
+ print(
+ "\nDry-running XAVIER pipeline:\n{}".format(dryrun_output.decode("utf-8"))
+ )
# Optional Step. Orchestrate pipeline execution,
# run pipeline in locally on a compute node
# for debugging purposes or submit the master
- # job to the job scheduler, SLURM, and create
+ # job to the job scheduler, SLURM, and create
# logging file
if sub_args.runmode == "run":
print("--Run full pipeline")
- if not exists(os.path.join(sub_args.output, 'logfiles')):
+ if not exists(os.path.join(sub_args.output, "logfiles")):
# Create directory for logfiles
- os.makedirs(os.path.join(sub_args.output, 'logfiles'))
- if sub_args.mode == 'local':
- log = os.path.join(sub_args.output, 'logfiles', 'snakemake.log')
- else:
- log = os.path.join(sub_args.output, 'logfiles', 'master.log')
- logfh = open(log, 'w')
- wait = ''
- if sub_args.wait: wait = '--wait'
- mjob = runner(mode = sub_args.mode,
- outdir = sub_args.output,
+ os.makedirs(os.path.join(sub_args.output, "logfiles"))
+ if sub_args.mode == "local":
+ log = os.path.join(sub_args.output, "logfiles", "snakemake.log")
+ else:
+ log = os.path.join(sub_args.output, "logfiles", "master.log")
+ logfh = open(log, "w")
+ wait = ""
+ if sub_args.wait:
+ wait = "--wait"
+ mjob = runner(
+ mode=sub_args.mode,
+ outdir=sub_args.output,
# additional_bind_paths = all_bind_paths,
- alt_cache = sub_args.singularity_cache,
- threads = int(sub_args.threads),
- jobname = sub_args.job_name,
- submission_script='runner',
- logger = logfh,
- additional_bind_paths = ",".join(bindpaths),
- tmp_dir = sub_args.tmp_dir,
- wait = wait
+ alt_cache=sub_args.singularity_cache,
+ threads=int(sub_args.threads),
+ jobname=sub_args.job_name,
+ submission_script="runner",
+ logger=logfh,
+ additional_bind_paths=",".join(bindpaths),
+ tmp_dir=sub_args.tmp_dir,
+ wait=wait,
)
-
+
# Step 5. Wait for subprocess to complete,
# this is blocking and not asynchronous
if not sub_args.silent:
- print("\nRunning XAVIER pipeline in '{}' mode...".format(sub_args.mode))
+ print("\nRunning XAVIER pipeline in '{}' mode...".format(sub_args.mode))
mjob.wait()
logfh.close()
# Step 6. Relay information about submission
- # of the master job or the exit code of the
+ # of the master job or the exit code of the
# pipeline that ran in local mode
- if sub_args.mode == 'local':
+ if sub_args.mode == "local":
if int(mjob.returncode) == 0:
- print('XAVIER has successfully completed')
+ print("XAVIER has successfully completed")
else:
- fatal('XAVIER failed. Please see {} for more information.'.format(
- os.path.join(sub_args.output, 'logfiles', 'snakemake.log')))
- elif sub_args.mode == 'slurm':
- jobid = open(os.path.join(sub_args.output, 'logfiles', 'mjobid.log')).read().strip()
+ fatal(
+ "XAVIER failed. Please see {} for more information.".format(
+ os.path.join(sub_args.output, "logfiles", "snakemake.log")
+ )
+ )
+ elif sub_args.mode == "slurm":
+ jobid = (
+ open(os.path.join(sub_args.output, "logfiles", "mjobid.log"))
+ .read()
+ .strip()
+ )
if not sub_args.silent:
if int(mjob.returncode) == 0:
- print('Successfully submitted master job: ', end="")
+ print("Successfully submitted master job: ", end="")
else:
- fatal('Error occurred when submitting the master job.')
+ fatal("Error occurred when submitting the master job.")
print(jobid)
@@ -184,12 +189,11 @@ def unlock(sub_args):
outdir = sub_args.output
try:
- unlock_output = subprocess.check_output([
- 'snakemake', '--unlock',
- '--cores', '1',
- '--configfile=config.json'
- ], cwd = outdir,
- stderr=subprocess.STDOUT)
+ unlock_output = subprocess.check_output(
+ ["snakemake", "--unlock", "--cores", "1", "--configfile=config.json"],
+ cwd=outdir,
+ stderr=subprocess.STDOUT,
+ )
except subprocess.CalledProcessError as e:
# Unlocking process returned a non-zero exit code
sys.exit("{}\n{}".format(e, e.output))
@@ -205,12 +209,11 @@ def cache(sub_args):
Parsed arguments for unlock sub-command
"""
print(sub_args)
- #fatal('NotImplementedError... Comming Soon!')
sif_cache = sub_args.sif_cache
# Get absolute PATH to templates in XAVIER git repo
repo_path = os.path.dirname(os.path.abspath(__file__))
- images = os.path.join(repo_path, 'config','containers', 'images.json')
+ images = os.path.join(repo_path, "config", "containers", "images.json")
# Create image cache
if not exists(sif_cache):
@@ -218,19 +221,24 @@ def cache(sub_args):
os.makedirs(sif_cache)
elif exists(sif_cache) and os.path.isfile(sif_cache):
# Provided Path for pipeline output directory exists as file
- raise OSError("""\n\tFatal: Failed to create provided sif cache directory!
+ raise OSError(
+ """\n\tFatal: Failed to create provided sif cache directory!
User provided --sif-cache PATH already exists on the filesystem as a file.
Please {} cache again with a different --sif-cache PATH.
- """.format(sys.argv[0])
+ """.format(
+ sys.argv[0]
+ )
)
# Check if local SIFs already exist on the filesystem
- with open(images, 'r') as fh:
+ with open(images, "r") as fh:
data = json.load(fh)
pull = []
- for image, uri in data['images'].items():
- sif = os.path.join(sif_cache, '{}.sif'.format(os.path.basename(uri).replace(':', '_')))
+ for image, uri in data["images"].items():
+ sif = os.path.join(
+ sif_cache, "{}.sif".format(os.path.basename(uri).replace(":", "_"))
+ )
if not exists(sif):
# If local sif does not exist on in cache, print warning
# and default to pulling from URI in config/containers/images.json
@@ -239,22 +247,31 @@ def cache(sub_args):
if not pull:
# Nothing to do!
- print('Singularity image cache is already up to update!')
+ print("Singularity image cache is already up to update!")
else:
- # There are image(s) that need to be pulled
+ # There are image(s) that need to be pulled
if not sub_args.dry_run:
# submission_script for XAVIER cache is /path/to/output/resources/cacher
# Quote user provided values to avoid shell injections
masterjob = subprocess.Popen(
- 'sbatch --parsable -J pl:cache --gres=lscratch:200 --time=10:00:00 --mail-type=BEGIN,END,FAIL ' +
- str(os.path.join(repo_path, 'resources', 'cacher')) + ' slurm ' +
- " -s '{}' ".format(sif_cache) +
- " -i '{}' ".format(','.join(pull)) +
- " -t '/lscratch/${SLURM_JOB_ID}/.singularity/' ",
- cwd = sif_cache, shell=True, stderr = subprocess.STDOUT, stdout = subprocess.PIPE)
+ "sbatch --parsable -J pl:cache --gres=lscratch:200 --time=10:00:00 --mail-type=BEGIN,END,FAIL "
+ + str(os.path.join(repo_path, "resources", "cacher"))
+ + " slurm "
+ + " -s '{}' ".format(sif_cache)
+ + " -i '{}' ".format(",".join(pull))
+ + " -t '/lscratch/${SLURM_JOB_ID}/.singularity/' ",
+ cwd=sif_cache,
+ shell=True,
+ stderr=subprocess.STDOUT,
+ stdout=subprocess.PIPE,
+ )
masterjob.communicate()
- print('XAVIER reference cacher submitted master job with exit-code: {}'.format(masterjob.returncode))
+ print(
+ "XAVIER reference cacher submitted master job with exit-code: {}".format(
+ masterjob.returncode
+ )
+ )
def parsed_arguments():
@@ -269,13 +286,17 @@ def parsed_arguments():
"""
# Create a top-level parser
- parser = argparse.ArgumentParser(description = 'XAVIER: eXome Analysis and Variant explorER:')
+ parser = argparse.ArgumentParser(
+ description="XAVIER: eXome Analysis and Variant explorER:"
+ )
- # Adding Verison information
- parser.add_argument('--version', action = 'version', version='%(prog)s {}'.format(__version__))
+ # Adding Version information
+ parser.add_argument(
+ "--version", action="version", version="%(prog)s {}".format(__version__)
+ )
# Create sub-command parser
- subparsers = parser.add_subparsers(help='List of available sub-commands')
+ subparsers = parser.add_subparsers(help="List of available sub-commands")
# Sub-parser for the "run" sub-command
# Grouped sub-parser arguments are currently not supported by argparse.
@@ -283,7 +304,8 @@ def parsed_arguments():
# Here is a work around to create more useful help message for named
# options that are required! Please note: if a required arg is added the
# description below should be updated (i.e. update usage and add new option)
- required_run_options = textwrap.dedent("""\
+ required_run_options = textwrap.dedent(
+ """\
usage: xavier run [--help] \\
[--mode {local, slurm}] \\
[--job-name JOB_NAME] \\
@@ -302,7 +324,7 @@ def parsed_arguments():
--input INPUT [INPUT ...] \\
--output OUTPUT \\
--genome {hg38, mm10, ...} \\
- --targets TARGETS
+ --targets TARGETS
required arguments:
--runmode RUNMODE [init, dryrun, run ...]
@@ -312,7 +334,7 @@ def parsed_arguments():
3) run. Run the pipeline
Example: --runmode init
--input INPUT [INPUT ...]
- Input FastQ or BAM file(s) to process. One or more input
+ Input FastQ or BAM file(s) to process. One or more input
files can be provided. The pipeline does NOT support
single-end WES data. Please provide either a set of
FastQ files or a set of BAM files. The pipeline does
@@ -336,10 +358,12 @@ def parsed_arguments():
obtained from the manufacturer of the target capture
kit that was used.
- """)
+ """
+ )
# Display example usage in epilog
- run_epilog = textwrap.dedent("""\
+ run_epilog = textwrap.dedent(
+ """\
example:
# Step 1.) Grab an interactive node (do not run on head node)
sinteractive --mem=8g --cpus-per-task=4
@@ -353,7 +377,7 @@ def parsed_arguments():
--output /data/$USER/xavier_hg38 \\
--genome hg38 \\
--targets .tests/Agilent_SSv7_allExons_hg38.bed
-
+
# Step 2B.) Dry-run the pipeline
xavier run \\
--runmode dryrun \\
@@ -376,68 +400,80 @@ def parsed_arguments():
version:
{}
- """.format(__version__))
+ """.format(
+ __version__
+ )
+ )
- # Supressing help message of required args to overcome no sub-parser named groups
- subparser_run = subparsers.add_parser('run',
- help = 'Run the XAVIER pipeline with input files.',
- usage = argparse.SUPPRESS,
+ # Suppressing help message of required args to overcome no sub-parser named groups
+ subparser_run = subparsers.add_parser(
+ "run",
+ help="Run the XAVIER pipeline with input files.",
+ usage=argparse.SUPPRESS,
formatter_class=argparse.RawDescriptionHelpFormatter,
- description = required_run_options,
- epilog = run_epilog
+ description=required_run_options,
+ epilog=run_epilog,
)
# Required Arguments
# Input FastQ files
- subparser_run.add_argument('--runmode',
+ subparser_run.add_argument(
+ "--runmode",
# Determines how to run the pipeline: init, run, or dry-run
- required = True,
- choices = ['init','run', 'dryrun'],
- type = str,
- help = argparse.SUPPRESS
+ required=True,
+ choices=["init", "run", "dryrun"],
+ type=str,
+ help=argparse.SUPPRESS,
)
-
+
# Input FastQ files
- subparser_run.add_argument('--input',
+ subparser_run.add_argument(
+ "--input",
# Check if the file exists and if it is readable
- type = lambda file: permissions(parser, file, os.R_OK),
- required = True,
- nargs = '+',
- help = argparse.SUPPRESS
+ type=lambda file: permissions(parser, file, os.R_OK),
+ required=True,
+ nargs="+",
+ help=argparse.SUPPRESS,
)
# Output Directory (analysis working directory)
- subparser_run.add_argument('--output',
- type = lambda option: os.path.abspath(os.path.expanduser(option)),
- required = True,
- help = argparse.SUPPRESS
+ subparser_run.add_argument(
+ "--output",
+ type=lambda option: os.path.abspath(os.path.expanduser(option)),
+ required=True,
+ help=argparse.SUPPRESS,
)
-
+
# Reference Genome (to dynamically select reference files)
- subparser_run.add_argument('--genome',
- required = True,
- #choices = ['hg38', 'mm10'],
- type = lambda option: str(genome_options(subparser_run, option, ['hg38','mm10'])),
- help = argparse.SUPPRESS
+ subparser_run.add_argument(
+ "--genome",
+ required=True,
+ # choices = ['hg38', 'mm10'],
+ type=lambda option: str(
+ genome_options(subparser_run, option, ["hg38", "mm10"])
+ ),
+ help=argparse.SUPPRESS,
)
-
+
# Exome TARGET BED file
- subparser_run.add_argument('--targets',
+ subparser_run.add_argument(
+ "--targets",
# Check if the file exists and if it is readable
- type = lambda file: permissions(parser, file, os.R_OK),
- required = True,
- help = argparse.SUPPRESS
+ type=lambda file: permissions(parser, file, os.R_OK),
+ required=True,
+ help=argparse.SUPPRESS,
)
# Optional Arguments
# Execution Method (run locally on a compute node, submit to SLURM job scheduler, etc.)
- subparser_run.add_argument('--mode',
- type = str,
- required = False,
- default = "slurm",
- choices = ['slurm', 'local'],
- help = 'Execution Method [Default: slurm]. Defines the mode or method of execution. \
- Vaild mode options include: local or slurm. \
+ subparser_run.add_argument(
+ "--mode",
+ type=str,
+ required=False,
+ default="slurm",
+ choices=["slurm", "local"],
+ help="Execution Method [Default: slurm]. Defines the mode or method of execution. \
+ Valid mode options include: local or slurm. \
local: uses local method of execution. local executions will run serially on \
compute instance. This is useful for testing, debugging, or when a users does \
not have access to a high performance computing environment. If this option is \
@@ -445,124 +481,137 @@ def parsed_arguments():
slurm: uses slurm and singularity backend. The slurm execution method will submit \
jobs to a cluster. It is recommended running xavier in this mode as execution \
will be significantly faster in a distributed environment. \
- Example: --mode slurm'
+ Example: --mode slurm",
)
# Name of master job
- subparser_run.add_argument('--job-name',
- type = str,
- required = False,
- default = 'pl:xavier',
- help = 'Set the name of the pipeline\'s master job. \
+ subparser_run.add_argument(
+ "--job-name",
+ type=str,
+ required=False,
+ default="pl:xavier",
+ help="Set the name of the pipeline's master job. \
When submitting the pipeline to a job scheduler, like SLURM, \
- this option always you to set the name of the pipeline\'s master \
- job. By default, the name of the pipeline\'s master job \
- is set to "pl:xavier". \
- Example: --job-name xavier_hg38_main'
+ this option always you to set the name of the pipeline's master \
+ job. By default, the name of the pipeline's master job \
+ is set to \"pl:xavier\". \
+ Example: --job-name xavier_hg38_main",
)
# Variant Callers
- subparser_run.add_argument('--callers',
- type = str,
- required = False,
- nargs= '+',
- metavar = 'CALLERS',
- default = ['mutect2','mutect','strelka','vardict','varscan'],
- choices = ['mutect2','mutect','strelka','vardict','varscan'],
- help = 'Variant Callers. List of variant callers to call mutations. Please select from one or \
+ subparser_run.add_argument(
+ "--callers",
+ type=str,
+ required=False,
+ nargs="+",
+ metavar="CALLERS",
+ default=["mutect2", "mutect", "strelka", "vardict", "varscan"],
+ choices=["mutect2", "mutect", "strelka", "vardict", "varscan"],
+ help="Variant Callers. List of variant callers to call mutations. Please select from one or \
more of the following options: [mutect2, mutect, strelka, vardict, varscan]. Defaults to using all \
- variant callers. Example: --callers mutect2 strelka varscan'
+ variant callers. Example: --callers mutect2 strelka varscan",
)
# Tumor normal pairs file
- subparser_run.add_argument('--pairs',
+ subparser_run.add_argument(
+ "--pairs",
# Check if the file exists and if it is readable
- type = lambda file: permissions(parser, file, os.R_OK),
- required = False,
- help = 'Tumor normal pairs file. This tab delimited file contains two columns with the names \
+ type=lambda file: permissions(parser, file, os.R_OK),
+ required=False,
+ help='Tumor normal pairs file. This tab delimited file contains two columns with the names \
of tumor and normal pairs, one per line. The header of the file needs to be "Tumor" for the \
- tumor column and "Normal" for the normal column.'
+ tumor column and "Normal" for the normal column.',
)
- # Correction for FFPE samples
- subparser_run.add_argument('--ffpe',
- action = 'store_true',
- required = False,
- default = False,
- help = 'FFPE correction. Runs an additional filtering step for Formalin-Fixed Paraffin-Embedded \
- (FFPE) samples. Do NOT use this option with non-FFPE samples.'
+ # Correction for FFPE samples
+ subparser_run.add_argument(
+ "--ffpe",
+ action="store_true",
+ required=False,
+ default=False,
+ help="FFPE correction. Runs an additional filtering step for Formalin-Fixed Paraffin-Embedded \
+ (FFPE) samples. Do NOT use this option with non-FFPE samples.",
)
# Call CNVs
- subparser_run.add_argument('--cnv',
- action = 'store_true',
- required = False,
- default = False,
- help = 'Call copy number variations or CNVs. CNVs will only be called from tumor-normal pairs. \
- If this option is provided without providing a --pairs file, CNVs will NOT be called.'
- )
-
- # wait until master job finishes ... required for HPC API execution
- subparser_run.add_argument('--wait',
- action = 'store_true',
- required = False,
- default = False,
- help = 'Wait until master job completes. This is required if \
+ subparser_run.add_argument(
+ "--cnv",
+ action="store_true",
+ required=False,
+ default=False,
+ help="Call copy number variations or CNVs. CNVs will only be called from tumor-normal pairs. \
+ If this option is provided without providing a --pairs file, CNVs will NOT be called.",
+ )
+
+ # wait until master job finishes ... required for HPC API execution
+ subparser_run.add_argument(
+ "--wait",
+ action="store_true",
+ required=False,
+ default=False,
+ help="Wait until master job completes. This is required if \
the job is submitted using HPC API. If not provided \
the API may interpret submission of master job as \
- completion of the pipeline!'
+ completion of the pipeline!",
)
- # create-nidap-folder create a folder called "NIDAP" to be moved back to NIDAP
- subparser_run.add_argument('--create-nidap-folder',
- action = 'store_true',
- required = False,
- default = False,
- help = 'Create folder called "NIDAP" with file to-be-moved back to NIDAP \
- This makes it convinient to move only this folder (called NIDAP) and its content back \
- to NIDAP, rather than the entire pipeline output folder'
+ # create-nidap-folder create a folder called "NIDAP" to be moved back to NIDAP
+ subparser_run.add_argument(
+ "--create-nidap-folder",
+ action="store_true",
+ required=False,
+ default=False,
+ help='Create folder called "NIDAP" with file to-be-moved back to NIDAP \
+ This makes it convenient to move only this folder (called NIDAP) and its content back \
+ to NIDAP, rather than the entire pipeline output folder',
)
# Silent output mode
- subparser_run.add_argument('--silent',
- action = 'store_true',
- required = False,
- default = False,
- help = 'Silence standard output. Reduces the amount of information directed \
+ subparser_run.add_argument(
+ "--silent",
+ action="store_true",
+ required=False,
+ default=False,
+ help="Silence standard output. Reduces the amount of information directed \
to standard output when submitting master job to the job scheduler. Only the \
- job id of the master job is returned.'
+ job id of the master job is returned.",
)
-
+
# Singularity cache directory (default uses output directory)
- subparser_run.add_argument('--singularity-cache',
- type = lambda option: check_cache(parser, os.path.abspath(os.path.expanduser(option))),
- required = False,
- help = 'Overrides the $SINGULARITY_CACHEDIR environment variable. Singularity will cache \
+ subparser_run.add_argument(
+ "--singularity-cache",
+ type=lambda option: check_cache(
+ parser, os.path.abspath(os.path.expanduser(option))
+ ),
+ required=False,
+ help="Overrides the $SINGULARITY_CACHEDIR environment variable. Singularity will cache \
image layers pulled from remote registries. By default, the cache is set to \
- \'/path/to/output/directory/.singularity/\'. \
- Please note that this cache cannot be shared across users.'
- )
-
- # Local SIF cache directory (default pull from Dockerhub)
- subparser_run.add_argument('--sif-cache',
- type = lambda option: os.path.abspath(os.path.expanduser(option)),
- required = False,
- help = 'Path where a local cache of SIFs are stored. \
+ '/path/to/output/directory/.singularity/'. \
+ Please note that this cache cannot be shared across users.",
+ )
+
+ # Local SIF cache directory (default pull from Dockerhub)
+ subparser_run.add_argument(
+ "--sif-cache",
+ type=lambda option: os.path.abspath(os.path.expanduser(option)),
+ required=False,
+ help="Path where a local cache of SIFs are stored. \
This cache can be shared across users if permissions are \
set correctly. If a SIF does not exist in the SIF cache, \
the image will be pulled from Dockerhub. The xavier cache \
subcommand can be used to create a local SIF cache. Please see \
- xavier cache for more information.'
+ xavier cache for more information.",
)
- # Base directory to write temporary files
- subparser_run.add_argument('--tmp-dir',
- type = str,
- required = False,
- default = '/lscratch/$SLURM_JOBID/',
- help = 'Path on the filesystem for writing intermediate, temporary output \
- files. By default, this variable is set to \'/lscratch/$SLURM_JOBID\' \
- for backwards compatibility with the NIH\'s Biowulf cluster; however, \
+ # Base directory to write temporary files
+ subparser_run.add_argument(
+ "--tmp-dir",
+ type=str,
+ required=False,
+ default="/lscratch/$SLURM_JOBID/",
+ help="Path on the filesystem for writing intermediate, temporary output \
+ files. By default, this variable is set to '/lscratch/$SLURM_JOBID' \
+ for backwards compatibility with the NIH's Biowulf cluster; however, \
if you are running the pipeline on another cluster, this option will \
need to be specified. Ideally, this path should point to a dedicated \
location on the filesystem for writing tmp files. On many systems, this \
@@ -570,18 +619,19 @@ def parsed_arguments():
into this string that should NOT be expanded, please quote this options \
value in single quotes. As an example, on the NCI/NIH FRCE cluster the \
value of this option would be set to \
- --tmp-dir \'/scratch/cluster_scratch/$USER/\', \
- default: \'/lscratch/$SLURM_JOBID/\''
+ --tmp-dir '/scratch/cluster_scratch/$USER/', \
+ default: '/lscratch/$SLURM_JOBID/'",
)
# Number of threads for the xavier pipeline's main proceess
- subparser_run.add_argument('--threads',
- type = int,
- required = False,
- default = 2,
- help = 'Max number of threads for local processes. It is recommended \
- setting this vaule to the maximum number of CPUs available on the host \
- machine, default: 2.'
+ subparser_run.add_argument(
+ "--threads",
+ type=int,
+ required=False,
+ default=2,
+ help="Max number of threads for local processes. It is recommended \
+ setting this value to the maximum number of CPUs available on the host \
+ machine, default: 2.",
)
# Sub-parser for the "unlock" sub-command
@@ -590,7 +640,8 @@ def parsed_arguments():
# Here is a work around to create more useful help message for named
# options that are required! Please note: if a required arg is added the
# description below should be updated (i.e. update usage and add new option)
- required_unlock_options = textwrap.dedent("""\
+ required_unlock_options = textwrap.dedent(
+ """\
usage: xavier unlock [-h] --output OUTPUT
If the pipeline fails ungracefully, it maybe required to unlock the working
@@ -602,38 +653,42 @@ def parsed_arguments():
required arguments:
--output OUTPUT
Path to a previous run's output directory to
- unlock. This will remove a lock on the working
- directory. Please verify that the pipeline is
+ unlock. This will remove a lock on the working
+ directory. Please verify that the pipeline is
not running before running this command.
Example: --output /data/$USER/xavier_hg38
- """)
+ """
+ )
# Display example usage in epilog
- unlock_epilog = textwrap.dedent("""\
+ unlock_epilog = textwrap.dedent(
+ """\
example:
# Unlock xavier output directory
xavier unlock --output /scratch/$USER/xavier_hg38
version:
{}
- """.format(__version__))
+ """.format(
+ __version__
+ )
+ )
- # Supressing help message of required args to overcome no sub-parser named groups
- subparser_unlock = subparsers.add_parser('unlock',
- help = 'Unlocks a previous runs output directory.',
- usage = argparse.SUPPRESS,
+ # Suppressing help message of required args to overcome no sub-parser named groups
+ subparser_unlock = subparsers.add_parser(
+ "unlock",
+ help="Unlocks a previous runs output directory.",
+ usage=argparse.SUPPRESS,
formatter_class=argparse.RawDescriptionHelpFormatter,
- description = required_unlock_options,
- epilog = unlock_epilog
+ description=required_unlock_options,
+ epilog=unlock_epilog,
)
# Required Arguments
# Output Directory (analysis working directory)
- subparser_unlock.add_argument('--output',
- type = str,
- required = True,
- help = argparse.SUPPRESS
+ subparser_unlock.add_argument(
+ "--output", type=str, required=True, help=argparse.SUPPRESS
)
# Sub-parser for the "cache" sub-command
@@ -642,7 +697,8 @@ def parsed_arguments():
# Here is a work around to create more useful help message for named
# options that are required! Please note: if a required arg is added the
# description below should be updated (i.e. update usage and add new option)
- required_cache_options = textwrap.dedent("""\
+ required_cache_options = textwrap.dedent(
+ """\
usage: xavier cache [-h] [-n] --sif-cache SIF_CACHE
Creates a local cache resources hosted on DockerHub or AWS S3.
@@ -667,48 +723,56 @@ def parsed_arguments():
Example: --sif-cache /scratch/$USER/cache
- """)
+ """
+ )
# Display example usage in epilog
- cache_epilog = textwrap.dedent("""\
+ cache_epilog = textwrap.dedent(
+ """\
example:
# Cache xavier resources
xavier cache --sif-cache /scratch/$USER/cache
version:
{}
- """.format(__version__))
+ """.format(
+ __version__
+ )
+ )
- # Supressing help message of required args to overcome no sub-parser named groups
- subparser_cache = subparsers.add_parser('cache',
- help = 'Cache remote resources locally.',
- usage = argparse.SUPPRESS,
+ # Suppressing help message of required args to overcome no sub-parser named groups
+ subparser_cache = subparsers.add_parser(
+ "cache",
+ help="Cache remote resources locally.",
+ usage=argparse.SUPPRESS,
formatter_class=argparse.RawDescriptionHelpFormatter,
- description = required_cache_options,
- epilog = cache_epilog
+ description=required_cache_options,
+ epilog=cache_epilog,
)
# Required Arguments
# Output Directory (analysis working directory)
- subparser_cache.add_argument('--sif-cache',
- type = lambda option: os.path.abspath(os.path.expanduser(option)),
- required = True,
- help = argparse.SUPPRESS
+ subparser_cache.add_argument(
+ "--sif-cache",
+ type=lambda option: os.path.abspath(os.path.expanduser(option)),
+ required=True,
+ help=argparse.SUPPRESS,
)
# Optional Arguments
# Dry-run xavier cache (do not pull any remote resources)
- subparser_cache.add_argument('--dry-run',
- action = 'store_true',
- required = False,
- default = False,
- help = 'Only display what remote resources would be pulled.'
+ subparser_cache.add_argument(
+ "--dry-run",
+ action="store_true",
+ required=False,
+ default=False,
+ help="Only display what remote resources would be pulled.",
)
# Define handlers for each sub-parser
- subparser_run.set_defaults(func = run)
- subparser_unlock.set_defaults(func = unlock)
- subparser_cache.set_defaults(func = cache)
+ subparser_run.set_defaults(func=run)
+ subparser_unlock.set_defaults(func=unlock)
+ subparser_cache.set_defaults(func=cache)
# Parse command-line args
args = parser.parse_args()
@@ -716,16 +780,15 @@ def parsed_arguments():
def main():
-
# Collect args for sub-command
args = parsed_arguments()
# Display version information
- err('xavier ({})'.format(__version__))
+ err("xavier ({})".format(__version__))
# Mediator method to call sub-command's set handler function
args.func(args)
-if __name__ == '__main__':
+if __name__ == "__main__":
main()