diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index c349a9e..38dfaca 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -23,35 +23,31 @@ jobs: run: | docker run -v $PWD:/opt2 snakemake/snakemake:v7.32.4 \ /opt2/bin/xavier run --input \ - /opt2/.tests/Sample10_ARK1_S37.R1.fastq.gz /opt2/.tests/Sample10_ARK1_S37.R2.fastq.gz \ - /opt2/.tests/Sample11_ACI_158_S38.R1.fastq.gz /opt2/.tests/Sample11_ACI_158_S38.R2.fastq.gz \ - /opt2/.tests/Sample4_CRL1622_S31.R1.fastq.gz /opt2/.tests/Sample4_CRL1622_S31.R2.fastq.gz \ + /opt2/tests/data/WES_NC_N_1_sub.R1.fastq.gz /opt2/tests/data/WES_NC_N_1_sub.R2.fastq.gz \ + /opt2/tests/data/WES_NC_T_1_sub.R1.fastq.gz /opt2/tests/data/WES_NC_T_1_sub.R2.fastq.gz \ --output /opt2/output_tn_fqs --targets /opt2/resources/Agilent_SSv7_allExons_hg38.bed \ - --pairs /opt2/.tests/pairs.tsv --genome hg38 --mode local --ffpe --cnv --runmode init + --pairs /opt2/tests/data/pairs.tsv --genome hg38 --mode local --ffpe --cnv --runmode init docker run -v $PWD:/opt2 snakemake/snakemake:v7.32.4 \ /opt2/bin/xavier run --input \ - /opt2/.tests/Sample10_ARK1_S37.R1.fastq.gz /opt2/.tests/Sample10_ARK1_S37.R2.fastq.gz \ - /opt2/.tests/Sample11_ACI_158_S38.R1.fastq.gz /opt2/.tests/Sample11_ACI_158_S38.R2.fastq.gz \ - /opt2/.tests/Sample4_CRL1622_S31.R1.fastq.gz /opt2/.tests/Sample4_CRL1622_S31.R2.fastq.gz \ + /opt2/tests/data/WES_NC_N_1_sub.R1.fastq.gz /opt2/tests/data/WES_NC_N_1_sub.R2.fastq.gz \ + /opt2/tests/data/WES_NC_T_1_sub.R1.fastq.gz /opt2/tests/data/WES_NC_T_1_sub.R2.fastq.gz \ --output /opt2/output_tn_fqs --targets /opt2/resources/Agilent_SSv7_allExons_hg38.bed \ - --pairs /opt2/.tests/pairs.tsv --genome hg38 --mode local --ffpe --cnv --runmode dryrun + --pairs /opt2/tests/data/pairs.tsv --genome hg38 --mode local --ffpe --cnv --runmode dryrun - name: Tumor-only FastQ Dry Run run: | docker run -v $PWD:/opt2 snakemake/snakemake:v7.32.4 \ /opt2/bin/xavier run --input \ - /opt2/.tests/Sample10_ARK1_S37.R1.fastq.gz /opt2/.tests/Sample10_ARK1_S37.R2.fastq.gz \ - /opt2/.tests/Sample11_ACI_158_S38.R1.fastq.gz /opt2/.tests/Sample11_ACI_158_S38.R2.fastq.gz \ - /opt2/.tests/Sample4_CRL1622_S31.R1.fastq.gz /opt2/.tests/Sample4_CRL1622_S31.R2.fastq.gz \ + /opt2/tests/data/WES_NC_N_1_sub.R1.fastq.gz /opt2/tests/data/WES_NC_N_1_sub.R2.fastq.gz \ + /opt2/tests/data/WES_NC_T_1_sub.R1.fastq.gz /opt2/tests/data/WES_NC_T_1_sub.R2.fastq.gz \ --output /opt2/output_tonly_fqs --targets /opt2/resources/Agilent_SSv7_allExons_hg38.bed \ --genome hg38 --mode local --ffpe --runmode init docker run -v $PWD:/opt2 snakemake/snakemake:v7.32.4 \ /opt2/bin/xavier run --input \ - /opt2/.tests/Sample10_ARK1_S37.R1.fastq.gz /opt2/.tests/Sample10_ARK1_S37.R2.fastq.gz \ - /opt2/.tests/Sample11_ACI_158_S38.R1.fastq.gz /opt2/.tests/Sample11_ACI_158_S38.R2.fastq.gz \ - /opt2/.tests/Sample4_CRL1622_S31.R1.fastq.gz /opt2/.tests/Sample4_CRL1622_S31.R2.fastq.gz \ + /opt2/tests/data/WES_NC_N_1_sub.R1.fastq.gz /opt2/tests/data/WES_NC_N_1_sub.R2.fastq.gz \ + /opt2/tests/data/WES_NC_T_1_sub.R1.fastq.gz /opt2/tests/data/WES_NC_T_1_sub.R2.fastq.gz \ --output /opt2/output_tonly_fqs --targets /opt2/resources/Agilent_SSv7_allExons_hg38.bed \ --genome hg38 --mode local --ffpe --runmode dryrun @@ -59,35 +55,31 @@ jobs: run: | docker run -v $PWD:/opt2 snakemake/snakemake:v7.32.4 \ /opt2/bin/xavier run --input \ - /opt2/.tests/Sample10_ARK1_S37.recal.bam \ - /opt2/.tests/Sample11_ACI_158_S38.recal.bam \ - /opt2/.tests/Sample4_CRL1622_S31.recal.bam \ + /opt2/tests/data/WES_NC_N_1_sub.bam \ + /opt2/tests/data/WES_NC_T_1_sub.bam \ --output /opt2/output_tn_bams --targets /opt2/resources/Agilent_SSv7_allExons_hg38.bed \ - --pairs /opt2/.tests/pairs.tsv --genome hg38 --mode local --ffpe --cnv --runmode init + --pairs /opt2/tests/data/pairs.tsv --genome hg38 --mode local --ffpe --cnv --runmode init docker run -v $PWD:/opt2 snakemake/snakemake:v7.32.4 \ /opt2/bin/xavier run --input \ - /opt2/.tests/Sample10_ARK1_S37.recal.bam \ - /opt2/.tests/Sample11_ACI_158_S38.recal.bam \ - /opt2/.tests/Sample4_CRL1622_S31.recal.bam \ + /opt2/tests/data/WES_NC_N_1_sub.bam \ + /opt2/tests/data/WES_NC_T_1_sub.bam \ --output /opt2/output_tn_bams --targets /opt2/resources/Agilent_SSv7_allExons_hg38.bed \ - --pairs /opt2/.tests/pairs.tsv --genome hg38 --mode local --ffpe --cnv --runmode dryrun + --pairs /opt2/tests/data/pairs.tsv --genome hg38 --mode local --ffpe --cnv --runmode dryrun - name: Tumor-only BAM Dry Run run: | docker run -v $PWD:/opt2 snakemake/snakemake:v7.32.4 \ /opt2/bin/xavier run --input \ - /opt2/.tests/Sample10_ARK1_S37.recal.bam \ - /opt2/.tests/Sample11_ACI_158_S38.recal.bam \ - /opt2/.tests/Sample4_CRL1622_S31.recal.bam \ + /opt2/tests/data/WES_NC_N_1_sub.bam \ + /opt2/tests/data/WES_NC_T_1_sub.bam \ --output /opt2/output_tonly_bams --targets /opt2/resources/Agilent_SSv7_allExons_hg38.bed \ --genome hg38 --mode local --ffpe --runmode init docker run -v $PWD:/opt2 snakemake/snakemake:v7.32.4 \ /opt2/bin/xavier run --input \ - /opt2/.tests/Sample10_ARK1_S37.recal.bam \ - /opt2/.tests/Sample11_ACI_158_S38.recal.bam \ - /opt2/.tests/Sample4_CRL1622_S31.recal.bam \ + /opt2/tests/data/WES_NC_N_1_sub.bam \ + /opt2/tests/data/WES_NC_T_1_sub.bam \ --output /opt2/output_tonly_bams --targets /opt2/resources/Agilent_SSv7_allExons_hg38.bed \ --genome hg38 --mode local --ffpe --runmode dryrun diff --git a/.tests/README.md b/.tests/README.md deleted file mode 100644 index be56f9a..0000000 --- a/.tests/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# About - -These input files are used for continuous integration purposes, specifically to dry run the pipeline whenever commits have been made to the main, master, or unified branches. - -**Please Note:** Each of the provided FastQ files and BAM files are empty and are not suitable input to the CCBR GATK4 pipeline! diff --git a/.tests/Sample10_ARK1_S37.R1.fastq.gz b/.tests/Sample10_ARK1_S37.R1.fastq.gz deleted file mode 100644 index e69de29..0000000 diff --git a/.tests/Sample10_ARK1_S37.R2.fastq.gz b/.tests/Sample10_ARK1_S37.R2.fastq.gz deleted file mode 100644 index e69de29..0000000 diff --git a/.tests/Sample10_ARK1_S37.recal.bam b/.tests/Sample10_ARK1_S37.recal.bam deleted file mode 100644 index e69de29..0000000 diff --git a/.tests/Sample11_ACI_158_S38.R1.fastq.gz b/.tests/Sample11_ACI_158_S38.R1.fastq.gz deleted file mode 100644 index e69de29..0000000 diff --git a/.tests/Sample11_ACI_158_S38.R2.fastq.gz b/.tests/Sample11_ACI_158_S38.R2.fastq.gz deleted file mode 100644 index e69de29..0000000 diff --git a/.tests/Sample11_ACI_158_S38.recal.bam b/.tests/Sample11_ACI_158_S38.recal.bam deleted file mode 100644 index e69de29..0000000 diff --git a/.tests/Sample4_CRL1622_S31.R1.fastq.gz b/.tests/Sample4_CRL1622_S31.R1.fastq.gz deleted file mode 100644 index e69de29..0000000 diff --git a/.tests/Sample4_CRL1622_S31.R2.fastq.gz b/.tests/Sample4_CRL1622_S31.R2.fastq.gz deleted file mode 100644 index e69de29..0000000 diff --git a/.tests/Sample4_CRL1622_S31.recal.bam b/.tests/Sample4_CRL1622_S31.recal.bam deleted file mode 100644 index e69de29..0000000 diff --git a/.tests/pairs.tsv b/.tests/pairs.tsv deleted file mode 100644 index 84a2995..0000000 --- a/.tests/pairs.tsv +++ /dev/null @@ -1,3 +0,0 @@ -Normal Tumor -Sample4_CRL1622_S31 Sample10_ARK1_S37 -Sample4_CRL1622_S31 Sample11_ACI_158_S38 diff --git a/CHANGELOG.md b/CHANGELOG.md index 946855e..622017b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ - Provide default exome targets for hg38 and mm10, which can be overridden by the optional `--targets` argument. (#102, @kelly-sovacool) - Previously, the `--targets` argument was required with no defaults. - Increased memory for rules: BWA mem, qualimap, kraken. gatk_contamination is not localrule. (#89, @samarth8392) +- Added new human test dataset for github workflow (#27, @samarth8392) ## XAVIER 3.0.3 diff --git a/docs/usage/run.md b/docs/usage/run.md index 3486355..7676684 100644 --- a/docs/usage/run.md +++ b/docs/usage/run.md @@ -46,7 +46,9 @@ Each of the following arguments are required. Failure to provide a required argu > > One or more FastQ files can be provided. The pipeline does NOT support single-end WES data. Please provide either a set of FastQ files or a set of BAM files. The pipeline does NOT support processing a mixture of FastQ files and BAM files. From the command-line, each input file should separated by a space. Globbing is supported! This makes selecting FastQ files easy. Input FastQ files should be gzipp-ed. > -> **_Example:_** `--input .tests/*.R?.fastq.gz` +> **_Example:_** `--input tests/data/*.R?.fastq.gz` +> +> **_Example:_** `--input /data/CCBR_Pipeliner/testdata/XAVIER/human_subset/*.R?.fastq.gz` --- @@ -251,7 +253,7 @@ module purge module load ccbrpipeliner # Step 2A.) Initialize the all resources to the output folder -xavier run --input .tests/*.R?.fastq.gz \ +xavier run --input tests/data/*.R?.fastq.gz \ --output /data/$USER/xavier_hg38 \ --genome hg38 \ --targets Agilent_SSv7_allExons_hg38.bed \ @@ -259,7 +261,7 @@ xavier run --input .tests/*.R?.fastq.gz \ --runmode init # Step 2B.) Dry-run the pipeline -xavier run --input .tests/*.R?.fastq.gz \ +xavier run --input tests/data/*.R?.fastq.gz \ --output /data/$USER/xavier_hg38 \ --genome hg38 \ --targets Agilent_SSv7_allExons_hg38.bed \ @@ -269,7 +271,7 @@ xavier run --input .tests/*.R?.fastq.gz \ # Step 2C.) Run the XAVIER pipeline # The slurm mode will submit jobs to the cluster. # It is recommended running xavier in this mode. -xavier run --input .tests/*.R?.fastq.gz \ +xavier run --input tests/data/*.R?.fastq.gz \ --output /data/$USER/xavier_hg38 \ --genome hg38 \ --targets Agilent_SSv7_allExons_hg38.bed \ @@ -277,3 +279,10 @@ xavier run --input .tests/*.R?.fastq.gz \ --runmode run ``` + +The example dataset in `tests/data` in this repository is a very small +subsampled dataset, and some steps of the pipeline fail due to the small size +(CNV callling, somalier, etc). +We have a larger subsample (25% of a full human dataset) available on Biowulf if +you would like to test the full functionality of the pipeline: +`/data/CCBR_Pipeliner/testdata/XAVIER/human_subset/*.R?.fastq.gz` diff --git a/pyproject.toml b/pyproject.toml index 13bc863..ff2b98f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,7 +64,7 @@ Repository = "https://github.com/CCBR/XAVIER" xavier = "." [tool.setuptools.package-data] -"*" = ["CITATION.cff", "LICENSE", "VERSION", "docker/**", "resources/**", "bin/**", "config/**", "resources/**", "workflow/**", "tests/**", ".tests/**"] +"*" = ["CITATION.cff", "LICENSE", "VERSION", "docker/**", "resources/**", "bin/**", "config/**", "resources/**", "workflow/**", "tests/**"] [tool.setuptools.dynamic] version = {file = "VERSION"} diff --git a/src/xavier/__main__.py b/src/xavier/__main__.py index 8442c2b..7f607e0 100755 --- a/src/xavier/__main__.py +++ b/src/xavier/__main__.py @@ -220,7 +220,7 @@ def parsed_arguments(): FastQ files or a set of BAM files. The pipeline does NOT support processing a mixture of FastQ files and BAM files. - Example: --input .tests/*.R?.fastq.gz + Example: --input tests/data/*.R?.fastq.gz --output OUTPUT Path to an output directory. This location is where the pipeline will create all of its output files, also @@ -256,7 +256,7 @@ def parsed_arguments(): # Step 2A.) Initialize the pipeline xavier run \\ --runmode init \\ - --input .tests/*.R?.fastq.gz \\ + --input tests/data/*.R?.fastq.gz \\ --output /data/$USER/xavier_hg38 \\ --genome hg38 \\ --targets resources/Agilent_SSv7_allExons_hg38.bed @@ -264,7 +264,7 @@ def parsed_arguments(): # Step 2B.) Dry-run the pipeline xavier run \\ --runmode dryrun \\ - --input .tests/*.R?.fastq.gz \\ + --input tests/data/*.R?.fastq.gz \\ --output /data/$USER/xavier_hg38 \\ --genome hg38 \\ --targets resources/Agilent_SSv7_allExons_hg38.bed \\ @@ -275,7 +275,7 @@ def parsed_arguments(): # It is recommended running xavier in this mode. xavier run \\ --runmode run \\ - --input .tests/*.R?.fastq.gz \\ + --input tests/data/*.R?.fastq.gz \\ --output /data/$USER/xavier_hg38 \\ --genome hg38 \\ --targets resources/Agilent_SSv7_allExons_hg38.bed \\ diff --git a/tests/data/README.md b/tests/data/README.md new file mode 100644 index 0000000..b0ced84 --- /dev/null +++ b/tests/data/README.md @@ -0,0 +1,20 @@ +# About + +These input files are used for continuous integration purposes, specifically to dry run the pipeline whenever commits have been made to the main, master, or unified branches. + +Human whole exome sequence reads from the Sequencing Quality Control Phase 2 (SEQC2) Consortium has been subsampled and added. + +The tumor-normal paired reads were downloaded from the [seqc2](https://sites.google.com/view/seqc2/home/sequencing) server that were sequenced by the NCI (WES_NC_T_1 vs. WES_NC_N_1) which corresponds to NCBI SRA accession no. [SRX4728524](https://www.ncbi.nlm.nih.gov/sra/SRX4728524) and [SRX4728523](https://www.ncbi.nlm.nih.gov/sra/SRX4728523) respectively. + +Next, the reads were subsampled to 0.1% using `seqtk` and gzipped as follows: + +```bash +seqtk sample -s100 {input}.R[1/2].fastq.gz 0.001 > {input}.R[1/2]_sub.R2.fastq +gzip *.fastq +``` + +Similarly, the BAM files were created by first mapping to the hg38 genome and then subsampled using `samtools`: + +```bash +samtools view -s 0.00125 -b WES_NC_[T/N]_1.bam -o WES_NC_[T/N]_1_sub.bam +``` \ No newline at end of file diff --git a/tests/data/WES_NC_N_1_sub.R1.fastq.gz b/tests/data/WES_NC_N_1_sub.R1.fastq.gz new file mode 100644 index 0000000..7468c27 Binary files /dev/null and b/tests/data/WES_NC_N_1_sub.R1.fastq.gz differ diff --git a/tests/data/WES_NC_N_1_sub.R2.fastq.gz b/tests/data/WES_NC_N_1_sub.R2.fastq.gz new file mode 100644 index 0000000..5c55a61 Binary files /dev/null and b/tests/data/WES_NC_N_1_sub.R2.fastq.gz differ diff --git a/tests/data/WES_NC_N_1_sub.bam b/tests/data/WES_NC_N_1_sub.bam new file mode 100644 index 0000000..e8ab407 Binary files /dev/null and b/tests/data/WES_NC_N_1_sub.bam differ diff --git a/tests/data/WES_NC_T_1_sub.R1.fastq.gz b/tests/data/WES_NC_T_1_sub.R1.fastq.gz new file mode 100644 index 0000000..f82aac4 Binary files /dev/null and b/tests/data/WES_NC_T_1_sub.R1.fastq.gz differ diff --git a/tests/data/WES_NC_T_1_sub.R2.fastq.gz b/tests/data/WES_NC_T_1_sub.R2.fastq.gz new file mode 100644 index 0000000..c72b9aa Binary files /dev/null and b/tests/data/WES_NC_T_1_sub.R2.fastq.gz differ diff --git a/tests/data/WES_NC_T_1_sub.bam b/tests/data/WES_NC_T_1_sub.bam new file mode 100644 index 0000000..efecae5 Binary files /dev/null and b/tests/data/WES_NC_T_1_sub.bam differ diff --git a/tests/data/pairs.tsv b/tests/data/pairs.tsv new file mode 100644 index 0000000..00d7a2b --- /dev/null +++ b/tests/data/pairs.tsv @@ -0,0 +1,2 @@ +Normal Tumor +WES_NC_N_1_sub WES_NC_T_1_sub diff --git a/tests/test_cli.py b/tests/test_cli.py index c5a642e..2196e3f 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -8,8 +8,8 @@ xavier_run = ( "xavier run " - "--input .tests/*.fastq.gz " - "--pairs .tests/pairs.tsv " + "--input tests/data/*.fastq.gz " + "--pairs tests/data/pairs.tsv " "--mode local " ) diff --git a/tests/test_run.py b/tests/test_run.py index c705137..054beac 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -15,14 +15,14 @@ def test_dryrun(): with tempfile.TemporaryDirectory() as tmp_dir: run_args = argparse.Namespace( runmode="init", - input=list(glob.glob(f"{xavier_base('.tests')}/*.fastq.gz")), + input=list(glob.glob(f"{xavier_base('tests/data')}/*.fastq.gz")), output=tmp_dir, genome="hg38", targets=xavier_base("resources/Agilent_SSv7_allExons_hg38.bed"), mode="local", job_name="pl:xavier", callers=["mutect2", "mutect", "strelka", "vardict", "varscan"], - pairs=xavier_base(".tests/pairs.tsv"), + pairs=xavier_base("tests/data/pairs.tsv"), ffpe=False, cnv=False, wait=False,