diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index c349a9e..38dfaca 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -23,35 +23,31 @@ jobs: run: | docker run -v $PWD:/opt2 snakemake/snakemake:v7.32.4 \ /opt2/bin/xavier run --input \ - /opt2/.tests/Sample10_ARK1_S37.R1.fastq.gz /opt2/.tests/Sample10_ARK1_S37.R2.fastq.gz \ - /opt2/.tests/Sample11_ACI_158_S38.R1.fastq.gz /opt2/.tests/Sample11_ACI_158_S38.R2.fastq.gz \ - /opt2/.tests/Sample4_CRL1622_S31.R1.fastq.gz /opt2/.tests/Sample4_CRL1622_S31.R2.fastq.gz \ + /opt2/tests/data/WES_NC_N_1_sub.R1.fastq.gz /opt2/tests/data/WES_NC_N_1_sub.R2.fastq.gz \ + /opt2/tests/data/WES_NC_T_1_sub.R1.fastq.gz /opt2/tests/data/WES_NC_T_1_sub.R2.fastq.gz \ --output /opt2/output_tn_fqs --targets /opt2/resources/Agilent_SSv7_allExons_hg38.bed \ - --pairs /opt2/.tests/pairs.tsv --genome hg38 --mode local --ffpe --cnv --runmode init + --pairs /opt2/tests/data/pairs.tsv --genome hg38 --mode local --ffpe --cnv --runmode init docker run -v $PWD:/opt2 snakemake/snakemake:v7.32.4 \ /opt2/bin/xavier run --input \ - /opt2/.tests/Sample10_ARK1_S37.R1.fastq.gz /opt2/.tests/Sample10_ARK1_S37.R2.fastq.gz \ - /opt2/.tests/Sample11_ACI_158_S38.R1.fastq.gz /opt2/.tests/Sample11_ACI_158_S38.R2.fastq.gz \ - /opt2/.tests/Sample4_CRL1622_S31.R1.fastq.gz /opt2/.tests/Sample4_CRL1622_S31.R2.fastq.gz \ + /opt2/tests/data/WES_NC_N_1_sub.R1.fastq.gz /opt2/tests/data/WES_NC_N_1_sub.R2.fastq.gz \ + /opt2/tests/data/WES_NC_T_1_sub.R1.fastq.gz /opt2/tests/data/WES_NC_T_1_sub.R2.fastq.gz \ --output /opt2/output_tn_fqs --targets /opt2/resources/Agilent_SSv7_allExons_hg38.bed \ - --pairs /opt2/.tests/pairs.tsv --genome hg38 --mode local --ffpe --cnv --runmode dryrun + --pairs /opt2/tests/data/pairs.tsv --genome hg38 --mode local --ffpe --cnv --runmode dryrun - name: Tumor-only FastQ Dry Run run: | docker run -v $PWD:/opt2 snakemake/snakemake:v7.32.4 \ /opt2/bin/xavier run --input \ - /opt2/.tests/Sample10_ARK1_S37.R1.fastq.gz /opt2/.tests/Sample10_ARK1_S37.R2.fastq.gz \ - /opt2/.tests/Sample11_ACI_158_S38.R1.fastq.gz /opt2/.tests/Sample11_ACI_158_S38.R2.fastq.gz \ - /opt2/.tests/Sample4_CRL1622_S31.R1.fastq.gz /opt2/.tests/Sample4_CRL1622_S31.R2.fastq.gz \ + /opt2/tests/data/WES_NC_N_1_sub.R1.fastq.gz /opt2/tests/data/WES_NC_N_1_sub.R2.fastq.gz \ + /opt2/tests/data/WES_NC_T_1_sub.R1.fastq.gz /opt2/tests/data/WES_NC_T_1_sub.R2.fastq.gz \ --output /opt2/output_tonly_fqs --targets /opt2/resources/Agilent_SSv7_allExons_hg38.bed \ --genome hg38 --mode local --ffpe --runmode init docker run -v $PWD:/opt2 snakemake/snakemake:v7.32.4 \ /opt2/bin/xavier run --input \ - /opt2/.tests/Sample10_ARK1_S37.R1.fastq.gz /opt2/.tests/Sample10_ARK1_S37.R2.fastq.gz \ - /opt2/.tests/Sample11_ACI_158_S38.R1.fastq.gz /opt2/.tests/Sample11_ACI_158_S38.R2.fastq.gz \ - /opt2/.tests/Sample4_CRL1622_S31.R1.fastq.gz /opt2/.tests/Sample4_CRL1622_S31.R2.fastq.gz \ + /opt2/tests/data/WES_NC_N_1_sub.R1.fastq.gz /opt2/tests/data/WES_NC_N_1_sub.R2.fastq.gz \ + /opt2/tests/data/WES_NC_T_1_sub.R1.fastq.gz /opt2/tests/data/WES_NC_T_1_sub.R2.fastq.gz \ --output /opt2/output_tonly_fqs --targets /opt2/resources/Agilent_SSv7_allExons_hg38.bed \ --genome hg38 --mode local --ffpe --runmode dryrun @@ -59,35 +55,31 @@ jobs: run: | docker run -v $PWD:/opt2 snakemake/snakemake:v7.32.4 \ /opt2/bin/xavier run --input \ - /opt2/.tests/Sample10_ARK1_S37.recal.bam \ - /opt2/.tests/Sample11_ACI_158_S38.recal.bam \ - /opt2/.tests/Sample4_CRL1622_S31.recal.bam \ + /opt2/tests/data/WES_NC_N_1_sub.bam \ + /opt2/tests/data/WES_NC_T_1_sub.bam \ --output /opt2/output_tn_bams --targets /opt2/resources/Agilent_SSv7_allExons_hg38.bed \ - --pairs /opt2/.tests/pairs.tsv --genome hg38 --mode local --ffpe --cnv --runmode init + --pairs /opt2/tests/data/pairs.tsv --genome hg38 --mode local --ffpe --cnv --runmode init docker run -v $PWD:/opt2 snakemake/snakemake:v7.32.4 \ /opt2/bin/xavier run --input \ - /opt2/.tests/Sample10_ARK1_S37.recal.bam \ - /opt2/.tests/Sample11_ACI_158_S38.recal.bam \ - /opt2/.tests/Sample4_CRL1622_S31.recal.bam \ + /opt2/tests/data/WES_NC_N_1_sub.bam \ + /opt2/tests/data/WES_NC_T_1_sub.bam \ --output /opt2/output_tn_bams --targets /opt2/resources/Agilent_SSv7_allExons_hg38.bed \ - --pairs /opt2/.tests/pairs.tsv --genome hg38 --mode local --ffpe --cnv --runmode dryrun + --pairs /opt2/tests/data/pairs.tsv --genome hg38 --mode local --ffpe --cnv --runmode dryrun - name: Tumor-only BAM Dry Run run: | docker run -v $PWD:/opt2 snakemake/snakemake:v7.32.4 \ /opt2/bin/xavier run --input \ - /opt2/.tests/Sample10_ARK1_S37.recal.bam \ - /opt2/.tests/Sample11_ACI_158_S38.recal.bam \ - /opt2/.tests/Sample4_CRL1622_S31.recal.bam \ + /opt2/tests/data/WES_NC_N_1_sub.bam \ + /opt2/tests/data/WES_NC_T_1_sub.bam \ --output /opt2/output_tonly_bams --targets /opt2/resources/Agilent_SSv7_allExons_hg38.bed \ --genome hg38 --mode local --ffpe --runmode init docker run -v $PWD:/opt2 snakemake/snakemake:v7.32.4 \ /opt2/bin/xavier run --input \ - /opt2/.tests/Sample10_ARK1_S37.recal.bam \ - /opt2/.tests/Sample11_ACI_158_S38.recal.bam \ - /opt2/.tests/Sample4_CRL1622_S31.recal.bam \ + /opt2/tests/data/WES_NC_N_1_sub.bam \ + /opt2/tests/data/WES_NC_T_1_sub.bam \ --output /opt2/output_tonly_bams --targets /opt2/resources/Agilent_SSv7_allExons_hg38.bed \ --genome hg38 --mode local --ffpe --runmode dryrun diff --git a/.tests/README.md b/.tests/README.md deleted file mode 100644 index be56f9a..0000000 --- a/.tests/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# About - -These input files are used for continuous integration purposes, specifically to dry run the pipeline whenever commits have been made to the main, master, or unified branches. - -**Please Note:** Each of the provided FastQ files and BAM files are empty and are not suitable input to the CCBR GATK4 pipeline! diff --git a/.tests/Sample10_ARK1_S37.R1.fastq.gz b/.tests/Sample10_ARK1_S37.R1.fastq.gz deleted file mode 100644 index e69de29..0000000 diff --git a/.tests/Sample10_ARK1_S37.R2.fastq.gz b/.tests/Sample10_ARK1_S37.R2.fastq.gz deleted file mode 100644 index e69de29..0000000 diff --git a/.tests/Sample10_ARK1_S37.recal.bam b/.tests/Sample10_ARK1_S37.recal.bam deleted file mode 100644 index e69de29..0000000 diff --git a/.tests/Sample11_ACI_158_S38.R1.fastq.gz b/.tests/Sample11_ACI_158_S38.R1.fastq.gz deleted file mode 100644 index e69de29..0000000 diff --git a/.tests/Sample11_ACI_158_S38.R2.fastq.gz b/.tests/Sample11_ACI_158_S38.R2.fastq.gz deleted file mode 100644 index e69de29..0000000 diff --git a/.tests/Sample11_ACI_158_S38.recal.bam b/.tests/Sample11_ACI_158_S38.recal.bam deleted file mode 100644 index e69de29..0000000 diff --git a/.tests/Sample4_CRL1622_S31.R1.fastq.gz b/.tests/Sample4_CRL1622_S31.R1.fastq.gz deleted file mode 100644 index e69de29..0000000 diff --git a/.tests/Sample4_CRL1622_S31.R2.fastq.gz b/.tests/Sample4_CRL1622_S31.R2.fastq.gz deleted file mode 100644 index e69de29..0000000 diff --git a/.tests/Sample4_CRL1622_S31.recal.bam b/.tests/Sample4_CRL1622_S31.recal.bam deleted file mode 100644 index e69de29..0000000 diff --git a/.tests/pairs.tsv b/.tests/pairs.tsv deleted file mode 100644 index 84a2995..0000000 --- a/.tests/pairs.tsv +++ /dev/null @@ -1,3 +0,0 @@ -Normal Tumor -Sample4_CRL1622_S31 Sample10_ARK1_S37 -Sample4_CRL1622_S31 Sample11_ACI_158_S38 diff --git a/CHANGELOG.md b/CHANGELOG.md index aae5dfd..622017b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ - Previously, `xavier_gui` (with an underscore) was a command in the `ccbrpipeliner` module. - Provide default exome targets for hg38 and mm10, which can be overridden by the optional `--targets` argument. (#102, @kelly-sovacool) - Previously, the `--targets` argument was required with no defaults. +- Increased memory for rules: BWA mem, qualimap, kraken. gatk_contamination is not localrule. (#89, @samarth8392) +- Added new human test dataset for github workflow (#27, @samarth8392) ## XAVIER 3.0.3 diff --git a/bin/redirect b/bin/redirect index 99a1086..7b36ca3 100755 --- a/bin/redirect +++ b/bin/redirect @@ -56,13 +56,11 @@ fi # - snakemake # are in PATH if [[ $ISBIOWULF == true ]];then - # module purge load_module_if_needed singularity - load_module_if_needed snakemake + load_module_if_needed snakemake/7 elif [[ $ISFRCE == true ]];then # snakemake module on FRCE does not work as expected # use the conda installed version of snakemake instead - # module purge load_module_if_needed load singularity export PATH="/mnt/projects/CCBR-Pipelines/bin:$PATH" fi diff --git a/config/cluster.biowulf.json b/config/cluster.biowulf.json index 184b3fa..a647265 100644 --- a/config/cluster.biowulf.json +++ b/config/cluster.biowulf.json @@ -22,13 +22,17 @@ "threads": "2", "time": "4:00:00" }, - + "kraken": { + "mem": "64G" + }, "strelka": { "threads": "16", "time": "16:00:00", "mem": "32G" }, - + "qualimap_bamqc": { + "mem": "32G" + }, "strelka_filter": { "threads": "4", "time": "8:00:00", @@ -57,7 +61,7 @@ "mem": "32G" }, - "merge_somatic_callers": { + "somatic_merge_callers": { "threads": "16", "time": "18:00:00", "mem": "32G" @@ -116,7 +120,7 @@ }, "bwa_mem": { "threads": "24", - "mem": "32G" + "mem": "64G" }, "picard_headers": { "threads": "2", diff --git a/config/cluster.frce.json b/config/cluster.frce.json index 3ddeb16..6c7773d 100644 --- a/config/cluster.frce.json +++ b/config/cluster.frce.json @@ -21,13 +21,17 @@ "threads": "2", "time": "4:00:00" }, - + "kraken": { + "mem": "64G" + }, "strelka": { "threads": "16", "time": "16:00:00", "mem": "32G" }, - + "qualimap_bamqc": { + "mem": "32G" + }, "strelka_filter": { "threads": "4", "time": "8:00:00", @@ -56,7 +60,7 @@ "mem": "32G" }, - "merge_somatic_callers": { + "somatic_merge_callers": { "threads": "16", "time": "18:00:00", "mem": "32G" @@ -115,7 +119,7 @@ }, "bwa_mem": { "threads": "24", - "mem": "32G" + "mem": "64G" }, "picard_headers": { "threads": "2", diff --git a/docs/usage/run.md b/docs/usage/run.md index 3486355..7676684 100644 --- a/docs/usage/run.md +++ b/docs/usage/run.md @@ -46,7 +46,9 @@ Each of the following arguments are required. Failure to provide a required argu > > One or more FastQ files can be provided. The pipeline does NOT support single-end WES data. Please provide either a set of FastQ files or a set of BAM files. The pipeline does NOT support processing a mixture of FastQ files and BAM files. From the command-line, each input file should separated by a space. Globbing is supported! This makes selecting FastQ files easy. Input FastQ files should be gzipp-ed. > -> **_Example:_** `--input .tests/*.R?.fastq.gz` +> **_Example:_** `--input tests/data/*.R?.fastq.gz` +> +> **_Example:_** `--input /data/CCBR_Pipeliner/testdata/XAVIER/human_subset/*.R?.fastq.gz` --- @@ -251,7 +253,7 @@ module purge module load ccbrpipeliner # Step 2A.) Initialize the all resources to the output folder -xavier run --input .tests/*.R?.fastq.gz \ +xavier run --input tests/data/*.R?.fastq.gz \ --output /data/$USER/xavier_hg38 \ --genome hg38 \ --targets Agilent_SSv7_allExons_hg38.bed \ @@ -259,7 +261,7 @@ xavier run --input .tests/*.R?.fastq.gz \ --runmode init # Step 2B.) Dry-run the pipeline -xavier run --input .tests/*.R?.fastq.gz \ +xavier run --input tests/data/*.R?.fastq.gz \ --output /data/$USER/xavier_hg38 \ --genome hg38 \ --targets Agilent_SSv7_allExons_hg38.bed \ @@ -269,7 +271,7 @@ xavier run --input .tests/*.R?.fastq.gz \ # Step 2C.) Run the XAVIER pipeline # The slurm mode will submit jobs to the cluster. # It is recommended running xavier in this mode. -xavier run --input .tests/*.R?.fastq.gz \ +xavier run --input tests/data/*.R?.fastq.gz \ --output /data/$USER/xavier_hg38 \ --genome hg38 \ --targets Agilent_SSv7_allExons_hg38.bed \ @@ -277,3 +279,10 @@ xavier run --input .tests/*.R?.fastq.gz \ --runmode run ``` + +The example dataset in `tests/data` in this repository is a very small +subsampled dataset, and some steps of the pipeline fail due to the small size +(CNV callling, somalier, etc). +We have a larger subsample (25% of a full human dataset) available on Biowulf if +you would like to test the full functionality of the pipeline: +`/data/CCBR_Pipeliner/testdata/XAVIER/human_subset/*.R?.fastq.gz` diff --git a/pyproject.toml b/pyproject.toml index 5a0c99c..ff2b98f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ classifiers = [ requires-python = ">=3.11" dependencies = [ "argparse", + "ccbr_tools@git+https://github.com/CCBR/Tools", "Click >= 8.1.3", "PySimpleGui < 5", "snakemake >= 7, < 8", @@ -63,7 +64,7 @@ Repository = "https://github.com/CCBR/XAVIER" xavier = "." [tool.setuptools.package-data] -"*" = ["CITATION.cff", "LICENSE", "VERSION", "docker/**", "resources/**", "bin/**", "config/**", "resources/**", "workflow/**", "tests/**", ".tests/**"] +"*" = ["CITATION.cff", "LICENSE", "VERSION", "docker/**", "resources/**", "bin/**", "config/**", "resources/**", "workflow/**", "tests/**"] [tool.setuptools.dynamic] version = {file = "VERSION"} diff --git a/src/xavier/__main__.py b/src/xavier/__main__.py index 5df5326..7f607e0 100755 --- a/src/xavier/__main__.py +++ b/src/xavier/__main__.py @@ -36,28 +36,20 @@ """ # Python standard library -from __future__ import print_function import sys, os, subprocess, re, json, textwrap # 3rd party imports from pypi import argparse # potential python3 3rd party package, added in python/3.5 +from ccbr_tools.pipeline.util import err, exists, fatal, permissions, require +from ccbr_tools.pipeline.cache import check_cache # Local imports from .run import init, setup, bind, dryrun, runner, run from .shells import bash from .options import genome_options -from .util import ( - err, - exists, - fatal, - permissions, - check_cache, - require, - get_version, - get_genomes_list, -) from .gui import launch_gui +from .util import xavier_base, get_version __version__ = get_version() __email__ = "ccbr@mail.nih.gov" @@ -228,7 +220,7 @@ def parsed_arguments(): FastQ files or a set of BAM files. The pipeline does NOT support processing a mixture of FastQ files and BAM files. - Example: --input .tests/*.R?.fastq.gz + Example: --input tests/data/*.R?.fastq.gz --output OUTPUT Path to an output directory. This location is where the pipeline will create all of its output files, also @@ -264,7 +256,7 @@ def parsed_arguments(): # Step 2A.) Initialize the pipeline xavier run \\ --runmode init \\ - --input .tests/*.R?.fastq.gz \\ + --input tests/data/*.R?.fastq.gz \\ --output /data/$USER/xavier_hg38 \\ --genome hg38 \\ --targets resources/Agilent_SSv7_allExons_hg38.bed @@ -272,7 +264,7 @@ def parsed_arguments(): # Step 2B.) Dry-run the pipeline xavier run \\ --runmode dryrun \\ - --input .tests/*.R?.fastq.gz \\ + --input tests/data/*.R?.fastq.gz \\ --output /data/$USER/xavier_hg38 \\ --genome hg38 \\ --targets resources/Agilent_SSv7_allExons_hg38.bed \\ @@ -283,7 +275,7 @@ def parsed_arguments(): # It is recommended running xavier in this mode. xavier run \\ --runmode run \\ - --input .tests/*.R?.fastq.gz \\ + --input tests/data/*.R?.fastq.gz \\ --output /data/$USER/xavier_hg38 \\ --genome hg38 \\ --targets resources/Agilent_SSv7_allExons_hg38.bed \\ diff --git a/src/xavier/gui.py b/src/xavier/gui.py index 72ee41e..588edcf 100644 --- a/src/xavier/gui.py +++ b/src/xavier/gui.py @@ -5,21 +5,23 @@ import glob import PySimpleGUI as sg -from .util import ( +from ccbr_tools.pipeline.util import ( get_genomes_dict, get_tmp_dir, - xavier_base, - get_version, get_hpcname, check_python_version, ) -from .run import run_in_context -from .cache import get_sif_cache_dir +from ccbr_tools.pipeline.cache import get_sif_cache_dir +from ccbr_tools.shell import exec_in_context + +from .util import xavier_base, get_version +from .run import run + def launch_gui(DEBUG=True): check_python_version() # get drop down genome options - jsons = get_genomes_dict() + jsons = get_genomes_dict(repo_base=xavier_base) genome_annotation_combinations = list(jsons.keys()) genome_annotation_combinations.sort() if DEBUG: @@ -165,7 +167,9 @@ def launch_gui(DEBUG=True): if DEBUG: print("layout is ready!") - window = sg.Window(f"XAVIER {get_version()}", layout, location=(0, 500), finalize=True) + window = sg.Window( + f"XAVIER {get_version()}", layout, location=(0, 500), finalize=True + ) if DEBUG: print("window created!") @@ -277,7 +281,11 @@ def launch_gui(DEBUG=True): input=list(glob.glob(os.path.join(values["-INDIR-"], "*.fastq.gz"))), output=output_dir, genome=genome, - targets=values["-TARGETS-"] if values["-TARGETS-"] else xavier_base('resources', 'Agilent_SSv7_allExons_hg38.bed'), # TODO should this be part of the genome config file? + targets=values["-TARGETS-"] + if values["-TARGETS-"] + else xavier_base( + "resources", "Agilent_SSv7_allExons_hg38.bed" + ), # TODO should this be part of the genome config file? mode="slurm", job_name="pl:xavier", callers=["mutect2", "mutect", "strelka", "vardict", "varscan"], @@ -292,9 +300,9 @@ def launch_gui(DEBUG=True): tmp_dir=get_tmp_dir(None, output_dir), threads=2, ) - allout_init = run_in_context(run_args) + allout_init = exec_in_context(run, run_args) run_args.runmode = "dryrun" - allout_dryrun = run_in_context(run_args) + allout_dryrun = exec_in_context(run, run_args) allout = "\n".join([allout_init, allout_dryrun]) if DEBUG: print(allout) @@ -308,7 +316,7 @@ def launch_gui(DEBUG=True): ) if ch == "Yes": run_args.runmode = "run" - allout = run_in_context(run_args) + allout = exec_in_context(run, run_args) sg.popup_scrolled( allout, title="Slurmrun:STDOUT/STDERR", @@ -344,6 +352,7 @@ def launch_gui(DEBUG=True): continue window.close() + def copy_to_clipboard(string): r = Tk() r.withdraw() diff --git a/src/xavier/options.py b/src/xavier/options.py index 5676adf..5f73c0f 100644 --- a/src/xavier/options.py +++ b/src/xavier/options.py @@ -1,13 +1,8 @@ #!/usr/bin/env python3 # -*- coding: UTF-8 -*- -# Python standard library -from __future__ import print_function - -# Local imports -from .util import permissions - import os +from ccbr_tools.pipeline.util import permissions def genome_options(parser, user_option, prebuilt): @@ -45,7 +40,3 @@ def genome_options(parser, user_option, prebuilt): ) return user_option - - -if __name__ == "__main__": - pass diff --git a/src/xavier/run.py b/src/xavier/run.py index 516c0bf..8f9fbbf 100644 --- a/src/xavier/run.py +++ b/src/xavier/run.py @@ -11,20 +11,20 @@ import shutil import sys import subprocess - -# Local imports -from .util import ( +from ccbr_tools.pipeline.util import ( git_commit_hash, join_jsons, fatal, which, exists, err, - get_version, - xavier_base, require, get_hpcname, ) +from ccbr_tools.pipeline.cache import image_cache + +# Local imports +from .util import get_version, xavier_base def run(sub_args): @@ -38,7 +38,7 @@ def run(sub_args): # Step 0. Check for required dependencies # The pipelines has only two requirements: # snakemake and singularity - require(["snakemake", "singularity"], ["snakemake", "singularity"]) + require(["snakemake", "singularity"], ["snakemake/7", "singularity"]) # Optional Step. Initialize working directory, # copy over required resources to run @@ -360,7 +360,7 @@ def setup(sub_args, repo_path, output_path, create_nidap_folder_YN="no", links=[ # Resolves if an image needs to be pulled from an OCI registry or # a local SIF generated from the rna-seek cache subcommand exists - config = image_cache(sub_args, config, repo_path) + config = image_cache(sub_args, config) # Add other cli collected info config["project"]["annotation"] = sub_args.genome @@ -630,49 +630,6 @@ def add_rawdata_information(sub_args, config, ifiles): return config -def image_cache(sub_args, config, repo_path): - """Adds Docker Image URIs, or SIF paths to config if singularity cache option is provided. - If singularity cache option is provided and a local SIF does not exist, a warning is - displayed and the image will be pulled from URI in 'config/containers/images.json'. - @param sub_args : - Parsed arguments for run sub-command - @params config : - Docker Image config file - @param repo_path : - Path to RNA-seek source code and its templates - @return config : - Updated config dictionary containing user information (username and home directory) - """ - images = os.path.join(repo_path, "config", "containers", "images.json") - - # Read in config for docker image uris - with open(images, "r") as fh: - data = json.load(fh) - # Check if local sif exists - for image, uri in data["images"].items(): - if sub_args.sif_cache: - sif = os.path.join( - sub_args.sif_cache, - "{}.sif".format(os.path.basename(uri).replace(":", "_")), - ) - if not exists(sif): - # If local sif does not exist on in cache, print warning - # and default to pulling from URI in config/containers/images.json - print( - 'Warning: Local image "{}" does not exist in singularity cache'.format( - sif - ), - file=sys.stderr, - ) - else: - # Change pointer to image from Registry URI to local SIF - data["images"][image] = sif - - config.update(data) - - return config - - def get_nends(ifiles): """Determines whether the dataset is paired-end or single-end. If paired-end data, checks to see if both mates (R1 and R2) are present for each sample. @@ -969,13 +926,3 @@ def runner( ) return masterjob - - -def run_in_context(args): - """Execute the run function in a context manager to capture stdout/stderr""" - with contextlib.redirect_stdout(io.StringIO()) as out_f, contextlib.redirect_stderr( - io.StringIO() - ) as err_f: - run(args) - allout = out_f.getvalue() + "\n" + err_f.getvalue() - return allout diff --git a/src/xavier/shells.py b/src/xavier/shells.py index e9ded44..6480c1f 100644 --- a/src/xavier/shells.py +++ b/src/xavier/shells.py @@ -8,7 +8,7 @@ import subprocess # Local imports -from .util import err +from ccbr_tools.pipeline.util import err def set_options(strict): @@ -65,9 +65,3 @@ def bash( ) return exitcode - - -if __name__ == "__main__": - # Tests - bash("ls -la /home/") - bash("ls -la /fake/path") diff --git a/src/xavier/util.py b/src/xavier/util.py index 3cf87b8..4f04bfa 100644 --- a/src/xavier/util.py +++ b/src/xavier/util.py @@ -10,6 +10,7 @@ import json import glob import os +import pathlib import warnings @@ -17,10 +18,8 @@ def xavier_base(*paths): """Get the absolute path to a file in the repository @return abs_path """ - basedir = os.path.dirname( - os.path.dirname(os.path.dirname(os.path.realpath(__file__))) - ) - return os.path.join(basedir, *paths) + basedir = pathlib.Path(__file__).absolute().parent.parent.parent + return str(basedir.joinpath(*paths)) def get_version(): @@ -30,398 +29,3 @@ def get_version(): with open(xavier_base("VERSION"), "r") as vfile: version = f"v{vfile.read().strip()}" return version - - -def scontrol_show(): - """Run scontrol show config and parse the output as a dictionary - @return scontrol_dict : - """ - scontrol_dict = dict() - scontrol_out = subprocess.run( - "scontrol show config", shell=True, capture_output=True, text=True - ).stdout - if len(scontrol_out) > 0: - for line in scontrol_out.split("\n"): - line_split = line.split("=") - if len(line_split) > 1: - scontrol_dict[line_split[0].strip()] = line_split[1].strip() - return scontrol_dict - - -def get_hpcname(): - """Get the HPC name (biowulf, frce, or an empty string) - @return hpcname - """ - scontrol_out = scontrol_show() - hpc = scontrol_out["ClusterName"] if "ClusterName" in scontrol_out.keys() else "" - if hpc == "fnlcr": - hpc = "frce" - return hpc - - -def get_tmp_dir(tmp_dir, outdir, hpc=get_hpcname()): - """Get default temporary directory for biowulf and frce. Allow user override.""" - if not tmp_dir: - if hpc == "biowulf": - tmp_dir = "/lscratch/$SLURM_JOBID" - elif hpc == "frce": - tmp_dir = outdir - else: - tmp_dir = None - return tmp_dir - - -def get_genomes_list(hpcname=get_hpcname(), error_on_warnings=False): - """Get list of genome annotations available for the current platform - @return genomes_list - """ - return sorted( - list( - get_genomes_dict( - hpcname=hpcname, error_on_warnings=error_on_warnings - ).keys() - ) - ) - - -def get_genomes_dict(hpcname=get_hpcname(), error_on_warnings=False): - """Get dictionary of genome annotation versions and the paths to the corresponding JSON files - @return genomes_dict { genome_name: json_file_path } - """ - if error_on_warnings: - warnings.filterwarnings("error") - genomes_dir = xavier_base(os.path.join("config", "genomes", hpcname)) - if not os.path.exists(genomes_dir): - warnings.warn(f"Folder does not exist: {genomes_dir}") - search_term = genomes_dir + "/*.json" - json_files = glob.glob(search_term) - if len(json_files) == 0: - warnings.warn( - f"No Genome+Annotation JSONs found in {genomes_dir}. Please specify a custom genome json file with `--genome`" - ) - genomes_dict = { - os.path.basename(json_file).replace(".json", ""): json_file - for json_file in json_files - } - warnings.resetwarnings() - return genomes_dict - - -def md5sum(filename, first_block_only=False, blocksize=65536): - """Gets md5checksum of a file in memory-safe manner. - The file is read in blocks/chunks defined by the blocksize parameter. This is - a safer option to reading the entire file into memory if the file is very large. - @param filename : - Input file on local filesystem to find md5 checksum - @param first_block_only : - Calculate md5 checksum of the first block/chunk only - @param blocksize : - Blocksize of reading N chunks of data to reduce memory profile - @return hasher.hexdigest() : - MD5 checksum of the file's contents - """ - hasher = hashlib.md5() - with open(filename, "rb") as fh: - buf = fh.read(blocksize) - if first_block_only: - # Calculate MD5 of first block or chunk of file. - # This is a useful heuristic for when potentially - # calculating an MD5 checksum of thousand or - # millions of file. - hasher.update(buf) - return hasher.hexdigest() - while len(buf) > 0: - # Calculate MD5 checksum of entire file - hasher.update(buf) - buf = fh.read(blocksize) - - return hasher.hexdigest() - - -## copied directly from rna-seek -def check_cache(parser, cache, *args, **kwargs): - """Check if provided SINGULARITY_CACHE is valid. Singularity caches cannot be - shared across users (and must be owned by the user). Singularity strictly enforces - 0700 user permission on on the cache directory and will return a non-zero exitcode. - @param parser : - Argparse parser object - @param cache : - Singularity cache directory - @return cache : - If singularity cache dir is valid - """ - if not exists(cache): - # Cache directory does not exist on filesystem - os.makedirs(cache) - elif os.path.isfile(cache): - # Cache directory exists as file, raise error - parser.error( - """\n\t\x1b[6;37;41mFatal: Failed to provided a valid singularity cache!\x1b[0m - The provided --singularity-cache already exists on the filesystem as a file. - Please run {} again with a different --singularity-cache location. - """.format( - sys.argv[0] - ) - ) - elif os.path.isdir(cache): - # Provide cache exists as directory - # Check that the user owns the child cache directory - # May revert to os.getuid() if user id is not sufficient - if ( - exists(os.path.join(cache, "cache")) - and os.stat(os.path.join(cache, "cache")).st_uid != os.getuid() - ): - # User does NOT own the cache directory, raise error - parser.error( - """\n\t\x1b[6;37;41mFatal: Failed to provided a valid singularity cache!\x1b[0m - The provided --singularity-cache already exists on the filesystem with a different owner. - Singularity strictly enforces that the cache directory is not shared across users. - Please run {} again with a different --singularity-cache location. - """.format( - sys.argv[0] - ) - ) - - return cache - - -def permissions(parser, path, *args, **kwargs): - """Checks permissions using os.access() to see the user is authorized to access - a file/directory. Checks for existence, readability, writability and executability via: - os.F_OK (tests existence), os.R_OK (tests read), os.W_OK (tests write), os.X_OK (tests exec). - @param parser : - Argparse parser object - @param path : - Name of path to check - @return path : - Returns abs path if it exists and permissions are correct - """ - if not exists(path): - parser.error( - "Path '{}' does not exists! Failed to provide valid input.".format(path) - ) - if not os.access(path, *args, **kwargs): - parser.error( - "Path '{}' exists, but cannot read path due to permissions!".format(path) - ) - - return os.path.abspath(path) - - -def standard_input(parser, path, *args, **kwargs): - """Checks for standard input when provided or permissions using permissions(). - @param parser : - Argparse parser object - @param path : - Name of path to check - @return path : - If path exists and user can read from location - """ - # Checks for standard input - if not sys.stdin.isatty(): - # Standard input provided, set path as an - # empty string to prevent searching of '-' - path = "" - return path - - # Checks for positional arguments as paths - path = permissions(parser, path, *args, **kwargs) - - return path - - -def exists(testpath): - """Checks if file exists on the local filesystem. - @param parser : - argparse parser object - @param testpath : - Name of file/directory to check - @return does_exist : - True when file/directory exists, False when file/directory does not exist - """ - does_exist = True - if not os.path.exists(testpath): - does_exist = False # File or directory does not exist on the filesystem - - return does_exist - - -def ln(files, outdir): - """Creates symlinks for files to an output directory. - @param files list[]: - List of filenames - @param outdir : - Destination or output directory to create symlinks - """ - # Create symlinks for each file in the output directory - for file in files: - ln = os.path.join(outdir, os.path.basename(file)) - if not exists(ln): - os.symlink(os.path.abspath(os.path.realpath(file)), ln) - - -def which(cmd, path=None): - """Checks if an executable is in $PATH - @param cmd : - Name of executable to check - @param path : - Optional list of PATHs to check [default: $PATH] - @return : - True if exe in PATH, False if not in PATH - """ - if path is None: - path = os.environ["PATH"].split(os.pathsep) - - for prefix in path: - filename = os.path.join(prefix, cmd) - executable = os.access(filename, os.X_OK) - is_not_directory = os.path.isfile(filename) - if executable and is_not_directory: - return True - return False - - -def err(*message, **kwargs): - """Prints any provided args to standard error. - kwargs can be provided to modify print functions - behavior. - @param message : - Values printed to standard error - @params kwargs - Key words to modify print function behavior - """ - print(*message, file=sys.stderr, **kwargs) - - -def fatal(*message, **kwargs): - """Prints any provided args to standard error - and exits with an exit code of 1. - @param message : - Values printed to standard error - @params kwargs - Key words to modify print function behavior - """ - err(*message, **kwargs) - sys.exit(1) - - -def require(cmds, suggestions, path=None): - """Enforces an executable is in $PATH - @param cmds list[]: - List of executable names to check - @param suggestions list[]: - Name of module to suggest loading for a given index - in param cmd. - @param path list[]]: - Optional list of PATHs to check [default: $PATH] - """ - error = False - for i in range(len(cmds)): - available = which(cmds[i]) - if not available: - error = True - err( - """\x1b[6;37;41m\n\tFatal: {} is not in $PATH and is required during runtime! - └── Solution: please 'module load {}' and run again!\x1b[0m""".format( - cmds[i], suggestions[i] - ) - ) - - if error: - fatal() - - return - - -def safe_copy(source, target, resources=[]): - """Private function: Given a list paths it will recursively copy each to the - target location. If a target path already exists, it will NOT over-write the - existing paths data. - @param resources : - List of paths to copy over to target location - @params source : - Add a prefix PATH to each resource - @param target : - Target path to copy templates and required resources - """ - - for resource in resources: - destination = os.path.join(target, resource) - if not exists(destination): - # Required resources do not exist - copytree(os.path.join(source, resource), destination) - - -def git_commit_hash(repo_path): - """Gets the git commit hash of the RNA-seek repo. - @param repo_path : - Path to RNA-seek git repo - @return githash : - Latest git commit hash - """ - try: - githash = ( - subprocess.check_output( - ["git", "rev-parse", "HEAD"], stderr=subprocess.STDOUT, cwd=repo_path - ) - .strip() - .decode("utf-8") - ) - # Typecast to fix python3 TypeError (Object of type bytes is not JSON serializable) - # subprocess.check_output() returns a byte string - githash = str(githash) - except Exception as e: - # Github releases are missing the .git directory, - # meaning you cannot get a commit hash, set the - # commit hash to indicate its from a GH release - githash = "github_release" - return githash - - -def join_jsons(templates): - """Joins multiple JSON files to into one data structure - Used to join multiple template JSON files to create a global config dictionary. - @params templates : - List of template JSON files to join together - @return aggregated : - Dictionary containing the contents of all the input JSON files - """ - # Get absolute PATH to templates in rna-seek git repo - repo_path = os.path.dirname(os.path.abspath(__file__)) - aggregated = {} - - for file in templates: - with open(os.path.join(repo_path, file), "r") as fh: - aggregated.update(json.load(fh)) - - return aggregated - - -def check_python_version(): - # version check - # glob.iglob requires 3.11 for using "include_hidden=True" - MIN_PYTHON = (3, 11) - try: - assert sys.version_info >= MIN_PYTHON - print( - "Python version: {0}.{1}.{2}".format( - sys.version_info.major, sys.version_info.minor, sys.version_info.micro - ) - ) - except AssertionError: - exit( - f"{sys.argv[0]} requires Python {'.'.join([str(n) for n in MIN_PYTHON])} or newer" - ) - - -if __name__ == "__main__": - # Calculate MD5 checksum of entire file - print("{} {}".format(md5sum(sys.argv[0]), sys.argv[0])) - # Calculate MD5 checksum of 512 byte chunk of file, - # which is similar to following unix command: - # dd if=utils.py bs=512 count=1 2>/dev/null | md5sum - print( - "{} {}".format( - md5sum(sys.argv[0], first_block_only=True, blocksize=512), sys.argv[0] - ) - ) diff --git a/tests/data/README.md b/tests/data/README.md new file mode 100644 index 0000000..b0ced84 --- /dev/null +++ b/tests/data/README.md @@ -0,0 +1,20 @@ +# About + +These input files are used for continuous integration purposes, specifically to dry run the pipeline whenever commits have been made to the main, master, or unified branches. + +Human whole exome sequence reads from the Sequencing Quality Control Phase 2 (SEQC2) Consortium has been subsampled and added. + +The tumor-normal paired reads were downloaded from the [seqc2](https://sites.google.com/view/seqc2/home/sequencing) server that were sequenced by the NCI (WES_NC_T_1 vs. WES_NC_N_1) which corresponds to NCBI SRA accession no. [SRX4728524](https://www.ncbi.nlm.nih.gov/sra/SRX4728524) and [SRX4728523](https://www.ncbi.nlm.nih.gov/sra/SRX4728523) respectively. + +Next, the reads were subsampled to 0.1% using `seqtk` and gzipped as follows: + +```bash +seqtk sample -s100 {input}.R[1/2].fastq.gz 0.001 > {input}.R[1/2]_sub.R2.fastq +gzip *.fastq +``` + +Similarly, the BAM files were created by first mapping to the hg38 genome and then subsampled using `samtools`: + +```bash +samtools view -s 0.00125 -b WES_NC_[T/N]_1.bam -o WES_NC_[T/N]_1_sub.bam +``` \ No newline at end of file diff --git a/tests/data/WES_NC_N_1_sub.R1.fastq.gz b/tests/data/WES_NC_N_1_sub.R1.fastq.gz new file mode 100644 index 0000000..7468c27 Binary files /dev/null and b/tests/data/WES_NC_N_1_sub.R1.fastq.gz differ diff --git a/tests/data/WES_NC_N_1_sub.R2.fastq.gz b/tests/data/WES_NC_N_1_sub.R2.fastq.gz new file mode 100644 index 0000000..5c55a61 Binary files /dev/null and b/tests/data/WES_NC_N_1_sub.R2.fastq.gz differ diff --git a/tests/data/WES_NC_N_1_sub.bam b/tests/data/WES_NC_N_1_sub.bam new file mode 100644 index 0000000..e8ab407 Binary files /dev/null and b/tests/data/WES_NC_N_1_sub.bam differ diff --git a/tests/data/WES_NC_T_1_sub.R1.fastq.gz b/tests/data/WES_NC_T_1_sub.R1.fastq.gz new file mode 100644 index 0000000..f82aac4 Binary files /dev/null and b/tests/data/WES_NC_T_1_sub.R1.fastq.gz differ diff --git a/tests/data/WES_NC_T_1_sub.R2.fastq.gz b/tests/data/WES_NC_T_1_sub.R2.fastq.gz new file mode 100644 index 0000000..c72b9aa Binary files /dev/null and b/tests/data/WES_NC_T_1_sub.R2.fastq.gz differ diff --git a/tests/data/WES_NC_T_1_sub.bam b/tests/data/WES_NC_T_1_sub.bam new file mode 100644 index 0000000..efecae5 Binary files /dev/null and b/tests/data/WES_NC_T_1_sub.bam differ diff --git a/tests/data/pairs.tsv b/tests/data/pairs.tsv new file mode 100644 index 0000000..00d7a2b --- /dev/null +++ b/tests/data/pairs.tsv @@ -0,0 +1,2 @@ +Normal Tumor +WES_NC_N_1_sub WES_NC_T_1_sub diff --git a/tests/test_cli.py b/tests/test_cli.py index 878b6dc..2196e3f 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -3,12 +3,13 @@ import subprocess import tempfile from xavier.src.xavier.__main__ import main -from xavier.src.xavier.util import get_hpcname +from ccbr_tools.pipeline.util import get_hpcname +from ccbr_tools.shell import exec_in_context, shell_run xavier_run = ( "xavier run " - "--input .tests/*.fastq.gz " - "--pairs .tests/pairs.tsv " + "--input tests/data/*.fastq.gz " + "--pairs tests/data/pairs.tsv " "--mode local " ) @@ -17,11 +18,8 @@ def run_in_temp(command_str): with tempfile.TemporaryDirectory() as tmp_dir: outdir = os.path.join(tmp_dir, "testout") run_command = f"{command_str} --output {outdir}" - output = subprocess.run( + output = shell_run( f"{run_command} --runmode init && {run_command} --runmode dryrun", - capture_output=True, - shell=True, - text=True, ) if os.path.exists(os.path.join(outdir, "config.json")): with open(os.path.join(outdir, "config.json"), "r") as infile: @@ -32,12 +30,7 @@ def run_in_temp(command_str): def test_help(): - assert ( - "XAVIER" - in subprocess.run( - "./bin/xavier --help", capture_output=True, shell=True, text=True - ).stdout - ) + assert "XAVIER" in shell_run("./bin/xavier --help") def test_dryrun_targets(): @@ -53,13 +46,13 @@ def test_dryrun_targets(): assert all( [ "This was a dry-run (flag -n). The order of jobs does not reflect the order of execution." - in output_human.stdout, + in output_human, "This was a dry-run (flag -n). The order of jobs does not reflect the order of execution." - in output_mouse.stdout, + in output_mouse, "This was a dry-run (flag -n). The order of jobs does not reflect the order of execution." - in output_custom.stdout, + in output_custom, "error: Path 'not/a/file.txt' does not exists! Failed to provide valid input." - in output_invalid.stderr, + in output_invalid, config_human["input_params"]["EXOME_TARGETS"].endswith( "resources/Agilent_SSv7_allExons_hg38.bed" ), diff --git a/tests/test_run.py b/tests/test_run.py index 8fd28d6..054beac 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -2,10 +2,12 @@ import glob import os import tempfile +from ccbr_tools.pipeline.util import get_tmp_dir, get_hpcname +from ccbr_tools.pipeline.cache import get_sif_cache_dir +from ccbr_tools.shell import exec_in_context -from xavier.src.xavier.util import get_tmp_dir, xavier_base, get_hpcname -from xavier.src.xavier.cache import get_sif_cache_dir -from xavier.src.xavier.run import run, run_in_context +from xavier.src.xavier.util import xavier_base +from xavier.src.xavier.run import run def test_dryrun(): @@ -13,14 +15,14 @@ def test_dryrun(): with tempfile.TemporaryDirectory() as tmp_dir: run_args = argparse.Namespace( runmode="init", - input=list(glob.glob(xavier_base(".tests/*.fastq.gz"))), + input=list(glob.glob(f"{xavier_base('tests/data')}/*.fastq.gz")), output=tmp_dir, genome="hg38", targets=xavier_base("resources/Agilent_SSv7_allExons_hg38.bed"), mode="local", job_name="pl:xavier", callers=["mutect2", "mutect", "strelka", "vardict", "varscan"], - pairs=xavier_base(".tests/pairs.tsv"), + pairs=xavier_base("tests/data/pairs.tsv"), ffpe=False, cnv=False, wait=False, @@ -32,11 +34,14 @@ def test_dryrun(): threads=2, ) # init - allout_1 = run_in_context(run_args) + allout_1 = exec_in_context(run, run_args) run_args.runmode = "dryrun" # dryrun - allout_2 = run_in_context(run_args) - assert (all([ - "--Initializing" in allout_1, - "This was a dry-run (flag -n). The order of jobs does not reflect the order of execution." in allout_2 - ])) + allout_2 = exec_in_context(run, run_args) + assert all( + [ + "--Initializing" in allout_1, + "This was a dry-run (flag -n). The order of jobs does not reflect the order of execution." + in allout_2, + ] + ) diff --git a/tests/test_util.py b/tests/test_util.py index 4cccd51..ad5df41 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -1,10 +1,7 @@ import os import warnings -from xavier.src.xavier.util import ( - xavier_base -) +from xavier.src.xavier.util import xavier_base + def test_xavier_base(): - test_base = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) - xavier_base() - assert xavier_base("a","b","c").endswith('/a/b/c') + assert str(xavier_base("a", "b", "c")).endswith("/a/b/c") diff --git a/workflow/rules/somatic_snps.paired.smk b/workflow/rules/somatic_snps.paired.smk index ce8071f..1f5427b 100644 --- a/workflow/rules/somatic_snps.paired.smk +++ b/workflow/rules/somatic_snps.paired.smk @@ -73,7 +73,6 @@ rule pileup_paired: wait """ -localrules: contamination_paired rule contamination_paired: input: tumor = os.path.join(output_somatic_snpindels, "mutect2_out", "pileup_summaries", "{samples}_tumor.pileup.table"), diff --git a/workflow/rules/somatic_snps.tumor_only.smk b/workflow/rules/somatic_snps.tumor_only.smk index 650e3b9..826b07f 100644 --- a/workflow/rules/somatic_snps.tumor_only.smk +++ b/workflow/rules/somatic_snps.tumor_only.smk @@ -67,7 +67,6 @@ rule pileup_single: -O {output.pileup} """ -localrules: contamination_single rule contamination_single: input: pileup = os.path.join(output_somatic_snpindels, "mutect2_out", "pileup_summaries", "{samples}.pileup.table")