Merge pull request #102 from CCBR/iss-100

Set default exome targets file based on genome
CCBR · Aug 12, 2024 · 0c1a3ce · 0c1a3ce
2 parents cbf9842 + 86b7bfc
commit 0c1a3ce
Show file tree

Hide file tree

Showing 15 changed files with 106 additions and 21 deletions.
diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
@@ -26,15 +26,15 @@ jobs:
           /opt2/.tests/Sample10_ARK1_S37.R1.fastq.gz /opt2/.tests/Sample10_ARK1_S37.R2.fastq.gz \
           /opt2/.tests/Sample11_ACI_158_S38.R1.fastq.gz /opt2/.tests/Sample11_ACI_158_S38.R2.fastq.gz \
           /opt2/.tests/Sample4_CRL1622_S31.R1.fastq.gz /opt2/.tests/Sample4_CRL1622_S31.R2.fastq.gz \
-          --output /opt2/output_tn_fqs --targets /opt2/.tests/Agilent_SSv7_allExons_hg38.bed \
+          --output /opt2/output_tn_fqs --targets /opt2/resources/Agilent_SSv7_allExons_hg38.bed \
           --pairs /opt2/.tests/pairs.tsv --genome hg38 --mode local --ffpe --cnv --runmode init
 
           docker run -v $PWD:/opt2 snakemake/snakemake:v7.32.4 \
           /opt2/bin/xavier run --input \
           /opt2/.tests/Sample10_ARK1_S37.R1.fastq.gz /opt2/.tests/Sample10_ARK1_S37.R2.fastq.gz \
           /opt2/.tests/Sample11_ACI_158_S38.R1.fastq.gz /opt2/.tests/Sample11_ACI_158_S38.R2.fastq.gz \
           /opt2/.tests/Sample4_CRL1622_S31.R1.fastq.gz /opt2/.tests/Sample4_CRL1622_S31.R2.fastq.gz \
-          --output /opt2/output_tn_fqs --targets /opt2/.tests/Agilent_SSv7_allExons_hg38.bed \
+          --output /opt2/output_tn_fqs --targets /opt2/resources/Agilent_SSv7_allExons_hg38.bed \
           --pairs /opt2/.tests/pairs.tsv --genome hg38 --mode local --ffpe --cnv --runmode dryrun
 
       - name: Tumor-only FastQ Dry Run
@@ -44,15 +44,15 @@ jobs:
           /opt2/.tests/Sample10_ARK1_S37.R1.fastq.gz /opt2/.tests/Sample10_ARK1_S37.R2.fastq.gz \
           /opt2/.tests/Sample11_ACI_158_S38.R1.fastq.gz /opt2/.tests/Sample11_ACI_158_S38.R2.fastq.gz \
           /opt2/.tests/Sample4_CRL1622_S31.R1.fastq.gz /opt2/.tests/Sample4_CRL1622_S31.R2.fastq.gz \
-          --output /opt2/output_tonly_fqs --targets /opt2/.tests/Agilent_SSv7_allExons_hg38.bed \
+          --output /opt2/output_tonly_fqs --targets /opt2/resources/Agilent_SSv7_allExons_hg38.bed \
           --genome hg38 --mode local --ffpe --runmode init
 
           docker run -v $PWD:/opt2 snakemake/snakemake:v7.32.4 \
           /opt2/bin/xavier run --input \
           /opt2/.tests/Sample10_ARK1_S37.R1.fastq.gz /opt2/.tests/Sample10_ARK1_S37.R2.fastq.gz \
           /opt2/.tests/Sample11_ACI_158_S38.R1.fastq.gz /opt2/.tests/Sample11_ACI_158_S38.R2.fastq.gz \
           /opt2/.tests/Sample4_CRL1622_S31.R1.fastq.gz /opt2/.tests/Sample4_CRL1622_S31.R2.fastq.gz \
-          --output /opt2/output_tonly_fqs --targets /opt2/.tests/Agilent_SSv7_allExons_hg38.bed \
+          --output /opt2/output_tonly_fqs --targets /opt2/resources/Agilent_SSv7_allExons_hg38.bed \
           --genome hg38 --mode local --ffpe --runmode dryrun
 
       - name: Tumor-normal BAM Dry Run
@@ -62,15 +62,15 @@ jobs:
           /opt2/.tests/Sample10_ARK1_S37.recal.bam \
           /opt2/.tests/Sample11_ACI_158_S38.recal.bam \
           /opt2/.tests/Sample4_CRL1622_S31.recal.bam \
-          --output /opt2/output_tn_bams --targets /opt2/.tests/Agilent_SSv7_allExons_hg38.bed \
+          --output /opt2/output_tn_bams --targets /opt2/resources/Agilent_SSv7_allExons_hg38.bed \
           --pairs /opt2/.tests/pairs.tsv --genome hg38 --mode local --ffpe --cnv --runmode init
 
           docker run -v $PWD:/opt2 snakemake/snakemake:v7.32.4 \
           /opt2/bin/xavier run --input \
           /opt2/.tests/Sample10_ARK1_S37.recal.bam \
           /opt2/.tests/Sample11_ACI_158_S38.recal.bam \
           /opt2/.tests/Sample4_CRL1622_S31.recal.bam \
-          --output /opt2/output_tn_bams --targets /opt2/.tests/Agilent_SSv7_allExons_hg38.bed \
+          --output /opt2/output_tn_bams --targets /opt2/resources/Agilent_SSv7_allExons_hg38.bed \
           --pairs /opt2/.tests/pairs.tsv --genome hg38 --mode local --ffpe --cnv --runmode dryrun
 
       - name: Tumor-only BAM Dry Run
@@ -80,15 +80,15 @@ jobs:
           /opt2/.tests/Sample10_ARK1_S37.recal.bam \
           /opt2/.tests/Sample11_ACI_158_S38.recal.bam \
           /opt2/.tests/Sample4_CRL1622_S31.recal.bam \
-          --output /opt2/output_tonly_bams --targets /opt2/.tests/Agilent_SSv7_allExons_hg38.bed \
+          --output /opt2/output_tonly_bams --targets /opt2/resources/Agilent_SSv7_allExons_hg38.bed \
           --genome hg38 --mode local --ffpe --runmode init
 
           docker run -v $PWD:/opt2 snakemake/snakemake:v7.32.4 \
           /opt2/bin/xavier run --input \
           /opt2/.tests/Sample10_ARK1_S37.recal.bam \
           /opt2/.tests/Sample11_ACI_158_S38.recal.bam \
           /opt2/.tests/Sample4_CRL1622_S31.recal.bam \
-          --output /opt2/output_tonly_bams --targets /opt2/.tests/Agilent_SSv7_allExons_hg38.bed \
+          --output /opt2/output_tonly_bams --targets /opt2/resources/Agilent_SSv7_allExons_hg38.bed \
           --genome hg38 --mode local --ffpe --runmode dryrun
 
       - name: Lint

diff --git a/.tests/Agilent_SSv7_allExons_hg38.bed b/.tests/Agilent_SSv7_allExons_hg38.bed
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,8 @@
 - The docs website now has a dropdown menu to select which version to view. The latest release is shown by default. (#150, @kelly-sovacool)
 - Add `xavier gui` subcommand to launch the graphical user interface. (#99, @kelly-sovacool)
   - Previously, `xavier_gui` (with an underscore) was a command in the `ccbrpipeliner` module.
+- Provide default exome targets for hg38 and mm10, which can be overridden by the optional `--targets` argument. (#102, @kelly-sovacool)
+  - Previously, the `--targets` argument was required with no defaults.
 
 ## XAVIER 3.0.3
 

diff --git a/README.md b/README.md
@@ -61,7 +61,7 @@ xavier run \
 --output /data/$USER/xavier_hg38 \
 --genome hg38 \
 --pairs pairs.txt \
---targets Targets_hg38.bed \
+--targets resources/Agilent_SSv7_allExons_hg38.bed \
 --mode slurm \
 --runmode init
 
@@ -71,7 +71,7 @@ xavier run \
 --output /data/$USER/xavier_hg38 \
 --genome hg38 \
 --pairs pairs.txt \
---targets Targets_hg38.bed \
+--targets resources/Agilent_SSv7_allExons_hg38.bed \
 --mode slurm \
 --runmode dryrun
 
@@ -81,7 +81,7 @@ xavier run \
 --output /data/$USER/xavier_hg38 \
 --genome hg38 \
 --pairs pairs.txt \
---targets Targets_hg38.bed \
+--targets resources/Agilent_SSv7_allExons_hg38.bed \
 --mode slurm \
 --runmode run
 ```
@@ -109,7 +109,7 @@ xavier run \
 --sif-cache $SIFCACHE \
 --tmp-dir $TMPDIR \
 --pairs pairs.txt \
---targets Targets_hg38.bed \
+--targets resources/Agilent_SSv7_allExons_hg38.bed \
 --mode slurm \
 --runmode init # run
 

diff --git a/config/genomes/biowulf/hg38.json b/config/genomes/biowulf/hg38.json
@@ -1,6 +1,7 @@
 {
     "references": {
         "FASTQ_SCREEN_CONFIG": "resources/fastq_screen.biowulf.conf",
+        "exome_targets": "resources/Agilent_SSv7_allExons_hg38.bed",
         "KRAKENBACDB": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/kraken/20180907_standard_kraken2",
         "trimmomatic.adapters": "resources/adapters.fa",
         "SNPEFF_GENOME": "GRCh38.86",

diff --git a/config/genomes/biowulf/mm10.json b/config/genomes/biowulf/mm10.json
@@ -1,6 +1,7 @@
 {
     "references": {
         "FASTQ_SCREEN_CONFIG": "resources/fastq_screen.biowulf.conf",
+        "exome_targets": "resources/SureSelect_mm10_sorted.bed",
         "KRAKENBACDB": "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/kraken/20180907_standard_kraken2",
         "trimmomatic.adapters": "resources/adapters.fa",
         "SNPEFF_GENOME": "GRCm38.86",

diff --git a/config/genomes/frce/hg38.json b/config/genomes/frce/hg38.json
@@ -1,6 +1,7 @@
 {
     "references": {
         "FASTQ_SCREEN_CONFIG": "resources/fastq_screen.frce.conf",
+        "exome_targets": "resources/Agilent_SSv7_allExons_hg38.bed",
         "KRAKENBACDB": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/kraken/20180907_standard_kraken2",
         "trimmomatic.adapters": "resources/adapters.fa",
         "SNPEFF_GENOME": "GRCh38.86",

diff --git a/config/genomes/frce/mm10.json b/config/genomes/frce/mm10.json
@@ -1,6 +1,7 @@
 {
     "references": {
         "FASTQ_SCREEN_CONFIG": "resources/fastq_screen.frce.conf",
+        "exome_targets": "resources/SureSelect_mm10_sorted.bed",
         "KRAKENBACDB": "/mnt/projects/CCBR-Pipelines/pipelines/XAVIER/resources/hg38/kraken/20180907_standard_kraken2",
         "trimmomatic.adapters": "resources/adapters.fa",
         "SNPEFF_GENOME": "GRCm38.86",

diff --git a/docs/usage/run.md b/docs/usage/run.md
@@ -82,7 +82,7 @@ Each of the following arguments are required. Failure to provide a required argu
 > This option defines the reference genome for your set of samples. On Biowulf, xavier does comes bundled with pre built reference files for human samples; however, it is worth noting that the pipeline does accept a pre-built resource bundle pulled with the cache sub command (coming soon). Currently, the pipeline only supports the human reference hg38; however, support for mouse reference mm10 will be added soon.
 >
 > **_Pre built Option_**  
-> Here is a list of available pre built genomes on Biowulf: hg38.
+> Here is a list of available pre built genomes on Biowulf: hg38, mm10.
 >
 > **_Custom Option_**  
 > For users running the pipeline outside of Biowulf, a pre-built resource bundle can be pulled with the cache sub command (coming soon). Please supply the custom reference JSON file that was generated by the cache sub command.
@@ -98,7 +98,9 @@ Each of the following arguments are required. Failure to provide a required argu
 >
 > This file can be obtained from the manufacturer of the target capture kit that was used.
 >
-> **_Example:_** `--targets /data/$USER/Agilent_SSv7_allExons_hg38.bed`
+> If not provided, the default targets file from the genome config is used
+>
+> **_Example:_** `--targets resources/Agilent_SSv7_allExons_hg38.bed` > **_Example:_** `--targets resources/SureSelect_mm10_sorted.bed`
 
 ### 2.2 Options
 

diff --git a/resources/Agilent_SSv7_allExons_hg38.bed b/resources/Agilent_SSv7_allExons_hg38.bed
diff --git a/resources/fastq_screen.frce.conf b/resources/fastq_screen.frce.conf
diff --git a/src/xavier/__main__.py b/src/xavier/__main__.py
@@ -245,6 +245,9 @@ def parsed_arguments():
                                 Path to exome targets BED file. This file can be
                                 obtained from the manufacturer of the target capture
                                 kit that was used.
+                                If not provided, the default targets file is used from the genome config file.
+                                Example: --targets resources/Agilent_SSv7_allExons_hg38.bed
+                                Example: --targets resources/SureSelect_mm10_sorted.bed
 
         """
     )
@@ -264,15 +267,15 @@ def parsed_arguments():
                         --input .tests/*.R?.fastq.gz \\
                         --output /data/$USER/xavier_hg38 \\
                         --genome hg38 \\
-                        --targets .tests/Agilent_SSv7_allExons_hg38.bed
+                        --targets resources/Agilent_SSv7_allExons_hg38.bed
 
           # Step 2B.) Dry-run the pipeline
           xavier run \\
                         --runmode dryrun \\
                         --input .tests/*.R?.fastq.gz \\
                         --output /data/$USER/xavier_hg38 \\
                         --genome hg38 \\
-                        --targets Agilent_SSv7_allExons_hg38.bed \\
+                        --targets resources/Agilent_SSv7_allExons_hg38.bed \\
                         --mode slurm \\
 
           # Step 2C.) Run the XAVIER pipeline
@@ -283,7 +286,7 @@ def parsed_arguments():
                         --input .tests/*.R?.fastq.gz \\
                         --output /data/$USER/xavier_hg38 \\
                         --genome hg38 \\
-                        --targets .tests/Agilent_SSv7_allExons_hg38.bed \\
+                        --targets resources/Agilent_SSv7_allExons_hg38.bed \\
                         --mode slurm
 
         version:
@@ -354,8 +357,9 @@ def parsed_arguments():
         "--targets",
         # Check if the file exists and if it is readable
         type=lambda file: permissions(parser, file, os.R_OK),
-        required=True,
+        required=False,
         help=argparse.SUPPRESS,
+        default=None,
     )
 
     # Optional Arguments

diff --git a/src/xavier/run.py b/src/xavier/run.py
@@ -322,7 +322,7 @@ def setup(sub_args, repo_path, output_path, create_nidap_folder_YN="no", links=[
             f"{shorthostname} unknown host. Configuration files for references may not be correct. Defaulting to Biowulf config"
         )
     else:
-        print(f"Thank you for running XAVIER on {shorthostname.upper()}")   
+        print(f"Thank you for running XAVIER on {shorthostname.upper()}")
 
     genome_config = os.path.join(
         repo_path, "config", "genomes", get_hpcname(), sub_args.genome + ".json"
@@ -370,7 +370,17 @@ def setup(sub_args, repo_path, output_path, create_nidap_folder_YN="no", links=[
     # Add optional cli workflow steps
     config["input_params"]["CNV_CALLING"] = str(sub_args.cnv).lower()
     config["input_params"]["FFPE_FILTER"] = str(sub_args.ffpe).lower()
-    config["input_params"]["EXOME_TARGETS"] = str(sub_args.targets)
+    config["input_params"]["EXOME_TARGETS"] = (
+        str(sub_args.targets)
+        if sub_args.targets
+        else os.path.join(
+            config["project"]["workpath"], config["references"]["exome_targets"]
+        )
+    )
+    if not os.path.exists(config["input_params"]["EXOME_TARGETS"]):
+        raise FileNotFoundError(
+            f"Exome targets file does not exist: {config['input_params']['EXOME_TARGETS']}"
+        )
     config["input_params"]["VARIANT_CALLERS"] = sub_args.callers
     config["input_params"]["PAIRS_FILE"] = str(sub_args.pairs)
     config["input_params"]["BASE_OUTDIR"] = str(sub_args.output)

diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -1,5 +1,34 @@
+import json
+import os
 import subprocess
+import tempfile
 from xavier.src.xavier.__main__ import main
+from xavier.src.xavier.util import get_hpcname
+
+xavier_run = (
+    "xavier run "
+    "--input .tests/*.fastq.gz "
+    "--pairs .tests/pairs.tsv "
+    "--mode local "
+)
+
+
+def run_in_temp(command_str):
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        outdir = os.path.join(tmp_dir, "testout")
+        run_command = f"{command_str} --output {outdir}"
+        output = subprocess.run(
+            f"{run_command} --runmode init && {run_command} --runmode dryrun",
+            capture_output=True,
+            shell=True,
+            text=True,
+        )
+        if os.path.exists(os.path.join(outdir, "config.json")):
+            with open(os.path.join(outdir, "config.json"), "r") as infile:
+                config = json.load(infile)
+        else:
+            config = None
+    return output, config
 
 
 def test_help():
@@ -9,3 +38,37 @@ def test_help():
             "./bin/xavier --help", capture_output=True, shell=True, text=True
         ).stdout
     )
+
+
+def test_dryrun_targets():
+    if get_hpcname() == "biowulf":
+        output_human, config_human = run_in_temp(f"{xavier_run} --genome hg38")
+        output_mouse, config_mouse = run_in_temp(f"{xavier_run} --genome mm10")
+        output_custom, config_custom = run_in_temp(
+            f"{xavier_run} --genome mm10 --targets resources/Agilent_SSv7_allExons_hg38.bed"
+        )
+        output_invalid, config_invalid = run_in_temp(
+            f"{xavier_run} --genome hg38 --target not/a/file.txt"
+        )
+        assert all(
+            [
+                "This was a dry-run (flag -n). The order of jobs does not reflect the order of execution."
+                in output_human.stdout,
+                "This was a dry-run (flag -n). The order of jobs does not reflect the order of execution."
+                in output_mouse.stdout,
+                "This was a dry-run (flag -n). The order of jobs does not reflect the order of execution."
+                in output_custom.stdout,
+                "error: Path 'not/a/file.txt' does not exists! Failed to provide valid input."
+                in output_invalid.stderr,
+                config_human["input_params"]["EXOME_TARGETS"].endswith(
+                    "resources/Agilent_SSv7_allExons_hg38.bed"
+                ),
+                config_mouse["input_params"]["EXOME_TARGETS"].endswith(
+                    "resources/SureSelect_mm10_sorted.bed"
+                ),
+                config_custom["input_params"]["EXOME_TARGETS"].endswith(
+                    "resources/Agilent_SSv7_allExons_hg38.bed"
+                ),
+                not config_invalid,
+            ]
+        )
diff --git a/tests/test_run.py b/tests/test_run.py
@@ -16,7 +16,7 @@ def test_dryrun():
                 input=list(glob.glob(xavier_base(".tests/*.fastq.gz"))),
                 output=tmp_dir,
                 genome="hg38",
-                targets=xavier_base(".tests/Agilent_SSv7_allExons_hg38.bed"),
+                targets=xavier_base("resources/Agilent_SSv7_allExons_hg38.bed"),
                 mode="local",
                 job_name="pl:xavier",
                 callers=["mutect2", "mutect", "strelka", "vardict", "varscan"],