From a7712da2401e21d24f36c707b5b51216f9f099f6 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Fri, 5 Jul 2024 15:43:40 +0200 Subject: [PATCH] docs: Check & update of configuration defaults (first part) --- snappy_pipeline/apps/tpls/project_config.yaml | 40 +++++++++++--- .../workflows/ngs_mapping/model.py | 52 ++++++++++++++++--- .../somatic_variant_calling/model.py | 38 ++++++++++++-- 3 files changed, 113 insertions(+), 17 deletions(-) diff --git a/snappy_pipeline/apps/tpls/project_config.yaml b/snappy_pipeline/apps/tpls/project_config.yaml index 0e73a72b2..1bbe91a06 100644 --- a/snappy_pipeline/apps/tpls/project_config.yaml +++ b/snappy_pipeline/apps/tpls/project_config.yaml @@ -4,18 +4,46 @@ # Step Configuration ============================================================================== # -# Configuration for paths with static data. This has been preconfigured for the paths on the BIH -# cluster. +# Configuration for paths with static data (GRCh37/hs37d5 genome release, no CHR prefix in contig names). +# This has been preconfigured for the paths on the BIH cluster. # static_data_config: cosmic: - path: /fast/projects/cubit/current/static_data/db/COSMIC/v72/GRCh37/CosmicAll.vcf.gz + path: /data/cephfs-1/work/projects/cubit/current/static_data/db/COSMIC/v72/GRCh37/CosmicAll.vcf.gz dbnsfp: - path: /fast/projects/cubit/current/static_data/db/dbNSFP/2.9/dbNSFP2.9.txt.gz + path: /data/cephfs-1/work/projects/cubit/current/static_data/db/dbNSFP/2.9/dbNSFP2.9.txt.gz dbsnp: - path: /fast/projects/cubit/current/static_data/db/dbSNP/b147/GRCh37/All_20160408.vcf.gz + path: /data/cephfs-1/work/projects/cubit/current/static_data/db/dbSNP/b147/GRCh37/All_20160408.vcf.gz reference: - path: /fast/projects/cubit/current/static_data/reference/GRCh37/hs37d5/hs37d5.fa + path: /data/cephfs-1/work/projects/cubit/current/static_data/reference/GRCh37/hs37d5/hs37d5.fa + features: + path: /data/cephfs-1/work/projects/cubit/current/static_data/annotation/GENCODE/19/GRCh37/gencode.v19.annotation.gtf + +# Step Configuration ============================================================================== +# +# Configuration for paths with static data (GRCh38.d1.vd1 genome release, includes decoys & viral sequences). +# This has been preconfigured for the paths on the BIH cluster (CUBI group members only). +# +# Notes: +# - GRCh38.d1.vd1 is the genome release used by the GDC consortium +# (https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/DNA_Seq_Variant_Calling_Pipeline/#alignment-workflow) +# - GENCODE release 36 is the feature annotation used by the GDC consortium for its RNA pipeline +# (https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline/#rna-seq-alignment-workflow) +# - The GENCODE release 36 corresponds to ENSEMBL release 102 +# (https://www.gencodegenes.org/human/releases.html) +# - Some files have not yet been moved to Tier 1. +# +# static_data_config: +# cosmic: +# path: /data/cephfs-1/work/projects/cubit/current/static_data/db/COSMIC/v90/GRCh38/CosmicAll.vcf.gz +# dbnsfp: +# path: /data/cephfs-1/work/projects/cubit/current/static_data/db/dbNSFP/3.5/GRCh38/dbNSFP.txt.gz +# dbsnp: +# path: /data/cephfs-1/work/projects/cubit/current/static_data/db/dbSNP/b147/GRCh38/common_all_20160407.vcf.gz +# reference: +# path: /fast/work/groups/cubi/projects/biotools/static_data/reference/GRCh38.d1.vd1/GRCh38.d1.vd1.fa +# features: +# path: /fast/work/groups/cubi/projects/biotools/static_data_by_ref/GRCh38/annotation/GENCODE/36/gencode.v36.primary_assembly.annotation.gtf # Step Configuration ============================================================================== # diff --git a/snappy_pipeline/workflows/ngs_mapping/model.py b/snappy_pipeline/workflows/ngs_mapping/model.py index 30968a324..5ff25fd50 100644 --- a/snappy_pipeline/workflows/ngs_mapping/model.py +++ b/snappy_pipeline/workflows/ngs_mapping/model.py @@ -43,6 +43,9 @@ class TargetCoverageReportEntry(SnappyModel): - name: IDT_xGen_V1_0 pattern: "xGen Exome Research Panel V1\\.0*" path: "path/to/targets.bed" + + Bed file for many Agilent exome panels can be found in + /fast/work/groups/cubi/projects/biotools/static_data/exome_panel/Agilent """ name: Annotated[str, Field(examples=["IDT_xGen_V1_0"])] @@ -72,7 +75,14 @@ class BwaMode(Enum): class BwaMapper(SnappyModel): - path_index: str + path_index: str = Field( + examples=[ + "/data/cephfs-1/work/projects/cubit/current/static_data/precomputed/BWA/0.7.17/GRCh37/hs37d5/hs37d5.fa", + "/fast/work/groups/cubi/projects/biotools/static_data_by_ref/GRCh38.d1.vd1/precomputed/BWA/0.7.17/GRCh38.d1.vd1.fa", + "/data/cephfs-1/work/groups/cubi/projects/biotools/bwa-mem2/GRCh37/hs37d5/hs37d5.fa", + "/fast/work/groups/cubi/projects/biotools/static_data_by_ref/GRCh38.d1.vd1/precomputed/BWA_MEM2/2.2.1/GRCh38.d1.vd1.fa", + ] + ) """Required if listed in ngs_mapping.tools.dna; otherwise, can be removed.""" num_threads_align: int = 16 num_threads_trimming: int = 8 @@ -145,8 +155,13 @@ class Somatic(SnappyModel): class Bqsr(SnappyModel): - common_variants: str - """Common germline variants (see /fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK)""" + common_variants: str = Field( + examples=[ + "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/b37/small_exac_common_3.vcf.gz", + "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/hg38/small_exac_common_3.hg38.vcf.gz", + ] + ) + """Common germline variants (see https://console.cloud.google.com/storage/browser/gatk-best-practices)""" class AgentLibPrepType(Enum): @@ -158,7 +173,11 @@ class AgentLibPrepType(Enum): class AgentPrepare(SnappyModel): - path: str + path: str = Field( + examples=[ + "/fast/work/groups/cubi/projects/biotools/AGeNT_3.0.6/agent/lib/trimmer-3.0.5.jar" + ] + ) lib_prep_type: AgentLibPrepType = None """One of "halo" (HaloPlex), "hs" (HaloPlexHS), "xt" (SureSelect XT, XT2, XT HS), "v2" (SureSelect XT HS2) & "qxt" (SureSelect QXT)""" @@ -174,8 +193,17 @@ class AgentMarkDuplicatesConsensusMode(Enum): class AgentMarkDuplicates(SnappyModel): - path: str + path: str = Field( + examples=["/fast/work/groups/cubi/projects/biotools/AGeNT_3.0.6/agent/lib/creak-1.0.5.jar"] + ) path_baits: str + """ + Different exome panels cannot be accomodated here, because the selection method used for coverage is not used. + The absolute path of the exome panel must be input. + + Bed file for many Agilent exome panels can be found in + /fast/work/groups/cubi/projects/biotools/static_data/exome_panel/Agilent + """ consensus_mode: AgentMarkDuplicatesConsensusMode = None """One of "SINGLE", "HYBRID", "DUPLEX" """ @@ -194,7 +222,12 @@ class Agent(SnappyModel): class Star(SnappyModel): - path_index: str + path_index: str = Field( + examples=[ + "/fast/work/groups/cubi/projects/biotools/static_data_by_ref/hs37d5/annotation/GENCODE/19/precomputed/STAR/2.7.10a/100", + "/fast/work/groups/cubi/projects/biotools/static_data_by_ref/GRCh38/annotations/GENCODE/36/precomputed/STAR/STAR_2.7.10a_100", + ] + ) """Required if listed in ngs_mapping.tools.rna; otherwise, can be removed.""" num_threads_align: int = 16 num_threads_trimming: int = 8 @@ -245,7 +278,12 @@ class Strand(enum.IntEnum): class Strandedness(SnappyModel): - path_exon_bed: str + path_exon_bed: str = Field( + examples=[ + "/fast/work/groups/cubi/projects/biotools/static_data_by_ref/hs37d5/annotation/GENCODE/19/gencode.v19.chr_scaff.annotation.cds.collapse_annotation.bed", + "/fast/work/groups/cubi/projects/biotools/static_data_by_ref/GRCh38/annotations/GENCODE/36/gencode.v36.primary_assembly.annotation.cds.collapse_annotation.bed", + ] + ) """Location of usually highly expressed genes. Known protein coding genes is a good choice""" strand: Strand = Strand.UNKNOWN diff --git a/snappy_pipeline/workflows/somatic_variant_calling/model.py b/snappy_pipeline/workflows/somatic_variant_calling/model.py index 5e017785a..82b973e9a 100644 --- a/snappy_pipeline/workflows/somatic_variant_calling/model.py +++ b/snappy_pipeline/workflows/somatic_variant_calling/model.py @@ -98,13 +98,38 @@ class Mutect2(Parallel): # Sadly a type of # `FilePath | None = None` # still applies `FilePath` validation on `None`, which errors - panel_of_normals: str | None = "" + panel_of_normals: Annotated[ + str | None, + Field( + examples=[ + "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/b37/Mutect2-exome-panel.vcf.gz", + "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/b37/Mutect2-WGS-panel-b37.vcf.gz", + "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/hg38/1000g_pon.hg38.vcf.gz", + ] + ), + ] = None """Set path to panel of normals vcf if required""" - germline_resource: str | None = "" + germline_resource: Annotated[ + str | None, + Field( + examples=[ + "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/b37/af-only-gnomad.raw.sites.vcf.gz", + "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/hg38/af-only-gnomad.hg38.vcf.gz", + ] + ), + ] = None """Germline variants resource (same as panel of normals)""" - common_variants: str | None = "" + common_variants: Annotated[ + str | None, + Field( + examples=[ + "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/b37/small_exac_common_3.vcf.gz", + "/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/hg38/small_exac_common_3.hg38.vcf.gz", + ] + ), + ] = None """Common germline variants for contamination estimation""" extra_arguments: Annotated[ @@ -220,7 +245,12 @@ class SomaticVariantCalling(SnappyStepModel, validators.ToolsMixin): ignore_chroms: Annotated[ list[str], - Field(examples=["NC_007605", "hs37d5", "chrEBV", "*_decoy", "HLA-*", "GL000220.*"]), + Field( + examples=[ + ["NC_007605", "hs37d5", "chrEBV", "*_decoy", "HLA-*", "GL000220.*"], + ["*_decoy", "EBV", "HPV*", "HBV", "HCV-*", "HIV-*", "HTLV-1", "CMV", "KSHV", "MCV", "SV40", "chrUn_GL000220v1"] + ] + ), ] = ["NC_007605", "hs37d5", "chrEBV", "*_decoy", "HLA-*", "GL000220.*"] """Patterns of contig names to ignore"""