Skip to content

Commit

Permalink
docs: Check & update of configuration defaults (first part)
Browse files Browse the repository at this point in the history
  • Loading branch information
ericblanc20 committed Jul 5, 2024
1 parent 17c1a87 commit a7712da
Show file tree
Hide file tree
Showing 3 changed files with 113 additions and 17 deletions.
40 changes: 34 additions & 6 deletions snappy_pipeline/apps/tpls/project_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,46 @@

# Step Configuration ==============================================================================
#
# Configuration for paths with static data. This has been preconfigured for the paths on the BIH
# cluster.
# Configuration for paths with static data (GRCh37/hs37d5 genome release, no CHR prefix in contig names).
# This has been preconfigured for the paths on the BIH cluster.
#
static_data_config:
cosmic:
path: /fast/projects/cubit/current/static_data/db/COSMIC/v72/GRCh37/CosmicAll.vcf.gz
path: /data/cephfs-1/work/projects/cubit/current/static_data/db/COSMIC/v72/GRCh37/CosmicAll.vcf.gz
dbnsfp:
path: /fast/projects/cubit/current/static_data/db/dbNSFP/2.9/dbNSFP2.9.txt.gz
path: /data/cephfs-1/work/projects/cubit/current/static_data/db/dbNSFP/2.9/dbNSFP2.9.txt.gz
dbsnp:
path: /fast/projects/cubit/current/static_data/db/dbSNP/b147/GRCh37/All_20160408.vcf.gz
path: /data/cephfs-1/work/projects/cubit/current/static_data/db/dbSNP/b147/GRCh37/All_20160408.vcf.gz
reference:
path: /fast/projects/cubit/current/static_data/reference/GRCh37/hs37d5/hs37d5.fa
path: /data/cephfs-1/work/projects/cubit/current/static_data/reference/GRCh37/hs37d5/hs37d5.fa
features:
path: /data/cephfs-1/work/projects/cubit/current/static_data/annotation/GENCODE/19/GRCh37/gencode.v19.annotation.gtf

# Step Configuration ==============================================================================
#
# Configuration for paths with static data (GRCh38.d1.vd1 genome release, includes decoys & viral sequences).
# This has been preconfigured for the paths on the BIH cluster (CUBI group members only).
#
# Notes:
# - GRCh38.d1.vd1 is the genome release used by the GDC consortium
# (https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/DNA_Seq_Variant_Calling_Pipeline/#alignment-workflow)
# - GENCODE release 36 is the feature annotation used by the GDC consortium for its RNA pipeline
# (https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline/#rna-seq-alignment-workflow)
# - The GENCODE release 36 corresponds to ENSEMBL release 102
# (https://www.gencodegenes.org/human/releases.html)
# - Some files have not yet been moved to Tier 1.
#
# static_data_config:
# cosmic:
# path: /data/cephfs-1/work/projects/cubit/current/static_data/db/COSMIC/v90/GRCh38/CosmicAll.vcf.gz
# dbnsfp:
# path: /data/cephfs-1/work/projects/cubit/current/static_data/db/dbNSFP/3.5/GRCh38/dbNSFP.txt.gz
# dbsnp:
# path: /data/cephfs-1/work/projects/cubit/current/static_data/db/dbSNP/b147/GRCh38/common_all_20160407.vcf.gz
# reference:
# path: /fast/work/groups/cubi/projects/biotools/static_data/reference/GRCh38.d1.vd1/GRCh38.d1.vd1.fa
# features:
# path: /fast/work/groups/cubi/projects/biotools/static_data_by_ref/GRCh38/annotation/GENCODE/36/gencode.v36.primary_assembly.annotation.gtf

# Step Configuration ==============================================================================
#
Expand Down
52 changes: 45 additions & 7 deletions snappy_pipeline/workflows/ngs_mapping/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ class TargetCoverageReportEntry(SnappyModel):
- name: IDT_xGen_V1_0
pattern: "xGen Exome Research Panel V1\\.0*"
path: "path/to/targets.bed"
Bed file for many Agilent exome panels can be found in
/fast/work/groups/cubi/projects/biotools/static_data/exome_panel/Agilent
"""

name: Annotated[str, Field(examples=["IDT_xGen_V1_0"])]
Expand Down Expand Up @@ -72,7 +75,14 @@ class BwaMode(Enum):


class BwaMapper(SnappyModel):
path_index: str
path_index: str = Field(
examples=[
"/data/cephfs-1/work/projects/cubit/current/static_data/precomputed/BWA/0.7.17/GRCh37/hs37d5/hs37d5.fa",
"/fast/work/groups/cubi/projects/biotools/static_data_by_ref/GRCh38.d1.vd1/precomputed/BWA/0.7.17/GRCh38.d1.vd1.fa",
"/data/cephfs-1/work/groups/cubi/projects/biotools/bwa-mem2/GRCh37/hs37d5/hs37d5.fa",
"/fast/work/groups/cubi/projects/biotools/static_data_by_ref/GRCh38.d1.vd1/precomputed/BWA_MEM2/2.2.1/GRCh38.d1.vd1.fa",
]
)
"""Required if listed in ngs_mapping.tools.dna; otherwise, can be removed."""
num_threads_align: int = 16
num_threads_trimming: int = 8
Expand Down Expand Up @@ -145,8 +155,13 @@ class Somatic(SnappyModel):


class Bqsr(SnappyModel):
common_variants: str
"""Common germline variants (see /fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK)"""
common_variants: str = Field(
examples=[
"/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/b37/small_exac_common_3.vcf.gz",
"/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/hg38/small_exac_common_3.hg38.vcf.gz",
]
)
"""Common germline variants (see https://console.cloud.google.com/storage/browser/gatk-best-practices)"""


class AgentLibPrepType(Enum):
Expand All @@ -158,7 +173,11 @@ class AgentLibPrepType(Enum):


class AgentPrepare(SnappyModel):
path: str
path: str = Field(
examples=[
"/fast/work/groups/cubi/projects/biotools/AGeNT_3.0.6/agent/lib/trimmer-3.0.5.jar"
]
)

lib_prep_type: AgentLibPrepType = None
"""One of "halo" (HaloPlex), "hs" (HaloPlexHS), "xt" (SureSelect XT, XT2, XT HS), "v2" (SureSelect XT HS2) & "qxt" (SureSelect QXT)"""
Expand All @@ -174,8 +193,17 @@ class AgentMarkDuplicatesConsensusMode(Enum):


class AgentMarkDuplicates(SnappyModel):
path: str
path: str = Field(
examples=["/fast/work/groups/cubi/projects/biotools/AGeNT_3.0.6/agent/lib/creak-1.0.5.jar"]
)
path_baits: str
"""
Different exome panels cannot be accomodated here, because the selection method used for coverage is not used.
The absolute path of the exome panel must be input.
Bed file for many Agilent exome panels can be found in
/fast/work/groups/cubi/projects/biotools/static_data/exome_panel/Agilent
"""
consensus_mode: AgentMarkDuplicatesConsensusMode = None
"""One of "SINGLE", "HYBRID", "DUPLEX" """

Expand All @@ -194,7 +222,12 @@ class Agent(SnappyModel):


class Star(SnappyModel):
path_index: str
path_index: str = Field(
examples=[
"/fast/work/groups/cubi/projects/biotools/static_data_by_ref/hs37d5/annotation/GENCODE/19/precomputed/STAR/2.7.10a/100",
"/fast/work/groups/cubi/projects/biotools/static_data_by_ref/GRCh38/annotations/GENCODE/36/precomputed/STAR/STAR_2.7.10a_100",
]
)
"""Required if listed in ngs_mapping.tools.rna; otherwise, can be removed."""
num_threads_align: int = 16
num_threads_trimming: int = 8
Expand Down Expand Up @@ -245,7 +278,12 @@ class Strand(enum.IntEnum):


class Strandedness(SnappyModel):
path_exon_bed: str
path_exon_bed: str = Field(
examples=[
"/fast/work/groups/cubi/projects/biotools/static_data_by_ref/hs37d5/annotation/GENCODE/19/gencode.v19.chr_scaff.annotation.cds.collapse_annotation.bed",
"/fast/work/groups/cubi/projects/biotools/static_data_by_ref/GRCh38/annotations/GENCODE/36/gencode.v36.primary_assembly.annotation.cds.collapse_annotation.bed",
]
)
"""Location of usually highly expressed genes. Known protein coding genes is a good choice"""

strand: Strand = Strand.UNKNOWN
Expand Down
38 changes: 34 additions & 4 deletions snappy_pipeline/workflows/somatic_variant_calling/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,13 +98,38 @@ class Mutect2(Parallel):
# Sadly a type of
# `FilePath | None = None`
# still applies `FilePath` validation on `None`, which errors
panel_of_normals: str | None = ""
panel_of_normals: Annotated[
str | None,
Field(
examples=[
"/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/b37/Mutect2-exome-panel.vcf.gz",
"/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/b37/Mutect2-WGS-panel-b37.vcf.gz",
"/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/hg38/1000g_pon.hg38.vcf.gz",
]
),
] = None
"""Set path to panel of normals vcf if required"""

germline_resource: str | None = ""
germline_resource: Annotated[
str | None,
Field(
examples=[
"/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/b37/af-only-gnomad.raw.sites.vcf.gz",
"/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/hg38/af-only-gnomad.hg38.vcf.gz",
]
),
] = None
"""Germline variants resource (same as panel of normals)"""

common_variants: str | None = ""
common_variants: Annotated[
str | None,
Field(
examples=[
"/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/b37/small_exac_common_3.vcf.gz",
"/fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK/hg38/small_exac_common_3.hg38.vcf.gz",
]
),
] = None
"""Common germline variants for contamination estimation"""

extra_arguments: Annotated[
Expand Down Expand Up @@ -220,7 +245,12 @@ class SomaticVariantCalling(SnappyStepModel, validators.ToolsMixin):

ignore_chroms: Annotated[
list[str],
Field(examples=["NC_007605", "hs37d5", "chrEBV", "*_decoy", "HLA-*", "GL000220.*"]),
Field(
examples=[
["NC_007605", "hs37d5", "chrEBV", "*_decoy", "HLA-*", "GL000220.*"],
["*_decoy", "EBV", "HPV*", "HBV", "HCV-*", "HIV-*", "HTLV-1", "CMV", "KSHV", "MCV", "SV40", "chrUn_GL000220v1"]
]
),
] = ["NC_007605", "hs37d5", "chrEBV", "*_decoy", "HLA-*", "GL000220.*"]
"""Patterns of contig names to ignore"""

Expand Down

0 comments on commit a7712da

Please sign in to comment.