diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f182086fb..797b81a81 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,7 +26,7 @@ jobs: - name: Install Python uses: actions/setup-python@v2 with: - python-version: "3.10" + python-version: "3.12" - name: Install dependencies run: | @@ -42,10 +42,9 @@ jobs: uses: marocchino/sticky-pull-request-comment@v1.1.0 with: message: | - - Please format your Python code with [black](https://black.readthedocs.io): `make black` + - Please format your Python code with [ruff](https://docs.astral.sh/ruff/): `make fmt` + - Please check your Python code with [ruff](https://docs.astral.sh/ruff/): `make check` - Please format your Snakemake code with [snakefmt](https://github.com/snakemake/snakefmt): `make snakefmt` - - Please organize your imports [isorts](https://isort.readthedocs.io): `make isort` - - Please ensure that your code passes [flake8](https://flake8.pycqa.org/en/latest/): `make flake8` You can trigger all lints locally by running `make lint` GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} @@ -58,11 +57,12 @@ jobs: - name: Install Python uses: actions/setup-python@v2 with: - python-version: "3.10" + python-version: "3.12" - name: Install dependencies run: | - pip install -r requirements/test.txt + pip install -r requirements/base.txt -r requirements/test.txt + pip install -e . pip freeze - name: Build documentation @@ -75,7 +75,7 @@ jobs: strategy: matrix: python-version: - - "3.10" + - "3.12" # - "3.11" # no compatible pysam yet # - "3.12" # no compatible pysam yet needs: linting @@ -95,7 +95,7 @@ jobs: - name: Prepare environment.yml file run: > cp environment.yml /tmp/environment.yml && sed -i -e - 's/- python/- python=${{ matrix.python-version }}/' + 's/- python=.*/- python=${{ matrix.python-version }}/' /tmp/environment.yml - name: Update environment using mamba run: mamba env update --name root --file /tmp/environment.yml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 000000000..fc3b9db20 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,8 @@ +repos: + - repo: https://github.com/charliermarsh/ruff-pre-commit + # Ruff version. + rev: v0.4.8 + hooks: + - id: ruff + args: [ --fix ] + - id: ruff-format diff --git a/.tests/test-workflow/config/cancer_wes/config.yaml.jinja2 b/.tests/test-workflow/config/cancer_wes/config.yaml.jinja2 index 2622bc0e8..76be4c686 100644 --- a/.tests/test-workflow/config/cancer_wes/config.yaml.jinja2 +++ b/.tests/test-workflow/config/cancer_wes/config.yaml.jinja2 @@ -23,9 +23,9 @@ step_config: somatic_variant_calling: + path_ngs_mapping: ../ngs_mapping tools: [mutect2] mutect2: - ngs_mapping: ../ngs_mapping extra_arguments: [] window_length: 300000000 keep_tmpdir: onerror diff --git a/.tests/test-workflow/workflow/envs/snappy.yaml b/.tests/test-workflow/workflow/envs/snappy.yaml index d095ae02c..21ed2f92c 100644 --- a/.tests/test-workflow/workflow/envs/snappy.yaml +++ b/.tests/test-workflow/workflow/envs/snappy.yaml @@ -4,7 +4,7 @@ channels: dependencies: # Fundamentals - - python >=3.11.0 + - python >=3.12.0 - pip =24 - git-lfs =3.5 - gcc_linux-64 =13.2 diff --git a/.tests/test-workflow/workflow/rules/common.smk b/.tests/test-workflow/workflow/rules/common.smk index 3edea7a00..329a3d765 100644 --- a/.tests/test-workflow/workflow/rules/common.smk +++ b/.tests/test-workflow/workflow/rules/common.smk @@ -23,7 +23,7 @@ def folder_names(pipeline: str) -> list[str]: def reference_path() -> str: if chrom := config["reference"].get("chromosome", None): - return f"resources/refs/{chrom}.fa.gz" + return "resources/refs/" + chrom + ".fa.gz" else: return "resources/refs/genome.fa.gz" @@ -33,8 +33,8 @@ def reference_faidx_region_string(wildcards) -> str: region = config["reference"].get("region", None) match chrom, region: case None, None: - return f"" + return "" case chrom, None: - return f"{chrom}" + return chrom case chrom, region: - return f"{chrom}:{region}" + return chrom + ":" + region diff --git a/Makefile b/Makefile index 7f7df1b0b..889e73089 100644 --- a/Makefile +++ b/Makefile @@ -4,49 +4,42 @@ default: help .PHONY: help help: @echo help -- display this help - @echo black -- apply black code formatter + @echo fmt -- apply code formatter @echo snakefmt -- apply snakefmt code formatter @echo srcfmt -- apply black and snakefmt formatters @echo lint -- run linters @echo test -- run tests through pytest - @echo isort -- run isort + @echo check -- run code checker -.PHONY: black -black: - black -l 100 . +.PHONY: fmt +fmt: + ruff format . .PHONY: snakefmt snakefmt: snakefmt -l 100 . --include '(\.smk$$|\.rules$$|^Snakefile)' .PHONY: srcfmt -srcfmt: black snakefmt +srcfmt: fmt snakefmt .PHONY: lint -lint: flake8 lint-black lint-snakefmt lint-isort +lint: check lint-fmt lint-snakefmt -.PHONY: isort -isort: - isort --force-sort-within-sections --profile=black . +.PHONY: check +check: + ruff check . -.PHONY: flake8 -flake8: - flake8 - -.PHONY: lint-black -lint-black: - black -l 100 --check . +.PHONY: lint-fmt +lint-fmt: + ruff format --check . .PHONY: lint-snakefmt lint-snakefmt: snakefmt -l 100 . --include '(\.smk$$|\.rules$$|^Snakefile)' --check -.PHONY: lint-isort -lint-isort: - isort --force-sort-within-sections --profile=black --check . test: - py.test + pytest coverage: coverage report diff --git a/docs/conf.py b/docs/conf.py old mode 100755 new mode 100644 index dcc373f77..c349356e5 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,30 +1,17 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- +# Configuration file for the Sphinx documentation builder. # -# snappy_pipeline documentation build configuration file, created by -# sphinx-quickstart on Tue Jul 9 22:26:36 2013. -# -# This file is execfile()d with the current directory set to its -# containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html -# isort:skip_file -import os -import sys -import textwrap +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information -# If extensions (or modules to document with autodoc) are in another -# directory, add these directories to sys.path here. If the directory is -# relative to the documentation root, use os.path.abspath to make it -# absolute, like shown here. -# sys.path.insert(0, os.path.abspath('.')) +project = 'snappy-pipeline' +copyright = "2015-2024, CUBI, Berlin Institute of Health" +author = 'CUBI, Berlin Institute of Health' # Get the project root dir, which is the parent dir of this +import os, sys cwd = os.getcwd() project_root = os.path.dirname(cwd) @@ -34,183 +21,30 @@ sys.path.insert(0, project_root) import snappy_pipeline +# The short X.Y version. +version = snappy_pipeline.__version__ +# The full version, including alpha/beta/rc tags. +release = snappy_pipeline.__version__ -# -- General configuration --------------------------------------------- - -# If your documentation needs a minimal Sphinx version, state it here. -# needs_sphinx = '1.0' +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = [ "sphinx.ext.autodoc", "sphinx.ext.viewcode", "sphinx_mdinclude", ] -# Add any paths that contain templates here, relative to this directory. -templates_path = ["_templates"] - -# The suffix of source filenames. +templates_path = ['_templates'] +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', "step/DEFAULT_CONFIG_*.rst"] source_suffix = ".rst" - -# The encoding of source files. -# source_encoding = 'utf-8-sig' - -# The master toctree document. master_doc = "index" -# General information about the project. -project = "SNAPPY Pipeline" -copyright = "2015-2021, CUBI, Berlin Institute of Health" - -# The version info for the project you're documenting, acts as replacement -# for |version| and |release|, also used in various other places throughout -# the built documents. -# -# The short X.Y version. -version = snappy_pipeline.__version__ -# The full version, including alpha/beta/rc tags. -release = snappy_pipeline.__version__ - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# language = None - -# There are two options for replacing |today|: either, you set today to -# some non-false value, then it is used: -# today = '' -# Else, today_fmt is used as the format for a strftime call. -# today_fmt = '%B %d, %Y' - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -exclude_patterns = ["_build", "step/DEFAULT_CONFIG_*.rst"] - -# The reST default role (used for this markup: `text`) to use for all -# documents. -# default_role = None - -# If true, '()' will be appended to :func: etc. cross-reference text. -# add_function_parentheses = True - -# If true, the current module name will be prepended to all description -# unit titles (such as .. function::). -# add_module_names = True - -# If true, sectionauthor and moduleauthor directives will be shown in the -# output. They are ignored by default. -# show_authors = False - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = "sphinx" - -# A list of ignored prefixes for module index sorting. -# modindex_common_prefix = [] - -# If true, keep warnings as "system message" paragraphs in the built -# documents. -# keep_warnings = False - - -# -- Options for HTML output ------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -html_theme = "sphinx_rtd_theme" - -# Theme options are theme-specific and customize the look and feel of a -# theme further. For a list of options available for each theme, see the -# documentation. -# html_theme_options = {} - -# Add any paths that contain custom themes here, relative to this directory. -# html_theme_path = [] - -# The name for this set of Sphinx documents. If None, it defaults to -# " v documentation". -# html_title = None - -# A shorter title for the navigation bar. Default is the same as -# html_title. -# html_short_title = None - -# The name of an image file (relative to this directory) to place at the -# top of the sidebar. -# html_logo = None - -# The name of an image file (within the static path) to use as favicon -# of the docs. This file should be a Windows icon file (.ico) being -# 16x16 or 32x32 pixels large. -# html_favicon = None - -# Add any paths that contain custom static files (such as style sheets) -# here, relative to this directory. They are copied after the builtin -# static files, so a file named "default.css" will overwrite the builtin -# "default.css". -html_static_path = ["_static"] - -# If not '', a 'Last updated on:' timestamp is inserted at every page -# bottom, using the given strftime format. -html_last_updated_fmt = "%b %d, %Y" - -# If true, SmartyPants will be used to convert quotes and dashes to -# typographically correct entities. -# html_use_smartypants = True - -# Custom sidebar templates, maps document names to template names. -# html_sidebars = {} - -# Additional templates that should be rendered to pages, maps page names -# to template names. -# html_additional_pages = {} - -# If false, no module index is generated. -# html_domain_indices = True - -# If false, no index is generated. -# html_use_index = True - -# If true, the index is split into individual pages for each letter. -# html_split_index = False - -# If true, links to the reST sources are added to the pages. -# html_show_sourcelink = True - -# If true, "Created using Sphinx" is shown in the HTML footer. -# Default is True. -# html_show_sphinx = True - -# If true, "(C) Copyright ..." is shown in the HTML footer. -# Default is True. -# html_show_copyright = True - -# If true, an OpenSearch description file will be output, and all pages -# will contain a tag referring to it. The value of this option -# must be the base URL from which the finished HTML is served. -# html_use_opensearch = '' - -# This is the file name suffix for HTML files (e.g. ".xhtml"). -# html_file_suffix = None - -# Output file base name for HTML help builder. -htmlhelp_basename = "snappy_pipelinedoc" +man_pages = [("index", "snappy_pipeline", "SNAPPY Pipeline Documentation", ["Manuel Holtgrewe"], 1)] # -- Options for LaTeX output ------------------------------------------ -latex_elements = { - # The paper size ('letterpaper' or 'a4paper'). - # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). - # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. - # 'preamble': '', -} - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, author, documentclass -# [howto/manual]). latex_documents = [ ( "index", @@ -221,42 +55,6 @@ ) ] -# The name of an image file (relative to this directory) to place at -# the top of the title page. -# latex_logo = None - -# For "manual" documents, if this is true, then toplevel headings -# are parts, not chapters. -# latex_use_parts = False - -# If true, show page references after internal links. -# latex_show_pagerefs = False - -# If true, show URL addresses after external links. -# latex_show_urls = False - -# Documents to append as an appendix to all manuals. -# latex_appendices = [] - -# If false, no module index is generated. -# latex_domain_indices = True - - -# -- Options for manual page output ------------------------------------ - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [("index", "snappy_pipeline", "SNAPPY Pipeline Documentation", ["Manuel Holtgrewe"], 1)] - -# If true, show URL addresses after external links. -# man_show_urls = False - - -# -- Options for Texinfo output ---------------------------------------- - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) texinfo_documents = [ ( "index", @@ -269,26 +67,24 @@ ) ] -# Documents to append as an appendix to all manuals. -# texinfo_appendices = [] - -# If false, no module index is generated. -# texinfo_domain_indices = True -# How to display URL addresses: 'footnote', 'no', or 'inline'. -# texinfo_show_urls = 'footnote' +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output -# If true, do not generate a @detailmenu in the "Top" node's menu. -# texinfo_no_detailmenu = False - -# -- Write out RST for DEFAULT_CONFIG automatically ------------------- +# html_theme = 'alabaster' +pygments_style = "sphinx" +html_theme = 'sphinx_rtd_theme' +html_static_path = ['_static'] +# Output file base name for HTML help builder. +htmlhelp_basename = "snappy_pipelinedoc" +html_last_updated_fmt = "%b %d, %Y" -# Taken from https://stackoverflow.com/a/45826052/84349 import importlib import pkgutil import snappy_pipeline.workflows +import textwrap for _, name, is_pkg in pkgutil.iter_modules(snappy_pipeline.workflows.__path__): if is_pkg: diff --git a/docs/index.rst b/docs/index.rst index 8090bc94d..4c3d82022 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -2,7 +2,7 @@ CUBI Pipeline Documentation =========================== -This is the documentatation for the CUBI Pipeline. +This is the documentation for the CUBI Pipeline. This documentation is split into four parts: Pipeline User Docs diff --git a/docs/installation.rst b/docs/installation.rst index 19e014729..595d1535d 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -12,7 +12,7 @@ Installation Prerequisites ------------- -The CUBI pipeline requires Python >=3.7 (e.g., from a Miniconda3 installation). +The CUBI pipeline requires Python >=3.12 (e.g., from a Miniconda3 installation). More recent versions also work but other requirements as Snakemake might make it depend on a more recent Python version. diff --git a/environment.yml b/environment.yml index 38f954f5d..81f3fa259 100644 --- a/environment.yml +++ b/environment.yml @@ -8,31 +8,64 @@ name: snappy_env dependencies: # Fundamentals - - python + - python=3.12 - pip - - git-lfs + - git-lfs ~=3.5.1 + + # for compiling packages from pip + - gcc_linux-64 ~=13.2.0 + - gxx_linux-64 ~=13.2.0 + + # basics some snappy wrappers rely on + - coreutils ~=9.5 + - gawk ~=5.3.0 + - bash ~=5.2.21 + - gzip ~=1.13 + + # pydantic is used to validate configuration files + - pydantic =2.7 # Snakemake is used for providing the actual wrapper calling functionality - - snakemake >=7.0.2 + - snakemake =7.32 # Additional libraries used by snappy - - ruamel.yaml # Nice, round-trip enabled YAML parsing - - fasteners # File-based locks - - termcolor # Helpful for CLIs - - matplotlib # Required for plotting - - jinja2 # Jinja 2 template rendering + - ruamel.yaml ==0.18.6 # Nice, round-trip enabled YAML parsing + - fasteners ==0.17.3 # File-based locks + - termcolor ==1.1.0 # Helpful for CLIs + - matplotlib ==3.8.4 # Required for plotting + - jinja2 ==3.1.4 # Jinja 2 template rendering # Bioinformatics-related libraries used by snappy - - htslib >=1.15 - - bcftools >=1.15 - - samtools >=1.15 - - vcfpy # Library for working with VCF files - - pysam # Support for vcfpy + - htslib ==1.20 + - bcftools ==1.20 + - samtools ==1.20 + + # packages for testing + - pytest ~=8.2.2 + - coverage ~=7.5.3 + - pytest-cov ~=5.0.0 + - pytest-mock ~=3.14.0 + - pytest-subprocess ~=1.5.0 + - pyfakefs ~=5.5.0 + - ruff ~=0.4.8 + - snakefmt ~=0.8.0 + - coveralls ~=4.0.1 + - sphinx ~=7.3.7 + - sphinx_rtd_theme ~=2.0.0 + - sphinx-mdinclude ~=0.6.0 + + # misc packages + - pytest-sugar ~=0.9.6 - # Parsing of ISA-tab - - altamisa - # CUBI libraries required by snappy (installed through pip) - pip: - - varfish-cli - - biomedsheets >=0.11.7 + # build varfish-cli from pypi + - varfish-cli ~=0.6.3 + # specific compatible biomedsheets revision + - git+https://github.com/bihealth/biomedsheets.git@4e0a8484850c39d1511036c3fe29ec0b4f9271f8 + # specific compatible altamisa revision + - git+https://github.com/bihealth/altamisa.git@817dc491ff819e4c80686082bf3e5f602f1ac14c + + # build pysam and vcfpy via pip (to avoid python version + conda packaging issues) + - pysam ~=0.22 + - vcfpy ~=0.13.8 diff --git a/requirements/base.txt b/requirements/base.txt index 695023439..f38ca2cc1 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -1,38 +1,38 @@ -# Simplified class builder -attrs +setuptools ~=68.1.2 # Nice, round-trip enabled YAML parsing -ruamel.yaml>=0.15.35 +ruamel.yaml ~=0.18.6 # File-based locks -fasteners==0.17.3 +fasteners ~=0.17.3 # We're trying to keep the PyPi package up to date, you might have to install # from source, though. -biomedsheets >=0.11.7 +biomedsheets @ git+https://github.com/bihealth/biomedsheets.git@4e0a8484850c39d1511036c3fe29ec0b4f9271f8 # Helpful for CLIs -termcolor==1.1.0 +termcolor ~=1.1.0 # Snakemake is used for providing the actual wrapper calling functionality -snakemake==7.26.0 -# Snakemake needs manual install of PyYAML to make YAML configuration loading work -PyYAML>=6.0 +snakemake ~=7.32.0 # Required for plotting -matplotlib>=2.1.2 +matplotlib ~=3.8.4 # Library for working with VCF files. -vcfpy >=0.13.2 +vcfpy ~=0.13.8 # Support for vcfpy -pysam >=0.19.1 -pytabix >=0.1 +pysam ~=0.22.1 +pytabix ~=0.1 # Jinja 2 template rendering -jinja2 >=3.1.2 +jinja2 ~=3.1.4 # Parsing of ISA-tab. -altamisa >=0.2.6 +altamisa @ git+https://github.com/bihealth/altamisa.git@817dc491ff819e4c80686082bf3e5f602f1ac14c # REST API client for VarFish Server -varfish-cli >=0.2.0 +varfish-cli ~=0.6.3 + +# Validation for models, mainly used for configuration validation +pydantic ~=2.7.0 diff --git a/requirements/dev.txt b/requirements/dev.txt index 5cccc8a87..ad3df8fdb 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -1,7 +1,4 @@ -r test.txt -# iPython is nice -ipython - # Prettier CLI for py.test -pytest-sugar +pytest-sugar ~=0.9.6 diff --git a/requirements/test.txt b/requirements/test.txt index 78e7bae71..0343a7581 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1,30 +1,24 @@ -r base.txt # Wonderful py.test library, style checker etc. -pytest -coverage -pytest-cov -pytest-mock -pytest-subprocess +pytest ~=8.2.2 +coverage ~=7.5.3 +pytest-cov ~=5.0.0 +pytest-mock ~=3.14.0 +pytest-subprocess ~=1.5.0 # Fake file system for testing -pyfakefs +pyfakefs ~=5.5.0 -# "Black" code formatter and checker. -black ==22.8.0 +# ruff code linter + formatter +ruff ~=0.4.8 # "snakefmt" code formatter and checker. -snakefmt >=0.8.0,<0.9.0 -# isort code formatter - import order -isort ==5.10.1 - -flake8 -flake8-import-order -pytest-flake8 +snakefmt ~=0.8.0 # coveralls.io tooling -coveralls +coveralls ~=4.0.1 # Sphinx -sphinx -sphinx_rtd_theme -sphinx-mdinclude +sphinx ~=7.3.7 +sphinx_rtd_theme ~=2.0.0 +sphinx-mdinclude ~=0.6.0 diff --git a/ruff.toml b/ruff.toml new file mode 100644 index 000000000..724dbc320 --- /dev/null +++ b/ruff.toml @@ -0,0 +1,94 @@ +# Exclude a variety of commonly ignored directories. +exclude = [ + ".bzr", + ".direnv", + ".eggs", + ".git", + ".git-rewrite", + ".hg", + ".ipynb_checkpoints", + ".mypy_cache", + ".nox", + ".pants.d", + ".pyenv", + ".pytest_cache", + ".pytype", + ".ruff_cache", + ".svn", + ".tox", + ".venv", + ".vscode", + "__pypackages__", + "_build", + "buck-out", + "build", + "dist", + "node_modules", + "site-packages", + "venv", + "docs", + "tests", + ".*.py", + ".snakemake.*.wrapper.py", + "splitMNPsAndComplex.py", + "wrapper.py", + "snappy_pipeline/__init__.py", + "versioneer.py", + ".tests", +] + + +line-length = 100 +indent-width = 4 + +# Assume Python 3.8 +target-version = "py312" + + +[lint] +select = ["E", "F", "W", "B9"] # enable "B", "C" later +ignore = ["E203", "E266", "E501", "B904", "B905", "E713", "E721", "E741"] + +# Allow fix for all enabled rules (when `--fix`) is provided. +fixable = ["ALL"] +unfixable = [] + +# Allow unused variables when underscore-prefixed. +dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" + +[lint.per-file-ignores] +"tests/**/*.py" = ["E501"] +"docs/conf.py" = ["ALL"] + +[lint.flake8-quotes] +docstring-quotes = "double" + + +[format] +# Like Black, use double quotes for strings. +quote-style = "double" + +# Like Black, indent with spaces, rather than tabs. +indent-style = "space" + +# Like Black, respect magic trailing commas. +skip-magic-trailing-comma = false + +# Like Black, automatically detect the appropriate line ending. +line-ending = "auto" + +# Enable auto-formatting of code examples in docstrings. Markdown, +# reStructuredText code/literal blocks and doctests are all supported. +# +# This is currently disabled by default, but it is planned for this +# to be opt-out in the future. +docstring-code-format = false + +# Set the line length limit used when formatting code snippets in +# docstrings. +# +# This only has an effect when the `docstring-code-format` setting is +# enabled. +docstring-code-line-length = "dynamic" + + diff --git a/setup.cfg b/setup.cfg index dc165af49..82eda8355 100644 --- a/setup.cfg +++ b/setup.cfg @@ -15,6 +15,7 @@ force_grid_wrap = 0 use_parentheses = True ensure_newline_before_comments = True line_length = 100 +skip_glob = .tests [flake8] exclude = @@ -26,6 +27,7 @@ exclude = splitMNPsAndComplex.py wrapper.py snappy_pipeline/__init__.py + .tests max-complexity = 18 select = B,C,E,F,W,T4,B9 ignore = E203, E266, E501, W503 diff --git a/setup.py b/setup.py index 42dc0f570..e6fa48f43 100644 --- a/setup.py +++ b/setup.py @@ -1,11 +1,10 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -"""Installation driver (and development utility entry point) for snappy-pipeline -""" +"""Installation driver (and development utility entry point) for snappy-pipeline""" -from itertools import chain import os import sys +from itertools import chain from setuptools import find_packages, setup @@ -27,9 +26,9 @@ def parse_requirements(path): return requirements -# Enforce python version >=3.7 -if sys.version_info < (3, 7): - print("At least Python 3.7 is required.\n", file=sys.stderr) +# Enforce python version >=3.12 +if sys.version_info < (3, 12): + print("At least Python 3.12 is required.\n", file=sys.stderr) sys.exit(1) with open("README.md") as readme_file: @@ -128,11 +127,12 @@ def bash_scripts(names): "License :: OSI Approved :: MIT License", "Natural Language :: English", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.12", # We are missing bioconda pysam packages for 3.11 and 3.12, cf. # https://github.com/bioconda/bioconda-recipes/issues/37805 # "Programming Language :: Python :: 3.11", # "Programming Language :: Python :: 3.12", + # … but we can build pysam from pip instead ], test_suite="tests", tests_require=test_requirements, diff --git a/snappy_pipeline/__init__.py b/snappy_pipeline/__init__.py index 2cbe22a7d..42c9f5ffc 100644 --- a/snappy_pipeline/__init__.py +++ b/snappy_pipeline/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -from .base import expand_ref, merge_dicts, print_config, print_sample_sheets +from .base import expand_ref, merge_dictlikes, print_config, print_sample_sheets __author__ = """Manuel Holtgrewe""" __email__ = "manuel.holtgrewe@bih-charite.de" diff --git a/snappy_pipeline/apps/impl/yaml_utils.py b/snappy_pipeline/apps/impl/yaml_utils.py index dfb2199f8..0f86027ce 100644 --- a/snappy_pipeline/apps/impl/yaml_utils.py +++ b/snappy_pipeline/apps/impl/yaml_utils.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- -"""Utilities for processing YAML configuration -""" +"""Utilities for processing YAML configuration""" from collections.abc import MutableMapping, MutableSequence import re diff --git a/snappy_pipeline/base.py b/snappy_pipeline/base.py index c2d32890e..f8d43e385 100644 --- a/snappy_pipeline/base.py +++ b/snappy_pipeline/base.py @@ -1,16 +1,18 @@ # -*- coding: utf-8 -*- -"""Basic utility code for snappy_pipeline -""" +"""Basic utility code for snappy_pipeline""" from collections import OrderedDict from collections.abc import MutableMapping from copy import deepcopy import os import sys +from typing import TYPE_CHECKING, Any, AnyStr, Dict import warnings import ruamel.yaml as ruamel_yaml +from .models import SnappyModel, SnappyStepModel + # TODO: This has to go away once biomedsheets is a proper, halfway-stable module try: from biomedsheets.ref_resolver import RefResolver @@ -40,7 +42,12 @@ class UnknownFiltrationSourceException(Exception): """Raised when user try to request an unknown filtration source.""" -def expand_ref(config_path, dict_data, lookup_paths=None, dict_class=OrderedDict): +def expand_ref( + config_path: str, + dict_data: dict | list, + lookup_paths: list[str] = None, + dict_class=OrderedDict, +) -> tuple[Any, tuple[AnyStr, ...], tuple[AnyStr, ...]]: """Expand "$ref" in JSON-like data ``dict_data`` Returns triple: @@ -68,7 +75,14 @@ def expand_ref(config_path, dict_data, lookup_paths=None, dict_class=OrderedDict return resolved, tuple(lookup_paths), tuple(config_files) -def print_config(config, file=sys.stderr): +def validate_config[C: SnappyStepModel]( + config: dict[Any, Any], + model: type[C], +) -> C: + return model(**config) + + +def print_config(config: dict[str, Any], file=sys.stderr): """Print human-readable version of configuration to ``file``""" print("\nConfiguration", file=file) print("-------------\n", file=file) @@ -76,7 +90,11 @@ def print_config(config, file=sys.stderr): return yaml.dump(config, stream=file) -def print_sample_sheets(step, file=sys.stderr): +if TYPE_CHECKING: + from snappy_pipeline.workflows.abstract import BaseStep + + +def print_sample_sheets(step: "BaseStep", file=sys.stderr): """Print loaded sample sheets from ``BaseStep`` in human-readable format""" for info in step.data_set_infos: print("\nSample Sheet {}".format(info.sheet_path), file=file) @@ -85,7 +103,9 @@ def print_sample_sheets(step, file=sys.stderr): return yaml.dump(info.sheet.json_data, stream=file) -def merge_kwargs(first_kwargs, second_kwargs): +def merge_kwargs( + first_kwargs: dict[str, Any] | None, second_kwargs: dict[str, Any] | None +) -> dict[str, Any] | None: """Merge two keyword arguments. :param first_kwargs: First keyword arguments dictionary. @@ -112,30 +132,32 @@ def merge_kwargs(first_kwargs, second_kwargs): return None -def merge_dicts(dict1, dict2, dict_class=OrderedDict): - """Merge dictionary ``dict2`` into ``dict1``""" +type DictLike = Dict | MutableMapping | SnappyModel + + +def merge_dictlikes[D](dict1: DictLike, dict2: DictLike, dict_class: D = OrderedDict) -> D: + """Merge dictionary/model ``dict2`` into ``dict1``""" - def _merge_inner(dict1, dict2): - for k in set(dict1.keys()).union(dict2.keys()): - if k in dict1 and k in dict2: - if isinstance(dict1[k], (dict, MutableMapping)) and isinstance( - dict2[k], (dict, MutableMapping) - ): - yield k, dict_class(_merge_inner(dict1[k], dict2[k])) + def _merge_inner(d1: DictLike, d2: DictLike) -> D: + DICT_LIKE = DictLike.__value__ + for k in d1.keys() | d2.keys(): + if k in d1 and k in d2: + if isinstance(d1[k], DICT_LIKE) and isinstance(d2[k], DICT_LIKE): + yield k, dict_class(_merge_inner(d1[k], d2[k])) else: # If one of the values is not a dict, you can't continue # merging it. Value from second dict overrides one in # first and we move on. - yield k, dict2[k] - elif k in dict1: - yield k, dict1[k] + yield k, d2[k] + elif k in d1: + yield k, d1[k] else: - yield k, dict2[k] + yield k, d2[k] return dict_class(_merge_inner(dict1, dict2)) -def snakefile_path(step_name): +def snakefile_path(step_name: str) -> AnyStr: """Return absolute path to Snakefile for the given step name""" return os.path.abspath( os.path.join(os.path.dirname(__file__), "workflows", step_name, "Snakefile") diff --git a/snappy_pipeline/find_file.py b/snappy_pipeline/find_file.py index 52ff49a9a..9fbf8091e 100644 --- a/snappy_pipeline/find_file.py +++ b/snappy_pipeline/find_file.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- -"""Code for crawling the file system and caching the results -""" +"""Code for crawling the file system and caching the results""" from collections import OrderedDict from fnmatch import fnmatch diff --git a/snappy_pipeline/guess_genome.py b/snappy_pipeline/guess_genome.py index 24c29ec67..cb8ff2c73 100644 --- a/snappy_pipeline/guess_genome.py +++ b/snappy_pipeline/guess_genome.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- -"""Helper code for guessing UCSC genome IDs from reference paths -""" +"""Helper code for guessing UCSC genome IDs from reference paths""" from collections import OrderedDict diff --git a/snappy_pipeline/models/__init__.py b/snappy_pipeline/models/__init__.py new file mode 100644 index 000000000..f547f056c --- /dev/null +++ b/snappy_pipeline/models/__init__.py @@ -0,0 +1,369 @@ +import enum +from enum import Enum +from inspect import isclass +from io import StringIO +import json +import re +import types +import typing +from typing import Annotated + +from annotated_types import Predicate +from pydantic import BaseModel, ConfigDict, Field +from pydantic_core import PydanticUndefined +import ruamel +from ruamel.yaml import YAML +import typing_extensions + + +def enum_options(enum: Enum) -> list[tuple[str, typing.Any]]: + """Returns a list of tuples containing the name and value of each enum member.""" + return [(e.name, e.value) for e in enum] + + +def EnumField(enum: type[Enum], default: typing.Any = PydanticUndefined, *args, **kwargs): + """ + An extension of pydantic's `Field` that adds 'options' to the json_schema_extra field, + containing the available options of the specified enum. + """ + extra = kwargs.get("json_schema_extra", {}) + extra.update(dict(options=enum_options(enum))) + kwargs["json_schema_extra"] = extra + return Field(default, *args, **kwargs) + + +size_string_regexp = re.compile(r"[. 0-9]+([KMGTP])") +SizeString = Annotated[str, Predicate(lambda s: size_string_regexp.match(s) is not None)] +"""A string representing a size, e.g. '1G' for 1 gigabyte.""" + + +class KeepTmpdir(enum.StrEnum): + """Whether to keep the temporary directory after the job has finished.""" + + always = "always" + never = "never" + onerror = "onerror" + + +class SnappyModel(BaseModel): + """ + Base class for all snappy models. + By default, extra fields are forbidden, attribute docstrings are used for field descriptions, + enum member values instead of names are used, and default values are validated (because + validation can potentially modify the values of fields with default values) + """ + + model_config = ConfigDict( + extra="forbid", + use_attribute_docstrings=True, + use_enum_values=True, + validate_default=True, + ) + + def get(self, key: str, default: typing.Any = None) -> typing.Any: + """ + Return the value of the field with the given key, or the default value if it doesn't exist. + Simply delegates to getattr. + """ + return getattr(self, key, default) + + def __getitem__(self, item: str) -> typing.Any: + """ + Return the value of the field with the given key. + Raise an AttributeError if the field doesn't exist. + """ + return getattr(self, item) + + def keys(self): + """Return a list of field names.""" + return self.model_fields.keys() + + +# This exists to distinguish workflow step_config models from other snappy specific models +# It also provides a default_config_yaml_string method that includes the step_config section +# by default. +class SnappyStepModel(SnappyModel, object): + """ + A base class for all workflow step configuration models. + """ + + @classmethod + def default_config_yaml_string( + cls, comment_optional: bool = True, with_step_config: bool = True + ): + config_str = default_config_yaml_string(cls, comment_optional) + if with_step_config: + config_str = ( + (" " * INDENTATION * 2 + line) for line in config_str.splitlines(keepends=True) + ) + + def camel_to_snake(name: str) -> str: + name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name) + return re.sub("([a-z0-9])([A-Z])", r"\1_\2", name).lower() + + name = camel_to_snake(cls.__name__) + config_str = "step_config:\n" f"{' ' * INDENTATION}{name}:\n" f"{''.join(config_str)}" + + return config_str + + def model_dump_yaml(self, **kwargs) -> str: + cfg, _ = _model_to_commented_yaml(self, **kwargs) + return _dump_yaml(cfg) + + +# Classes and functions for generating an annotated default configuration YAML +# from a snappy pydantic model +# list all optional fields commented out, list enum options, mark required fields, show descriptions + +INDENTATION = 2 # Indentation used for YAML + + +def default_config_yaml_string(model: type[SnappyStepModel], comment_optional: bool = False) -> str: + return _dump_commented_yaml(model, comment_optional) + + +def _check_model_class(annotation, clazz: type = BaseModel) -> bool: + """Checks whether the given annotation is a class and is a subclass of `clazz`""" + try: + is_model_class = isclass(annotation) and issubclass(annotation, clazz) + except TypeError: + # for some reason, types.GenericAlias doesn't count as a class? + is_model_class = hasattr(annotation, "model_fields") + return is_model_class + + +def _annotate_model( + config_model: type[BaseModel], + comment_map: ruamel.yaml.CommentedMap, + max_column: int = 80, + level: int = 0, + indent: int = INDENTATION, + path: list[str] = [], +): + """ + Annotates a given pydantic model with comments + based on whether a field is required or not, as well as listing options for enum valued fields + """ + for key, field in config_model.model_fields.items(): + annotation = field.annotation + is_union = _is_union_type(annotation) + allows_none = types.NoneType in typing_extensions.get_args(annotation) + is_optional = (is_union and allows_none) or not field.is_required() + + if is_optional: + tags = [] + else: + tags = ["REQUIRED"] + + comment = tags + + if field.examples: + comment.append(f"Examples: {', '.join(map(str, field.examples))}") + + options = (getattr(field, "json_schema_extra") or {}).get("options", []) + if _check_model_class(annotation, enum.Enum): + options = enum_options(annotation) + + if options: + + def option_str(option: tuple[str, typing.Any]) -> str: + name, value = option + if name.upper() != str(value).upper().replace("-", "_"): + return f"{repr(value)} ({name})" + else: + return f"{repr(value)}" + + comment.append(f"Options: " f"{', '.join(map(option_str, options))}") + + comment = "; ".join(comment) + + if comment_map is not None: + if comment: + comment_map.yaml_add_eol_comment(comment, key, column=max_column) + + if field.description: + ( + comment_map.yaml_set_comment_before_after_key( + key, + indent=indent * level, + before="\n" + field.description, + after=None, + ) + ) + + if _check_model_class(annotation): + _annotate_model( + annotation, + comment_map[key], + level=level + 1, + max_column=max_column, + path=path + [key], + ) + elif _is_union_type(annotation): + if len(args := typing.get_args(annotation)) == 2 and types.NoneType in args: + sub_model = next(filter(lambda s: s is not types.NoneType, args)) + if _check_model_class(sub_model): + # when the default is set to None but the annotation inherits from basemodel, + # make sure to generate those entries as well + if not comment_map[key]: + sub_model_yaml, m = _model_to_commented_yaml( + _placeholder_model_instance(sub_model) + ) + max_column = max(m + 2, max_column) + comment_map[key] = sub_model_yaml + _annotate_model( + sub_model, + comment_map[key], + level=level + 1, + max_column=max_column, + path=path + [key], + ) + elif _check_model_class(annotation, typing.Collection): + for s in filter(lambda c: issubclass(c, BaseModel), typing.get_args(annotation)): + _annotate_model( + s, comment_map[key], level=level + 1, max_column=max_column, path=path + [key] + ) + + +def _is_union_type(typ_) -> bool: + return typing.get_origin(typ_) in (typing.Union, types.UnionType) + + +def _placeholder_model_instance(model: type[BaseModel], placeholder=None): + """ + Constructs a model instance where the values of required fields are replaced by a placeholder + """ + placeholders = {} + # recurse into (optional) fields which are themselves pydantic models + for name, field in model.model_fields.items(): + annotation = field.annotation + + # optional fields, i.e. `Union[Model, None]` or `Model | None` + if _is_union_type(annotation): + if len(args := typing.get_args(annotation)) == 2 and types.NoneType in args: + if field.default in (PydanticUndefined, None): + sub_model = next(filter(lambda s: s is not types.NoneType, args)) + if _check_model_class(sub_model): + placeholders[name] = _placeholder_model_instance(sub_model, placeholder) + + # required fields, i.e. `Model` + if _check_model_class(annotation): + placeholders[name] = _placeholder_model_instance(annotation) + + # replace values of undefined required fields with `placeholder` + required_field_placeholders = { + name: placeholder + for name, field in model.model_fields.items() + if field.is_required() and field.default is PydanticUndefined + } + required_field_placeholders.update(placeholders) + + # construct a model instance with invalid data, skipping validation! + # this is only to be used to generate example configuration files + invalid_model_instance = model.model_construct(_fields_set=None, **required_field_placeholders) + return invalid_model_instance + + +def _yaml_instance(): + yaml = YAML(typ="rt") + yaml.indent(mapping=INDENTATION, sequence=INDENTATION * 2, offset=INDENTATION) + return yaml + + +def _load_yaml(yaml_str: str) -> ruamel.yaml.CommentedMap: + return _yaml_instance().load(yaml_str) + + +def _dump_yaml(comment_map: ruamel.yaml.CommentedMap) -> str: + yaml = _yaml_instance() + + with StringIO() as out: + yaml.dump(comment_map, stream=out) + return out.getvalue() + + +def _dump_commented_yaml(model: type[BaseModel], comment_optional: bool = True) -> str: + invalid_model_instance = _placeholder_model_instance(model) + + cfg, max_column = _model_to_commented_yaml(invalid_model_instance) + max_column = max(50, max_column) + _annotate_model(model, cfg, max_column=max_column) + key_paths = _optional_key_paths(model, cfg) + cfg_yaml = _dump_yaml(cfg) + if comment_optional: + return _comment_key_paths_naive(cfg_yaml, key_paths) + else: + return cfg_yaml + + +def _model_to_commented_yaml(model_instance: BaseModel, **kwargs): + yaml = _yaml_instance() + with StringIO() as s: + yaml.dump(json.loads(model_instance.model_dump_json(**kwargs)), stream=s) + s.flush() + yaml_config_string = s.getvalue() + max_column = max(map(len, yaml_config_string.splitlines())) + 2 + cfg = yaml.load(stream=yaml_config_string) + return cfg, max_column + + +def _comment_key_paths_naive( + yaml_str: str, key_paths: list[list[str]], comment_prefix: str = "#" +) -> str: + comment_lines: set[int] = set() + + for key_path in key_paths: + same_block = False + key_line_indent = 0 + key = key_path[0] + for i, line in enumerate(yaml_str.splitlines()): + line_indent = len(line) - len(line.lstrip()) + key_match = line.lstrip().startswith(key + ":") + if key_match: + key_path.pop(0) + if key_path: + key = key_path[0] + continue + else: + key_line_indent = line_indent + same_block = True + comment_lines.add(i) + continue + if same_block: + if line_indent > key_line_indent or len(line) == 0: + comment_lines.add(i) + else: + break + else: + if not key_path: + break + + return "\n".join( + (comment_prefix + line) if i in comment_lines else line + for i, line in enumerate(yaml_str.splitlines()) + ) + + +def _optional_key_paths( + config_model: type[BaseModel], + comment_map: ruamel.yaml.CommentedMap, + path: list[str] = [], +): + optional_keys = [] + for key, field in config_model.model_fields.items(): + path_ = path + [key] + annotation = field.annotation + is_union = _is_union_type(annotation) + allows_none = types.NoneType in typing_extensions.get_args(annotation) + is_optional = (is_union and allows_none) or not field.is_required() + if is_optional: + optional_keys.append(path_) + + if _check_model_class(annotation): + optional_keys.extend(_optional_key_paths(annotation, comment_map[key], path_)) + + return optional_keys + + +class ToggleModel(SnappyModel): + enabled: bool = False diff --git a/snappy_pipeline/models/annotation.py b/snappy_pipeline/models/annotation.py new file mode 100644 index 000000000..468f0bf96 --- /dev/null +++ b/snappy_pipeline/models/annotation.py @@ -0,0 +1,38 @@ +import enum + +from snappy_pipeline.models import SnappyModel + + +class VepTxFlag(enum.StrEnum): + gencode_basic = "gencode_basic" + refseq = "refseq" + merged = "merged" + + +class Vep(SnappyModel): + cache_dir: str = "" + """Defaults to $HOME/.vep Not a good idea on the cluster""" + + species: str = "homo_sapiens" + + assembly: str = "GRCh38" + + cache_version: str = "102" + """WARNING- this must match the wrapper's vep version!""" + + tx_flag: VepTxFlag = VepTxFlag.gencode_basic + """The flag selecting the transcripts. One of "gencode_basic", "refseq", and "merged".""" + + pick_order: list[str] = [ + "biotype", + "mane", + "appris", + "tsl", + "ccds", + "canonical", + "rank", + "length", + ] + num_threads: int = 8 + buffer_size: int = 1000 + output_options: list[str] = ["everything"] diff --git a/snappy_pipeline/models/cnvkit.py b/snappy_pipeline/models/cnvkit.py new file mode 100644 index 000000000..e80ae634f --- /dev/null +++ b/snappy_pipeline/models/cnvkit.py @@ -0,0 +1,164 @@ +import enum +from typing import Annotated + +from pydantic import Field + +from snappy_pipeline.models import SnappyModel + + +class SegmentationMethod(enum.StrEnum): + cbs = "cbs" + flasso = "flasso" + haar = "haar" + hmm = "hmm" + hmm_tumor = "hmm-tumor" + hmm_germline = "hmm-germline" + none = "none" + + +class CenterMode(enum.StrEnum): + mean = "mean" + median = "median" + mode = "mode" + biweight = "biweight" + + +class FilterMode(enum.StrEnum): + ampdel = "ampdel" + cn = "cn" + ci = "ci" + sem = "sem" + + +class CallingMethod(enum.StrEnum): + threshold = "threshold" + clonal = "clonal" + none = "" + + +class Gender(enum.StrEnum): + male = "male" + female = "female" + guess = "" + + +class Cnvkit(SnappyModel): + path_target: Annotated[ + str, Field(examples=["../panel_of_normals/output/cnvkit.target/out/cnvkit.target.bed"]) + ] + """Path to target regions""" + + path_antitarget: Annotated[ + str, + Field(examples=["../panel_of_normals/output/cnvkit.antitarget/out/cnvkit.antitarget.bed"]), + ] + """Path to antitarget regions""" + + path_panel_of_normals: Annotated[ + str, + Field( + examples=[ + "../panel_of_normals/output/{mapper}.cnvkit.create_panel/out/{mapper}.cnvkit.panel_of_normals.cnn" + ] + ), + ] + """Path to panel of normals (reference)""" + + plot: bool = True + """Generate plots (very slow)""" + + min_mapq: int = 0 + """[coverage] Mininum mapping quality score to count a read for coverage depth""" + + count: bool = False + """[coverage] Alternative counting algorithm""" + + gc_correction: bool = True + """[fix] Use GC correction""" + + edge_correction: bool = True + """[fix] Use edge correction""" + + rmask_correction: bool = True + """[fix] Use rmask correction""" + # BCBIO uses + # seg_method: haar + # seg_threshold: 0.0001 + # -- OR + # seg_method: cbs + # seg_threshold: 0.000001 + segmentation_method: SegmentationMethod = SegmentationMethod.cbs + """[segment] One of cbs, flasso, haar, hmm, hmm-tumor, hmm-germline, none""" + + segmentation_threshold: float = 0.000001 + """[segment] Significance threshold (hmm methods: smoothing window size)""" + + drop_low_coverage: bool = False + """[segment, call, genemetrics] Drop very low coverage bins""" + + drop_outliers: int = 10 + """[segment] Drop outlier bins (0 for no outlier filtering)""" + + smooth_cbs: bool = True + """[segment] Additional smoothing of CBS segmentation (WARNING- not the default value)""" + + center: CenterMode | float | None = None + """[call] Either one of mean, median, mode, biweight, or a constant log2 ratio value.""" + + filter: FilterMode | str = FilterMode.ampdel + """ + [call] One of ampdel, cn, ci, sem (merging segments flagged with the specified filter), + "" for no filtering + """ + + calling_method: CallingMethod = CallingMethod.threshold + """[call] One of threshold, clonal, none""" + + call_thresholds: str = "-1.1,-0.25,0.2,0.7" + """[call] Thresholds for calling integer copy number""" + + ploidy: int = 2 + """[call] Ploidy of sample cells""" + purity: Annotated[float, Field(0, ge=0, le=1)] + """[call] Estimated tumor cell fraction (0 for discarding tumor cell purity)""" + + gender: Gender = Gender.guess + """ + [call, diagram] Specify the chromosomal sex of all given samples as male or female. + Guess when missing + """ + + male_reference: bool = False + """[call, diagram] Create male reference""" + diagram_threshold: float = 0.5 + """[diagram] Copy number change threshold to label genes""" + + diagram_min_probes: int = 3 + """[diagram] Min number of covered probes to label genes""" + + shift_xy: bool = True + """[diagram] Shift X & Y chromosomes according to sample sex""" + + breaks_min_probes: int = 1 + """[breaks] Min number of covered probes for a break inside the gene""" + + genemetrics_min_probes: int = 3 + """[genemetrics] Min number of covered probes to consider a gene""" + + genemetrics_threshold: float = 0.2 + """[genemetrics] Min abs log2 change to consider a gene""" + + genemetrics_alpha: float = 0.05 + """[genemetrics] Significance cutoff""" + + genemetrics_bootstrap: int = 100 + """[genemetrics] Number of bootstraps""" + + segmetrics_alpha: float = 0.05 + """[segmetrics] Significance cutoff""" + + segmetrics_bootstrap: int = 100 + """[segmetrics] Number of bootstraps""" + + smooth_bootstrap: bool = False + """[segmetrics] Smooth bootstrap results""" diff --git a/snappy_pipeline/models/gcnv.py b/snappy_pipeline/models/gcnv.py new file mode 100644 index 000000000..66855e84b --- /dev/null +++ b/snappy_pipeline/models/gcnv.py @@ -0,0 +1,35 @@ +from typing import Annotated + +from pydantic import ConfigDict, Field + +from snappy_pipeline.models import SnappyModel + + +class TargetIntervalEntry(SnappyModel): + """ + The following will match both the stock IDT library kit and the ones + with spike-ins seen fromr Yale genomics. The path above would be + mapped to the name "default". + - name: IDT_xGen_V1_0 + pattern: "xGen Exome Research Panel V1\\.0*" + path: "path/to/targets.bed" + """ + + name: Annotated[str, Field(examples=["IDT_xGen_V1_0"])] + + pattern: Annotated[str, Field(examples=["xGen Exome Research Panel V1\\.0*"])] + + path: Annotated[str, Field(examples=["path/to/targets.bed"])] + + +class PrecomputedModelEntry(SnappyModel): + model_config = ConfigDict(protected_namespaces=()) + + library: Annotated[str, Field(examples=["Agilent SureSelect Human All Exon V6"])] + """Kit name, match in path_target_interval_list_mapping""" + + contig_ploidy: Annotated[str, Field(examples=["/path/to/ploidy-model"])] + """Output from `DetermineGermlineContigPloidy`""" + + model_pattern: Annotated[str, Field(examples=["/path/to/model_*"])] + """Output from `GermlineCNVCaller`""" diff --git a/snappy_pipeline/models/validators.py b/snappy_pipeline/models/validators.py new file mode 100644 index 000000000..052fc37d2 --- /dev/null +++ b/snappy_pipeline/models/validators.py @@ -0,0 +1,38 @@ +import pydantic +from pydantic import BaseModel + + +def validate_tools(): + def ensure_tools_are_configured(instance): + for tool in instance.tools: + if not getattr(instance, str(tool)): + raise ValueError(f"Tool {tool} not configured") + return instance + + return pydantic.model_validator(mode="after")(ensure_tools_are_configured) + + +class ToolsMixin(BaseModel): + """ + A mixin for validating that all defined tools in `self.tools` + have an accompanying configuration field in the model. + """ + + _validate_tools = validate_tools() + + +def validate_ngs_mapping_or_link(): + def path_ngs_mapping_or_path_link_in(instance): + if not instance.path_ngs_mapping and not instance.path_link_in: + raise ValueError("Either path_ngs_mapping or path_link_in must be set") + return instance + + return pydantic.model_validator(mode="after")(path_ngs_mapping_or_path_link_in) + + +class NgsMappingMixin(BaseModel): + """ + A mixin for validating that not both `path_ngs_mapping` and `path_link_in` are set. + """ + + _validate_ngs_mapping_or_link = validate_ngs_mapping_or_link() diff --git a/snappy_pipeline/utils.py b/snappy_pipeline/utils.py index 90ef76b45..8478fea67 100644 --- a/snappy_pipeline/utils.py +++ b/snappy_pipeline/utils.py @@ -26,7 +26,7 @@ def patched(*args, **kwargs): return patched -def dictify(gen): +def dictify[**P](gen) -> typing.Callable[P, dict]: """Decorator that converts a generator into a function which returns a dict Use it in the case where a generator is easier to write but you want diff --git a/snappy_pipeline/workflow_model.py b/snappy_pipeline/workflow_model.py new file mode 100644 index 000000000..aeda7682e --- /dev/null +++ b/snappy_pipeline/workflow_model.py @@ -0,0 +1,149 @@ +import enum +from typing import TypedDict + +from pydantic import ConfigDict + +from snappy_pipeline.models import SnappyModel, SnappyStepModel +from snappy_pipeline.workflows.adapter_trimming.model import AdapterTrimming +from snappy_pipeline.workflows.cbioportal_export.model import CbioportalExport +from snappy_pipeline.workflows.gene_expression_quantification.model import ( + GeneExpressionQuantification, +) +from snappy_pipeline.workflows.gene_expression_report.model import GeneExpressionReport +from snappy_pipeline.workflows.helper_gcnv_model_targeted.model import HelperGcnvModelTargeted +from snappy_pipeline.workflows.helper_gcnv_model_wgs.model import HelperGcnvModelWgs +from snappy_pipeline.workflows.hla_typing.model import HlaTyping +from snappy_pipeline.workflows.homologous_recombination_deficiency.model import ( + HomologousRecombinationDeficiency, +) +from snappy_pipeline.workflows.igv_session_generation.model import IgvSessionGeneration +from snappy_pipeline.workflows.ngs_data_qc.model import NgsDataQc +from snappy_pipeline.workflows.ngs_mapping.model import NgsMapping +from snappy_pipeline.workflows.panel_of_normals.model import PanelOfNormals +from snappy_pipeline.workflows.repeat_expansion.model import RepeatExpansion +from snappy_pipeline.workflows.somatic_cnv_checking.model import SomaticCnvChecking +from snappy_pipeline.workflows.somatic_gene_fusion_calling.model import SomaticGeneFusionCalling +from snappy_pipeline.workflows.somatic_hla_loh_calling.model import SomaticHlaLohCalling +from snappy_pipeline.workflows.somatic_msi_calling.model import SomaticMsiCalling +from snappy_pipeline.workflows.somatic_purity_ploidy_estimate.model import ( + SomaticPurityPloidyEstimate, +) +from snappy_pipeline.workflows.somatic_targeted_seq_cnv_calling.model import ( + SomaticTargetedSeqCnvCalling, +) +from snappy_pipeline.workflows.somatic_variant_annotation.model import SomaticVariantAnnotation +from snappy_pipeline.workflows.somatic_variant_calling.model import SomaticVariantCalling +from snappy_pipeline.workflows.somatic_variant_filtration.model import SomaticVariantFiltration +from snappy_pipeline.workflows.somatic_variant_signatures.model import SomaticVariantSignatures +from snappy_pipeline.workflows.somatic_wgs_cnv_calling.model import SomaticWgsCnvCalling +from snappy_pipeline.workflows.somatic_wgs_sv_calling.model import SomaticWgsSvCalling +from snappy_pipeline.workflows.sv_calling_targeted.model import SvCallingTargeted +from snappy_pipeline.workflows.sv_calling_wgs.model import SvCallingWgs +from snappy_pipeline.workflows.targeted_seq_mei_calling.model import TargetedSeqMeiCalling +from snappy_pipeline.workflows.tumor_mutational_burden.model import TumorMutationalBurden +from snappy_pipeline.workflows.varfish_export.model import VarfishExport +from snappy_pipeline.workflows.variant_annotation.model import VariantAnnotation +from snappy_pipeline.workflows.variant_calling.model import VariantCalling +from snappy_pipeline.workflows.variant_checking.model import VariantChecking +from snappy_pipeline.workflows.variant_denovo_filtration.model import VariantDenovoFiltration +from snappy_pipeline.workflows.variant_export_external.model import VariantExportExternal +from snappy_pipeline.workflows.variant_filtration.model import VariantFiltration +from snappy_pipeline.workflows.variant_phasing.model import VariantPhasing +from snappy_pipeline.workflows.wgs_cnv_export_external.model import WgsCnvExportExternal +from snappy_pipeline.workflows.wgs_sv_export_external.model import WgsSvExportExternal + + +class PathModel(SnappyModel): + path: str = "" + + +class StaticDataConfig(SnappyModel): + reference: PathModel + cosmic: PathModel | None = None + dbsnp: PathModel | None = None + dbnsfp: PathModel | None = None + features: PathModel | None = None + + +class SearchPattern(TypedDict): + left: str + right: str | None + + +class DataSetType(enum.StrEnum): + MATCHED_CANCER = "matched_cancer" + GERMLINE_VARIANTS = "germline_variants" + + +class NamingScheme(enum.StrEnum): + ONLY_SECONDARY_ID = "only_secondary_id" + SECONDARY_ID_PK = "secondary_id_pk" + + +class DataSet(SnappyModel): + file: str = "" + search_patterns: list[SearchPattern] = [ + SearchPattern(left="*.R1.fastq.gz", right="*.R2.fastq.gz") + ] + search_paths: list[str] = ["../raw"] + type: DataSetType = DataSetType.MATCHED_CANCER + naming_scheme: NamingScheme = NamingScheme.SECONDARY_ID_PK + is_background: bool = False + mixed_se_pe: bool = False + sodar_uuid: str | None = None + sodar_title: str | None = None + pedigree_field: str | None = None + + +class StepConfig(TypedDict, total=False): + adapter_trimming: AdapterTrimming + cbioportal_export: CbioportalExport + gene_expression_quantification: GeneExpressionQuantification + gene_expression_report: GeneExpressionReport + helper_gcnv_model_targeted: HelperGcnvModelTargeted + helper_gcnv_model_wgs: HelperGcnvModelWgs + hla_typing: HlaTyping + homologous_recombination_deficiency: HomologousRecombinationDeficiency + igv_session_generation: IgvSessionGeneration + ngs_data_qc: NgsDataQc + ngs_mapping: NgsMapping + panel_of_normals: PanelOfNormals + repeat_expansion: RepeatExpansion + somatic_cnv_checking: SomaticCnvChecking + somatic_gene_fusion_calling: SomaticGeneFusionCalling + somatic_hla_loh_calling: SomaticHlaLohCalling + somatic_msi_calling: SomaticMsiCalling + somatic_purity_ploidy_estimate: SomaticPurityPloidyEstimate + somatic_targeted_seq_cnv_calling: SomaticTargetedSeqCnvCalling + somatic_variant_annotation: SomaticVariantAnnotation + somatic_variant_calling: SomaticVariantCalling + somatic_variant_filtration: SomaticVariantFiltration + somatic_variant_signatures: SomaticVariantSignatures + somatic_wgs_cnv_calling: SomaticWgsCnvCalling + somatic_wgs_sv_calling: SomaticWgsSvCalling + sv_calling_targeted: SvCallingTargeted + sv_calling_wgs: SvCallingWgs + targeted_seq_mei_calling: TargetedSeqMeiCalling + tumor_mutational_burden: TumorMutationalBurden + varfish_export: VarfishExport + variant_annotation: VariantAnnotation + variant_calling: VariantCalling + variant_checking: VariantChecking + variant_denovo_filtration: VariantDenovoFiltration + variant_export_external: VariantExportExternal + variant_filtration: VariantFiltration + variant_phasing: VariantPhasing + wgs_cnv_export_external: WgsCnvExportExternal + wgs_sv_export_external: WgsSvExportExternal + + +class ConfigModel(SnappyStepModel): + model_config = ConfigDict( + extra="allow", + use_attribute_docstrings=True, + use_enum_values=True, + ) + + static_data_config: StaticDataConfig + step_config: StepConfig + data_sets: dict[str, DataSet] diff --git a/snappy_pipeline/workflows/__init__.py b/snappy_pipeline/workflows/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/snappy_pipeline/workflows/abstract/__init__.py b/snappy_pipeline/workflows/abstract/__init__.py index eb3c193cc..9aa20e058 100644 --- a/snappy_pipeline/workflows/abstract/__init__.py +++ b/snappy_pipeline/workflows/abstract/__init__.py @@ -8,37 +8,41 @@ from fnmatch import fnmatch from functools import lru_cache from io import StringIO -import itertools +import logging import os import os.path import sys import tempfile import typing +from typing import Any, Callable import attr +import pydantic +import ruamel.yaml as ruamel_yaml +import snakemake +from snakemake.io import InputFiles, OutputFiles, Wildcards, touch + from biomedsheets import io_tsv from biomedsheets.io import SheetBuilder, json_loads_ordered from biomedsheets.models import SecondaryIDNotFoundException -from biomedsheets.naming import NAMING_SCHEMES, NAMING_SECONDARY_ID_PK, name_generator_for_scheme +from biomedsheets.naming import NAMING_SCHEMES, name_generator_for_scheme from biomedsheets.ref_resolver import RefResolver from biomedsheets.shortcuts import ( + ShortcutSampleSheet, donor_has_dna_ngs_library, write_pedigree_to_ped, write_pedigrees_to_ped, ) -import ruamel.yaml as ruamel_yaml -from snakemake.io import touch - from snappy_pipeline.base import ( MissingConfiguration, UnsupportedActionException, - merge_dicts, merge_kwargs, print_config, print_sample_sheets, snakefile_path, ) from snappy_pipeline.find_file import FileSystemCrawler, PatternSet +from snappy_pipeline.models import SnappyStepModel from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract.pedigree import append_pedigree_to_ped from snappy_wrappers.resource_usage import ResourceUsage @@ -95,30 +99,36 @@ class ImplementationUnavailableError(NotImplementedError): """ +Inputs: typing.TypeAlias = InputFiles | dict[str, Any] +Outputs: typing.TypeAlias = OutputFiles | dict[str, Any] + + class BaseStepPart: """Base class for a part of a pipeline step""" - name = "" + name: str #: The actions available in the class. - actions: typing.Tuple[str] = None + actions: tuple[str, ...] #: Default resource usage for actions that are not given in ``resource_usage``. default_resource_usage: ResourceUsage = ResourceUsage( - threads=1, time="01:00:00", memory="2G" # 1h + threads=1, + time="01:00:00", + memory="2G", # 1h ) #: Configure resource usage here that should not use the default resource usage from #: ``default_resource_usage``. - resource_usage: typing.Dict[str, ResourceUsage] = {} + resource_usage: dict[str, ResourceUsage] = {} - def __init__(self, parent): + def __init__[P: BaseStep](self, parent: P): self.name = self.__class__.name - self.parent = parent + self.parent: P = parent self.config = parent.config self.w_config = parent.w_config - def _validate_action(self, action): + def _validate_action(self, action: str): """Validate provided action Checks that the provided ``action`` is listed in the valid class actions list. @@ -133,18 +143,20 @@ def _validate_action(self, action): error_message = f"Action '{action}' is not supported. Valid options: {actions_str}" raise UnsupportedActionException(error_message) - def get_resource_usage(self, action: str) -> ResourceUsage: + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Return the resource usage for the given action.""" if action not in self.actions: raise ValueError(f"Invalid {action} not in {self.actions}") return self.resource_usage.get(action, self.default_resource_usage) @staticmethod - def get_default_partition() -> str: + def get_default_partition() -> str | None: """Helper that returns the default partition.""" return os.getenv("SNAPPY_PIPELINE_PARTITION") - def get_resource(self, action: str, resource_name: str): + def get_resource( + self, action: str, resource_name: str + ) -> Callable[[Wildcards, InputFiles], Any]: """Return the amount of resources to be allocated for the given action. :param action: The action to return the resource requirement for. @@ -152,27 +164,31 @@ def get_resource(self, action: str, resource_name: str): """ if resource_name not in ("threads", "time", "memory", "partition", "tmpdir"): raise ValueError(f"Invalid resource name: {resource_name}") - resource_usage = self.get_resource_usage(action) - if resource_name == "tmpdir" and not resource_usage.tmpdir: - return self.parent.get_tmpdir() - if resource_name == "partition" and not resource_usage.partition: - return self.get_default_partition() - else: - return getattr(resource_usage, resource_name) - def get_args(self, action): + def _get_resource(wildcards: Wildcards = None, input: InputFiles = None) -> Any: + resource_usage = self.get_resource_usage(action, wildcards=wildcards, input=input) + if resource_name == "tmpdir" and not resource_usage.tmpdir: + return self.parent.get_tmpdir() + if resource_name == "partition" and not resource_usage.partition: + return self.get_default_partition() + else: + return getattr(resource_usage, resource_name) + + return _get_resource + + def get_args(self, action: str) -> Inputs | Callable[[Wildcards], Inputs]: """Return args for the given action of the sub step""" raise NotImplementedError("Called abstract method. Override me!") # pragma: no cover - def get_input_files(self, action): + def get_input_files(self, action: str) -> Inputs | Callable[[Wildcards], Inputs]: """Return input files for the given action of the sub step""" raise NotImplementedError("Called abstract method. Override me!") # pragma: no cover - def get_output_files(self, action): + def get_output_files(self, action: str) -> Outputs: """Return output files for the given action of the sub step and""" raise NotImplementedError("Called abstract method. Override me!") # pragma: no cover - def get_log_file(self, action): + def get_log_file(self, action: str) -> Outputs: """Return path to log file The default implementation tries to call ``self._get_log_files()`` and in the case of @@ -194,13 +210,13 @@ def get_log_file(self, action): "Log file name generation not implemented!" ) # pragma: no cover - def get_shell_cmd(self, action, wildcards): # NOSONAR + def get_shell_cmd(self, action: str, wildcards: Wildcards) -> str: # NOSONAR """Return shell command for the given action of the sub step and the given wildcards""" raise ImplementationUnavailableError( "Override this method before calling it!" ) # pragma: no cover - def run(self, action, wildcards): # NOSONAR + def run(self, action: str, wildcards: Wildcards): # NOSONAR """Run the sub steps action action's code with the given wildcards""" raise ImplementationUnavailableError( "Override this method before calling it!" @@ -224,7 +240,9 @@ class WritePedigreeStepPart(BaseStepPart): #: Class available actions actions = ("run",) - def __init__(self, parent, require_dna_ngs_library=False, only_trios=False): + def __init__[P: BaseStep]( + self, parent: P, require_dna_ngs_library: bool = False, only_trios: bool = False + ): super().__init__(parent) #: Whether to prevent writing out of samples with out NGS library. self.require_dna_ngs_library = require_dna_ngs_library @@ -238,7 +256,11 @@ def __init__(self, parent, require_dna_ngs_library=False, only_trios=False): in_trio = set() for donor in pedigree.donors: if donor.father and donor.mother: - in_trio |= {donor.name, donor.father.name, donor.mother.name} + in_trio |= { + donor.name, + donor.father.name, + donor.mother.name, + } if not any((donor.name in in_trio for donor in pedigree.donors)): continue # ignore empty pedigree post filtration pedigree = pedigree.with_filtered_donors( @@ -270,13 +292,16 @@ def get_input_files(wildcards): donor_names = list(sorted(d.name for d in pedigree.donors)) print(msg.format(donor_names), file=sys.stderr) # pragma: no cover return - mappers = self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"] + mappers = self.w_config.step_config["ngs_mapping"].tools.dna tpl = "output/{mapper}.{library_name}/out/{mapper}.{library_name}{ext}" for donor in filter(lambda d: d.dna_ngs_library, pedigree.donors): library_name = donor.dna_ngs_library.name for mapper in mappers: path = tpl.format( - library_name=library_name, mapper=mapper, ext=".bam", **wildcards + library_name=library_name, + mapper=mapper, + ext=".bam", + **wildcards, ) yield ngs_mapping(path) @@ -294,7 +319,7 @@ def get_result_files(self): # yield tpl.format(index_ngs_library=index_ngs_library) return [] - def run(self, wildcards, output): + def run(self, wildcards: Wildcards, output: OutputFiles): """Write out the pedigree information :param wildcards: Snakemake wildcards associated with rule (unused). @@ -307,7 +332,8 @@ def run(self, wildcards, output): write_pedigrees_to_ped(self.index_ngs_library_to_pedigree.values(), str(output)) else: write_pedigree_to_ped( - self.index_ngs_library_to_pedigree[wildcards.index_ngs_library], str(output) + self.index_ngs_library_to_pedigree[wildcards.index_ngs_library], + str(output), ) @@ -558,7 +584,10 @@ def _load_sheet(self): def _set_is_background(cls, sheet, flag): """Override "is_background" flag""" # TODO: check whether is already there and fail if not compatible - sheet.json_data["extraInfoDefs"]["is_background"] = {"type": "boolean", "default": False} + sheet.json_data["extraInfoDefs"]["is_background"] = { + "type": "boolean", + "default": False, + } sheet.extra_infos["is_background"] = flag return sheet @@ -581,10 +610,10 @@ class BaseStep: """ #: Override with step name - name = None + name: str #: Override with the sheet shortcut class to use - sheet_shortcut_class = None + sheet_shortcut_class: type[ShortcutSampleSheet] #: Override with arguments to pass into sheet shortcut class constructor sheet_shortcut_args = None @@ -619,18 +648,22 @@ def default_config_yaml(self): """ return "" # pragma: no cover - def __init__( + def __init__[C: SnappyStepModel]( self, - workflow, - config, - config_lookup_paths, - config_paths, - work_dir, - previous_steps=None, + workflow: snakemake.Workflow, + config: MutableMapping[str, Any], + config_lookup_paths: tuple[str, ...], + config_paths: tuple[str, ...], + work_dir: str, + *, + config_model_class: type[C], + previous_steps: tuple[type[typing.Self], ...] | None = None, ): self.name = self.__class__.name #: Tuple with absolute paths to configuration files read self.config_paths = config_paths + #: Pydantic model class for configuration validation + self.config_model_class = config_model_class #: Absolute path to directory of where to perform work self.work_dir = work_dir #: Classes of previously executed steps, used for merging their default configuration as @@ -638,22 +671,47 @@ def __init__( self.previous_steps = tuple(previous_steps or []) #: Snakefile "workflow" object self.workflow = workflow + #: Setup logger for the step + self.logger = logging.getLogger(self.name) #: Merge default configuration with true configuration - self.w_config = config - self.w_config.update(self._update_config(config)) - self.config = self.w_config["step_config"].get(self.name, OrderedDict()) + workflow_config = config + local_config = workflow_config["step_config"].get(self.name, OrderedDict()) + self.logger.info(local_config) + + # #: Validate workflow step configuration using its accompanying pydantic model + # #: available through self.config_model_class (mandatory keyword arg for BaseStep) + # try: + # self.config: C = validate_config(local_config, self.config_model_class) + # # Also update the workflow config, just in case + # workflow_config["step_config"][self.name] = self.config.model_dump(by_alias=True) + # except pydantic.ValidationError as ve: + # self.logger.error(f"{self.name} failed validation:\n{local_config}") + # raise ve + + #: Validate complete workflow configuration using SnappyPipeline's ConfigModel + #: This includes static_data_config, step_config and data_sets + try: + # local import of ConfigModel to avoid circular import + from snappy_pipeline.workflow_model import ConfigModel + + self.w_config: ConfigModel = ConfigModel(**workflow_config) + self.config: C = self.w_config.step_config[self.name] + except pydantic.ValidationError as ve: + self.logger.error(f"Workflow configuration failed validation:\n{workflow_config}") + raise ve + #: Paths with configuration paths, important for later retrieving sample sheet files - self.config_lookup_paths = config_lookup_paths - self.sub_steps = {} + self.config_lookup_paths = list(config_lookup_paths) + self.sub_steps: dict[str, BaseStepPart] = {} self.data_set_infos = list(self._load_data_set_infos()) - # Check configuration - self._check_config() + #: Shortcut to the BioMed SampleSheet objects self.sheets = [info.sheet for info in self.data_set_infos] #: Shortcut BioMed SampleSheet keyword arguments sheet_kwargs_list = [ merge_kwargs( - first_kwargs=self.sheet_shortcut_kwargs, second_kwargs=info.pedigree_field_kwargs + first_kwargs=self.sheet_shortcut_kwargs, + second_kwargs=info.pedigree_field_kwargs, ) for info in self.data_set_infos ] @@ -669,13 +727,29 @@ def __init__( # Setup onstart/onerror/onsuccess hooks self._setup_hooks() #: Functions from sub workflows, can be used to generate output paths into these workflows - self.sub_workflows = {} + self.sub_workflows: dict[str, snakemake.Workflow] = {} + + # Even though we already validated via pydantic, we still call check_config here, as + # some of the checks done in substep check_config are not covered by the pydantic models yet + # and some of the checks actually influence program logic/flow + self._check_config() + + config_string = self.config.model_dump_yaml(by_alias=True) + self.logger.info(f"Configuration for step {self.name}\n{config_string}") + + config_string = self.w_config.model_dump_yaml(by_alias=True) + self.logger.info(f"Configuration for workflow\n{config_string}") - if workflow.verbose: - self._write_step_config() + # Update snakemake.config (which `config` is a reference to) + # with the validated configuration. + # All fields with default values are explicitly defined. + _config = _cached_yaml_round_trip_load_str(config_string) + config.update(_config) + self.logger.info(f"Snakemake config\n{config}") def _setup_hooks(self): """Setup Snakemake workflow hooks for start/end/error""" + # In the following, the "log" parameter to the handler functions is set to "_" as we # don't use them def on_start(_): @@ -706,17 +780,6 @@ def on_success(_): self.workflow.onerror(on_error) self.workflow.onsuccess(on_success) - def _update_config(self, config): - """Update configuration config with the configuration returned by subclass' - ``default_config_yaml()`` and return - """ - result = OrderedDict() - for cls in itertools.chain([self.__class__], self.previous_steps): - result = merge_dicts( - result, _cached_yaml_round_trip_load_str(cls.default_config_yaml()) - ) - return merge_dicts(result, config) - def _check_config(self): """Internal method, checks step and sub step configurations""" self.check_config() @@ -762,8 +825,7 @@ def ensure_w_config(self, config_keys, msg, e_class=MissingConfiguration): # Iterate over required configuration keys for entry in config_keys: # Check if keys are present in config dictionary - if entry in handle: - handle = handle[entry] + if (handle := (getattr(handle, entry, None) or handle.get(entry, None))) is not None: so_far.append(entry) else: tpl = 'Missing configuration ("{full_path}", got up to "{so_far}"): {msg}'.format( @@ -771,7 +833,9 @@ def ensure_w_config(self, config_keys, msg, e_class=MissingConfiguration): ) raise e_class(tpl) - def register_sub_step_classes(self, classes): + def register_sub_step_classes( + self, classes: tuple[type[BaseStepPart] | tuple[type[BaseStepPart], Any], ...] + ): """Register an iterable of sub step classes Initializes objects in ``self.sub_steps`` dict @@ -783,10 +847,12 @@ def register_sub_step_classes(self, classes): klass = pair_or_class args = () obj = klass(self, *args) - obj.check_config() + # obj.check_config() self.sub_steps[klass.name] = obj - def register_sub_workflow(self, step_name, workdir, sub_workflow_name=None): + def register_sub_workflow( + self, step_name: str, workdir: str, sub_workflow_name: str | None = None + ): """Register workflow with given pipeline ``step_name`` and in the given ``workdir``. Optionally, the sub workflow name can be given separate from ``step_name`` (the default) @@ -807,42 +873,42 @@ def register_sub_workflow(self, step_name, workdir, sub_workflow_name=None): ) self.sub_workflows[sub_workflow_name] = self.workflow.globals[sub_workflow_name] - def get_args(self, sub_step, action): + def get_args(self, sub_step: str, action: str) -> Inputs | Callable[[Wildcards], Inputs]: """Return arguments for action of substep with given wildcards - Delegates to the sub step object's get_input_files function + Delegates to the sub step object's get_args function """ return self._get_sub_step(sub_step).get_args(action) - def get_input_files(self, sub_step, action): + def get_input_files(self, sub_step: str, action: str) -> Inputs | Callable[[Wildcards], Inputs]: """Return input files for action of substep with given wildcards Delegates to the sub step object's get_input_files function """ return self._get_sub_step(sub_step).get_input_files(action) - def get_output_files(self, sub_step, action): + def get_output_files(self, sub_step: str, action: str) -> Outputs: """Return list of strings with output files/patterns Delegates to the sub step object's get_output_files function """ return self._get_sub_step(sub_step).get_output_files(action) - def get_params(self, sub_step, action): + def get_params(self, sub_step: str, action: str) -> Any: """Return parameters Delegates to the sub step object's get_params function """ return self.substep_dispatch(sub_step, "get_params", action) - def get_resource(self, sub_step, action, resource_name): + def get_resource(self, sub_step: str, action: str, resource_name: str) -> Any: """Get resource Delegates to the sub step object's get_resource function """ return self.substep_dispatch(sub_step, "get_resource", action, resource_name) - def get_tmpdir(self): + def get_tmpdir(self) -> str: """Return temporary directory. To be used directly or via get_resource("step", "action", "tmpdir") @@ -852,7 +918,7 @@ def get_tmpdir(self): 2. If this fails, try to use environment variable TMPDIR. 3. If this fails, use tempfile.gettempdir(), same as Snakemake default. """ - tmpdir = self.w_config.get("global_config", {}).get("tmpdir", None) + tmpdir = getattr(self.w_config, "global_config", {}).get("tmpdir", None) if tmpdir: with modified_environ(TODAY=datetime.date.today().strftime("%Y%m%d")): tmpdir = os.path.expandvars(tmpdir) @@ -864,40 +930,40 @@ def get_tmpdir(self): os.makedirs(tmpdir, exist_ok=True) return tmpdir - def get_log_file(self, sub_step, action): + def get_log_file(self, sub_step: str, action: str) -> Outputs: """Return path to the log file Delegates to the sub step object's get_log_file function """ return self.substep_dispatch(sub_step, "get_log_file", action) - def get_shell_cmd(self, sub_step, action, wildcards): + def get_shell_cmd(self, sub_step: str, action: str, wildcards: Wildcards) -> str: """Return shell command for the pipeline sub step Delegates to the sub step object's get_shell_cmd function """ return self.substep_dispatch(sub_step, "get_shell_cmd", action, wildcards) - def run(self, sub_step, action, wildcards): + def run(self, sub_step: str, action: str, wildcards: Wildcards) -> str: """Run command for the given action of the given sub step with the given wildcards Delegates to the sub step object's run function """ - return self._get_sub_step(sub_step).get_shell_cmd(action, wildcards) + return self._get_sub_step(sub_step).run(action, wildcards) - def get_result_files(self): + def get_result_files(self) -> OutputFiles: """Return actual list of file names to build""" raise NotImplementedError("Implement me!") # pragma: no cover - def substep_getattr(self, step, name): + def substep_getattr(self, step: str, name: str) -> Any: """Return attribute from substep""" return getattr(self._get_sub_step(step), name) - def substep_dispatch(self, step, function, *args, **kwargs): + def substep_dispatch(self, step: str, function: str, *args, **kwargs): """Dispatch call to function of sub step implementation""" return self.substep_getattr(step, function)(*args, **kwargs) - def _get_sub_step(self, sub_step): + def _get_sub_step(self, sub_step: str) -> BaseStepPart: if sub_step in self.sub_steps: return self.sub_steps[sub_step] else: @@ -905,49 +971,47 @@ def _get_sub_step(self, sub_step): 'Could not find sub step "{}" in workflow step "{}"'.format(sub_step, self.name) ) # pragma: no cover - def _load_data_set_infos(self): + def _load_data_set_infos(self) -> typing.Generator[DataSetInfo, None, None]: """Load BioMed Sample Sheets as given by configuration and yield them""" - for name, data_set in self.w_config["data_sets"].items(): + for name, data_set in self.w_config.data_sets.items(): yield DataSetInfo( name, - data_set["file"], + data_set.file, self.config_lookup_paths, - data_set["search_paths"], - data_set["search_patterns"], - data_set["type"], - data_set.get("is_background", False), - data_set.get("naming_scheme", NAMING_SECONDARY_ID_PK), - data_set.get("mixed_se_pe", False), - data_set.get("sodar_uuid", None), - data_set.get("sodar_title", None), - data_set.get("pedigree_field", None), + data_set.search_paths, + data_set.search_patterns, + data_set.type, + data_set.is_background, + data_set.naming_scheme, + data_set.mixed_se_pe, + data_set.sodar_uuid, + data_set.sodar_title, + data_set.pedigree_field, ) - def _load_data_search_infos(self): + def _load_data_search_infos(self) -> typing.Generator[DataSearchInfo, None, None]: """Use workflow and step configuration to yield ``DataSearchInfo`` objects""" - for _, data_set in self.w_config["data_sets"].items(): + for _, data_set in self.w_config.data_sets.items(): yield DataSearchInfo( - sheet_path=data_set["file"], + sheet_path=data_set.file, base_paths=self.config_lookup_paths, - search_paths=self.config["search_paths"], - search_patterns=self.config["search_patterns"], + search_paths=self.config.search_paths, + search_patterns=self.config.search_patterns, mixed_se_pe=False, ) - def _write_step_config(self, f=sys.stdout): - print(f"\n\n----- Configuration for step {self.name}:\n", file=f) - yaml = ruamel_yaml.YAML() - yaml.preserve_quotes = True - yaml.indent(sequence=4, mapping=4, offset=4) - yaml.dump(self.config, f) - print(f"\n------ Configuration for {self.name} ends here\n", file=f) - @classmethod - def wrapper_path(cls, path): + def wrapper_path(cls, path: str) -> str: """Generate path to wrapper""" return "file://" + os.path.abspath( os.path.join( - os.path.dirname(__file__), "..", "..", "..", "snappy_wrappers", "wrappers", path + os.path.dirname(__file__), + "..", + "..", + "..", + "snappy_wrappers", + "wrappers", + path, ) ) @@ -983,7 +1047,11 @@ def __init__( os.path.join(self.work_dir, self.cache_file_name), invalidate_paths_list ) - def run(self, folder_name, pattern_set_keys=("left", "right", "left_md5", "right_md5", "bam")): + def run( + self, + folder_name, + pattern_set_keys=("left", "right", "left_md5", "right_md5", "bam"), + ): """Yield (src_path, path_infix, filename) one-by-one Cache is saved after the last iteration @@ -1049,7 +1117,8 @@ def _get_shell_cmd_root_paths(cls, info): for search_path in info.search_paths: yield os.path.abspath( os.path.join( - os.path.dirname(os.path.join(base_path, info.sheet_path)), search_path + os.path.dirname(os.path.join(base_path, info.sheet_path)), + search_path, ) ) @@ -1067,7 +1136,6 @@ def _merge_cache_invalidate_paths(cls, data_set_infos): out_list = [] # Iterate over DataSetInfo objects for info in data_set_infos: - # Search paths - expects a list already out_list.extend(getattr(info, "search_paths")) @@ -1148,8 +1216,8 @@ def __init__(self, parent): # FASTQ files. That doesn't make sense for pipelines that are using externally generated # data already. try: - preprocessed_path = self.config["path_link_in"] - except KeyError: + preprocessed_path = self.config.path_link_in + except AttributeError: preprocessed_path = "" # Path generator. @@ -1180,7 +1248,7 @@ def get_shell_cmd(self, action, wildcards): out_path = os.path.dirname(self.base_pattern_out.format(**wildcards)) # Get folder name of first library candidate folder_name = get_ngs_library_folder_name(self.parent.sheets, wildcards.library_name) - if self.config["path_link_in"]: + if self.config.path_link_in: folder_name = wildcards.library_name # Perform the command generation lines = [] @@ -1200,7 +1268,10 @@ def get_shell_cmd(self, action, wildcards): filenames[new_path] = src_path lines.append( tpl.format( - src_path=src_path, out_path=out_path, path_infix=path_infix, filename=filename + src_path=src_path, + out_path=out_path, + path_infix=path_infix, + filename=filename, ) ) if not lines: @@ -1239,7 +1310,9 @@ def get_shell_cmd(self, action, wildcards): self._validate_action(action) # Define path generator path_gen = LinkInPathGenerator( - self.parent.work_dir, self.parent.data_search_infos, self.parent.config_lookup_paths + self.parent.work_dir, + self.parent.data_search_infos, + self.parent.config_lookup_paths, ) # Get base out path out_path = os.path.dirname(self.base_pattern_out.format(**wildcards)) @@ -1263,7 +1336,10 @@ def get_shell_cmd(self, action, wildcards): filenames[new_path] = src_path lines.append( tpl.format( - src_path=src_path, out_path=out_path, path_infix=path_infix, filename=filename + src_path=src_path, + out_path=out_path, + path_infix=path_infix, + filename=filename, ) ) if not lines: @@ -1326,9 +1402,12 @@ def get_input_files(self, action): @dictify def input_function(wildcards): if self.include_ped_file: - yield "ped", os.path.realpath( - "work/write_pedigree.{index_library}/out/{index_library}.ped" - ).format(**wildcards) + yield ( + "ped", + os.path.realpath( + "work/write_pedigree.{index_library}/out/{index_library}.ped" + ).format(**wildcards), + ) name_pattern = self.prev_class.name_pattern.replace(r",[^\.]+", "") tpl_path_out = os.path.join("work", name_pattern, "out", name_pattern) for key, ext in zip(self.ext_names, self.ext_values): diff --git a/snappy_pipeline/workflows/abstract/common.py b/snappy_pipeline/workflows/abstract/common.py index 15c69f853..4ac10a886 100644 --- a/snappy_pipeline/workflows/abstract/common.py +++ b/snappy_pipeline/workflows/abstract/common.py @@ -57,7 +57,7 @@ class ForwardResourceUsageMixin: #: Resource usage definitions resource_usage_dict: typing.Optional[typing.Dict[str, ResourceUsage]] = None - def get_resource_usage(self, action: str) -> ResourceUsage: + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: self._validate_action(action) assert self.resource_usage_dict is not None, "resource_usage_dict not set!" assert action in self.resource_usage_dict, f"No resource usage entry for {action}" diff --git a/snappy_pipeline/workflows/abstract/pedigree.py b/snappy_pipeline/workflows/abstract/pedigree.py index 38dc4a195..e7b12423d 100644 --- a/snappy_pipeline/workflows/abstract/pedigree.py +++ b/snappy_pipeline/workflows/abstract/pedigree.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- """Pedigree manipulation related methods""" + from biomedsheets.shortcuts import KEY_IS_AFFECTED, KEY_SEX diff --git a/snappy_pipeline/workflows/adapter_trimming/__init__.py b/snappy_pipeline/workflows/adapter_trimming/__init__.py index a8eedb1d1..465ac3fe0 100644 --- a/snappy_pipeline/workflows/adapter_trimming/__init__.py +++ b/snappy_pipeline/workflows/adapter_trimming/__init__.py @@ -139,9 +139,9 @@ from collections import OrderedDict import os -from biomedsheets.shortcuts import GenericSampleSheet from snakemake.io import expand +from biomedsheets.shortcuts import GenericSampleSheet from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import ( BaseStep, @@ -152,275 +152,13 @@ get_ngs_library_folder_name, ) +from .model import AdapterTrimming as AdapterTrimmingConfigModel + #: Adatper trimming tools TRIMMERS = ("bbduk", "fastp") #: Default configuration for the hla_typing schema -DEFAULT_CONFIG = r""" -# Default configuration adapter_trimming -step_config: - adapter_trimming: - path_link_in: "" # OPTIONAL Override data set configuration search paths for FASTQ files - tools: [bbduk, fastp] # REQUIRED, available: 'bbduk' and 'fastp'. - bbduk: - adapter_sequences: [] # REQUIRED - # - /fast/work/groups/cubi/projects/biotools/static_data/app_support/bbtools/39.01/resources/adapters.fa - # - /fast/work/groups/cubi/projects/biotools/static_data/app_support/bbtools/39.01/resources/phix174_ill.ref.fa.gz - # Note: The author recommends setting tpe=t & tbo=t when adapter trimming paired reads. - num_threads: 8 - - # Non-default parameters from https://www.biostars.org/p/268221/ - # & https://github.com/ewels/MultiQC/issues/1146#issuecomment-607980076 - - # Input parameters: - interleaved: auto # (int) t/f overrides interleaved autodetection. - qin: auto # Input quality offset: 33 (Sanger), 64, or auto. - copyundefined: f # (cu) Process non-AGCT IUPAC reference bases by making all - # possible unambiguous copies. Intended for short motifs - # or adapter barcodes, as time/memory use is exponential. - - # Output parameters: - nzo: t # Only write statistics about ref sequences with nonzero hits. - qout: auto # Output quality offset: 33 (Sanger), 64, or auto. - statscolumns: 3 # (cols) Number of columns for stats output, 3 or 5. - # 5 includes base counts. - rename: f # Rename reads to indicate which sequences they matched. - refnames: f # Use names of reference files rather than scaffold IDs. - trd: f # Truncate read and ref names at the first whitespace. - ordered: f # Set to true to output reads in same order as input. - - # Histogram output parameters: - gcbins: auto # Number gchist bins. Set to 'auto' to use read length. - maxhistlen: 6000 # Set an upper bound for histogram lengths; higher uses - # more memory. The default is 6000 for some histograms - # and 80000 for others. - - # Histograms for mapped sam/bam files only: - histbefore: t # Calculate histograms from reads before processing. - idbins: 100 # Number idhist bins. Set to 'auto' to use read length. - - # Processing parameters: - k: 21 # Kmer length used for finding contaminants. Contaminants - # shorter than k will not be found. k must be at least 1. - # bbduk default: 27 - rcomp: t # Look for reverse-complements of kmers in addition to - # forward kmers. - maskmiddle: t # (mm) Treat the middle base of a kmer as a wildcard, to - # increase sensitivity in the presence of errors. - minkmerhits: 1 # (mkh) Reads need at least this many matching kmers - # to be considered as matching the reference. - minkmerfraction: 0.0 # (mkf) A reads needs at least this fraction of its total - # kmers to hit a ref, in order to be considered a match. - # If this and minkmerhits are set, the greater is used. - mincovfraction: 0.0 # (mcf) A reads needs at least this fraction of its total - # bases to be covered by ref kmers to be considered a match. - # If specified, mcf overrides mkh and mkf. - hammingdistance: 1 # (hdist) Maximum Hamming distance for ref kmers (subs only). - # Memory use is proportional to (3*K)^hdist. - # bbduk default: 0 - qhdist: 0 # Hamming distance for query kmers; impacts speed, not memory. - editdistance: 0 # (edist) Maximum edit distance from ref kmers (subs - # and indels). Memory use is proportional to (8*K)^edist. - hammingdistance2: 0 # (hdist2) Sets hdist for short kmers, when using mink. - qhdist2: 0 # Sets qhdist for short kmers, when using mink. - editdistance2: 0 # (edist2) Sets edist for short kmers, when using mink. - forbidn: f # (fn) Forbids matching of read kmers containing N. - # By default, these will match a reference 'A' if - # hdist>0 or edist>0, to increase sensitivity. - removeifeitherbad: t # (rieb) Paired reads get sent to 'outmatch' if either is - # match (or either is trimmed shorter than minlen). - # Set to false to require both. - trimfailures: f # Instead of discarding failed reads, trim them to 1bp. - # This makes the statistics a bit odd. - findbestmatch: f # (fbm) If multiple matches, associate read with sequence - # sharing most kmers. Reduces speed. - skipr1: f # Don't do kmer-based operations on read 1. - skipr2: f # Don't do kmer-based operations on read 2. - ecco: f # For overlapping paired reads only. Performs error- - # correction with BBMerge prior to kmer operations. - - # Trimming/Filtering/Masking parameters: - # Note - if ktrim, kmask, and ksplit are unset, the default behavior is kfilter. - # All kmer processing modes are mutually exclusive. - # Reads only get sent to 'outm' purely based on kmer matches in kfilter mode. - - ktrim: r # Trim reads to remove bases matching reference kmers. - # Values: - # f (don't trim), [bbduk default] - # r (trim to the right), - # l (trim to the left) - kmask: "" # Replace bases matching ref kmers with another symbol. - # Allows any non-whitespace character, and processes short - # kmers on both ends if mink is set. 'kmask: lc' will - # convert masked bases to lowercase. - maskfullycovered: f # (mfc) Only mask bases that are fully covered by kmers. - ksplit: f # For single-ended reads only. Reads will be split into - # pairs around the kmer. If the kmer is at the end of the - # read, it will be trimmed instead. Singletons will go to - # out, and pairs will go to outm. Do not use ksplit with - # other operations such as quality-trimming or filtering. - mink: 11 # Look for shorter kmers at read tips down to this length, - # when k-trimming or masking. 0 means disabled. Enabling - # this will disable maskmiddle. - # bbduk default: 0 (disabled) - qtrim: rl # Trim read ends to remove bases with quality below trimq. - # Performed AFTER looking for kmers. Values: - # rl (trim both ends), - # f (neither end), [bbduk default] - # r (right end only), - # l (left end only), - # w (sliding window). - trimq: 25 # Regions with average quality BELOW this will be trimmed, - # if qtrim is set to something other than f. Can be a - # floating-point number like 7.3. - # Very strict quality threshold, bbduk default: 6 - minlength: 35 # (ml) Reads shorter than this after trimming will be - # discarded. Pairs will be discarded if both are shorter. - # bbduk default: 10 - mlf: 0 # (minlengthfraction) Reads shorter than this fraction of - # original length after trimming will be discarded. - minavgquality: 0 # (maq) Reads with average quality (after trimming) below - # this will be discarded. - maqb: 0 # If positive, calculate maq from this many initial bases. - minbasequality: 0 # (mbq) Reads with any base below this quality (after - # trimming) will be discarded. - maxns: -1 # If non-negative, reads with more Ns than this - # (after trimming) will be discarded. - mcb: 0 # (minconsecutivebases) Discard reads without at least - # this many consecutive called bases. - ottm: f # (outputtrimmedtomatch) Output reads trimmed to shorter - # than minlength to outm rather than discarding. - tp: 0 # (trimpad) Trim this much extra around matching kmers. - tbo: f # (trimbyoverlap) Trim adapters based on where paired - # reads overlap. - strictoverlap: t # Adjust sensitivity for trimbyoverlap mode. - minoverlap: 14 # Require this many bases of overlap for detection. - mininsert: 40 # Require insert size of at least this for overlap. - # Should be reduced to 16 for small RNA sequencing. - tpe: f # (trimpairsevenly) When kmer right-trimming, trim both - # reads to the minimum length of either. - forcetrimleft: 0 # (ftl) If positive, trim bases to the left of this position - # (exclusive, 0-based). - forcetrimright: 0 # (ftr) If positive, trim bases to the right of this position - # (exclusive, 0-based). - forcetrimright2: 0 # (ftr2) If positive, trim this many bases on the right end. - forcetrimmod: 5 # (ftm) If positive, right-trim length to be equal to zero, - # modulo this number. - # bbduk default: 0 - restrictleft: 0 # If positive, only look for kmer matches in the - # leftmost X bases. - restrictright: 0 # If positive, only look for kmer matches in the - # rightmost X bases. - mingc: 0 # Discard reads with GC content below this. - maxgc: 1 # Discard reads with GC content above this. - # gcpairs: t # Use average GC of paired reads. Deprecated option? - # # Also affects gchist. - tossjunk: f # Discard reads with invalid characters as bases. - swift: f # Trim Swift sequences: Trailing C/T/N R1, leading G/A/N R2. - - # Header-parsing parameters - these require Illumina headers: - chastityfilter: f # (cf) Discard reads with id containing ' 1:Y:' or ' 2:Y:'. - barcodefilter: f # Remove reads with unexpected barcodes if barcodes is set, - # or barcodes containing 'N' otherwise. A barcode must be - # the last part of the read header. Values: - # t: Remove reads with bad barcodes. - # f: Ignore barcodes. - # crash: Crash upon encountering bad barcodes. - barcodes: "" # File of barcodes. - xmin: -1 # If positive, discard reads with a lesser X coordinate. - ymin: -1 # If positive, discard reads with a lesser Y coordinate. - xmax: -1 # If positive, discard reads with a greater X coordinate. - ymax: -1 # If positive, discard reads with a greater Y coordinate. - - # Polymer trimming: - trimpolya: 0 # If greater than 0, trim poly-A or poly-T tails of - # at least this length on either end of reads. - trimpolygleft: 0 # If greater than 0, trim poly-G prefixes of at least this - # length on the left end of reads. Does not trim poly-C. - trimpolygright: 8 # If greater than 0, trim poly-G tails of at least this - # length on the right end of reads. Does not trim poly-C. - # bbduk default: don't trim polyG (trimpolyg=0) - trimpolyg: 0 # This sets both left and right at once. - filterpolyg: 8 # If greater than 0, remove reads with a poly-G prefix of - # at least this length (on the left). - # Note: there are also equivalent poly-C flags. - - # Entropy/Complexity parameters: - entropy: -1 # Set between 0 and 1 to filter reads with entropy below - # that value. Higher is more stringent. - entropywindow: 50 # Calculate entropy using a sliding window of this length. - entropyk: 5 # Calculate entropy using kmers of this length. - minbasefrequency: 0 # Discard reads with a minimum base frequency below this. - entropytrim: f # Values: - # f: (false) Do not entropy-trim. - # r: (right) Trim low entropy on the right end only. - # l: (left) Trim low entropy on the left end only. - # rl: (both) Trim low entropy on both ends. - entropymask: f # Values: - # f: (filter) Discard low-entropy sequences. - # t: (true) Mask low-entropy parts of sequences with N. - # lc: Change low-entropy parts of sequences to lowercase. - entropymark: f # Mark each base with its entropy value. This is on a scale - # of 0-41 and is reported as quality scores, so the output - # should be fastq or fasta+qual. - # NOTE: If set, entropytrim overrides entropymask. - - # Cardinality estimation: - cardinality: f # (loglog) Count unique kmers using the LogLog algorithm. - cardinalityout: f # (loglogout) Count unique kmers in output reads. - loglogk: 31 # Use this kmer length for counting. - loglogbuckets: 2048 # Use this many buckets for counting. - - fastp: - num_threads: 4 - - trim_front1: 0 # trimming how many bases in front for read1, default is 0 (int [=0]) - trim_tail1: 0 # trimming how many bases in tail for read1, default is 0 (int [=0]) - max_len1: 0 # if read1 is longer than max_len1, then trim read1 at its tail to make it as long as max_len1. Default 0 means no limitation (int [=0]) - trim_front2: 0 # trimming how many bases in front for read2. If it's not specified, it will follow read1's settings (int [=0]) - trim_tail2: 0 # trimming how many bases in tail for read2. If it's not specified, it will follow read1's settings (int [=0]) - max_len2: 0 # if read2 is longer than max_len2, then trim read2 at its tail to make it as long as max_len2. Default 0 means no limitation. If it's not specified, it will follow read1's settings (int [=0]) - dedup: False # enable deduplication to drop the duplicated reads/pairs - dup_calc_accuracy: 0 # accuracy level to calculate duplication (1~6), higher level uses more memory (1G, 2G, 4G, 8G, 16G, 24G). Default 1 for no-dedup mode, and 3 for dedup mode. (int [=0]) - dont_eval_duplication: True # don't evaluate duplication rate to save time and use less memory. - trim_poly_g: True # force polyG tail trimming, by default trimming is automatically enabled for Illumina NextSeq/NovaSeq data - poly_g_min_len: 8 # the minimum length to detect polyG in the read tail. 10 by default. (int [=10]) - trim_poly_x: False # enable polyX trimming in 3' ends. - poly_x_min_len: 10 # the minimum length to detect polyX in the read tail. 10 by default. (int [=10]) - cut_front: False # move a sliding window from front (5') to tail, drop the bases in the window if its mean quality < threshold, stop otherwise. - cut_tail: False # move a sliding window from tail (3') to front, drop the bases in the window if its mean quality < threshold, stop otherwise. - cut_right: False # move a sliding window from front to tail, if meet one window with mean quality < threshold, drop the bases in the window and the right part, and then stop. - cut_front_window_size: 4 # the window size option of cut_front, default to cut_window_size if not specified (int [=4]) - cut_front_mean_quality: 20 # the mean quality requirement option for cut_front, default to cut_mean_quality if not specified (int [=20]) - cut_tail_window_size: 4 # the window size option of cut_tail, default to cut_window_size if not specified (int [=4]) - cut_tail_mean_quality: 20 # the mean quality requirement option for cut_tail, default to cut_mean_quality if not specified (int [=20]) - cut_right_window_size: 4 # the window size option of cut_right, default to cut_window_size if not specified (int [=4]) - cut_right_mean_quality: 20 # the mean quality requirement option for cut_right, default to cut_mean_quality if not specified (int [=20]) - disable_quality_filtering: False # quality filtering is enabled by default. If this option is specified, quality filtering is disabled - qualified_quality_phred: 15 # the quality value that a base is qualified. Default 15 means phred quality >=Q15 is qualified. (int [=15]) - unqualified_percent_limit: 40 # how many percents of bases are allowed to be unqualified (0~100). Default 40 means 40% (int [=40]) - n_base_limit: 5 # if one read's number of N base is >n_base_limit, then this read/pair is discarded. Default is 5 (int [=5]) - average_qual: 0 # if one read's average quality score ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -538,7 +277,7 @@ def get_resource_usage(self, action): # Validate action self._validate_action(action) return ResourceUsage( - threads=self.config["bbduk"]["num_threads"], + threads=self.config.bbduk.num_threads, time="12:00:00", # 40 hours memory="24000M", ) @@ -550,7 +289,7 @@ class FastpStepPart(AdapterTrimmingStepPart): #: Step name name = "fastp" - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -561,7 +300,7 @@ def get_resource_usage(self, action): # Validate action self._validate_action(action) return ResourceUsage( - threads=self.config["fastp"]["num_threads"], + threads=self.config.fastp.num_threads, time="12:00:00", # 60 hours memory="24000M", ) @@ -626,7 +365,7 @@ class AdapterTrimmingWorkflow(BaseStep): sheet_shortcut_class = GenericSampleSheet def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + super().__init__(*args, **kwargs, config_model_class=AdapterTrimmingConfigModel) self.register_sub_step_classes( (BbdukStepPart, FastpStepPart, LinkInStep, LinkOutFastqStepPart) ) @@ -652,6 +391,6 @@ def get_result_files(self): ) for sheet in self.shortcut_sheets: for ngs_library in sheet.all_ngs_libraries: - for tool in self.config["tools"]: + for tool in self.config.tools: for tpl in tpls: yield tpl.format(trimmer=tool, ngs_library_name=ngs_library.name) diff --git a/snappy_pipeline/workflows/adapter_trimming/model.py b/snappy_pipeline/workflows/adapter_trimming/model.py new file mode 100644 index 000000000..50b2e3a1f --- /dev/null +++ b/snappy_pipeline/workflows/adapter_trimming/model.py @@ -0,0 +1,883 @@ +from enum import Enum + +from pydantic import Field +from typing_extensions import Annotated + +from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel +from snappy_pipeline.models.validators import ToolsMixin + + +class Tool(Enum): + BBDUK = "bbduk" + FASTP = "fastp" + + +class Interleaved(Enum): + AUTO = "auto" + + +class Qin(Enum): + AUTO = "auto" + FIELD_33 = "33" + FIELD_64 = "64" + + +class Qout(Enum): + AUTO = "auto" + FIELD_33 = "33" + FIELD_64 = "64" + + +class Statscolumns(Enum): + INTEGER_3 = 3 + INTEGER_5 = 5 + + +class Gcbins(Enum): + AUTO = "auto" + + +class Maxhistlen(Enum): + AUTO = "auto" + + +class Idbins(Enum): + AUTO = "auto" + + +class Ktrim(Enum): + F = "f" + R = "r" + L = "l" + + +class Qtrim(Enum): + RL = "rl" + F = "f" + R = "r" + L = "l" + W = "w" + + +class Barcodefilter(Enum): + T = "t" + F = "f" + CRASH = "crash" + + +class Entropytrim(Enum): + F = "f" + """Do not entropy-trim""" + + R = "r" + """Trim low entropy on the right end only.""" + + L = "l" + """Trim low entropy on the left end only.""" + + RL = "rl" + """Trim low entropy on both ends.""" + + +class Entropymask(Enum): + F = "f" + T = "t" + LC = "lc" + + +class UmiLoc(Enum): + INDEX1 = "index1" + INDEX2 = "index2" + READ1 = "read1" + READ2 = "read2" + PER_INDEX = "per_index" + PER_READ = "per_read" + NONE = "" + + +class Fastp(SnappyModel): + num_threads: int = 0 + trim_front1: int = 0 + """ + trimming how many bases in front for read1, default is 0 (int [=0]) + """ + + trim_tail1: int = 0 + """ + trimming how many bases in tail for read1, default is 0 (int [=0]) + """ + + max_len1: int = 0 + """ + if read1 is longer than max_len1, then trim read1 at its tail to make it as long as max_len1. + Default 0 means no limitation (int [=0]) + """ + + trim_front2: int = 0 + """ + trimming how many bases in front for read2. + If it's not specified, it will follow read1's settings (int [=0]) + """ + + trim_tail2: int = 0 + """ + trimming how many bases in tail for read2. + If it's not specified, it will follow read1's settings (int [=0]) + """ + + max_len2: int = 0 + """ + if read2 is longer than max_len2, then trim read2 at its tail to make it as long as max_len2. + Default 0 means no limitation. If it's not specified, it will follow read1's settings (int [=0]) + """ + + dedup: bool = False + """ + enable deduplication to drop the duplicated reads/pairs + """ + + dup_calc_accuracy: Annotated[int, Field(0, ge=0, le=6)] + """ + accuracy level to calculate duplication (1~6), + higher level uses more memory (1G, 2G, 4G, 8G, 16G, 24G). + Default 1 for no-dedup mode, and 3 for dedup mode. (int [=0]) + """ + + dont_eval_duplication: bool = True + """ + don't evaluate duplication rate to save time and use less memory. + """ + + trim_poly_g: bool = True + """ + force polyG tail trimming, + by default trimming is automatically enabled for Illumina NextSeq/NovaSeq data + """ + + poly_g_min_len: int = 8 + """ + the minimum length to detect polyG in the read tail. 10 by default. (int [=10]) + """ + + trim_poly_x: bool = False + """ + enable polyX trimming in 3' ends. + """ + + poly_x_min_len: int = 10 + """ + the minimum length to detect polyX in the read tail. 10 by default. (int [=10]) + """ + + cut_front: bool = False + """ + move a sliding window from front (5') to tail, drop the bases in the window if its mean quality < threshold, stop otherwise. + """ + + cut_tail: bool = False + """ + move a sliding window from tail (3') to front, drop the bases in the window if its mean quality < threshold, stop otherwise. + """ + + cut_right: bool = False + """ + move a sliding window from front to tail, if meet one window with mean quality < threshold, drop the bases in the window and the right part, and then stop. + """ + + cut_front_window_size: int = 4 + """ + the window size option of cut_front, default to cut_window_size if not specified (int [=4]) + """ + + cut_front_mean_quality: int = 20 + """ + the mean quality requirement option for cut_front, default to cut_mean_quality if not specified (int [=20]) + """ + + cut_tail_window_size: int = 4 + """ + the window size option of cut_tail, default to cut_window_size if not specified (int [=4]) + """ + + cut_tail_mean_quality: int = 20 + """ + the mean quality requirement option for cut_tail, default to cut_mean_quality if not specified (int [=20]) + """ + + cut_right_window_size: int = 4 + """ + the window size option of cut_right, default to cut_window_size if not specified (int [=4]) + """ + + cut_right_mean_quality: int = 20 + """ + the mean quality requirement option for cut_right, + default to cut_mean_quality if not specified (int [=20]) + """ + + disable_quality_filtering: bool = False + """ + quality filtering is enabled by default. + If this option is specified, quality filtering is disabled + """ + + qualified_quality_phred: int = 15 + """ + the quality value that a base is qualified. + Default 15 means phred quality >=Q15 is qualified. (int [=15]) + """ + + unqualified_percent_limit: int = 40 + """ + how many percents of bases are allowed to be unqualified (0~100). + Default 40 means 40% (int [=40]) + """ + + n_base_limit: int = 5 + """ + if one read's number of N base is >n_base_limit, then this read/pair is discarded. + Default is 5 (int [=5]) + """ + + average_qual: int = 0 + """ + if one read's average quality score 0 or edist>0, to increase sensitivity. + """ + + removeifeitherbad: bool = True + """ + (rieb) Paired reads get sent to 'outmatch' if either is match + (or either is trimmed shorter than minlen). + Set to false to require both. + """ + + trimfailures: bool = False + """ + Instead of discarding failed reads, trim them to 1bp. + This makes the statistics a bit odd. + """ + + findbestmatch: bool = False + """ + (fbm) If multiple matches, associate read with sequence sharing most kmers. Reduces speed. + """ + + skipr1: bool = False + """ + Don't do kmer-based operations on read 1. + """ + + skipr2: bool = False + """ + Don't do kmer-based operations on read 2. + """ + + ecco: bool = False + """ + For overlapping paired reads only. + Performs error- correction with BBMerge prior to kmer operations. + """ + + ktrim: Ktrim = "r" + """ + Trim reads to remove bases matching reference kmers. Values: + f (don't trim), [bbduk default] + r (trim to the right), + l (trim to the left) + """ + + kmask: str = "" + """ + Replace bases matching ref kmers with another symbol. + Allows any non-whitespace character, and processes short kmers on both ends if mink is set. + 'kmask: lc' will convert masked bases to lowercase. + """ + + maskfullycovered: bool = False + """ + (mfc) Only mask bases that are fully covered by kmers. + """ + + ksplit: bool = False + """ + For single-ended reads only. + Reads will be split into pairs around the kmer. + If the kmer is at the end of the read, it will be trimmed instead. + Singletons will go to out, and pairs will go to outm. + Do not use ksplit with other operations such as quality-trimming or filtering. + """ + + mink: int = 11 + """ + Look for shorter kmers at read tips down to this length, when k-trimming or masking. + 0 means disabled. + Enabling this will disable maskmiddle. bbduk default: 0 (disabled) + """ + + qtrim: Qtrim = "rl" + """ + Trim read ends to remove bases with quality below trimq. + Performed AFTER looking for kmers. Values: + rl (trim both ends), + f (neither end), [bbduk default] + r (right end only), + l (left end only), + w (sliding window). + """ + + trimq: float = 25 + """ + Regions with average quality BELOW this will be trimmed, + if qtrim is set to something other than f. + Can be a floating-point number like 7.3. + Very strict quality threshold, bbduk default: 6 + """ + + minlength: int = 35 + """ + (ml) Reads shorter than this after trimming will be discarded. + Pairs will be discarded if both are shorter. + bbduk default: 10 + """ + + mlf: int = 0 + """ + (minlengthfraction) Reads shorter than this fraction of original length after trimming + will be discarded. + """ + + minavgquality: int = 0 + """ + (maq) Reads with average quality (after trimming) below this will be discarded. + """ + + maqb: int = 0 + """ + If positive, calculate maq from this many initial bases. + """ + + minbasequality: int = 0 + """ + (mbq) Reads with any base below this quality (after trimming) will be discarded. + """ + + maxns: int = -1 + """ + If non-negative, reads with more Ns than this (after trimming) will be discarded. + """ + + mcb: int = 0 + """ + (minconsecutivebases) Discard reads without at least this many consecutive called bases. + """ + + ottm: bool = False + """ + (outputtrimmedtomatch) Output reads trimmed to shorter than minlength to outm rather than discarding. + """ + + tp: int = 0 + """ + (trimpad) Trim this much extra around matching kmers. + """ + + tbo: bool = False + """ + (trimbyoverlap) Trim adapters based on where paired reads overlap. + Note: The author recommends setting tpe=t & tbo=t when adapter trimming paired reads. + """ + + strictoverlap: bool = True + """ + Adjust sensitivity for trimbyoverlap mode. + """ + + minoverlap: int = 14 + """ + Require this many bases of overlap for detection. + """ + + mininsert: int = 40 + """ + Require insert size of at least this for overlap. + Should be reduced to 16 for small RNA sequencing. + """ + + tpe: bool = False + """ + (trimpairsevenly) When kmer right-trimming, trim both reads to the minimum length of either. + Note: The author recommends setting tpe=t & tbo=t when adapter trimming paired reads. + """ + + forcetrimleft: int = 0 + """ + (ftl) If positive, trim bases to the left of this position (exclusive, 0-based). + """ + + forcetrimright: int = 0 + """ + (ftr) If positive, trim bases to the right of this position (exclusive, 0-based). + + """ + forcetrimright2: int = 0 + """ + (ftr2) If positive, trim this many bases on the right end. + """ + + forcetrimmod: int = 5 + """ + (ftm) If positive, right-trim length to be equal to zero, modulo this number. bbduk default: 0 + """ + + restrictleft: int = 0 + """ + If positive, only look for kmer matches in the leftmost X bases. + """ + + restrictright: int = 0 + """ + If positive, only look for kmer matches in the rightmost X bases. + """ + + mingc: float = 0 + """ + Discard reads with GC content below this. + """ + + maxgc: float = 1 + """ + Discard reads with GC content above this. + """ + + gcpairs: bool = True + """ + Use average GC of paired reads. Deprecated option? Also affects gchist. + """ + + tossjunk: bool = False + """ + Discard reads with invalid characters as bases. + """ + + swift: bool = False + """ + Trim Swift sequences: Trailing C/T/N R1, leading G/A/N R2. + """ + + chastityfilter: bool = False + """ + (cf) Discard reads with id containing ' 1:Y:' or ' 2:Y:'. + """ + + barcodefilter: Barcodefilter = "f" + """ + Remove reads with unexpected barcodes if barcodes is set, + or barcodes containing 'N' otherwise. + A barcode must be the last part of the read header. + Values: + t: Remove reads with bad barcodes. + f: Ignore barcodes. + crash: Crash upon encountering bad barcodes. + """ + + barcodes: str = "" + """ + File of barcodes. + """ + + xmin: int = -1 + """ + If positive, discard reads with a lesser X coordinate. + """ + + ymin: int = -1 + """ + If positive, discard reads with a lesser Y coordinate. + """ + + xmax: int = -1 + """ + If positive, discard reads with a greater X coordinate. + """ + + ymax: int = -1 + """ + If positive, discard reads with a greater Y coordinate. + """ + + trimpolya: int = 0 + """ + If greater than 0, trim poly-A or poly-T tails of at least this length on either end of reads. + """ + + trimpolygleft: int = 0 + """ + If greater than 0, trim poly-G prefixes of at least this length on the left end of reads. + Does not trim poly-C. + """ + + trimpolygright: int = 8 + """ + If greater than 0, trim poly-G tails of at least this length on the right end of reads. + Does not trim poly-C. bbduk default: don't trim polyG (trimpolyg=0) + """ + + trimpolyg: int = 0 + """ + This sets both left and right at once. + """ + + filterpolyg: int = 8 + """ + If greater than 0, remove reads with a poly-G prefix of at least this length (on the left). + Note: there are also equivalent poly-C flags. + """ + + entropy: float = -1 + """ + Set between 0 and 1 to filter reads with entropy below that value. + Higher is more stringent. + """ + + entropywindow: int = 50 + """ + Calculate entropy using a sliding window of this length. + """ + + entropyk: int = 5 + """ + Calculate entropy using kmers of this length. + """ + + minbasefrequency: float = 0 + """ + Discard reads with a minimum base frequency below this. + """ + + entropytrim: Entropytrim = "f" + """ + Values: + f: (false) Do not entropy-trim. + r: (right) Trim low entropy on the right end only. + l: (left) Trim low entropy on the left end only. + rl: (both) Trim low entropy on both ends. + NOTE: If set, entropytrim overrides entropymask. + """ + + entropymask: Entropymask = "f" + """ + Values: + f: (filter) Discard low-entropy sequences. + t: (true) Mask low-entropy parts of sequences with N. + lc: Change low-entropy parts of sequences to lowercase. + """ + + entropymark: bool = False + """ + Mark each base with its entropy value. + This is on a scale of 0-41 and is reported as quality scores, + so the output should be fastq or fasta+qual. NOTE: If set, entropytrim overrides entropymask. + """ + + cardinality: bool = False + """ + (loglog) Count unique kmers using the LogLog algorithm. + """ + + cardinalityout: bool = False + """ + (loglogout) Count unique kmers in output reads. + """ + + loglogk: Annotated[int, Field(31, gt=0)] + """ + Use this kmer length for counting. + """ + + loglogbuckets: Annotated[int, Field(2048, gt=0)] + """ + Use this many buckets for counting. + """ + + +class AdapterTrimming(SnappyStepModel, ToolsMixin): + path_link_in: str | None = None + """Override data set configuration search paths for FASTQ files""" + + tools: Annotated[list[Tool], EnumField(Tool, min_length=1, default=["bbduk", "fastp"])] + bbduk: Bbduk | None = None + fastp: Fastp | None = None diff --git a/snappy_pipeline/workflows/cbioportal_export/Snakefile b/snappy_pipeline/workflows/cbioportal_export/Snakefile index 66878c620..93fca10c2 100644 --- a/snappy_pipeline/workflows/cbioportal_export/Snakefile +++ b/snappy_pipeline/workflows/cbioportal_export/Snakefile @@ -23,7 +23,7 @@ config, lookup_paths, config_paths = expand_ref("config.yaml", config) wf = cbioportalExportWorkflow(workflow, config, lookup_paths, config_paths, os.getcwd()) -exclude_flag = wf.w_config["step_config"]["cbioportal_export"]["exclude_variant_with_flag"] +exclude_flag = wf.w_config.step_config["cbioportal_export"].exclude_variant_with_flag # Rules ======================================================================= diff --git a/snappy_pipeline/workflows/cbioportal_export/__init__.py b/snappy_pipeline/workflows/cbioportal_export/__init__.py index 7762e3f4b..cccbca3b5 100644 --- a/snappy_pipeline/workflows/cbioportal_export/__init__.py +++ b/snappy_pipeline/workflows/cbioportal_export/__init__.py @@ -12,11 +12,12 @@ import sys from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background - from snappy_pipeline.base import MissingConfiguration from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, ResourceUsage +from .model import CbioportalExport as CbioportalExportConfigModel + # cbioportal meta data files META_FILES = { "always_present": ["meta_study.txt", "meta_clinical_patient.txt", "meta_clinical_sample.txt"], @@ -72,49 +73,7 @@ } -DEFAULT_CONFIG = r""" -step_config: - cbioportal_export: - # Required for RNA expression - path_ngs_mapping: "" # When missing, no expression data is uploaded to cBioPortal - mapping_tool: "bwa" - expression_tool: "star" - # Required for somatic variants - path_somatic_variant: ../somatic_variant_filtration # REQUIRED (before or after filtration) - somatic_variant_calling_tool: "mutect2" # mutect/scalpel combo unsupported - somatic_variant_annotation_tool: "vep" - filter_set: "" # Set it to an empty value when using annotated variants without filtration. - # When using filters, there are two possibilities: - # - the old implementation, using filter_sets. - # In that case, choose one of the filters: - # * no_filter - # * dkfz_only - # * dkfz_and_ebfilter - # * dkfz_and_ebfilter_and_oxog (that should be reserved for datasets with oxo-G artifacts) - # - the new implementation, using filter_list. - # In that case, the value must be "filter_list" - exon_list: "genome_wide" # Works together with filter_set, ignored when "filter_list" is selected - exclude_variant_with_flag: "" - # Required for Copy Number Alterations - path_copy_number: "" # When missing, no CNV data uploaded to portal. Access WES & WGS steps - copy_number_tool: cnvkit # Control_FREEC is currently unsupported, CopywriteR is not maintained - # Required for MAF &/or cBioPortal - path_gene_id_mappings: REQUIRED # Mapping from pipeline gene ids to cBioPortal ids (HGNC symbols from GeneNexus) - vcf2maf: - Center: BIH - ncbi_build: GRCh37 - # Description of dataset in cBioPortal - study: - type_of_cancer: REQUIRED # see http://oncotree.mskcc.org/#/home - cancer_study_id: REQUIRED # Usually: __ - study_description: REQUIRED # REQUIRED - study_name: REQUIRED # REQUIRED - study_name_short: REQUIRED # REQUIRED - patient_info: {} # Unimplemented - sample_info: {} # Each additional sample column must have a name and a (possibly empty) config attached. - # tumor_mutational_burden: - # path: ../tumor_mutational_burden -""" +DEFAULT_CONFIG = CbioportalExportConfigModel.default_config_yaml_string() # ================================================================================================ @@ -220,8 +179,8 @@ class cbioportalVcf2MafStepPart(BaseStepPart): def __init__(self, parent): super().__init__(parent) self.name_pattern = None - if self.config["filter_set"]: - if self.config["filter_set"] == "filter_list": + if self.config.filter_set: + if self.config.filter_set == "filter_list": self.name_pattern = "{mapper}.{caller}.{annotator}.filtered.{tumor_library}" else: self.name_pattern = ( @@ -302,7 +261,7 @@ def _get_normal_bio_sample(self, wildcards): pair = self.tumor_ngs_library_to_sample_pair[wildcards.tumor_library] return pair.normal_sample.dna_ngs_library.test_sample.bio_sample.name - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -336,8 +295,8 @@ class cbioportalMutationsStepPart(cbioportalExportStepPart): def __init__(self, parent): super().__init__(parent) - if self.config["filter_set"]: - if self.config["filter_set"] == "filter_list": + if self.config.filter_set: + if self.config.filter_set == "filter_list": name_pattern = "{mapper}.{caller}.{annotator}.filtered.{{library_name}}" else: name_pattern = ( @@ -349,11 +308,11 @@ def __init__(self, parent): name_pattern = "{mapper}.{caller}.{annotator}.{{library_name}}" tpl = os.path.join("work/maf", name_pattern, "out", name_pattern + "{ext}") self.input_tpl = tpl.format( - mapper=self.config["mapping_tool"], - caller=self.config["somatic_variant_calling_tool"], - annotator=self.config["somatic_variant_annotation_tool"], - filter_set=self.config["filter_set"], - exon_list=self.config["exon_list"], + mapper=self.config.mapping_tool, + caller=self.config.somatic_variant_calling_tool, + annotator=self.config.somatic_variant_annotation_tool, + filter_set=self.config.filter_set, + exon_list=self.config.exon_list, ext=".maf", ) @@ -373,12 +332,15 @@ def get_input_files(self, action): # Validate action self._validate_action(action) name_pattern = "{mapper}.{caller}.{tumor_library}" - yield "DNAcopy", os.path.join( - self.config["path_copy_number"], - "output", - name_pattern, - "out", - name_pattern + "_dnacopy.seg", + yield ( + "DNAcopy", + os.path.join( + self.config.path_copy_number, + "output", + name_pattern, + "out", + name_pattern + "_dnacopy.seg", + ), ) @dictify @@ -410,10 +372,10 @@ def get_args(self, action): self._validate_action(action) return { "pipeline_id": "ENSEMBL", - "features": self.parent.w_config["static_data_config"]["features"]["path"], + "features": self.parent.w_config.static_data_config.features.path, } - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -445,7 +407,7 @@ class cbioportalCnaFilesStepPart(cbioportalExportStepPart): def __init__(self, parent): super().__init__(parent) name_pattern = ( - self.config["mapping_tool"] + "." + self.config["copy_number_tool"] + ".{library_name}" + self.config.mapping_tool + "." + self.config.copy_number_tool + ".{library_name}" ) self.input_tpl = os.path.join("work/cna", name_pattern, "out", name_pattern + ".cna") @@ -468,7 +430,7 @@ def get_output_files(self, action): self._validate_action(action) return "work/upload/data_cna_{action}.txt".format(action=action) - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -486,7 +448,6 @@ def get_resource_usage(self, action): class cbioportalSegmentStepPart(cbioportalExportStepPart): - #: Step name name = "cbioportal_segment" @@ -502,17 +463,17 @@ class cbioportalSegmentStepPart(cbioportalExportStepPart): def __init__(self, parent): super().__init__(parent) name_pattern = ( - self.config["mapping_tool"] + "." + self.config["copy_number_tool"] + ".{library_name}" + self.config.mapping_tool + "." + self.config.copy_number_tool + ".{library_name}" ) self.input_tpl = os.path.join( - self.config["path_copy_number"], + self.config.path_copy_number, "output", name_pattern, "out", name_pattern + "_dnacopy.seg", ) - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -546,9 +507,9 @@ class cbioportalExpressionStepPart(cbioportalExportStepPart): def __init__(self, parent): super().__init__(parent) - name_pattern = self.config["expression_tool"] + ".{library_name}" + name_pattern = self.config.expression_tool + ".{library_name}" self.input_tpl = os.path.join( - self.config["path_ngs_mapping"], + self.config.path_ngs_mapping, "output", name_pattern, "out", @@ -562,11 +523,11 @@ def get_args(self, action): "action_type": "expression", "extra_args": { "pipeline_id": "ENSEMBL", - "tx_obj": self.parent.w_config["static_data_config"]["features"]["path"], + "tx_obj": self.parent.w_config.static_data_config.features.path, }, } - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -596,13 +557,13 @@ def get_output_files(self, action): # Validate action self._validate_action(action) yield from [os.path.join("work/upload", f) for f in META_FILES["always_present"]] - if self.config["path_somatic_variant"]: + if self.config.path_somatic_variant: yield from [os.path.join("work/upload", f) for f in META_FILES["sequenced"]] - if self.config["path_copy_number"]: + if self.config.path_copy_number: yield from [os.path.join("work/upload", f) for f in META_FILES["cna"]] - if self.config["study"]["reference_genome"] == "hg19": + if self.config.study.reference_genome == "hg19": yield from [os.path.join("work/upload", f) for f in META_FILES["segment"]] - if self.config["path_ngs_mapping"]: + if self.config.path_ngs_mapping: yield from [os.path.join("work/upload", f) for f in META_FILES["rna_seq_mrna"]] @@ -622,11 +583,11 @@ def get_args(self, action): for extraction_type in ("DNA", "RNA"): if ( extraction_type == "DNA" - and self.config["path_somatic_variant"] == "" - and self.config["path_copy_number"] == "" + and self.config.path_somatic_variant == "" + and self.config.path_copy_number == "" ): continue - if extraction_type == "RNA" and self.config["path_ngs_mapping"] == "": + if extraction_type == "RNA" and self.config.path_ngs_mapping == "": continue self.extraction_type = extraction_type for lib in self._yield_libraries(): @@ -666,13 +627,13 @@ def get_args(self, action): self.extraction_type = "DNA" for lib in self._yield_libraries(): sample_name = lib.test_sample.bio_sample.name - if self.config["path_somatic_variant"]: + if self.config.path_somatic_variant: samples["sequenced"] += [sample_name] - if self.config["path_copy_number"]: + if self.config.path_copy_number: samples["cna"] += [sample_name] - if self.config["path_somatic_variant"] and self.config["path_copy_number"]: + if self.config.path_somatic_variant and self.config.path_copy_number: samples["cnaseq"] += [sample_name] - if self.config["path_ngs_mapping"]: + if self.config.path_ngs_mapping: self.extraction_type = "RNA" for lib in self._yield_libraries(): sample_name = lib.test_sample.bio_sample.name @@ -683,9 +644,9 @@ def get_args(self, action): args[k] = CASE_LIST_FILES[k] args[k]["samples"] = v if ( - self.config["path_somatic_variant"] - and self.config["path_copy_number"] - and self.config["path_ngs_mapping"] + self.config.path_somatic_variant + and self.config.path_copy_number + and self.config.path_ngs_mapping ): args["3way_complete"] = CASE_LIST_FILES["3way_complete"] args["3way_complete"]["samples"] = [] @@ -699,16 +660,16 @@ def get_output_files(self, action): # Validate action self._validate_action(action) case_lists = {} - if self.config["path_somatic_variant"]: + if self.config.path_somatic_variant: case_lists["sequenced"] = CASE_LIST_FILES["sequenced"]["filename"] - if self.config["path_copy_number"]: + if self.config.path_copy_number: case_lists["cna"] = CASE_LIST_FILES["cna"]["filename"] - if self.config["path_ngs_mapping"]: + if self.config.path_ngs_mapping: case_lists["rna_seq_mrna"] = CASE_LIST_FILES["rna_seq_mrna"]["filename"] - if self.config["path_somatic_variant"]: - if self.config["path_copy_number"]: + if self.config.path_somatic_variant: + if self.config.path_copy_number: case_lists["cnaseq"] = CASE_LIST_FILES["cnaseq"]["filename"] - if self.config["path_ngs_mapping"]: + if self.config.path_ngs_mapping: case_lists["3way_complete"] = CASE_LIST_FILES["3way_complete"]["filename"] for case, filename in case_lists.items(): yield case, os.path.join("work/upload/case_lists", filename) @@ -746,6 +707,7 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, + config_model_class=CbioportalExportConfigModel, ) # cBioPortal requires the genome release as GRC[hm]3[78] in the MAF file @@ -754,13 +716,13 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) # For the user's convenience, the configuration is augmented automatically, # before the sub steps are registered, so they all have the updated config. translated = "unknown" - if self.config["vcf2maf"]["ncbi_build"] == "GRCh37": + if self.config.vcf2maf.ncbi_build == "GRCh37": translated = "hg19" - if self.config["vcf2maf"]["ncbi_build"] == "GRCh38": + if self.config.vcf2maf.ncbi_build == "GRCh38": translated = "hg38" - if self.config["vcf2maf"]["ncbi_build"] in ("mm9", "mm10", "GRCm37", "GRCm38", "GRCm39"): + if self.config.vcf2maf.ncbi_build in ("mm9", "mm10", "GRCm37", "GRCm38", "GRCm39"): translated = "mouse" - self.config["study"]["reference_genome"] = translated + self.config.study.reference_genome = translated # Register sub step classes so the sub steps are available self.register_sub_step_classes( @@ -777,21 +739,21 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) ) ) # Initialize sub-workflows - if self.config["path_somatic_variant"]: - if self.config["filter_set"]: + if self.config.path_somatic_variant: + if self.config.filter_set: self.register_sub_workflow( "somatic_variant_filtration", - self.config["path_somatic_variant"], + self.config.path_somatic_variant, sub_workflow_name="somatic_variant", ) else: self.register_sub_workflow( "somatic_variant_annotation", - self.config["path_somatic_variant"], + self.config.path_somatic_variant, sub_workflow_name="somatic_variant", ) - if self.config["path_copy_number"]: - if self.config["copy_number_tool"] in [ + if self.config.path_copy_number: + if self.config.copy_number_tool in [ "cnvetti_on_target_postprocess", "cnvkit", "copywriter", @@ -800,19 +762,19 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) ]: self.register_sub_workflow( "somatic_targeted_seq_cnv_calling", - workdir=self.config["path_copy_number"], + workdir=self.config.path_copy_number, sub_workflow_name="copy_number_step", ) else: self.register_sub_workflow( "somatic_wgs_cnv_calling", - workdir=self.config["path_copy_number"], + workdir=self.config.path_copy_number, sub_workflow_name="copy_number_step", ) - if self.config["path_ngs_mapping"]: + if self.config.path_ngs_mapping: self.register_sub_workflow( "ngs_mapping", - workdir=self.config["path_ngs_mapping"], + workdir=self.config.path_ngs_mapping, sub_workflow_name="ngs_mapping", ) @@ -823,14 +785,14 @@ def get_result_files(self): yield from self.sub_steps["cbioportal_case_lists"].get_output_files("run").values() result_files = [] - if self.config["path_somatic_variant"]: + if self.config.path_somatic_variant: result_files += [self.sub_steps["cbioportal_mutations"].get_output_files("run")] - if self.config["path_copy_number"]: + if self.config.path_copy_number: result_files += [self.sub_steps["cbioportal_cna"].get_output_files("log2")] result_files += [self.sub_steps["cbioportal_cna"].get_output_files("gistic")] - if self.config["study"]["reference_genome"] == "hg19": + if self.config.study.reference_genome == "hg19": result_files += [self.sub_steps["cbioportal_segment"].get_output_files("run")] - if self.config["path_ngs_mapping"]: + if self.config.path_ngs_mapping: result_files += [self.sub_steps["cbioportal_expression"].get_output_files("run")] yield from result_files @@ -838,21 +800,21 @@ def get_result_files(self): def check_config(self): """Check config attributes for presence""" msg = [] - if self.config["path_somatic_variant"]: - if not self.config["mapping_tool"]: + if self.config.path_somatic_variant: + if not self.config.mapping_tool: msg += ["DNA mapping tool must be defined"] if ( - not self.config["somatic_variant_calling_tool"] - or not self.config["somatic_variant_annotation_tool"] + not self.config.somatic_variant_calling_tool + or not self.config.somatic_variant_annotation_tool ): msg += [ "Somatic variant calling tool and somatic variant annotation tool must be defined" ] - if self.config["path_copy_number"] and not self.config["copy_number_tool"]: + if self.config.path_copy_number and not self.config.copy_number_tool: msg += [ "Somatic copy number calling tool must be defined when CNV results are available" ] - if self.config["path_ngs_mapping"] and not self.config["expression_tool"]: + if self.config.path_ngs_mapping and not self.config.expression_tool: msg += ["Gene count tools must be defined when RNA expression is available"] if len(msg) > 0: raise MissingConfiguration() # (msg="Please select a supported tool for the CNV calls") diff --git a/snappy_pipeline/workflows/cbioportal_export/model.py b/snappy_pipeline/workflows/cbioportal_export/model.py new file mode 100644 index 000000000..8bcfe102b --- /dev/null +++ b/snappy_pipeline/workflows/cbioportal_export/model.py @@ -0,0 +1,196 @@ +from __future__ import annotations + +import enum +from typing import Annotated, TypedDict + +from pydantic import ConfigDict, Field, model_validator + +from snappy_pipeline.models import SnappyModel, SnappyStepModel + + +class MappingTool(enum.StrEnum): + BWA = "bwa" + + +class ExpressionTool(enum.StrEnum): + STAR = "star" + + +class SomaticVariantCallingTool(enum.StrEnum): + MUTECT2 = "mutect2" + + +class SomaticVariantAnnotationTool(enum.StrEnum): + VEP = "vep" + + +class FilterSet(enum.StrEnum): + NO_FILTER = "no_filter" + DKFZ_ONLY = "dkfz_only" + DKFZ_AND_EBFILTER = "dkfz_and_ebfilter" + DKFZ_AND_EBFILTER_AND_OXOG = "dkfz_and_ebfilter_and_oxog" + + +class Filter(SnappyModel): + pass + + +class DkfzFilter(Filter): + pass + + +class EbFilter(Filter): + ebfilter_threshold: float = 2.4 + shuffle_seed: int = 1 + panel_of_normals_size: int = 25 + min_mapq: int = 20 + min_baseq: int = 15 + + +class Bcftools(Filter): + include: str = "" + exclude: str = "" + + +class Regions(Filter): + path_bed: str = "" + + +class Protected(Filter): + path_bed: str = "" + + +class CopyNumberTool(enum.StrEnum): + CNVKIT = "cnvkit" + + CONTROL_FREEC = "Control_FREEC" + """unsupported""" + + COPYWRITER = "CopywriteR" + """unmaintained""" + + +class NcbiBuild(enum.StrEnum): + GRCh37 = "GRCh37" + GRCh38 = "GRCh38" + + +class Vcf2Maf(SnappyModel): + Center: str + ncbi_build: NcbiBuild + + +class GenomeName(enum.StrEnum): + grch37 = "grch37" + grch38 = "grch38" + hg19 = "hg19" + mouse = "mouse" + + +class Study(SnappyModel): + type_of_cancer: str + """ + see http://oncotree.mskcc.org/#/home + see also `curl https://oncotree.mskcc.org:443/api/tumorTypes | jq ".[].code"` + """ + cancer_study_id: str + """Usually: __""" + study_description: str + study_name: str + study_name_short: str + reference_genome: GenomeName + + +class ExtraInfos(TypedDict): + name: str + description: str + datatype: str + priority: str + column: str + + +class CbioportalExport(SnappyStepModel): + model_config = ConfigDict( + extra="forbid", + ) + + path_ngs_mapping: str | None = None + """When missing, no expression data is uploaded to cBioPortal""" + + mapping_tool: MappingTool = MappingTool.BWA + + expression_tool: ExpressionTool = ExpressionTool.STAR + + path_somatic_variant: str + """REQUIRED (before or after filtration)""" + + somatic_variant_calling_tool: SomaticVariantCallingTool = SomaticVariantCallingTool.MUTECT2 + """mutect/scalpel combo unsupported""" + + somatic_variant_annotation_tool: SomaticVariantAnnotationTool = SomaticVariantAnnotationTool.VEP + + filter_set: Annotated[FilterSet | None, Field(None, deprecated="use `filter_list` instead")] + """ + DEPRECATED: use `filter_list instead`. + Set it to an empty value when using annotated variants without filtration. + """ + + exon_list: Annotated[ + str | None, + Field( + "genome_wide", + deprecated="Works together with filter_set, ignored when `filter_list` is selected", + ), + ] + """ + DEPRECATED. + Works together with filter_set, ignored when "filter_list" is selected + """ + + filter_list: list[Filter] = [] + + exclude_variant_with_flag: str | None = None + """Required for Copy Number Alterations""" + + path_copy_number: str | None = None + """When missing, no CNV data uploaded to portal. Access WES & WGS steps""" + + copy_number_tool: CopyNumberTool = CopyNumberTool.CNVKIT + + path_gene_id_mappings: str + """Mapping from pipeline gene ids to cBioPortal ids (HGNC symbols from GeneNexus)""" + + vcf2maf: Vcf2Maf + + study: Study + + patient_info: None = None + """unimplemented""" + + sample_info: dict[str, ExtraInfos] = Field( + {}, + examples=[ + { + "tumor_mutational_burden": dict( + name="TMB", + description="Tumor mutational burden computed on CDS regions", + datatype="NUMBER", + priority="2", + column="TMB", + ) + } + ], + ) + """Each additional sample column must have a name and a (possibly empty) config attached.""" + + @model_validator(mode="after") + def ensure_tools_are_configured(self): + if self.path_somatic_variant and not self.mapping_tool: + raise ValueError("Mapping tool must be set when path_somatic_variant is set") + if not self.somatic_variant_calling_tool or not self.somatic_variant_annotation_tool: + raise ValueError("Somatic variant calling or annotation tools must be set") + if self.path_copy_number and not self.copy_number_tool: + raise ValueError("Copy number tool must be set when path_copy_number is set") + if self.path_ngs_mapping and not self.expression_tool: + raise ValueError("Expression tool must be set when path_ngs_mapping is set") + return self diff --git a/snappy_pipeline/workflows/common/gcnv/gcnv_build_model.py b/snappy_pipeline/workflows/common/gcnv/gcnv_build_model.py index 65d08e1ef..b383672b4 100644 --- a/snappy_pipeline/workflows/common/gcnv/gcnv_build_model.py +++ b/snappy_pipeline/workflows/common/gcnv/gcnv_build_model.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- -"""Implementation of the gCNV COHORT mode methods - used to build models. -""" +"""Implementation of the gCNV COHORT mode methods - used to build models.""" from snakemake.io import expand, touch @@ -201,6 +200,6 @@ def get_result_files(self): for path_tpl in result_path_tpls: yield from expand( path_tpl, - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + mapper=self.w_config.step_config["ngs_mapping"].tools.dna, library_name=library_names, ) diff --git a/snappy_pipeline/workflows/common/gcnv/gcnv_common.py b/snappy_pipeline/workflows/common/gcnv/gcnv_common.py index 7c957bcff..225ac2855 100644 --- a/snappy_pipeline/workflows/common/gcnv/gcnv_common.py +++ b/snappy_pipeline/workflows/common/gcnv/gcnv_common.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- -"""Implementation of the gCNV common methods. -""" +"""Implementation of the gCNV common methods.""" from collections import OrderedDict @@ -181,7 +180,7 @@ def get_log_file(self, action): self._validate_action(action) return getattr(self, f"_get_log_file_{action}")() - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. diff --git a/snappy_pipeline/workflows/common/gcnv/gcnv_run.py b/snappy_pipeline/workflows/common/gcnv/gcnv_run.py index 6c6654a10..ab2c1d594 100644 --- a/snappy_pipeline/workflows/common/gcnv/gcnv_run.py +++ b/snappy_pipeline/workflows/common/gcnv/gcnv_run.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- -"""Implementation of the gCNV CASE mode run methods. -""" +"""Implementation of the gCNV CASE mode run methods.""" from glob import glob from itertools import chain @@ -94,20 +93,17 @@ def validate_request(self): :raises InvalidConfiguration: if information provided in configuration isn't enough to run the analysis. """ - if "gcnv" not in self.config["tools"]: + if "gcnv" not in self.config.tools: return # Get precomputed models from configurations - path_to_models = self.config["gcnv"]["precomputed_model_paths"] + path_to_models = self.config.gcnv.precomputed_model_paths # No model provided if not path_to_models: msg_tpl = "Precomputed model paths must be configured (key: 'precomputed_model_paths')." raise InvalidConfiguration(msg_tpl) else: - # Validate configuration - check if only expected keys are present - self.validate_precomputed_model_paths_config(config=path_to_models) - # Check model directories content for model in path_to_models: # Validate ploidy-model @@ -133,46 +129,6 @@ def validate_request(self): ) raise InvalidConfiguration(msg_tpl.format(str(model))) - def validate_precomputed_model_paths_config(self, config): - """Validate precomputed model config. - - Evaluates if provided configuration has the following format: - - precomputed_model_paths: - - library: "Agilent SureSelect Human All Exon V6" - contig_ploidy": /path/to/ploidy-model - model_pattern: /path/to/model_* - - :param config: List of precomputed model configuration dictionary. - :type config: list - - :raises InvalidConfiguration: if configuration not as expected for - ``precomputed_model_paths`` list. - """ - # Initialise variables - expected_keys = ("library", "model_pattern", "contig_ploidy") - expected_format = ( - '{\n "library": "Agilent SureSelect Human All Exon V6"\n' - ' "contig_ploidy": /path/to/ploidy-model\n' - ' "model_pattern": "/path/to/model_*"\n}' - ) - # Test - for model in config: - # Test keys - n_keys_pass = len(model) == 3 - keys_pass = all(key in expected_keys for key in model) - # Test values - values_pass = all(isinstance(value, str) for value in model.values()) - # Validate - if not (n_keys_pass and keys_pass and values_pass): - pretty_model = self._pretty_print_config(config=model) - msg = ( - "Provided configuration not as expected...\n" - f"\nn_keys_pass={n_keys_pass}, keys_pass={keys_pass}, values_pass={values_pass}\n" - f"Expected:\n{expected_format}\nObserved:\n{pretty_model}\n" - ) - raise InvalidConfiguration(msg) - def _pretty_print_config(self, config): """Pretty format configuration. @@ -284,7 +240,7 @@ def _get_params_contig_ploidy(self, wildcards: Wildcards): with `DetermineGermlineContigPloidy`. """ path = "__no_ploidy_model_for_library_in_config__" - for model in self.config["gcnv"]["precomputed_model_paths"]: + for model in self.config.gcnv.precomputed_model_paths: # Adjust library kit name from config to wildcard library_to_wildcard = model.get("library").strip().replace(" ", "_") if library_to_wildcard == wildcards.library_kit: @@ -340,7 +296,7 @@ def _get_params_call_cnvs(self, wildcards): `GermlineCNVCaller`. """ path = "__no_model_for_library_in_config__" - for model in self.config["gcnv"]["precomputed_model_paths"]: + for model in self.config.gcnv.precomputed_model_paths: # Adjust library kit name from config to wildcard library_to_wildcard = model.get("library").strip().replace(" ", "_") if library_to_wildcard == wildcards.library_kit: @@ -377,7 +333,7 @@ def _get_input_files_post_germline_calls(self, wildcards): # Get shards - based on scattered step model_dir_dict = None - for model in self.config["gcnv"]["precomputed_model_paths"]: + for model in self.config.gcnv.precomputed_model_paths: # Adjust library name to wildcard library_to_wildcard = model.get("library").strip().replace(" ", "_") if library_to_wildcard == library_kit: @@ -392,10 +348,13 @@ def _get_input_files_post_germline_calls(self, wildcards): # Yield cnv calls output name_pattern = f"{wildcards.mapper}.gcnv_call_cnvs.{library_kit}" - yield "calls", [ - f"work/{name_pattern}.{shard}/out/{name_pattern}.{shard}/.done" - for shard in model_dir_dict - ] + yield ( + "calls", + [ + f"work/{name_pattern}.{shard}/out/{name_pattern}.{shard}/.done" + for shard in model_dir_dict + ], + ) # Yield contig-ploidy output ext = "ploidy" @@ -415,7 +374,7 @@ def _get_params_post_germline_calls(self, wildcards): `PostprocessGermlineCNVCalls `. """ paths = ["__no_model_available_for_library__"] - for model in self.config["gcnv"]["precomputed_model_paths"]: + for model in self.config.gcnv.precomputed_model_paths: # Adjust library kit name from config to wildcard library_to_wildcard = model.get("library").strip().replace(" ", "_") # Get library kit associated with library name @@ -482,12 +441,15 @@ def _get_output_files_merge_multikit_families(self): for key, suffix in RESULT_EXTENSIONS.items(): work_files[key] = f"work/{name_pattern}/out/{name_pattern}{suffix}" yield from work_files.items() - yield "output_links", [ - re.sub(r"^work/", "output/", work_path) - for work_path in chain( - work_files.values(), self.get_log_file("merge_multikit_families").values() - ) - ] + yield ( + "output_links", + [ + re.sub(r"^work/", "output/", work_path) + for work_path in chain( + work_files.values(), self.get_log_file("merge_multikit_families").values() + ) + ], + ) @dictify def _get_log_file_merge_multikit_families(self): @@ -568,8 +530,8 @@ def get_result_files(self): The function will skip pedigrees where samples have inconsistent library kits and print a warning. """ - if "gcnv" not in self.config["tools"] and not ( - hasattr(self.config["tools"], "get") and "gcnv" in self.config["tools"].get("dna", {}) + if "gcnv" not in self.config.tools and not ( + hasattr(self.config.tools, "dna") and "gcnv" in self.config.tools.dna ): return @@ -600,6 +562,6 @@ def get_result_files(self): for path_tpl in result_path_tpls: yield from expand( path_tpl, - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + mapper=self.w_config.step_config["ngs_mapping"].tools.dna, library_name=[index_library_name], ) diff --git a/snappy_pipeline/workflows/common/manta.py b/snappy_pipeline/workflows/common/manta.py index df5c7cfb2..62240e012 100644 --- a/snappy_pipeline/workflows/common/manta.py +++ b/snappy_pipeline/workflows/common/manta.py @@ -30,7 +30,7 @@ class MantaStepPart( name = "manta" actions = ("run",) - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -44,7 +44,7 @@ def get_resource_usage(self, action): actions_str = ", ".join(self.actions) error_message = f"Action '{action}' is not supported. Valid options: {actions_str}" raise UnsupportedActionException(error_message) - num_threads = self.config["manta"]["num_threads"] + num_threads = self.config.manta.num_threads return ResourceUsage( threads=num_threads, time="7-00:00:00", # 3 days diff --git a/snappy_pipeline/workflows/common/melt.py b/snappy_pipeline/workflows/common/melt.py index 242336b61..66ff833b2 100644 --- a/snappy_pipeline/workflows/common/melt.py +++ b/snappy_pipeline/workflows/common/melt.py @@ -2,9 +2,9 @@ import re import typing -from biomedsheets.shortcuts import is_not_background from snakemake.io import touch +from biomedsheets.shortcuts import is_not_background from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import BaseStepPart, ResourceUsage from snappy_pipeline.workflows.abstract.common import ( @@ -202,7 +202,7 @@ def _get_output_files_make_vcf(self): @dictify def _get_input_files_merge_vcf(self, wildcards): vcfs = [] - for me_type in self.config["melt"]["me_types"]: + for me_type in self.config.melt.me_types: infix = f"{wildcards.mapper}.melt_make_vcf.{wildcards.library_name}.{me_type}" vcfs.append(f"work/{infix}/out/{infix}.final_comp.vcf.gz") yield "vcf", vcfs @@ -217,10 +217,13 @@ def _get_output_files_merge_vcf(self): "vcf_tbi_md5": f"work/{infix}/out/{infix}.vcf.gz.tbi.md5", } yield from work_files.items() - yield "output_links", [ - re.sub(r"^work/", "output/", work_path) - for work_path in chain(work_files.values(), self.get_log_file("merge_vcf").values()) - ] + yield ( + "output_links", + [ + re.sub(r"^work/", "output/", work_path) + for work_path in chain(work_files.values(), self.get_log_file("merge_vcf").values()) + ], + ) @dictify def _get_log_file_merge_vcf(self): diff --git a/snappy_pipeline/workflows/common/sv_calling.py b/snappy_pipeline/workflows/common/sv_calling.py index 6ff51b2b7..22314a404 100644 --- a/snappy_pipeline/workflows/common/sv_calling.py +++ b/snappy_pipeline/workflows/common/sv_calling.py @@ -4,7 +4,7 @@ from snakemake.io import expand -from snappy_pipeline.utils import DictQuery, dictify, flatten, listify +from snappy_pipeline.utils import dictify, flatten, listify class SvCallingGetResultFilesMixin: @@ -17,14 +17,13 @@ def get_result_files(self): The implementation will return a list of all paths with prefix ``output/` that are returned by ``self.get_output_files()`` for all actions in ``self.actions``. """ - if self.name not in self.config["tools"] and not ( - hasattr(self.config["tools"], "get") - and self.name in self.config["tools"].get("dna", {}) + if self.name not in self.config.tools and not ( + hasattr(self.config.tools, "dna") and self.name not in self.config.tools.dna ): return # tool not enabled, no result files - ngs_mapping_config = DictQuery(self.w_config).get("step_config/ngs_mapping") - for mapper in ngs_mapping_config["tools"]["dna"]: + ngs_mapping_config = self.w_config.step_config["ngs_mapping"] + for mapper in ngs_mapping_config.tools.dna: # Get list of result path templates. output_files_tmp = self.get_output_files(self.actions[-1]) if isinstance(output_files_tmp, dict): @@ -40,8 +39,9 @@ def get_result_files(self): #: Generate all concrete output paths. for path_tpl in result_paths_tpls: for library_name in self.index_ngs_library_to_pedigree.keys(): - if library_name not in self.config[self.name].get("skip_libraries", []): - yield from expand(path_tpl, mapper=[mapper], library_name=library_name) + if cfg := self.config.get(self.name): + if library_name not in cfg.skip_libraries: + yield from expand(path_tpl, mapper=[mapper], library_name=library_name) class SvCallingGetLogFileMixin: diff --git a/snappy_pipeline/workflows/gene_expression_quantification/__init__.py b/snappy_pipeline/workflows/gene_expression_quantification/__init__.py index 42fef2609..1b36d5432 100644 --- a/snappy_pipeline/workflows/gene_expression_quantification/__init__.py +++ b/snappy_pipeline/workflows/gene_expression_quantification/__init__.py @@ -49,9 +49,9 @@ import os -from biomedsheets.shortcuts import GenericSampleSheet, is_not_background from snakemake.io import expand +from biomedsheets.shortcuts import GenericSampleSheet, is_not_background from snappy_pipeline.base import UnsupportedActionException from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import ( @@ -65,6 +65,8 @@ ) from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow +from .model import GeneExpressionQuantification as GeneExpressionQuantificationConfigModel + # Extensions EXTENSIONS = { "featurecounts": { @@ -112,30 +114,7 @@ "salmon": {"transcript_sf": ".transcript.sf", "transcript_sf_md5": ".transcript.sf.md5"}, } -DEFAULT_CONFIG = r""" -step_config: - gene_expression_quantification: - path_link_in: "" # OPTIONAL Override data set configuration search paths for FASTQ files - tools: [strandedness, featurecounts, duplication, dupradar, rnaseqc, stats, salmon] # REQUIRED - path_ngs_mapping: ../ngs_mapping # REQUIRED - strand: -1 # Use 0, 1 or 2 to force unstranded, forward or reverse strand - featurecounts: - path_annotation_gtf: REQUIRED # REQUIRED - strandedness: - # needs column 6 with strand info, e.g. CCDS/15/GRCh37/CCDS.bed - path_exon_bed: REQUIRED # REQUIRED - threshold: 0.85 - rnaseqc: - rnaseqc_path_annotation_gtf: REQUIRED # REQUIRED - dupradar: - dupradar_path_annotation_gtf: REQUIRED # REQUIRED - num_threads: 8 - salmon: - path_transcript_to_gene: REQUIRED # REQUIRED - path_index: REQUIRED # REQUIRED - salmon_params: " --gcBias --validateMappings" - num_threads: 16 -""".lstrip() +DEFAULT_CONFIG = GeneExpressionQuantificationConfigModel.default_config_yaml_string() class SalmonStepPart(BaseStepPart): @@ -153,8 +132,8 @@ def __init__(self, parent): self.base_path_out = "work/salmon.{{library_name}}/out/salmon.{{library_name}}{ext}" self.extensions = EXTENSIONS["salmon"] if ( - self.config["salmon"]["path_transcript_to_gene"] is not None - and self.config["salmon"]["path_transcript_to_gene"] != "" + self.config.salmon.path_transcript_to_gene is not None + and self.config.salmon.path_transcript_to_gene != "" ): self.extensions["gene_sf"] = ".gene.sf" self.extensions["gene_sf_md5"] = ".gene.sf.md5" @@ -162,7 +141,7 @@ def __init__(self, parent): self.parent.work_dir, self.parent.data_set_infos, self.parent.config_lookup_paths, - preprocessed_path=self.config["path_link_in"], + preprocessed_path=self.config.path_link_in, ) @classmethod @@ -220,13 +199,13 @@ def _collect_reads(self, wildcards, library_name, prefix): Yields paths to right reads if prefix=='right-' """ folder_name = get_ngs_library_folder_name(self.parent.sheets, wildcards.library_name) - if self.config["path_link_in"]: + if self.config.path_link_in: folder_name = library_name pattern_set_keys = ("right",) if prefix.startswith("right-") else ("left",) for _, path_infix, filename in self.path_gen.run(folder_name, pattern_set_keys): yield os.path.join(self.base_path_in, path_infix, filename).format(**wildcards) - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -308,7 +287,7 @@ class FeatureCountsStepPart(GeneExpressionQuantificationStepPart): #: Class available actions actions = ("run",) - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -338,7 +317,7 @@ class StrandednessStepPart(GeneExpressionQuantificationStepPart): #: Class available actions actions = ("run",) - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -364,14 +343,13 @@ def get_strandedness_file(self, action): class QCStepPartDuplication(GeneExpressionQuantificationStepPart): - #: Step name name = "duplication" #: Class available actions actions = ("run",) - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -389,14 +367,13 @@ def get_resource_usage(self, action): class QCStepPartDupradar(GeneExpressionQuantificationStepPart): - #: Step name name = "dupradar" #: Class available actions actions = ("run",) - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -414,14 +391,13 @@ def get_resource_usage(self, action): class QCStepPartRnaseqc(GeneExpressionQuantificationStepPart): - #: Step name name = "rnaseqc" #: Class available actions actions = ("run",) - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -439,14 +415,13 @@ def get_resource_usage(self, action): class QCStepPartStats(GeneExpressionQuantificationStepPart): - #: Step name name = "stats" #: Class available actions actions = ("run",) - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -484,7 +459,8 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (NgsMappingWorkflow,), + config_model_class=GeneExpressionQuantificationConfigModel, + previous_steps=(NgsMappingWorkflow,), ) # Register sub step classes so the sub steps are available self.register_sub_step_classes( @@ -501,7 +477,7 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) ) ) # Initialize sub-workflows - self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"]) + self.register_sub_workflow("ngs_mapping", self.config.path_ngs_mapping) def get_strandedness_file(self, action): _ = action @@ -518,16 +494,16 @@ def get_result_files(self): # Salmon special case salmon_name_pattern = "salmon.{ngs_library.name}" salmon_exts = EXTENSIONS["salmon"] - if self.w_config["step_config"]["gene_expression_quantification"]["salmon"][ - "path_transcript_to_gene" - ]: + if self.w_config.step_config[ + "gene_expression_quantification" + ].salmon.path_transcript_to_gene: salmon_exts["gene_sf"] = ".gene.sf" salmon_exts["gene_sf_md5"] = ".gene.sf.md5" # TODO: too many ifs, use shortcut? # if fixed, please do the same for somatic_gene_fusion_calling all_fns = [] - for tool in self.config["tools"]: + for tool in self.config.tools: for sheet in filter(is_not_background, self.shortcut_sheets): for ngs_library in sheet.all_ngs_libraries: extraction_type = ngs_library.test_sample.extra_infos.get( @@ -550,7 +526,7 @@ def get_result_files(self): fns = expand( os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), ngs_library=ngs_library, - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["rna"], + mapper=self.w_config.step_config["ngs_mapping"].tools.rna, # tool=set(self.config['tools']), tool=tool, ext=EXTENSIONS[tool].values(), @@ -558,45 +534,3 @@ def get_result_files(self): all_fns.extend(fns) return all_fns - - def check_config(self): - """Check that the path to the NGS mapping is present""" - self.ensure_w_config( - ("step_config", "gene_expression_quantification", "path_ngs_mapping"), - "Path to NGS mapping not configured but required for gene expression quantification", - ) - self.ensure_w_config( - ( - "step_config", - "gene_expression_quantification", - "featurecounts", - "path_annotation_gtf", - ), - "Path to gtf file with annotations required for featurecounts", - ) - self.ensure_w_config( - ("step_config", "gene_expression_quantification", "strandedness", "path_exon_bed"), - "Path to bed file with exon regions required for RSeQC", - ) - self.ensure_w_config( - ( - "step_config", - "gene_expression_quantification", - "rnaseqc", - "rnaseqc_path_annotation_gtf", - ), - "Path to gtf file with annotations required for RNA-SeQC", - ) - self.ensure_w_config( - ( - "step_config", - "gene_expression_quantification", - "dupradar", - "dupradar_path_annotation_gtf", - ), - "Path to gtf file with annotations required for dupradar", - ) - self.ensure_w_config( - ("step_config", "gene_expression_quantification", "salmon", "path_index"), - "Path to directory containing salmon index files", - ) diff --git a/snappy_pipeline/workflows/gene_expression_quantification/model.py b/snappy_pipeline/workflows/gene_expression_quantification/model.py new file mode 100644 index 000000000..cc385687c --- /dev/null +++ b/snappy_pipeline/workflows/gene_expression_quantification/model.py @@ -0,0 +1,83 @@ +import enum +from typing import Annotated + +from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel, validators + + +class Strand(enum.IntEnum): + unstranded = 0 + forward = 1 + reverse = 2 + + +class Featurecounts(SnappyModel): + path_annotation_gtf: str + + +class Strandedness(SnappyModel): + path_exon_bed: str + """needs column 6 with strand info, e.g. CCDS/15/GRCh37/CCDS.bed""" + + threshold: float = 0.85 + + +class RnaSeqC(SnappyModel): + rnaseqc_path_annotation_gtf: str + + +class DupRadar(SnappyModel): + dupradar_path_annotation_gtf: str + num_threads: int = 8 + + +class Salmon(SnappyModel): + path_transcript_to_gene: str + path_index: str + salmon_params: str = " --gcBias --validateMappings" + num_threads: int = 16 + + +class Duplication(SnappyModel): + pass + + +class Stats(SnappyModel): + pass + + +class Tool(enum.Enum): + strandedness = "strandedness" + featurecounts = "featurecounts" + dupradar = "dupradar" + duplication = "duplication" + rnaseqc = "rnaseqc" + salmon = "salmon" + stats = "stats" + + +class GeneExpressionQuantification( + SnappyStepModel, validators.NgsMappingMixin, validators.ToolsMixin +): + path_ngs_mapping: str = "../ngs_mapping" + + path_link_in: str = "" + """OPTIONAL Override data set configuration search paths for FASTQ files""" + + tools: Annotated[list[Tool], EnumField(Tool, min_length=1)] = [Tool.salmon] + + strand: Strand | int = -1 # TODO: what is this default value of -1? + """Use 0, 1 or 2 to force unstranded, forward or reverse strand. Use -1 to guess.""" + + featurecounts: Featurecounts | None = None + + strandedness: Strandedness | None = None + + rnaseqc: RnaSeqC | None = None + + dupradar: DupRadar | None = None + + duplication: Duplication | None = None + + stats: Stats | None = None + + salmon: Salmon | None = None diff --git a/snappy_pipeline/workflows/gene_expression_report/Snakefile b/snappy_pipeline/workflows/gene_expression_report/Snakefile index f3e69df17..304a949ce 100644 --- a/snappy_pipeline/workflows/gene_expression_report/Snakefile +++ b/snappy_pipeline/workflows/gene_expression_report/Snakefile @@ -4,7 +4,9 @@ import os from snappy_pipeline import expand_ref -from snappy_pipeline.workflows.gene_expression_report import GeneExpressionReportWorkflow +from snappy_pipeline.workflows.gene_expression_report import ( + GeneExpressionReportWorkflow, +) __author__ = "Clemens Messerschmidt" diff --git a/snappy_pipeline/workflows/gene_expression_report/__init__.py b/snappy_pipeline/workflows/gene_expression_report/__init__.py index 4ef6eac04..3b284c944 100644 --- a/snappy_pipeline/workflows/gene_expression_report/__init__.py +++ b/snappy_pipeline/workflows/gene_expression_report/__init__.py @@ -1,23 +1,19 @@ # -*- coding: utf-8 -*- -"""Implementation of the ``gene_expression_report`` step - -""" +"""Implementation of the ``gene_expression_report`` step""" from collections import OrderedDict import os -from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background from snakemake.io import expand +from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow -DEFAULT_CONFIG = r""" -step_config: - gene_expression_report: - path_gene_expression_quantification: '' # REQUIRED -""" +from .model import GeneExpressionReport as GeneExpressionReportConfigModel + +DEFAULT_CONFIG = GeneExpressionReportConfigModel.default_config_yaml_string() #: Names of the files to create for the extension (snakemake output) EXT_NAMES = ("tsv",) @@ -164,7 +160,8 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (NgsMappingWorkflow,), + config_model_class=GeneExpressionReportConfigModel, + previous_steps=(NgsMappingWorkflow,), ) # Register sub step classes so the sub steps are available self.register_sub_step_classes( @@ -177,9 +174,9 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) ) ) # Initialize sub-workflows - if self.config["path_gene_expression_quantification"]: + if self.config.path_gene_expression_quantification: self.register_sub_workflow( - "gene_expression_quantification", self.config["path_gene_expression_quantification"] + "gene_expression_quantification", self.config.path_gene_expression_quantification ) @listify @@ -197,11 +194,7 @@ def get_result_files(self): yield from expand( os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), ngs_library=ngs_library, - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["rna"], + mapper=self.w_config.step_config["ngs_mapping"].tools.rna, tool="featurecounts", ext=exts, ) - - def check_config(self): - """Check config attributes for presence""" - # TODO: verify that `path_gene_expression_quantification` is defined in configuration. diff --git a/snappy_pipeline/workflows/gene_expression_report/model.py b/snappy_pipeline/workflows/gene_expression_report/model.py new file mode 100644 index 000000000..30cbebf31 --- /dev/null +++ b/snappy_pipeline/workflows/gene_expression_report/model.py @@ -0,0 +1,5 @@ +from snappy_pipeline.models import SnappyStepModel + + +class GeneExpressionReport(SnappyStepModel): + path_gene_expression_quantification: str diff --git a/snappy_pipeline/workflows/helper_gcnv_model_targeted/Snakefile b/snappy_pipeline/workflows/helper_gcnv_model_targeted/Snakefile index e3abeffcf..35631d2a6 100644 --- a/snappy_pipeline/workflows/helper_gcnv_model_targeted/Snakefile +++ b/snappy_pipeline/workflows/helper_gcnv_model_targeted/Snakefile @@ -57,6 +57,8 @@ rule build_gcnv_model_preprocess_intervals: tmpdir=wf.get_resource("gcnv", "preprocess_intervals", "tmpdir"), log: wf.get_log_file("gcnv", "preprocess_intervals"), + params: + step_key="helper_gcnv_model_targeted", wrapper: wf.wrapper_path("gcnv/preprocess_intervals") @@ -125,6 +127,8 @@ rule build_gcnv_model_contig_ploidy: tmpdir=wf.get_resource("gcnv", "contig_ploidy", "tmpdir"), log: wf.get_log_file("gcnv", "contig_ploidy"), + params: + step_key="helper_gcnv_model_targeted", wrapper: wf.wrapper_path("gcnv/contig_ploidy") diff --git a/snappy_pipeline/workflows/helper_gcnv_model_targeted/__init__.py b/snappy_pipeline/workflows/helper_gcnv_model_targeted/__init__.py index ee9e0fa0c..a1e81f720 100644 --- a/snappy_pipeline/workflows/helper_gcnv_model_targeted/__init__.py +++ b/snappy_pipeline/workflows/helper_gcnv_model_targeted/__init__.py @@ -82,35 +82,22 @@ .. include:: DEFAULT_CONFIG_helper_gcnv_model_targeted.rst """ + import os import re -from biomedsheets.shortcuts import GermlineCaseSheet, is_not_background from snakemake.io import glob_wildcards -from snappy_pipeline.utils import DictQuery, dictify, listify +from biomedsheets.shortcuts import GermlineCaseSheet, is_not_background +from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import BaseStep, WritePedigreeStepPart from snappy_pipeline.workflows.common.gcnv.gcnv_build_model import BuildGcnvModelStepPart from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow +from .model import HelperGcnvModelTargeted as HelperGcnvModelTargetedConfigModel + #: Default configuration for the helper_gcnv_model_targeted schema -DEFAULT_CONFIG = r""" -# Default configuration helper_gcnv_model_targeted -step_config: - - helper_gcnv_model_targeted: - path_ngs_mapping: ../ngs_mapping # REQUIRED - - gcnv: - path_uniquely_mapable_bed: null # REQUIRED - path to BED file with uniquely mappable regions. - path_target_interval_list_mapping: [] # REQUIRED - define one or more set of target intervals. - # The following will match both the stock IDT library kit and the ones - # with spike-ins seen from Yale genomics. The path above would be - # mapped to the name "default". - # - name: IDT_xGen_V1_0 - # pattern: "xGen Exome Research Panel V1\\.0*" - # path: "path/to/targets.bed" -""" +DEFAULT_CONFIG = HelperGcnvModelTargetedConfigModel.default_config_yaml_string() class BuildGcnvTargetSeqModelStepPart(BuildGcnvModelStepPart): @@ -123,8 +110,8 @@ def __init__(self, parent): @dictify def _build_ngs_library_to_kit(self): - gcnv_config = DictQuery(self.w_config).get("step_config/helper_gcnv_model_targeted/gcnv") - if not gcnv_config["path_target_interval_list_mapping"]: + gcnv_config = self.w_config.step_config["helper_gcnv_model_targeted"].gcnv + if not gcnv_config.path_target_interval_list_mapping: # No mapping given, we will use the "default" one for all. for donor in self.parent.all_donors(): if donor.dna_ngs_library: @@ -132,8 +119,7 @@ def _build_ngs_library_to_kit(self): # Build mapping regexes = { - item["pattern"]: item["name"] - for item in gcnv_config["path_target_interval_list_mapping"] + item.pattern: item.name for item in gcnv_config.path_target_interval_list_mapping } result = {} for donor in self.parent.all_donors(): @@ -158,12 +144,15 @@ def _get_input_files_post_germline_calls(self, wildcards, checkpoints): name_pattern = "{mapper}.gcnv_call_cnvs.{library_kit}".format( library_kit=library_kit, **wildcards ) - yield "calls", [ - "work/{name_pattern}.{shard}/out/{name_pattern}.{shard}/.done".format( - name_pattern=name_pattern, shard=shard - ) - for shard in shards - ] + yield ( + "calls", + [ + "work/{name_pattern}.{shard}/out/{name_pattern}.{shard}/.done".format( + name_pattern=name_pattern, shard=shard + ) + for shard in shards + ], + ) ext = "ploidy" name_pattern = "{mapper}.gcnv_contig_ploidy.{library_kit}".format( library_kit=library_kit, **wildcards @@ -192,7 +181,8 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (NgsMappingWorkflow,), + config_model_class=HelperGcnvModelTargetedConfigModel, + previous_steps=(NgsMappingWorkflow,), ) # Register sub step classes so the sub steps are available self.register_sub_step_classes( @@ -202,7 +192,7 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) ) ) # Register sub workflows - self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"]) + self.register_sub_workflow("ngs_mapping", self.config.path_ngs_mapping) # Build mapping from NGS DNA library to library kit self.ngs_library_to_kit = self.sub_steps["gcnv"].ngs_library_to_kit @@ -240,10 +230,3 @@ def pick_kits_and_donors(self): if donor.dna_ngs_library and donor.dna_ngs_library.name in self.ngs_library_to_kit ] return list(sorted(set(self.ngs_library_to_kit.values()))), donors, kit_counts - - def check_config(self): - """Check that the necessary configuration is available for the step""" - self.ensure_w_config( - ("step_config", "helper_gcnv_model_targeted", "path_ngs_mapping"), - "Path to NGS mapping not configured but required for gCNV model building.", - ) diff --git a/snappy_pipeline/workflows/helper_gcnv_model_targeted/model.py b/snappy_pipeline/workflows/helper_gcnv_model_targeted/model.py new file mode 100644 index 000000000..541c9f91c --- /dev/null +++ b/snappy_pipeline/workflows/helper_gcnv_model_targeted/model.py @@ -0,0 +1,25 @@ +from snappy_pipeline.models import SnappyModel, SnappyStepModel +from snappy_pipeline.models.gcnv import TargetIntervalEntry + + +class Gcnv(SnappyModel): + path_par_intervals: str = "" + """Path to interval block list with PAR region for contig calling.""" + + path_uniquely_mapable_bed: str + """path to BED file with uniquely mappable regions.""" + + path_target_interval_list_mapping: list[TargetIntervalEntry] + """ + The following allows to define one or more set of target intervals. This is only used by gcnv. + Example: + - name: "Agilent SureSelect Human All Exon V6" + pattern: "Agilent SureSelect Human All Exon V6.*" + path: "path/to/targets.bed" + """ + + +class HelperGcnvModelTargeted(SnappyStepModel): + path_ngs_mapping: str = "../ngs_mapping" + + gcnv: Gcnv diff --git a/snappy_pipeline/workflows/helper_gcnv_model_wgs/Snakefile b/snappy_pipeline/workflows/helper_gcnv_model_wgs/Snakefile index f73edcf13..ea987ae6e 100644 --- a/snappy_pipeline/workflows/helper_gcnv_model_wgs/Snakefile +++ b/snappy_pipeline/workflows/helper_gcnv_model_wgs/Snakefile @@ -4,7 +4,9 @@ import os from snappy_pipeline import expand_ref -from snappy_pipeline.workflows.helper_gcnv_model_wgs import HelperBuildWgsGcnvModelWorkflow +from snappy_pipeline.workflows.helper_gcnv_model_wgs import ( + HelperBuildWgsGcnvModelWorkflow, +) # Configuration =============================================================== @@ -53,6 +55,8 @@ rule build_gcnv_model_preprocess_intervals: tmpdir=wf.get_resource("gcnv", "preprocess_intervals", "tmpdir"), log: wf.get_log_file("gcnv", "preprocess_intervals"), + params: + step_key="helper_gcnv_model_wgs", wrapper: wf.wrapper_path("gcnv/preprocess_intervals_wgs") @@ -121,6 +125,8 @@ rule build_gcnv_model_contig_ploidy: tmpdir=wf.get_resource("gcnv", "contig_ploidy", "tmpdir"), log: wf.get_log_file("gcnv", "contig_ploidy"), + params: + step_key="helper_gcnv_model_wgs", wrapper: wf.wrapper_path("gcnv/contig_ploidy") diff --git a/snappy_pipeline/workflows/helper_gcnv_model_wgs/__init__.py b/snappy_pipeline/workflows/helper_gcnv_model_wgs/__init__.py index 3b6ce0d82..922ff64e3 100644 --- a/snappy_pipeline/workflows/helper_gcnv_model_wgs/__init__.py +++ b/snappy_pipeline/workflows/helper_gcnv_model_wgs/__init__.py @@ -86,25 +86,19 @@ import os import attr -from biomedsheets.shortcuts import GermlineCaseSheet, is_not_background from snakemake.io import glob_wildcards +from biomedsheets.shortcuts import GermlineCaseSheet, is_not_background from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import BaseStep, WritePedigreeStepPart from snappy_pipeline.workflows.common.gcnv.gcnv_build_model import BuildGcnvModelStepPart from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow +from snappy_wrappers.resource_usage import ResourceUsage + +from .model import HelperGcnvModelWgs as HelperGcnvModelWgsConfigModel #: Default configuration for the helper_gcnv_model_wgs schema -DEFAULT_CONFIG = r""" -# Default configuration helper_gcnv_model_wgs -step_config: - helper_gcnv_model_wgs: - path_ngs_mapping: ../ngs_mapping # REQUIRED - - gcnv: - # Path to BED file with uniquely mappable regions. - path_uniquely_mapable_bed: null # REQUIRED -""" +DEFAULT_CONFIG = HelperGcnvModelWgsConfigModel.default_config_yaml_string() class BuildGcnvWgsModelStepPart(BuildGcnvModelStepPart): @@ -154,8 +148,11 @@ def _get_input_files_call_cnvs(self, wildcards): yield ext, "work/{name_pattern}/out/{name_pattern}/.done".format(name_pattern=path_pattern) key = "intervals" path_pattern = "gcnv_annotate_gc.default" - yield key, "work/{name_pattern}/out/{name_pattern}.{ext}".format( - name_pattern=path_pattern, ext="tsv" + yield ( + key, + "work/{name_pattern}/out/{name_pattern}.{ext}".format( + name_pattern=path_pattern, ext="tsv" + ), ) @dictify @@ -172,19 +169,22 @@ def _get_input_files_post_germline_calls(self, wildcards, checkpoints): name_pattern = "{mapper}.gcnv_call_cnvs.{library_kit}".format( library_kit=library_kit, **wildcards ) - yield "calls", [ - "work/{name_pattern}.{shard}/out/{name_pattern}.{shard}/.done".format( - name_pattern=name_pattern, shard=shard - ) - for shard in shards - ] + yield ( + "calls", + [ + "work/{name_pattern}.{shard}/out/{name_pattern}.{shard}/.done".format( + name_pattern=name_pattern, shard=shard + ) + for shard in shards + ], + ) ext = "ploidy" name_pattern = "{mapper}.gcnv_contig_ploidy.{library_kit}".format( library_kit=library_kit, **wildcards ) yield ext, "work/{name_pattern}/out/{name_pattern}/.done".format(name_pattern=name_pattern) - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -228,7 +228,8 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (NgsMappingWorkflow,), + config_model_class=HelperGcnvModelWgsConfigModel, + previous_steps=(NgsMappingWorkflow,), ) # Register sub step classes so the sub steps are available self.register_sub_step_classes( @@ -238,7 +239,7 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) ) ) # Register sub workflows - self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"]) + self.register_sub_workflow("ngs_mapping", self.config.path_ngs_mapping) @listify def get_result_files(self): @@ -260,10 +261,3 @@ def all_donors(self, include_background=True): for sheet in sheets: for pedigree in sheet.cohort.pedigrees: yield from pedigree.donors - - def check_config(self): - """Check that the necessary configuration is available for the step""" - self.ensure_w_config( - ("step_config", "helper_gcnv_model_wgs", "path_ngs_mapping"), - "Path to NGS mapping not configured but required for gCNV model building.", - ) diff --git a/snappy_pipeline/workflows/helper_gcnv_model_wgs/model.py b/snappy_pipeline/workflows/helper_gcnv_model_wgs/model.py new file mode 100644 index 000000000..7c485ba52 --- /dev/null +++ b/snappy_pipeline/workflows/helper_gcnv_model_wgs/model.py @@ -0,0 +1,17 @@ +from snappy_pipeline.models import SnappyModel, SnappyStepModel + + +class Gcnv(SnappyModel): + path_par_intervals: str = "" + """Path to interval block list with PAR region for contig calling.""" + + path_uniquely_mapable_bed: str + """path to BED file with uniquely mappable regions.""" + + # NOTE: the wgs model do NOT need the path_target_interval_list_mapping + + +class HelperGcnvModelWgs(SnappyStepModel): + path_ngs_mapping: str = "../ngs_mapping" + + gcnv: Gcnv diff --git a/snappy_pipeline/workflows/hla_typing/__init__.py b/snappy_pipeline/workflows/hla_typing/__init__.py index a53f89500..f5ca5c070 100644 --- a/snappy_pipeline/workflows/hla_typing/__init__.py +++ b/snappy_pipeline/workflows/hla_typing/__init__.py @@ -57,9 +57,9 @@ from collections import OrderedDict import os -from biomedsheets.shortcuts import GenericSampleSheet from snakemake.io import expand +from biomedsheets.shortcuts import GenericSampleSheet from snappy_pipeline.base import UnsupportedActionException from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import ( @@ -72,6 +72,8 @@ get_ngs_library_folder_name, ) +from .model import HlaTyping as HlaTypingConfigModel + #: Extensions of files to create as main payload EXT_VALUES = (".txt", ".txt.md5") @@ -82,19 +84,7 @@ HLA_TYPERS = ("optitype", "arcashla") #: Default configuration for the hla_typing schema -DEFAULT_CONFIG = r""" -# Default configuration ngs_mapping -step_config: - hla_typing: - path_ngs_mapping: ../ngs_mapping - path_link_in: "" # OPTIONAL Override data set configuration search paths for FASTQ files - tools: [optitype] # REQUIRED - available: 'optitype' and 'arcashla' - optitype: - max_reads: 5000 # suggestion by OptiType author - num_mapping_threads: 4 - arcashla: - mapper: star -""".lstrip() +DEFAULT_CONFIG = HlaTypingConfigModel.default_config_yaml_string() class OptiTypeStepPart(BaseStepPart): @@ -119,7 +109,7 @@ def __init__(self, parent): self.parent.work_dir, self.parent.data_set_infos, self.parent.config_lookup_paths, - preprocessed_path=self.config["path_link_in"], + preprocessed_path=self.config.path_link_in, ) @staticmethod @@ -186,7 +176,7 @@ def _collect_reads(self, wildcards, library_name, prefix): Yields paths to right reads if prefix=='right-' """ folder_name = get_ngs_library_folder_name(self.parent.sheets, wildcards.library_name) - if self.config["path_link_in"]: + if self.config.path_link_in: folder_name = library_name pattern_set_keys = ("right",) if prefix.startswith("right-") else ("left",) for _, path_infix, filename in self.path_gen.run(folder_name, pattern_set_keys): @@ -197,7 +187,7 @@ def _get_seq_type(self, wildcards): library = self.parent.ngs_library_name_to_ngs_library[wildcards.library_name] return library.test_sample.extra_infos.get("extractionType", "DNA").lower() - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -232,7 +222,7 @@ class ArcasHlaStepPart(BaseStepPart): def __init__(self, parent): super().__init__(parent) - self.mapper = self.config["arcashla"]["mapper"] + self.mapper = self.config.arcashla.mapper self.base_path_out = ( "work/{mapper}.arcashla.{{library_name}}/out/{mapper}.arcashla.{{library_name}}{ext}" ) @@ -245,8 +235,11 @@ def get_input_files(self, action): def input_function(wildcards): yield "ref_done", "work/arcashla.prepare_reference/out/.done" tpl = "output/{mapper}.{library_name}/out/{mapper}.{library_name}.bam" - yield "bam", self.parent.sub_workflows["ngs_mapping"]( - tpl.format(mapper=self.mapper, **wildcards) + yield ( + "bam", + self.parent.sub_workflows["ngs_mapping"]( + tpl.format(mapper=self.mapper, **wildcards) + ), ) assert action == "run" @@ -257,10 +250,10 @@ def get_output_files(self, action): """Return output files""" assert action == "run" for name, ext in zip(EXT_NAMES, EXT_VALUES): - yield name, self.base_path_out.format(ext=ext, mapper=self.config["arcashla"]["mapper"]) + yield name, self.base_path_out.format(ext=ext, mapper=self.config.arcashla.mapper) def get_output_prefix(self): - return "%s." % self.config["arcashla"]["mapper"] + return "%s." % self.config.arcashla.mapper @staticmethod def get_log_file(action): @@ -268,7 +261,7 @@ def get_log_file(action): _ = action return "work/arcashla.{library_name}/log/snakemake.hla_typing.log" - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -306,17 +299,23 @@ def default_config_yaml(cls): return DEFAULT_CONFIG def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.register_sub_step_classes( - (OptiTypeStepPart, ArcasHlaStepPart, LinkInStep, LinkOutStepPart) - ) + super().__init__(*args, **kwargs, config_model_class=HlaTypingConfigModel) + sub_steps = [LinkInStep, LinkOutStepPart] + for tool in self.config.tools: + if self.config.get(str(tool)): + match tool: + case "optitype": + sub_steps.append(OptiTypeStepPart) + case "arcashla": + sub_steps.append(ArcasHlaStepPart) + self.register_sub_step_classes(tuple(sub_steps)) #: Mapping from library name to library object self.ngs_library_name_to_ngs_library = OrderedDict() for sheet in self.shortcut_sheets: for ngs_library in sheet.all_ngs_libraries: self.ngs_library_name_to_ngs_library[ngs_library.name] = ngs_library # Register sub workflows - self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"]) + self.register_sub_workflow("ngs_mapping", self.config.path_ngs_mapping) @listify def get_result_files(self): @@ -340,7 +339,7 @@ def _yield_result_files(self, tpl, **kwargs): """Build output paths from path template and extension list""" for sheet in self.shortcut_sheets: for ngs_library in sheet.all_ngs_libraries: - for tool in self.config["tools"]: + for tool in self.config.tools: supported = self.sub_steps[tool].supported_extraction_types extraction_type = ngs_library.test_sample.extra_infos.get( "extractionType", "DNA" diff --git a/snappy_pipeline/workflows/hla_typing/model.py b/snappy_pipeline/workflows/hla_typing/model.py new file mode 100644 index 000000000..f4aa3b63d --- /dev/null +++ b/snappy_pipeline/workflows/hla_typing/model.py @@ -0,0 +1,33 @@ +import enum +from typing import Annotated + +from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel, validators + + +class Tool(enum.StrEnum): + optitype = "optitype" + arcashla = "arcashla" + + +class Optitype(SnappyModel): + max_reads: int = 5000 + """5000 is a suggestion by OptiType author""" + + num_mapping_threads: int = 4 + + +class ArcasHla(SnappyModel): + mapper: str = "star" + + +class HlaTyping(SnappyStepModel, validators.ToolsMixin, validators.NgsMappingMixin): + path_ngs_mapping: str = "../ngs_mapping" + + path_link_in: str = "" + """Override data set configuration search paths for FASTQ files""" + + tools: Annotated[list[Tool], EnumField(Tool, [Tool.optitype], min_length=1)] + + optitype: Optitype | None = None + + arcashla: ArcasHla | None = None diff --git a/snappy_pipeline/workflows/homologous_recombination_deficiency/__init__.py b/snappy_pipeline/workflows/homologous_recombination_deficiency/__init__.py index 249fdae69..0fc6e3f33 100644 --- a/snappy_pipeline/workflows/homologous_recombination_deficiency/__init__.py +++ b/snappy_pipeline/workflows/homologous_recombination_deficiency/__init__.py @@ -59,9 +59,9 @@ import sys -from biomedsheets.shortcuts import CancerCaseSheet, is_not_background from snakemake.io import expand +from biomedsheets.shortcuts import CancerCaseSheet, is_not_background from snappy_pipeline.base import UnsupportedActionException from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import ( @@ -74,20 +74,12 @@ SomaticTargetedSeqCnvCallingWorkflow, ) +from .model import HomologousRecombinationDeficiency as HomologousRecombinationDeficiencyConfigModel + __author__ = "Eric Blanc " #: Default configuration for the homologous recombination deficiency step -DEFAULT_CONFIG = r""" -# Default configuration homologous_recombination_deficiency -step_config: - homologous_recombination_deficiency: - tools: ['scarHRD'] # REQUIRED - available: 'scarHRD' - path_cnv_calling: ../somatic_targeted_seq_cnv_calling # REQUIRED - scarHRD: - genome_name: "grch37" # Must be either "grch37", "grch38" or "mouse" - chr_prefix: False - length: 50 # Wiggle track for GC reference file -""" +DEFAULT_CONFIG = HomologousRecombinationDeficiencyConfigModel.default_config_yaml_string() class ScarHRDStepPart(BaseStepPart): @@ -154,7 +146,7 @@ def _get_log_file(self, action): yield key, prefix + ext yield key + "_md5", prefix + ext + ".md5" - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: self._validate_action(action) if action == "run": return ResourceUsage( @@ -163,7 +155,7 @@ def get_resource_usage(self, action): time="24:00:00", ) else: - return super().get_resource_usage(action) + return super().get_resource_usage(action, **kwargs) class HomologousRecombinationDeficiencyWorkflow(BaseStep): @@ -187,13 +179,14 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (SomaticTargetedSeqCnvCallingWorkflow,), + config_model_class=HomologousRecombinationDeficiencyConfigModel, + previous_steps=(SomaticTargetedSeqCnvCallingWorkflow,), ) # Register sub step classes so the sub steps are available self.register_sub_step_classes((ScarHRDStepPart, LinkOutStepPart)) # Initialize sub-workflows self.register_sub_workflow( - "somatic_targeted_seq_cnv_calling", self.config["path_cnv_calling"], "cnv_calling" + "somatic_targeted_seq_cnv_calling", self.config.path_cnv_calling, "cnv_calling" ) @listify @@ -212,7 +205,7 @@ def get_result_files(self): ) print(msg.format(sample_pair.tumor_sample.name), file=sys.stderr) continue - for tool in self.config["tools"]: + for tool in self.config.tools: for action in tool_actions[tool]: try: tpls = self.sub_steps[tool].get_output_files(action).values() @@ -223,7 +216,7 @@ def get_result_files(self): for tpl in tpls: filenames = expand( tpl, - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + mapper=self.w_config.step_config["ngs_mapping"].tools.dna, caller=["sequenza"], library_name=[sample_pair.tumor_sample.dna_ngs_library.name], ) @@ -237,6 +230,4 @@ def check_config(self): ("static_data_config", "reference", "path"), "Path to reference FASTA file not configured but required", ) - assert ( - "sequenza" in self.w_config["step_config"]["somatic_targeted_seq_cnv_calling"]["tools"] - ) + assert "sequenza" in self.w_config.step_config["somatic_targeted_seq_cnv_calling"].tools diff --git a/snappy_pipeline/workflows/homologous_recombination_deficiency/model.py b/snappy_pipeline/workflows/homologous_recombination_deficiency/model.py new file mode 100644 index 000000000..35c069033 --- /dev/null +++ b/snappy_pipeline/workflows/homologous_recombination_deficiency/model.py @@ -0,0 +1,31 @@ +import enum +from typing import Annotated + +from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel, validators + + +class Tool(enum.StrEnum): + scarHRD = "scarHRD" + + +class GenomeName(enum.StrEnum): + grch37 = "grch37" + grch38 = "grch38" + mouse = "mouse" + + +class ScarHRD(SnappyModel): + genome_name: GenomeName = GenomeName.grch37 + + chr_prefix: bool = False + + length: int = 50 + """Wiggle track for GC reference file""" + + +class HomologousRecombinationDeficiency(SnappyStepModel, validators.ToolsMixin): + tools: Annotated[list[Tool], EnumField(Tool, [Tool.scarHRD], min_length=1)] + + path_cnv_calling: str + + scarHRD: ScarHRD | None = None diff --git a/snappy_pipeline/workflows/igv_session_generation/Snakefile b/snappy_pipeline/workflows/igv_session_generation/Snakefile index 02cdc7eed..72a58309f 100644 --- a/snappy_pipeline/workflows/igv_session_generation/Snakefile +++ b/snappy_pipeline/workflows/igv_session_generation/Snakefile @@ -4,7 +4,9 @@ import os from snappy_pipeline import expand_ref -from snappy_pipeline.workflows.igv_session_generation import IgvSessionGenerationWorkflow +from snappy_pipeline.workflows.igv_session_generation import ( + IgvSessionGenerationWorkflow, +) __author__ = "Manuel Holtgrewe " diff --git a/snappy_pipeline/workflows/igv_session_generation/__init__.py b/snappy_pipeline/workflows/igv_session_generation/__init__.py index 1d6bb4815..78ca9432d 100644 --- a/snappy_pipeline/workflows/igv_session_generation/__init__.py +++ b/snappy_pipeline/workflows/igv_session_generation/__init__.py @@ -47,17 +47,19 @@ import os -from biomedsheets.shortcuts import GermlineCaseSheet, is_not_background from jinja2 import Environment, FileSystemLoader from snakemake import shell from snakemake.io import expand +from biomedsheets.shortcuts import GermlineCaseSheet, is_not_background from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow from snappy_pipeline.workflows.variant_annotation import VariantAnnotationWorkflow from snappy_pipeline.workflows.variant_phasing import VariantPhasingWorkflow +from .model import IgvSessionGeneration as IgvSessionGenerationConfigModel + #: Extensions of files to create as main payload EXT_VALUES = (".igv_session.xml", ".igv_session.xml.md5") @@ -65,18 +67,7 @@ EXT_NAMES = ("xml", "xml_md5") #: Default configuration of the wgs_sv_filtration step -DEFAULT_CONFIG = r""" -# Default configuration igv_session_generation -step_config: - igv_session_generation: - path_ngs_mapping: ../ngs_mapping - # One of the following must be given! - path_variant_phasing: '' - path_variant_annotation: '' - path_variant_calling: '' - tools_ngs_mapping: [] # defaults to ngs_mapping tool - tools_variant_calling: [] # defaults to variant_annotation tool -""" +DEFAULT_CONFIG = IgvSessionGenerationConfigModel.default_config_yaml_string() class WriteIgvSessionFileStepPart(BaseStepPart): @@ -114,7 +105,7 @@ def _get_path_vcf(self, wildcards, real_index): input_path = ("output/" + name_pattern + "/out/" + name_pattern).format( prev_token=self.prev_token, real_index_library=real_index.dna_ngs_library.name, - **wildcards + **wildcards, ) return prev_step(input_path + ".vcf.gz") @@ -181,9 +172,7 @@ def run(self, wildcards, output): r""" pushd $(dirname {output.xml}) md5sum $(basename {output.xml}) >$(basename {output.xml}).md5 - """.format( - output=output - ) + """.format(output=output) ) @@ -208,17 +197,18 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (VariantPhasingWorkflow, VariantAnnotationWorkflow, NgsMappingWorkflow), + config_model_class=IgvSessionGenerationConfigModel, + previous_steps=(VariantPhasingWorkflow, VariantAnnotationWorkflow, NgsMappingWorkflow), ) # Register sub workflows for prev in ("variant_phasing", "variant_annotation", "variant_calling"): - if self.config["path_%s" % prev]: + if prev_path := self.config.get(f"path_{prev}"): self.previous_step = prev - self.register_sub_workflow(prev, self.config["path_%s" % prev]) + self.register_sub_workflow(prev, prev_path) break else: raise Exception("No path to previous step given!") # pragma: no cover - self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"]) + self.register_sub_workflow("ngs_mapping", self.config.path_ngs_mapping) #: Name token for input self.prev_token = { "variant_phasing": "jannovar_annotate_vcf.gatk_pbt.gatk_rbp.", @@ -228,14 +218,10 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) # Register sub step classes so the sub steps are available self.register_sub_step_classes((WriteIgvSessionFileStepPart, LinkOutStepPart)) # Copy over "tools" setting from variant_calling/ngs_mapping if not set here - if not self.config["tools_ngs_mapping"]: - self.config["tools_ngs_mapping"] = self.w_config["step_config"]["ngs_mapping"]["tools"][ - "dna" - ] - if not self.config["tools_variant_calling"]: - self.config["tools_variant_calling"] = self.w_config["step_config"]["variant_calling"][ - "tools" - ] + if not self.config.tools_ngs_mapping: + self.config.tools_ngs_mapping = self.w_config.step_config["ngs_mapping"].tools.dna + if not self.config.tools_variant_calling: + self.config.tools_variant_calling = self.w_config.step_config["variant_calling"].tools @listify def get_result_files(self): @@ -244,8 +230,8 @@ def get_result_files(self): name_pattern = "{mapper}.{caller}%s.{index_library.name}" % (self.prev_token,) yield from self._yield_result_files( os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), - mapper=self.config["tools_ngs_mapping"], - caller=self.config["tools_variant_calling"], + mapper=self.config.tools_ngs_mapping, + caller=self.config.tools_variant_calling, ext=EXT_VALUES, ) @@ -263,11 +249,3 @@ def _yield_result_files(self, tpl, **kwargs): continue else: yield from expand(tpl, index_library=[donor.dna_ngs_library], **kwargs) - - def check_config(self): - """Check that the path to the variant annotation step is present.""" - # TODO: Check that at least one path was provided in user provided config. - self.ensure_w_config( - ("step_config", "igv_session_generation", "path_ngs_mapping"), - "Path to ngs_mapping not configured but required for igv_session_generation", - ) diff --git a/snappy_pipeline/workflows/igv_session_generation/model.py b/snappy_pipeline/workflows/igv_session_generation/model.py new file mode 100644 index 000000000..7d062195c --- /dev/null +++ b/snappy_pipeline/workflows/igv_session_generation/model.py @@ -0,0 +1,28 @@ +from pydantic import model_validator + +from snappy_pipeline.models import SnappyStepModel + + +class IgvSessionGeneration(SnappyStepModel): + path_ngs_mapping: str = "../ngs_mapping" + + path_variant_phasing: str = "" + + path_variant_annotation: str = "" + + path_variant_calling: str = "" + + tools_ngs_mapping: list[str] = [] + """defaults to ngs_mapping tool""" + + tools_variant_calling: list[str] = [] + """defaults to variant_annotation tool""" + + @model_validator(mode="after") + def ensure_at_least_one_path_is_specified(self): + if not any( + getattr(self, path) + for path in ("path_variant_phasing", "path_variant_annotation", "path_variant_calling") + ): + raise ValueError("No path specified for variant phasing, annotation or calling") + return self diff --git a/snappy_pipeline/workflows/ngs_data_qc/__init__.py b/snappy_pipeline/workflows/ngs_data_qc/__init__.py index d7b5b9aed..4e4aa225f 100644 --- a/snappy_pipeline/workflows/ngs_data_qc/__init__.py +++ b/snappy_pipeline/workflows/ngs_data_qc/__init__.py @@ -14,9 +14,9 @@ from itertools import chain import os -from biomedsheets.shortcuts import GenericSampleSheet from snakemake.io import Namedlist, expand, touch +from biomedsheets.shortcuts import GenericSampleSheet from snappy_pipeline.base import UnsupportedActionException from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import ( @@ -29,41 +29,10 @@ get_ngs_library_folder_name, ) +from .model import NgsDataQc as NgsDataQcConfigModel + #: Default configuration for the ngs_mapping schema -DEFAULT_CONFIG = r""" -# Default configuration ngs_mapping -step_config: - ngs_data_qc: - path_link_in: "" # OPTIONAL Override data set configuration search paths for FASTQ files - tools: [fastqc, picard] # REQUIRED - available: 'fastqc' & 'picard' (for QC on bam files) - picard: - path_ngs_mapping: ../ngs_mapping # REQUIRED - path_to_baits: "" # Required when CollectHsMetrics is among the programs - path_to_targets: "" # When missing, same as baits - bait_name: "" # Exon enrichment kit name (optional) - programs: [] # Available metrics: - # * Generic metrics [* grouped into CollectMultipleMetrics] - # - CollectAlignmentSummaryMetrics * - # - CollectBaseDistributionByCycle * - # - CollectGcBiasMetrics * - # - CollectInsertSizeMetrics * - # - CollectJumpingLibraryMetrics - # - CollectOxoGMetrics - # - CollectQualityYieldMetrics * - # - CollectSequencingArtifactMetrics * - # - EstimateLibraryComplexity - # - MeanQualityByCycle * - # - QualityScoreDistribution * - # * WGS-specific metrics - # - CollectRawWgsMetrics - # - CollectWgsMetrics - # - CollectWgsMetricsWithNonZeroCoverage - # * Other assay-specific metrics - # - CollectHsMetrics Whole Exome Sequencing - # - CollectTargetedPcrMetrics Panel sequencing - # - CollectRnaSeqMetrics mRNA sequencing, not implemented yet - # - CollectRbsMetrics bi-sulfite sequencing, not implemented yet -""" +DEFAULT_CONFIG = NgsDataQcConfigModel.default_config_yaml_string() MULTIPLE_METRICS = { "CollectAlignmentSummaryMetrics": ["alignment_summary_metrics"], @@ -115,7 +84,7 @@ def __init__(self, parent): self.parent.work_dir, self.parent.data_set_infos, self.parent.config_lookup_paths, - preprocessed_path=self.config["path_link_in"], + preprocessed_path=self.config.path_link_in, ) def get_args(self, action): @@ -163,7 +132,7 @@ def _collect_reads(self, wildcards, library_name, prefix): Yields paths to right reads if prefix=='right-' """ folder_name = get_ngs_library_folder_name(self.parent.sheets, wildcards.library_name) - if self.config["path_link_in"]: + if self.config.path_link_in: folder_name = library_name pattern_set_keys = ("right",) if prefix.startswith("right-") else ("left",) for _, path_infix, filename in self.path_gen.run(folder_name, pattern_set_keys): @@ -190,7 +159,7 @@ def get_input_files(self, action): @dictify def _get_input_files_metrics(self, wildcards): - if "CollectHsMetrics" in self.config["picard"]["programs"]: + if "CollectHsMetrics" in self.config.picard.programs: yield "baits", "work/static_data/picard/out/baits.interval_list" yield "targets", "work/static_data/picard/out/targets.interval_list" ngs_mapping = self.parent.sub_workflows["ngs_mapping"] @@ -204,7 +173,7 @@ def get_output_files(self, action): yield "targets", "work/static_data/picard/out/targets.interval_list" elif action == "metrics": base_out = "work/{mapper}.{library_name}/report/picard/{mapper}.{library_name}." - for pgm in self.config["picard"]["programs"]: + for pgm in self.config.picard.programs: if pgm in MULTIPLE_METRICS.keys(): first = MULTIPLE_METRICS[pgm][0] yield pgm, base_out + f"CollectMultipleMetrics.{first}.txt" @@ -250,7 +219,7 @@ def get_params(self, action): def _get_params(self, wildcards): return {"prefix": f"{wildcards.mapper}.{wildcards.library_name}"} - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -261,7 +230,7 @@ def get_resource_usage(self, action): :raises UnsupportedActionException: if action not in class defined list of valid actions. """ if action == "prepare": - return super().get_resource_usage(action) + return super().get_resource_usage(action, **kwargs) elif action == "metrics": return ResourceUsage(threads=1, time="24:00:00", memory="24G") else: @@ -285,12 +254,19 @@ def default_config_yaml(cls): return DEFAULT_CONFIG def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir): - super().__init__(workflow, config, config_lookup_paths, config_paths, workdir) + super().__init__( + workflow, + config, + config_lookup_paths, + config_paths, + workdir, + config_model_class=NgsDataQcConfigModel, + ) self.register_sub_step_classes( (LinkInStep, LinkOutStepPart, FastQcReportStepPart, PicardStepPart) ) - if "picard" in self.config["tools"]: - self.register_sub_workflow("ngs_mapping", self.config["picard"]["path_ngs_mapping"]) + if "picard" in self.config.tools: + self.register_sub_workflow("ngs_mapping", self.config.picard.path_ngs_mapping) @listify def get_result_files(self): @@ -299,7 +275,7 @@ def get_result_files(self): We will process all NGS libraries of all test samples in all sample sheets. """ - if "fastqc" in self.config["tools"]: + if "fastqc" in self.config.tools: yield from self._yield_result_files( tpl="output/{ngs_library.name}/report/fastqc/.done", allowed_extraction_types=( @@ -307,12 +283,12 @@ def get_result_files(self): "RNA", ), ) - if "picard" in self.config["tools"]: + if "picard" in self.config.tools: tpl = ( "output/{mapper}.{ngs_library.name}/report/picard/{mapper}.{ngs_library.name}.{ext}" ) exts = [] - for pgm in self.config["picard"]["programs"]: + for pgm in self.config.picard.programs: if pgm in MULTIPLE_METRICS.keys(): first = MULTIPLE_METRICS[pgm][0] exts.append(f"CollectMultipleMetrics.{first}.txt") @@ -323,7 +299,7 @@ def get_result_files(self): yield from self._yield_result_files( tpl=tpl, allowed_extraction_types=("DNA",), - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + mapper=self.w_config.step_config["ngs_mapping"].tools.dna, ext=exts, ) @@ -334,23 +310,3 @@ def _yield_result_files(self, tpl, allowed_extraction_types, **kwargs): extraction_type = ngs_library.test_sample.extra_infos["extractionType"] if extraction_type in allowed_extraction_types: yield from expand(tpl, ngs_library=[ngs_library], **kwargs) - - def check_config(self): - if "picard" in self.config["tools"]: - self.ensure_w_config( - ("step_config", "ngs_data_qc", "picard", "path_ngs_mapping"), - "Path to ngs_mapping not configured but required for picard", - ) - programs = self.config["picard"]["programs"] - assert len(programs) > 0, "No selected programs for collecting metrics" - assert all( - pgm in MULTIPLE_METRICS.keys() - or pgm in ADDITIONAL_METRICS - or pgm in WES_METRICS - or pgm in WGS_METRICS - for pgm in programs - ), "Some requested metrics programs are not implemented" - if "CollectHsMetrics" in programs: - assert self.config["picard"][ - "path_to_baits" - ], "Path to baits must be specified when using CollectHsMetrics" diff --git a/snappy_pipeline/workflows/ngs_data_qc/model.py b/snappy_pipeline/workflows/ngs_data_qc/model.py new file mode 100644 index 000000000..a51b90586 --- /dev/null +++ b/snappy_pipeline/workflows/ngs_data_qc/model.py @@ -0,0 +1,87 @@ +import enum +from typing import Annotated + +from pydantic import Field, model_validator + +from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel, validators + + +class Tool(enum.StrEnum): + fastqc = "fastqc" + picard = "picard" + + +class PicardProgram(enum.StrEnum): + # Generic metrics + CollectAlignmentSummaryMetrics = "CollectAlignmentSummaryMetrics" + CollectBaseDistributionByCycle = "CollectBaseDistributionByCycle" + CollectGcBiasMetrics = "CollectGcBiasMetrics" + CollectInsertSizeMetrics = "CollectInsertSizeMetrics" + CollectQualityYieldMetrics = "CollectQualityYieldMetrics" + CollectSequencingArtifactMetrics = "CollectSequencingArtifactMetrics" + MeanQualityByCycle = "MeanQualityByCycle" + QualityScoreDistribution = "QualityScoreDistribution" + + # The above are grouped into + CollectMultipleMetrics = "CollectMultipleMetrics" + + # Generic metrics not included in "CollectMultipleMetrics" + CollectJumpingLibraryMetrics = "CollectJumpingLibraryMetrics" + CollectOxoGMetrics = "CollectOxoGMetrics" + EstimateLibraryComplexity = "EstimateLibraryComplexity" + + # WGS-specific metrics + CollectRawWgsMetrics = "CollectRawWgsMetrics" + CollectWgsMetrics = "CollectWgsMetrics" + CollectWgsMetricsWithNonZeroCoverage = "CollectWgsMetricsWithNonZeroCoverage" + + # Other assay-specific metrics + CollectHsMetrics = "CollectHsMetrics" + """Whole Exome Sequencing""" + + CollectTargetedPcrMetrics = "CollectTargetedPcrMetrics" + """Panel sequencing""" + + CollectRnaSeqMetrics = "CollectRnaSeqMetrics" + """mRNA sequencing, not implemented yet""" + + CollectRbsMetrics = "CollectRbsMetrics" + """bi-sulfite sequencing, not implemented yet""" + + +class Picard(SnappyModel): + path_ngs_mapping: str = "../ngs_mapping" + + path_to_baits: str = "" + """Required when CollectHsMetrics is among the programs""" + + path_to_targets: str = "" + """When missing, same as baits""" + + bait_name: str = "" + """Exon enrichment kit name (optional)""" + + programs: Annotated[list[PicardProgram], Field(min_length=1)] + + @model_validator(mode="after") + def ensure_baits_when_required(self): + if PicardProgram.CollectHsMetrics in self.programs and not self.path_to_baits: + raise ValueError( + "Path to baits is required when CollectHsMetrics is among the programs" + ) + return self + + +class Fastqc(SnappyModel): + pass + + +class NgsDataQc(SnappyStepModel, validators.ToolsMixin): + path_link_in: str = "" + """Override data set configuration search paths for FASTQ files""" + + tools: Annotated[list[Tool], EnumField(Tool, [Tool.fastqc, Tool.picard], min_length=1)] + + picard: Picard | None = None + + fastqc: Fastqc | None = None # TODO fastqc has no configuration options in the DEFAULT_CONFIG? diff --git a/snappy_pipeline/workflows/ngs_mapping/__init__.py b/snappy_pipeline/workflows/ngs_mapping/__init__.py index bf987bb2d..67a5a60fe 100644 --- a/snappy_pipeline/workflows/ngs_mapping/__init__.py +++ b/snappy_pipeline/workflows/ngs_mapping/__init__.py @@ -383,11 +383,6 @@ [...] -Genome-wide Coverage Count (.bed.gz) - If ``ngs_mapping/compute_coverage_bed`` to be set to ``true`` a report is generated - that gives the depth at each base of the genome. (note: currently this report only appears - in ``work/`` and is not yet linked out into the ``output/`` directory). - (TODO: add file name rules and example) :: @@ -437,11 +432,11 @@ import re import sys -from biomedsheets.shortcuts import GenericSampleSheet, is_not_background from snakemake.io import expand +from biomedsheets.shortcuts import GenericSampleSheet, is_not_background from snappy_pipeline.base import InvalidConfiguration, UnsupportedActionException -from snappy_pipeline.utils import DictQuery, dictify, flatten, listify +from snappy_pipeline.utils import dictify, flatten, listify from snappy_pipeline.workflows.abstract import ( BaseStep, BaseStepPart, @@ -453,6 +448,8 @@ __author__ = "Manuel Holtgrewe " +from .model import NgsMapping as NgsMappingConfigModel + # TODO: Need something smarter still for @RG #: Extensions of files to create as main payload @@ -471,112 +468,7 @@ READ_MAPPERS_DNA_LONG = ("minimap2",) #: Default configuration -DEFAULT_CONFIG = r""" -step_config: - ngs_mapping: - # Aligners to use for the different NGS library types - tools: - dna: [] # Required if DNA analysis; otherwise, leave empty. Example: 'bwa'. - rna: [] # Required if RNA analysis; otherwise, leave empty. Example: 'star'. - dna_long: [] # Required if long-read mapper used; otherwise, leave empty. Example: 'minimap2'. - path_link_in: "" # OPTIONAL Override data set configuration search paths for FASTQ files - # Thresholds for targeted sequencing coverage QC. - target_coverage_report: - # Mapping from enrichment kit to target region BED file, for either computing per--target - # region coverage or selecting targeted exons. - # - # The following will match both the stock IDT library kit and the ones - # with spike-ins seen fromr Yale genomics. The path above would be - # mapped to the name "default". - # - name: IDT_xGen_V1_0 - # pattern: "xGen Exome Research Panel V1\\.0*" - # path: "path/to/targets.bed" - path_target_interval_list_mapping: [] - # Depth of coverage collection, mainly useful for genomes. - bam_collect_doc: - enabled: false - window_length: 1000 - # Compute fingerprints with ngs-chew - ngs_chew_fingerprint: - enabled: true - # Configuration for BWA - bwa: - path_index: REQUIRED # Required if listed in ngs_mapping.tools.dna; otherwise, can be removed. - num_threads_align: 16 - num_threads_trimming: 8 - num_threads_bam_view: 4 - num_threads_bam_sort: 4 - memory_bam_sort: 4G - trim_adapters: false - mask_duplicates: true - split_as_secondary: false # -M flag - extra_flags: [] # [ "-C" ] when molecular barcodes are processed with AGeNT in the somatic mode - # Configuration for BWA-MEM2 - bwa_mem2: - path_index: REQUIRED # Required if listed in ngs_mapping.tools.dna; otherwise, can be removed. - bwa_mode: auto # in ['auto', 'bwa-aln', 'bwa-mem'] - num_threads_align: 16 - num_threads_trimming: 8 - num_threads_bam_view: 4 - num_threads_bam_sort: 4 - memory_bam_sort: 4G - trim_adapters: false - mask_duplicates: true - split_as_secondary: true # -M flag - extra_flags: [] # [ "-C" ] when molecular barcodes are processed with AGeNT in the somatic mode - # Configuration for somatic ngs_calling (separate read groups, molecular barcodes & base quality recalibration) - somatic: - mapping_tool: REQUIRED # Either bwa of bwa_mem2. The indices & other parameters are taken from mapper config - barcode_tool: agent # Only agent currently implemented - use_barcodes: false - recalibrate: true - bqsr: - common_variants: REQUIRED # Common germline variants (see /fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK) - agent: - prepare: - path: REQUIRED - lib_prep_type: REQUIRED # One of "halo" (HaloPlex), "hs" (HaloPlexHS), "xt" (SureSelect XT, XT2, XT HS), "v2" (SureSelect XT HS2) & "qxt" (SureSelect QXT) - extra_args: [] # Consider "-polyG 8" for NovaSeq data & "-minFractionRead 50" for 100 cycles data - mark_duplicates: - path: REQUIRED - path_baits: REQUIRED - consensus_mode: REQUIRED # One of "SINGLE", "HYBRID", "DUPLEX" - input_filter_args: [] # Consider -mm 13 (min base qual) -mr 13 (min barcode base qual) -mq 30 (min map qual) - consensus_filter_args: [] - extra_args: [] # Consider -d 1 (max nb barcode mismatch) - # Configuration for STAR - star: - path_index: REQUIRED # Required if listed in ngs_mapping.tools.rna; otherwise, can be removed. - num_threads_align: 16 - num_threads_trimming: 8 - num_threads_bam_view: 4 - num_threads_bam_sort: 4 - memory_bam_sort: 4G - genome_load: NoSharedMemory - raw_star_options: '' - align_intron_max: 1000000 # ENCODE option - align_intron_min: 20 # ENCODE option - align_mates_gap_max: 1000000 # ENCODE option - align_sjdb_overhang_min: 1 # ENCODE option - align_sj_overhang_min: 8 # ENCODE option - out_filter_mismatch_n_max: 999 # ENCODE option - out_filter_mismatch_n_over_l_max: 0.04 # ENCODE option - out_filter_multimap_n_max: 20 # ENCODE option - out_filter_type: BySJout # ENCODE option - out_filter_intron_motifs: None # or for cufflinks: RemoveNoncanonical - out_sam_strand_field: None # or for cufflinks: intronMotif - transcriptome: false # true to output transcript coordinate bam for RSEM - trim_adapters: false - mask_duplicates: false - include_unmapped: true - strandedness: - path_exon_bed: REQUIRED # Location of usually highly expressed genes. Known protein coding genes is a good choice - strand: -1 # -1: unknown value, use infer_, 0: unstranded, 1: forward, 2: reverse (from featurecounts) - threshold: 0.85 # Minimum proportion of reads mapped to forward/reverse direction to call the protocol - # Configuration for Minimap2 - minimap2: - mapping_threads: 16 -""" +DEFAULT_CONFIG = NgsMappingConfigModel.default_config_yaml_string() class MappingGetResultFilesMixin: @@ -596,7 +488,7 @@ def skip_result_files_for_library(self, library_name: str) -> bool: if self.tool_category not in ("__any__", library_tool_category): return True else: - return self.name not in self.config["tools"][library_tool_category] + return self.name not in self.config.tools.get(library_tool_category) @listify def get_result_files(self): @@ -609,9 +501,8 @@ def get_result_files(self): a library. """ # Skip if step part has a tool category and it is not enabled - if ( - self.tool_category != "__any__" - and self.name not in self.config["tools"][self.tool_category] + if self.tool_category != "__any__" and self.name not in getattr( + self.config.tools, self.tool_category, [] ): return @@ -705,7 +596,7 @@ def __init__(self, parent): self.parent.work_dir, self.parent.data_set_infos, self.parent.config_lookup_paths, - preprocessed_path=self.config["path_link_in"], + preprocessed_path=self.config.path_link_in, ) def get_args(self, action): @@ -747,10 +638,13 @@ def get_output_files(self, action): paths_work = self._get_output_files_run_work() yield from paths_work.items() # Return list of paths to the links that will be created in ``output/`` - yield "output_links", [ - re.sub(r"^work/", "output/", work_path) - for work_path in chain(paths_work.values(), self.get_log_file(action).values()) - ] + yield ( + "output_links", + [ + re.sub(r"^work/", "output/", work_path) + for work_path in chain(paths_work.values(), self.get_log_file(action).values()) + ], + ) @dictify def _get_output_files_run_work(self): @@ -795,7 +689,7 @@ def _collect_reads(self, wildcards, library_name, prefix): Yields paths to right reads if prefix=='right-' """ folder_name = get_ngs_library_folder_name(self.parent.sheets, wildcards.library_name) - if self.config["path_link_in"]: + if self.config.path_link_in: folder_name = library_name pattern_set_keys = ("right",) if prefix.startswith("right-") else ("left",) seen = [] @@ -817,7 +711,7 @@ class BwaStepPart(ReadMappingStepPart): #: Tool category tool_category = "dna" - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -831,39 +725,13 @@ def get_resource_usage(self, action): actions_str = ", ".join(self.actions) error_message = f"Action '{action}' is not supported. Valid options: {actions_str}" raise UnsupportedActionException(error_message) - mem_mb = int(4.5 * 1024 * self.config["bwa"]["num_threads_align"]) + mem_mb = int(4.5 * 1024 * self.config.bwa.num_threads_align) return ResourceUsage( - threads=self.config["bwa"]["num_threads_align"], + threads=self.config.bwa.num_threads_align, time="3-00:00:00", # 3 days memory=f"{mem_mb}M", ) - def check_config(self): - """Check parameters in configuration. - - Method checks that all parameters required to execute BWA are present in the - configuration. It further checks that the provided index has all the expected file - extensions. If invalid configuration, it raises InvalidConfiguration exception. - """ - # Check if tool is at all included in workflow - if self.__class__.name not in self.config["tools"]["dna"]: - return # BWA not run, don't check configuration # pragma: no cover - - # Check required configuration settings present - self.parent.ensure_w_config( - config_keys=("step_config", "ngs_mapping", "bwa", "path_index"), - msg="Path to BWA index is required", - ) - - # Check that the path to the BWA index is valid. - for ext in (".amb", ".ann", ".bwt", ".pac", ".sa"): - expected_path = self.config["bwa"]["path_index"] + ext - if not os.path.exists(expected_path): # pragma: no cover - tpl = "Expected BWA input path {expected_path} does not exist!".format( - expected_path=expected_path - ) - raise InvalidConfiguration(tpl) - class BwaMem2StepPart(ReadMappingStepPart): """Support for performing NGS alignment using BWA-MEM 2""" @@ -871,7 +739,7 @@ class BwaMem2StepPart(ReadMappingStepPart): name = "bwa_mem2" tool_category = "dna" - def get_resource_usage(self, action: str) -> ResourceUsage: + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -882,38 +750,13 @@ def get_resource_usage(self, action: str) -> ResourceUsage: actions_str = ", ".join(self.actions) error_message = f"Action '{action}' is not supported. Valid options: {actions_str}" raise UnsupportedActionException(error_message) - mem_mb = int(4.5 * 1024 * self.config["bwa_mem2"]["num_threads_align"]) + mem_mb = int(4.5 * 1024 * self.config.bwa_mem2.num_threads_align) return ResourceUsage( - threads=self.config["bwa_mem2"]["num_threads_align"], + threads=self.config.bwa_mem2.num_threads_align, time="3-00:00:00", # 3 days memory=f"{mem_mb}M", ) - def check_config(self): - """Check parameters in configuration. - - Method checks that all parameters required to execute BWA-MEM2 are present in the - configuration. It further checks that the provided index has all the expected file - extensions. If invalid configuration, it raises InvalidConfiguration exception. - """ - # Check if tool is at all included in workflow - if self.__class__.name not in self.config["tools"]["dna"]: - return # BWA-MEM2 not run, don't check configuration # pragma: no cover - - # Check required configuration settings present - self.parent.ensure_w_config( - config_keys=("step_config", "ngs_mapping", "bwa_mem2", "path_index"), - msg="Path to BWA-MEM2 index is required", - ) - - # Check that the path to the BWA-MEM2 index is valid. - for ext in (".0123", ".amb", ".ann", ".bwt.2bit.64", ".pac"): - expected_path = self.config["bwa_mem2"]["path_index"] + ext - if not os.path.exists(expected_path): # pragma: no cover - raise InvalidConfiguration( - f"Expected BWA-MEM2 input path {expected_path} does not exist!" - ) - class MBCsStepPart(ReadMappingStepPart): """Support for performing NGS alignment on MBC data""" @@ -921,10 +764,7 @@ class MBCsStepPart(ReadMappingStepPart): name = "mbcs" tool_category = "dna" - LIB_PREP_TYPES = ("halo", "hs", "xt", "v2", "qxt") - CONSENSUS_MODES = ("SINGLE", "HYBRID", "DUPLEX") - - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -942,50 +782,6 @@ def get_resource_usage(self, action): partition="medium", ) - def check_config(self): - """Check parameters in configuration. - - Method checks that all parameters required to execute BWA-MEM2 are present in the - configuration. It further checks that the provided index has all the expected file - extensions. If invalid configuration, it raises InvalidConfiguration exception. - """ - # Check if tool is at all included in workflow - if self.__class__.name not in self.config["tools"]["dna"]: - return # mbcs not run, don't check configuration # pragma: no cover - - # Check mapper - mapper = self.config["somatic"]["mapping_tool"] - assert mapper != "mbcs" and mapper in READ_MAPPERS_DNA, f'Unknown mapper "{mapper}"' - self.parent.sub_steps[mapper].check_config() - - if self.config["somatic"]["use_barcodes"]: - assert self.config["somatic"]["barcode_tool"] == "agent" - # Check trimmer & creak paths - path = self.config["agent"]["prepare"]["path"] - if not os.path.exists(path): - raise InvalidConfiguration( - f"Expected agent's trimmer input path {path} does not exist!" - ) - path = self.config["agent"]["mark_duplicates"]["path"] - if not os.path.exists(path): - raise InvalidConfiguration( - f"Expected agent's creak input path {path} does not exist!" - ) - - # Check mandatory options - option = self.config["agent"]["prepare"]["lib_prep_type"] - if option not in self.__class__.LIB_PREP_TYPES: - options = '", "'.join(self.__class__.LIB_PREP_TYPES) - raise InvalidConfiguration( - f'Unkown library preparation type "{option}", valid options are "{options}"' - ) - option = self.config["agent"]["mark_duplicates"]["consensus_mode"] - if option not in self.__class__.CONSENSUS_MODES: - options = '", "'.join(self.__class__.CONSENSUS_MODES) - raise InvalidConfiguration( - f'Unkown consensus mode "{option}", valid options are "{options}"' - ) - class StarStepPart(ReadMappingStepPart): """Support for performing NGS alignment using STAR""" @@ -1004,34 +800,22 @@ def check_config(self): extensions. If invalid configuration, it raises InvalidConfiguration exception. """ # Check if tool is at all included in workflow - if self.__class__.name not in self.config["tools"]["rna"]: + if self.__class__.name not in self.config.tools.rna: return # STAR not run, don't check configuration # pragma: no cover - # Check required configuration settings present - self.parent.ensure_w_config( - config_keys=("step_config", "ngs_mapping", "star", "path_index"), - msg="Path to STAR index is required", - ) + # Check required global configuration settings present self.parent.ensure_w_config( config_keys=("static_data_config", "reference"), msg="No reference genome FASTA file given", ) - # Check validity of the STAR index - full_path = self.config["star"]["path_index"] - # a lot of files should be in this dir, justtest these - for indfile in ("Genome", "SA", "SAindex"): - expected_path = os.path.join(full_path, indfile) - if not os.path.exists(expected_path): # pragma: no cover - tpl = "Expected STAR index file {expected_path} does not exist!".format( - expected_path=expected_path - ) - raise InvalidConfiguration(tpl) - @dictify def _get_output_files_run_work(self): """Override base class' function to make Snakemake aware of extra files for STAR.""" output_files = super()._get_output_files_run_work() + if (cfg := self.config.get(self.name)) is None: + return output_files + output_files["gene_counts"] = self.base_path_out.format( mapper=self.name, ext=".GeneCounts.tab" ) @@ -1040,7 +824,7 @@ def _get_output_files_run_work(self): mapper=self.name, ext=".Junctions.tab" ) output_files["junctions_md5"] = output_files["junctions"] + ".md5" - if self.config[self.name]["transcriptome"]: + if cfg.transcriptome: output_files["transcriptome"] = self.base_path_out.format( mapper=self.name, ext=".toTranscriptome.bam" ) @@ -1055,16 +839,19 @@ def get_output_files(self, action): if key != "output_links": yield key, paths continue - yield key, list( - filter( - lambda x: not ( - x.endswith(".GeneCounts.tab") or x.endswith(".GeneCounts.tab.md5") - ), - paths, - ) + yield ( + key, + list( + filter( + lambda x: not ( + x.endswith(".GeneCounts.tab") or x.endswith(".GeneCounts.tab.md5") + ), + paths, + ) + ), ) - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -1078,9 +865,9 @@ def get_resource_usage(self, action): actions_str = ", ".join(self.actions) error_message = f"Action '{action}' is not supported. Valid options: {actions_str}" raise UnsupportedActionException(error_message) - mem_gb = int(3.5 * self.config["star"]["num_threads_align"]) + mem_gb = int(3.5 * self.config.star.num_threads_align) return ResourceUsage( - threads=self.config["star"]["num_threads_align"], + threads=self.config.star.num_threads_align, time="2-00:00:00", # 2 days memory=f"{mem_gb}G", ) @@ -1113,27 +900,51 @@ def get_output_files(self, action): if action == "infer": for key, ext in (("tsv", ".infer.txt"), ("decision", ".decision.json")): yield key, "work/{mapper}.{library_name}/strandedness/{mapper}.{library_name}" + ext - yield key + "_md5", "work/{mapper}.{library_name}/strandedness/{mapper}.{library_name}" + ext + ".md5" + yield ( + key + "_md5", + "work/{mapper}.{library_name}/strandedness/{mapper}.{library_name}" + + ext + + ".md5", + ) key, ext = ("output", ".decision.json") yield key, "output/{mapper}.{library_name}/strandedness/{mapper}.{library_name}" + ext - yield key + "_md5", "output/{mapper}.{library_name}/strandedness/{mapper}.{library_name}" + ext + ".md5" + yield ( + key + "_md5", + "output/{mapper}.{library_name}/strandedness/{mapper}.{library_name}" + + ext + + ".md5", + ) for key, ext in ( ("log", ".log"), ("conda_list", ".conda_list.txt"), ("conda_info", ".conda_info.txt"), ): - yield key, "output/{mapper}.{library_name}/log/{mapper}.{library_name}.strandedness" + ext - yield key + "_md5", "output/{mapper}.{library_name}/log/{mapper}.{library_name}.strandedness" + ext + ".md5" + yield ( + key, + "output/{mapper}.{library_name}/log/{mapper}.{library_name}.strandedness" + ext, + ) + yield ( + key + "_md5", + "output/{mapper}.{library_name}/log/{mapper}.{library_name}.strandedness" + + ext + + ".md5", + ) elif action == "counts": key, ext = ("counts", ".GeneCounts.tab") yield key, "work/{mapper}.{library_name}/strandedness/{mapper}.{library_name}" + ext - yield key + "_md5", "work/{mapper}.{library_name}/strandedness/{mapper}.{library_name}" + ext + ".md5" + yield ( + key + "_md5", + "work/{mapper}.{library_name}/strandedness/{mapper}.{library_name}" + ext + ".md5", + ) key, ext = ("output", ".GeneCounts.tab") yield key, "output/{mapper}.{library_name}/out/{mapper}.{library_name}" + ext - yield key + "_md5", "output/{mapper}.{library_name}/out/{mapper}.{library_name}" + ext + ".md5" + yield ( + key + "_md5", + "output/{mapper}.{library_name}/out/{mapper}.{library_name}" + ext + ".md5", + ) def get_result_files(self): - for mapper in self.config["tools"]["rna"]: + for mapper in self.config.tools.rna: tpl_out = "output/{mapper}.{library_name}/out/{mapper}.{library_name}.GeneCounts.tab" tpl_strandedness = ( "output/{mapper}.{library_name}/strandedness/{mapper}.{library_name}.decision.json" @@ -1149,9 +960,10 @@ def get_result_files(self): yield tpl_strandedness.format(mapper=mapper, library_name=library_name) + ".md5" for ext in ("log", "conda_info.txt", "conda_list.txt"): yield tpl_log.format(mapper=mapper, library_name=library_name, ext=ext) - yield tpl_log.format( - mapper=mapper, library_name=library_name, ext=ext - ) + ".md5" + yield ( + tpl_log.format(mapper=mapper, library_name=library_name, ext=ext) + + ".md5" + ) @dictify def get_log_file(self, action): @@ -1177,7 +989,7 @@ class Minimap2StepPart(ReadMappingStepPart): #: Tool category tool_category = "dna_long" - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -1191,9 +1003,9 @@ def get_resource_usage(self, action): actions_str = ", ".join(self.actions) error_message = f"Action '{action}' is not supported. Valid options: {actions_str}" raise UnsupportedActionException(error_message) - mem_gb = int(3.5 * self.config["minimap2"]["mapping_threads"]) + mem_gb = int(3.5 * self.config.minimap2.mapping_threads) return ResourceUsage( - threads=self.config["minimap2"]["mapping_threads"], + threads=self.config.minimap2.mapping_threads, time="2-00:00:00", # 2 days memory=f"{mem_gb}G", ) @@ -1222,7 +1034,7 @@ def check_config(self): configuration. If invalid configuration, it raises InvalidConfiguration exception. """ # Check if tool is at all included in workflow - if "external" not in self.config["tools"]["dna"]: + if "external" not in self.config.tools.dna: return # External not run, don't check configuration # pragma: no cover def get_args(self, action): @@ -1246,7 +1058,7 @@ def _collect_bams(self, wildcards, library_name): for _, path_infix, filename in self.path_gen.run(folder_name, ("bam",)): yield os.path.join(self.base_path_in, path_infix, filename).format(**wildcards) - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -1313,8 +1125,14 @@ def get_output_files(self, action): @dictify def _get_output_files_run_work(self): - yield "json", "work/{mapper}.{library_name}/report/alfred_qc/{mapper}.{library_name}.alfred.json.gz" - yield "json_md5", "work/{mapper}.{library_name}/report/alfred_qc/{mapper}.{library_name}.alfred.json.gz.md5" + yield ( + "json", + "work/{mapper}.{library_name}/report/alfred_qc/{mapper}.{library_name}.alfred.json.gz", + ) + yield ( + "json_md5", + "work/{mapper}.{library_name}/report/alfred_qc/{mapper}.{library_name}.alfred.json.gz.md5", + ) @dictify def get_log_file(self, action): @@ -1343,16 +1161,16 @@ def _get_params_run(self, wildcards): library_name = wildcards.library_name path_targets_bed = "" kit_name = self.parent.ngs_library_to_kit.get(library_name, "__default__") - for item in self.config["target_coverage_report"]["path_target_interval_list_mapping"]: - if item["name"] == kit_name: - path_targets_bed = item["path"] + for item in self.config.target_coverage_report.path_target_interval_list_mapping: + if item.name == kit_name: + path_targets_bed = item.path break return { "path_targets_bed": path_targets_bed, } - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -1383,9 +1201,9 @@ class BamCollectDocStepPart(ReportGetResultFilesMixin, BaseStepPart): actions = ("run",) def skip_result_files_for_library(self, library_name: str) -> bool: - return not self.config["bam_collect_doc"][ - "enabled" - ] or super().skip_result_files_for_library(library_name) + return not self.config.bam_collect_doc.enabled or super().skip_result_files_for_library( + library_name + ) def __init__(self, parent): super().__init__(parent) @@ -1414,21 +1232,39 @@ def get_output_files(self, action): paths_work = self._get_output_files_run_work() yield from paths_work.items() # Return list of paths to the links that will be created in ``output/`` - yield "output_links", [ - re.sub(r"^work/", "output/", work_path) - for work_path in chain(paths_work.values(), self.get_log_file(action).values()) - ] + yield ( + "output_links", + [ + re.sub(r"^work/", "output/", work_path) + for work_path in chain(paths_work.values(), self.get_log_file(action).values()) + ], + ) @dictify def _get_output_files_run_work(self): yield "vcf", "work/{mapper}.{library_name}/report/cov/{mapper}.{library_name}.cov.vcf.gz" - yield "vcf_md5", "work/{mapper}.{library_name}/report/cov/{mapper}.{library_name}.cov.vcf.gz.md5" - yield "vcf_tbi", "work/{mapper}.{library_name}/report/cov/{mapper}.{library_name}.cov.vcf.gz.tbi" - yield "vcf_tbi_md5", "work/{mapper}.{library_name}/report/cov/{mapper}.{library_name}.cov.vcf.gz.tbi.md5" + yield ( + "vcf_md5", + "work/{mapper}.{library_name}/report/cov/{mapper}.{library_name}.cov.vcf.gz.md5", + ) + yield ( + "vcf_tbi", + "work/{mapper}.{library_name}/report/cov/{mapper}.{library_name}.cov.vcf.gz.tbi", + ) + yield ( + "vcf_tbi_md5", + "work/{mapper}.{library_name}/report/cov/{mapper}.{library_name}.cov.vcf.gz.tbi.md5", + ) yield "cov_bw", "work/{mapper}.{library_name}/report/cov/{mapper}.{library_name}.cov.bw" - yield "cov_bw_md5", "work/{mapper}.{library_name}/report/cov/{mapper}.{library_name}.cov.bw.md5" + yield ( + "cov_bw_md5", + "work/{mapper}.{library_name}/report/cov/{mapper}.{library_name}.cov.bw.md5", + ) yield "mq_bw", "work/{mapper}.{library_name}/report/cov/{mapper}.{library_name}.mq.bw" - yield "mq_bw_md5", "work/{mapper}.{library_name}/report/cov/{mapper}.{library_name}.mq.bw.md5" + yield ( + "mq_bw_md5", + "work/{mapper}.{library_name}/report/cov/{mapper}.{library_name}.mq.bw.md5", + ) @dictify def get_log_file(self, action): @@ -1445,7 +1281,7 @@ def get_log_file(self, action): yield key, prefix + ext yield key + "_md5", prefix + ext + ".md5" - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -1476,7 +1312,7 @@ def __init__(self, parent): super().__init__(parent) def skip_result_files_for_library(self, library_name: str) -> bool: - return not self.config["ngs_chew_fingerprint"]["enabled"] + return not self.config.ngs_chew_fingerprint.enabled def get_input_files(self, action): """Return required input files""" @@ -1501,15 +1337,21 @@ def get_output_files(self, action): paths_work = self._get_output_files_fingerprint_work() yield from paths_work.items() # Return list of paths to the links that will be created in ``output/`` - yield "output_links", [ - re.sub(r"^work/", "output/", work_path) - for work_path in chain(paths_work.values(), self.get_log_file(action).values()) - ] + yield ( + "output_links", + [ + re.sub(r"^work/", "output/", work_path) + for work_path in chain(paths_work.values(), self.get_log_file(action).values()) + ], + ) @dictify def _get_output_files_fingerprint_work(self): yield "npz", "work/{mapper}.{library_name}/report/fingerprint/{mapper}.{library_name}.npz" - yield "npz_md5", "work/{mapper}.{library_name}/report/fingerprint/{mapper}.{library_name}.npz.md5" + yield ( + "npz_md5", + "work/{mapper}.{library_name}/report/fingerprint/{mapper}.{library_name}.npz.md5", + ) def get_log_file(self, action): self._check_action(action) @@ -1529,7 +1371,7 @@ def _get_log_files_fingerprint(self): yield key, prefix + ext yield key + "_md5", prefix + ext + ".md5" - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -1562,7 +1404,14 @@ def default_config_yaml(cls): return DEFAULT_CONFIG def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir): - super().__init__(workflow, config, config_lookup_paths, config_paths, workdir) + super().__init__( + workflow, + config, + config_lookup_paths, + config_paths, + workdir, + config_model_class=NgsMappingConfigModel, + ) self.register_sub_step_classes( ( BwaStepPart, @@ -1583,7 +1432,7 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) # Create shortcut from library to all extra infos. self.ngs_library_to_extra_infos = self._build_ngs_library_to_extra_infos() # Validate project - self.validate_project(config_dict=self.config, sample_sheets_list=self.shortcut_sheets) + self.validate_project(config=self.config, sample_sheets_list=self.shortcut_sheets) def _build_ngs_library_to_extra_infos(self): result = {} @@ -1596,15 +1445,15 @@ def _build_ngs_library_to_extra_infos(self): return result def _build_ngs_library_to_kit(self): - cov_config = DictQuery(self.w_config).get("step_config/ngs_mapping/target_coverage_report") + cov_config = self.w_config.step_config["ngs_mapping"].target_coverage_report # Build mapping. default_kit_configured = False regexes = {} - for item in cov_config["path_target_interval_list_mapping"]: - if item["name"] == "__default__": + for item in cov_config.path_target_interval_list_mapping: + if item.name == "__default__": default_kit_configured = True else: - regexes[item["pattern"]] = item["name"] + regexes[item.pattern] = item.name result = {} for donor in self._all_donors(): for bio_sample in donor.bio_samples.values(): @@ -1637,15 +1486,15 @@ def get_result_files(self): if sub_step.name not in (LinkInStep.name,): yield from sub_step.get_result_files() - def validate_project(self, config_dict, sample_sheets_list): + def validate_project(self, config, sample_sheets_list): """Validates project. Method compares sample information included in the sample sheet and the configuration. If sheet contains 'DNA' samples, a DNA mapper should be defined. Similarly, if it contains 'RNA', a RNA mapper should be defined. - :param config_dict: Dictionary with configurations as found in the project's yaml file. - :type config_dict: dict + :param config: SnappyStepModel with configurations as found in the project's yaml file. + :type config: SnappyStepModel :param sample_sheets_list: List with biomedical sample sheets. :type sample_sheets_list: list @@ -1655,7 +1504,7 @@ def validate_project(self, config_dict, sample_sheets_list): rna_bool_list = [] # Get tools dictionary - tools_dict = config_dict["tools"] + tools = config.tools # Iterate over sheets for sheet in sample_sheets_list: @@ -1669,14 +1518,14 @@ def validate_project(self, config_dict, sample_sheets_list): rna_analysis = any(rna_bool_list) # Validate DNA project - dna_tool_list = tools_dict.get("dna", []) + dna_tool_list = tools.dna if dna_analysis and not dna_tool_list: raise InvalidConfiguration( "Sample sheet contains DNA but configuration has no DNA " "mapper defined in tool list." ) # Validate RNA project - rna_tool_list = tools_dict.get("rna", []) + rna_tool_list = tools.rna if rna_analysis and not rna_tool_list: raise InvalidConfiguration( "Sample sheet contains RNA but configuration has no RNA " diff --git a/snappy_pipeline/workflows/ngs_mapping/model.py b/snappy_pipeline/workflows/ngs_mapping/model.py new file mode 100644 index 000000000..30968a324 --- /dev/null +++ b/snappy_pipeline/workflows/ngs_mapping/model.py @@ -0,0 +1,332 @@ +import enum +import os +from enum import Enum +from typing import Annotated + +from pydantic import Field, field_validator, model_validator + +from snappy_pipeline.models import EnumField, SizeString, SnappyModel, SnappyStepModel + + +class DnaMapper(Enum): + BWA = "bwa" + BWA_MEM2 = "bwa_mem2" + + +class LongDnaMapper(Enum): + MINIMAP2 = "minimap2" + + +class RnaMapper(Enum): + STAR = "star" + + +class Tools(SnappyModel): + dna: Annotated[list[DnaMapper], EnumField(DnaMapper, [])] + """Required if DNA analysis; otherwise, leave empty.""" + + rna: Annotated[list[RnaMapper], EnumField(RnaMapper, [])] + """Required if RNA analysis; otherwise, leave empty.""" + + dna_long: Annotated[list[LongDnaMapper], EnumField(LongDnaMapper, [])] + """Required if long-read mapper used; otherwise, leave empty.""" + + +class TargetCoverageReportEntry(SnappyModel): + """ + Mapping from enrichment kit to target region BED file, for either computing per--target + region coverage or selecting targeted exons. + + The following will match both the stock IDT library kit and the ones + with spike-ins seen fromr Yale genomics. The path above would be + mapped to the name "default". + - name: IDT_xGen_V1_0 + pattern: "xGen Exome Research Panel V1\\.0*" + path: "path/to/targets.bed" + """ + + name: Annotated[str, Field(examples=["IDT_xGen_V1_0"])] + + pattern: Annotated[str, Field(examples=["xGen Exome Research Panel V1\\.0*"])] + + path: Annotated[str, Field(examples=["path/to/targets.bed"])] + + +class TargetCoverageReport(SnappyModel): + path_target_interval_list_mapping: list[TargetCoverageReportEntry] = [] + + +class BamCollectDoc(SnappyModel): + enabled: bool = False + window_length: Annotated[int, Field(gt=0)] = 1000 + + +class NgsChewFingerprint(SnappyModel): + enabled: bool = True + + +class BwaMode(Enum): + AUTO = "auto" + BWA_ALN = "bwa-aln" + BWA_MEM = "bwa-mem" + + +class BwaMapper(SnappyModel): + path_index: str + """Required if listed in ngs_mapping.tools.dna; otherwise, can be removed.""" + num_threads_align: int = 16 + num_threads_trimming: int = 8 + num_threads_bam_view: int = 4 + num_threads_bam_sort: int = 4 + memory_bam_sort: SizeString = "4G" + trim_adapters: bool = False + mask_duplicates: bool = True + + split_as_secondary: bool = False + """-M flag""" + + extra_args: list[str] = [] + """[ "-C" ] when molecular barcodes are processed with AGeNT in the somatic mode""" + + +class Bwa(BwaMapper): + @field_validator("path_index") + @classmethod + def validate_bwa_path_index(cls, v): + import logging + + extensions = {".amb", ".ann", ".bwt", ".pac", ".sa"} + prefix, ext = os.path.splitext(v) + if ext: + if ext in {".fa", ".fasta"}: + prefix += ext + else: + if ext not in extensions: + logging.warning(f"unknown extension '{v}'") + for extension in extensions: + if not os.path.exists(prefix + extension): + logging.warning(f"{v} does not exist") + return prefix + + +class BwaMem2(BwaMapper): + @field_validator("path_index") + @classmethod + def validate_bwa_mem2_path_index(cls, v): + import logging + + extensions = {".0123", ".amb", ".ann", ".bwt.2bit.64", ".pac"} + prefix, ext = os.path.splitext(v) + if ext: + if ext in {".fa", ".fasta"}: + prefix += ext + else: + if ext not in extensions: + logging.warning(f"unknown extension '{v}'") + for extension in extensions: + if not os.path.exists(prefix + extension): + logging.warning(f"{v} does not exist") + return prefix + + +class BarcodeTool(Enum): + AGENT = "agent" + + +class Somatic(SnappyModel): + mapping_tool: DnaMapper + """Either bwa of bwa_mem2. The indices & other parameters are taken from mapper config""" + + barcode_tool: BarcodeTool = BarcodeTool.AGENT + """Only agent currently implemented""" + + use_barcodes: bool = False + recalibrate: bool = True + + +class Bqsr(SnappyModel): + common_variants: str + """Common germline variants (see /fast/work/groups/cubi/projects/biotools/static_data/app_support/GATK)""" + + +class AgentLibPrepType(Enum): + HALO_PLEX = "halo" + HALO_PLEX_HS = "hs" + SURE_SELECT = "xt" + SURE_SELECT_HS2 = "v2" + SURE_SELECT_QXT = "qxt" + + +class AgentPrepare(SnappyModel): + path: str + + lib_prep_type: AgentLibPrepType = None + """One of "halo" (HaloPlex), "hs" (HaloPlexHS), "xt" (SureSelect XT, XT2, XT HS), "v2" (SureSelect XT HS2) & "qxt" (SureSelect QXT)""" + + extra_args: list[str] = [] + """Consider "-polyG 8" for NovaSeq data & "-minFractionRead 50" for 100 cycles data""" + + +class AgentMarkDuplicatesConsensusMode(Enum): + SINGLE = "SINGLE" + HYBRID = "HYBRID" + DUPLEX = "DUPLEX" + + +class AgentMarkDuplicates(SnappyModel): + path: str + path_baits: str + consensus_mode: AgentMarkDuplicatesConsensusMode = None + """One of "SINGLE", "HYBRID", "DUPLEX" """ + + input_filter_args: list[str] = [] + """Consider -mm 13 (min base qual) -mr 13 (min barcode base qual) -mq 30 (min map qual)""" + + consensus_filter_args: list[str] = [] + + extra_args: list[str] = [] + """Consider -d 1 (max nb barcode mismatch)""" + + +class Agent(SnappyModel): + prepare: AgentPrepare + mark_duplicates: AgentMarkDuplicates + + +class Star(SnappyModel): + path_index: str + """Required if listed in ngs_mapping.tools.rna; otherwise, can be removed.""" + num_threads_align: int = 16 + num_threads_trimming: int = 8 + num_threads_bam_view: int = 4 + num_threads_bam_sort: int = 4 + memory_bam_sort: SizeString = "4G" + genome_load: str = "NoSharedMemory" + raw_star_options: str = "" + align_intron_max: int = 1000000 # ENCODE option + align_intron_min: int = 20 # ENCODE option + align_mates_gap_max: int = 1000000 # ENCODE option + align_sjdb_overhang_min: int = 1 # ENCODE option + align_sj_overhang_min: int = 8 # ENCODE option + out_filter_mismatch_n_max: int = 999 # ENCODE option + out_filter_mismatch_n_over_l_max: float = 0.04 # ENCODE option + out_filter_multimap_n_max: int = 20 # ENCODE option + out_filter_type: str = "BySJout" # ENCODE option + out_filter_intron_motifs: str = "" + """or for cufflinks: RemoveNoncanonical""" + + out_sam_strand_field: str = "" + """or for cufflinks: intronMotif""" + + transcriptome: bool = False + """true to output transcript coordinate bam for RSEM""" + + trim_adapters: bool = False + mask_duplicates: bool = False + include_unmapped: bool = True + + @model_validator(mode="after") + def ensure_star_index_files_exist(self): + full_path = self.path_index + # a lot of files should be in this dir, justtest these + for indfile in ("Genome", "SA", "SAindex"): + expected_path = os.path.join(full_path, indfile) + if not os.path.exists(expected_path): # pragma: no cover + raise ValueError(f"Expected STAR index file {expected_path} does not exist!") + return self + + +class Strand(enum.IntEnum): + UNKNOWN = -1 + INFER = 0 + UNSTRANDED = 0 + FORWARD = 1 + REVERSE = 2 + + +class Strandedness(SnappyModel): + path_exon_bed: str + """Location of usually highly expressed genes. Known protein coding genes is a good choice""" + + strand: Strand = Strand.UNKNOWN + """-1: unknown value, use infer_, 0: unstranded, 1: forward, 2: reverse (from featurecounts)""" + + threshold: float = 0.85 + """Minimum proportion of reads mapped to forward/reverse direction to call the protocol""" + + +class Minimap2(SnappyModel): + mapping_threads: int = 16 + + +class Mbcs(SnappyModel): + mapping_tool: DnaMapper + use_barcodes: bool + recalibrate: bool + + +class NgsMapping(SnappyStepModel): + tools: Tools + """Aligners to use for the different NGS library types""" + + path_link_in: str = "" + """OPTIONAL Override data set configuration search paths for FASTQ files""" + + target_coverage_report: TargetCoverageReport | None = None + """Thresholds for targeted sequencing coverage QC.""" + + bam_collect_doc: BamCollectDoc = BamCollectDoc() + """Depth of coverage collection, mainly useful for genomes.""" + + ngs_chew_fingerprint: NgsChewFingerprint = NgsChewFingerprint() + """Compute fingerprints with ngs-chew""" + + bwa: Bwa | None = None + """Configuration for BWA""" + + bwa_mem2: BwaMem2 | None = None + """Configuration for BWA-MEM2""" + + somatic: Somatic | None = None + """ + Configuration for somatic ngs_calling + (separate read groups, molecular barcodes & base quality recalibration) + """ + + bqsr: Bqsr | None = None + + agent: Agent | None = None + + star: Star | None = None + """Configuration for STAR""" + + strandedness: Strandedness | None = None + + minimap2: Minimap2 | None = None + + mbcs: Mbcs | None = None + + @model_validator(mode="after") + def ensure_tools_are_configured(self): + for data_type in ("dna", "rna", "dna_long"): + tool_list = getattr(self.tools, data_type) + for tool in tool_list: + if not getattr(self, tool): + raise ValueError(f"Tool {tool} not configured") + return self + + @model_validator(mode="after") + def check_mbcs_prerequisites(self): + if self.mbcs: + tool = self.mbcs.mapping_tool + if not getattr(self, str(tool)): + raise ValueError(f"Tool {tool} not configured") + + if self.mbcs.use_barcodes: + if not self.agent: + raise ValueError("Agent configuration required for MBCS") + + if self.mbcs.recalibrate: + if not self.bqsr: + raise ValueError("BQSR configuration required for MBCS") + return self diff --git a/snappy_pipeline/workflows/panel_of_normals/__init__.py b/snappy_pipeline/workflows/panel_of_normals/__init__.py index 9f3f84271..95f336009 100644 --- a/snappy_pipeline/workflows/panel_of_normals/__init__.py +++ b/snappy_pipeline/workflows/panel_of_normals/__init__.py @@ -148,7 +148,6 @@ """ from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions - from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import ( BaseStep, @@ -158,73 +157,15 @@ ) from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow +from .model import PanelOfNormals as PanelOfNormalsConfigModel + __author__ = "Manuel Holtgrewe " #: Names of the tools that might use panel of normals TOOLS = ("mutect2", "cnvkit", "access", "purecn") #: Default configuration for the somatic_variant_calling schema -DEFAULT_CONFIG = r""" -# Default configuration somatic_variant_calling -step_config: - panel_of_normals: - tools: ['mutect2'] # REQUIRED - available: 'mutect2' - path_ngs_mapping: ../ngs_mapping # REQUIRED - ignore_chroms: [] # patterns of chromosome names to ignore - # hs37d5: [NC_007605, hs37d5, chrEBV, '*_decoy', 'HLA-*', 'GL000220.*'] - # GRCh38.d1.vd1: [chrEBV, 'HPV*', CMV, HBV, 'HCV-*', 'HIV-*', KSHV, 'HTLV-1', MCV, '*_decoy', 'chrUn_GL00220*', SV40] - # Configuration for mutect2 - mutect2: - path_normals_list: null # Optional file listing libraries to include in panel - germline_resource: REQUIRED - # Java options - java_options: ' -Xmx16g ' - # Parallelization configuration - num_cores: 2 # number of cores to use locally - window_length: 100000000 # split input into windows of this size, each triggers a job - num_jobs: 500 # number of windows to process in parallel - use_profile: true # use Snakemake profile for parallel processing - restart_times: 5 # number of times to re-launch jobs in case of failure - max_jobs_per_second: 2 # throttling of job creation - max_status_checks_per_second: 10 # throttling of status checks - debug_trunc_tokens: 0 # truncation to first N tokens (0 for none) - keep_tmpdir: never # keep temporary directory, {always, never, onerror} - job_mult_memory: 1 # memory multiplier - job_mult_time: 1 # running time multiplier - merge_mult_memory: 1 # memory multiplier for merging - merge_mult_time: 1 # running time multiplier for merging - cnvkit: - path_normals_list: "" # Optional file listing libraries to include in panel - path_target_regions: "" # Bed files of targetted regions (Missing when creating a panel of normals for WGS data) - access: "" # Access bed file (output/cnvkit.access/out/cnvkit.access.bed when create_cnvkit_acces was run) - annotate: "" # [target] Optional targets annotations - target_avg_size: 0 # [target] Average size of split target bins (0: use default value) - bp_per_bin: 50000 # [autobin] Expected base per bin - split: True # [target] Split large intervals into smaller ones - antitarget_avg_size: 0 # [antitarget] Average size of antitarget bins (0: use default value) - min_size: 0 # [antitarget] Min size of antitarget bins (0: use default value) - min_mapq: 0 # [coverage] Mininum mapping quality score to count a read for coverage depth - count: False # [coverage] Alternative couting algorithm - min_cluster_size: 0 # [reference] Minimum cluster size to keep in reference profiles. 0 for no clustering - gender: "" # [reference] Specify the chromosomal sex of all given samples as male or female. Guess when missing - male_reference: False # [reference & sex] Create male reference - gc_correction: True # [reference] Use GC correction - edge_correction: True # [reference] Use edge correction - rmask_correction: True # [reference] Use rmask correction - drop_low_coverage: False # [metrics] Drop very-low-coverage bins before calculations - access: # Creates access file for cnvkit, based on genomic sequence & excluded regions (optionally) - exclude: [] # [access] Bed file of regions to exclude (mappability, blacklisted, ...) - min_gap_size: 0 # [access] Minimum gap size between accessible sequence regions (0: use default value) - purecn: - path_normals_list: "" # Optional file listing libraries to include in panel - path_bait_regions: REQUIRED # Bed files of enrichment kit sequences (MergedProbes for Agilent SureSelect), recommended by PureCN author - path_genomicsDB: REQUIRED # Mutect2 genomicsDB created during panel_of_normals - genome_name: "unknown" # Must be one from hg18, hg19, hg38, mm9, mm10, rn4, rn5, rn6, canFam3 - enrichment_kit_name: "unknown" # For filename only... - mappability: "" # GRCh38: /fast/work/groups/cubi/projects/biotools/static_data/app_support/PureCN/hg38/mappability.bw - reptiming: "" # Nothing for GRCh38 - seed: 1234567 -""" +DEFAULT_CONFIG = PanelOfNormalsConfigModel.default_config_yaml_string() class PanelOfNormalsStepPart(BaseStepPart): @@ -241,11 +182,12 @@ def __init__(self, parent): super().__init__(parent) # Build shortcut from cancer bio sample name to matched cancer sample self.normal_libraries = list(self._get_normal_libraries()) - if self.name and self.config[self.name].get("path_normals_list"): - self.normal_libraries = [] - with open(self.config[self.name]["path_normals_list"], "rt") as f: - for line in f: - self.normal_libraries.append(line.strip()) + if self.name and (cfg := self.config.get(self.name)): + if path := cfg.get("path_normals_list"): + self.normal_libraries = [] + with open(path, "rt") as f: + for line in f: + self.normal_libraries.append(line.strip()) def _get_normal_libraries(self): for sheet in self.parent.shortcut_sheets: @@ -321,9 +263,12 @@ def get_input_files(self, action): @dictify def _get_input_files_coverage(self, wildcards): yield "container", "work/containers/out/purecn.simg" - yield "intervals", "work/purecn/out/{}_{}.list".format( - self.config["purecn"]["enrichment_kit_name"], - self.config["purecn"]["genome_name"], + yield ( + "intervals", + "work/purecn/out/{}_{}.list".format( + self.config.purecn.enrichment_kit_name, + self.config.purecn.genome_name, + ), ) tpl = "output/{mapper}.{library_name}/out/{mapper}.{library_name}.bam" yield "bam", self.ngs_mapping(tpl.format(**wildcards)) @@ -332,9 +277,13 @@ def _get_input_files_coverage(self, wildcards): def _get_input_files_create(self, wildcards): yield "container", "work/containers/out/purecn.simg" tpl = "work/{mapper}.purecn/out/{mapper}.purecn.{library_name}_coverage_loess.txt.gz" - yield "normals", [ - tpl.format(mapper=wildcards.mapper, library_name=lib) for lib in self.normal_libraries - ] + yield ( + "normals", + [ + tpl.format(mapper=wildcards.mapper, library_name=lib) + for lib in self.normal_libraries + ], + ) def get_output_files(self, action): self._validate_action(action) @@ -343,8 +292,8 @@ def get_output_files(self, action): return {"container": "work/containers/out/purecn.simg"} if action == "prepare": base_out = "{}_{}".format( - self.config["purecn"]["enrichment_kit_name"], - self.config["purecn"]["genome_name"], + self.config.purecn.enrichment_kit_name, + self.config.purecn.genome_name, ) return { "intervals": "work/purecn/out/" + base_out + ".list", @@ -373,8 +322,8 @@ def get_log_file(self, action): tpls = { "install": "work/containers/log/purecn", "prepare": "work/purecn/log/{}_{}".format( - self.config["purecn"]["enrichment_kit_name"], - self.config["purecn"]["genome_name"], + self.config.purecn.enrichment_kit_name, + self.config.purecn.genome_name, ), "coverage": "work/{mapper}.purecn/log/{mapper}.purecn.{library_name,.+-DNA[0-9]+-WES[0-9]+}", "create_panel": "work/{mapper}.purecn/log/{mapper}.purecn.panel_of_normals", @@ -382,14 +331,6 @@ def get_log_file(self, action): assert action in self.actions return self._get_log_file(tpls[action]) - def check_config(self): - if self.name not in self.config["tools"]: - return # PureCN not enabled, skip - self.parent.ensure_w_config( - ("step_config", "panel_of_normals", self.name, "path_bait_regions"), - "Path to exome panel bait regions not defined for tool {}".format(self.name), - ) - class Mutect2StepPart(PanelOfNormalsStepPart): """Somatic variant calling with MuTect 2""" @@ -415,7 +356,7 @@ class Mutect2StepPart(PanelOfNormalsStepPart): } def check_config(self): - if self.name not in self.config["tools"]: + if self.name not in self.config.tools: return # Mutect not enabled, skip self.parent.ensure_w_config( ("static_data_config", "reference", "path"), @@ -525,10 +466,10 @@ class CnvkitStepPart(PanelOfNormalsStepPart): def __init__(self, parent): super().__init__(parent) - self.is_wgs = self.config["cnvkit"]["path_target_regions"] == "" + self.is_wgs = self.config.cnvkit.path_target_regions == "" def check_config(self): - if self.name not in self.config["tools"]: + if self.name not in self.config.tools: return # cnvkit not enabled, skip self.parent.ensure_w_config( ("static_data_config", "reference", "path"), @@ -610,12 +551,16 @@ def _get_input_files_create_panel(self, wildcards): for ext in ("log", "conda_list.txt", "conda_info.txt") ] return { - "target": targets - if targets - else "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed".format(**wildcards), - "antitarget": antitargets - if antitargets - else "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed".format(**wildcards), + "target": ( + targets + if targets + else "work/{mapper}.cnvkit/out/{mapper}.cnvkit.target.bed".format(**wildcards) + ), + "antitarget": ( + antitargets + if antitargets + else "work/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.bed".format(**wildcards) + ), "logs": logs if targets or antitargets else [], } @@ -714,7 +659,7 @@ class AccessStepPart(PanelOfNormalsStepPart): name = "access" actions = ("run",) - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: # Validate action self._validate_action(action) return ResourceUsage( @@ -766,10 +711,11 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (NgsMappingWorkflow,), + config_model_class=PanelOfNormalsConfigModel, + previous_steps=(NgsMappingWorkflow,), ) # Initialize sub-workflows - self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"]) + self.register_sub_workflow("ngs_mapping", self.config.path_ngs_mapping) # Register sub step classes so the sub steps are available self.register_sub_step_classes( ( @@ -798,7 +744,7 @@ def get_result_files(self): "conda_info.txt.md5", ] - if "mutect2" in set(self.config["tools"]) & set(TOOLS): + if "mutect2" in set(self.config.tools) & set(TOOLS): tpl = "output/{mapper}.mutect2/out/{mapper}.mutect2.panel_of_normals.{ext}" ext_list = ("vcf.gz", "vcf.gz.md5", "vcf.gz.tbi", "vcf.gz.tbi.md5") result_files.extend(self._expand_result_files(tpl, ext_list)) @@ -808,7 +754,7 @@ def get_result_files(self): tpl = "output/{mapper}.mutect2/log/{mapper}.mutect2.panel_of_normals.{ext}" result_files.extend(self._expand_result_files(tpl, log_ext_list)) - if "cnvkit" in set(self.config["tools"]) & set(TOOLS): + if "cnvkit" in set(self.config.tools) & set(TOOLS): tpls = [ ("output/{mapper}.cnvkit/out/{mapper}.cnvkit.target.{ext}", ("bed", "bed.md5")), ("output/{mapper}.cnvkit/out/{mapper}.cnvkit.antitarget.{ext}", ("bed", "bed.md5")), @@ -838,13 +784,13 @@ def get_result_files(self): tpl = "output/{mapper}.cnvkit/log/{mapper}.cnvkit.merged.tar.gz{ext}" result_files.extend(self._expand_result_files(tpl, ("", ".md5"))) - if "access" in set(self.config["tools"]) & set(TOOLS): + if "access" in set(self.config.tools) & set(TOOLS): tpl = "output/cnvkit.access/out/cnvkit.access.bed" result_files.extend([tpl + md5 for md5 in ("", ".md5")]) tpl = "output/cnvkit.access/log/cnvkit.access.{ext}" result_files.extend(self._expand_result_files(tpl, log_ext_list)) - if "purecn" in set(self.config["tools"]) & set(TOOLS): + if "purecn" in set(self.config.tools) & set(TOOLS): tpl = "output/{mapper}.purecn/out/{mapper}.purecn.panel_of_normals.{ext}" ext_list = ("rds", "rds.md5") result_files.extend(self._expand_result_files(tpl, ext_list)) @@ -854,27 +800,20 @@ def get_result_files(self): tpl = "output/{mapper}.purecn/log/{mapper}.purecn.panel_of_normals.{ext}" result_files.extend(self._expand_result_files(tpl, log_ext_list)) tpl = "output/purecn/out/{}_{}.{{ext}}".format( - self.config["purecn"]["enrichment_kit_name"], - self.config["purecn"]["genome_name"], + self.config.purecn.enrichment_kit_name, + self.config.purecn.genome_name, ) ext_list = ("list", "list.md5", "bed.gz", "bed.gz.md5", "bed.gz.tbi", "bed.gz.tbi.md5") result_files.extend(self._expand_result_files(tpl, ext_list)) tpl = "output/purecn/log/{}_{}.{{ext}}".format( - self.config["purecn"]["enrichment_kit_name"], - self.config["purecn"]["genome_name"], + self.config.purecn.enrichment_kit_name, + self.config.purecn.genome_name, ) result_files.extend(self._expand_result_files(tpl, log_ext_list)) return result_files def _expand_result_files(self, tpl, ext_list): - for mapper in self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"]: + for mapper in self.w_config.step_config["ngs_mapping"].tools.dna: for ext in ext_list: yield tpl.format(mapper=mapper, ext=ext) - - def check_config(self): - """Check that the path to the NGS mapping is present""" - self.ensure_w_config( - ("step_config", "panel_of_normals", "path_ngs_mapping"), - "Path to NGS mapping not configured but required for somatic variant calling", - ) diff --git a/snappy_pipeline/workflows/panel_of_normals/model.py b/snappy_pipeline/workflows/panel_of_normals/model.py new file mode 100644 index 000000000..ec68233e3 --- /dev/null +++ b/snappy_pipeline/workflows/panel_of_normals/model.py @@ -0,0 +1,212 @@ +import enum +from typing import Annotated, Literal + +from pydantic import Field + +from snappy_pipeline.models import EnumField, KeepTmpdir, SnappyModel, SnappyStepModel, validators + + +class Tool(enum.StrEnum): + mutect2 = "mutect2" + cnvkit = "cnvkit" + purecn = "purecn" + access = "access" + + +class Mutect2(SnappyModel): + path_normals_list: str = "" + + germline_resource: str + + java_options: str = " -Xmx16g" + + num_cores: int = 2 + """number of cores to use locally""" + + window_length: int = 100000000 + """split input into windows of this size, each triggers a job""" + + num_jobs: int = 500 + """number of windows to process in parallel""" + + use_profile: bool = True + """use Snakemake profile for parallel processing""" + + restart_times: int = 5 + """number of times to re-launch jobs in case of failure""" + + max_jobs_per_second: int = 2 + """throttling of job creation""" + + max_status_checks_per_second: int = 10 + """throttling of status checks""" + + debug_trunc_tokens: int = 0 + """truncation to first N tokens (0 for none)""" + + keep_tmpdir: KeepTmpdir = KeepTmpdir.never + """keep temporary directory, {always, never, onerror}""" + + job_mult_memory: float = 1 + """memory multiplier""" + + job_mult_time: float = 1 + """running time multiplier""" + + merge_mult_memory: float = 1 + """memory multiplier for merging""" + + merge_mult_time: float = 1 + """running time multiplier for merging""" + + +class CnvKit(SnappyModel): + path_normals_list: str = "" + """Optional file listing libraries to include in panel""" + + path_target_regions: str = "" + """Bed files of targetted regions (Missing when creating a panel of normals for WGS data)""" + + access: str = "" + """Access bed file (output/cnvkit.access/out/cnvkit.access.bed when create_cnvkit_acces was run)""" + + annotate: str = "" + """[target] Optional targets annotations""" + + target_avg_size: int = 0 + """[target] Average size of split target bins (0: use default value)""" + + bp_per_bin: int = 50000 + """[autobin] Expected base per bin""" + + split: bool = True + """[target] Split large intervals into smaller ones""" + + antitarget_avg_size: int = 0 + """[antitarget] Average size of antitarget bins (0: use default value)""" + + min_size: int = 0 + """[antitarget] Min size of antitarget bins (0: use default value)""" + + min_mapq: int = 0 + """[coverage] Mininum mapping quality score to count a read for coverage depth""" + + count: bool = False + """[coverage] Alternative couting algorithm""" + + min_cluster_size: int = 0 + """[reference] Minimum cluster size to keep in reference profiles. 0 for no clustering""" + + gender: str = "" + """[reference] Specify the chromosomal sex of all given samples as male or female. Guess when missing""" + + male_reference: bool = False + """[reference & sex] Create male reference""" + + gc_correction: bool = True + """[reference] Use GC correction""" + + edge_correction: bool = True + """[reference] Use edge correction""" + + rmask_correction: bool = True + """[reference] Use rmask correction""" + + drop_low_coverage: bool = False + """[metrics] Drop very-low-coverage bins before calculations""" + + +class Access(SnappyModel): + """Creates access file for cnvkit, based on genomic sequence & excluded regions (optionally)""" + + exclude: list[str] = [] + """[access] Bed file of regions to exclude (mappability, blacklisted, ...)""" + + min_gap_size: int = 0 + """[access] Minimum gap size between accessible sequence regions (0: use default value)""" + + +class GenomeName(enum.StrEnum): + hg18 = "hg18" + hg19 = "hg19" + hg38 = "hg38" + mm9 = "mm9" + mm10 = "mm10" + rn4 = "rn4" + rn5 = "rn5" + rn6 = "rn6" + canFam3 = "canFam3" + + +class PureCn(SnappyModel): + path_normals_list: str = "" + """Optional file listing libraries to include in panel""" + + path_bait_regions: str + """ + Bed files of enrichment kit sequences (MergedProbes for Agilent SureSelect), + recommended by PureCN author + """ + + path_genomicsDB: str + """Mutect2 genomicsDB created during panel_of_normals""" + + genome_name: Annotated[ + GenomeName | Literal["unknown"], + EnumField(GenomeName, json_schema_extra={"options": {"unknown"}}), + ] = "unknown" + + enrichment_kit_name: str = "unknown" + """For filename only...""" + + mappability: str = "" + """ + GRCh38: + /fast/work/groups/cubi/projects/biotools/static_data/app_support/PureCN/hg38/mappability.bw + """ + + reptiming: str = "" + """Nothing for GRCh38""" + + seed: int = 1234567 + + +class PanelOfNormals(SnappyStepModel, validators.ToolsMixin): + tools: Annotated[list[Tool], EnumField(Tool, [Tool.mutect2], min_length=1)] + + path_ngs_mapping: str = "../ngs_mapping" + + ignore_chroms: Annotated[ + list[str], + Field( + examples=[ + "NC_007605", + "hs37d5", + "chrEBV", + "*_decoy", + "HLA-*", + "GL000220.*", + "chrEBV", + "HPV*", + "CMV", + "HBV", + "HCV-*", + "HIV-*", + "KSHV", + "HTLV-1", + "MCV", + "*_decoy", + "chrUn_GL00220*", + "SV40", + ] + ), + ] = [] + """Patterns of contig names to ignore""" + + mutect2: Mutect2 | None = None + + cnvkit: CnvKit | None = None + + access: Access = Access() + + purecn: PureCn | None = None diff --git a/snappy_pipeline/workflows/repeat_expansion/__init__.py b/snappy_pipeline/workflows/repeat_expansion/__init__.py index 019bcd279..3145a32d9 100644 --- a/snappy_pipeline/workflows/repeat_expansion/__init__.py +++ b/snappy_pipeline/workflows/repeat_expansion/__init__.py @@ -77,12 +77,13 @@ Not available. """ + from collections import OrderedDict import os -from biomedsheets.shortcuts import KEY_SEX, GermlineCaseSheet, is_not_background from snakemake.io import expand +from biomedsheets.shortcuts import KEY_SEX, GermlineCaseSheet, is_not_background from snappy_pipeline.base import UnsupportedActionException from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart @@ -91,22 +92,14 @@ AnnotateExpansionHunter, ) +from .model import RepeatExpansion as RepeatExpansionConfigModel + #: Extensions of files to create as main payload - JSON. EXT_JSON = (".json", ".json.md5") #: Extensions of files to create as main payload - VCF. EXT_VCF = (".vcf", ".vcf.md5") #: Default configuration for the repeat_expansion step. -DEFAULT_CONFIG = r""" -# Default configuration repeat_expansion -step_config: - repeat_expansion: - # Repeat expansions definitions - used in ExpansionHunter call - repeat_catalog: REQUIRED - # Repeat expansions annotations, e.g., normality range - custom file - repeat_annotation: REQUIRED - # Path to the ngs_mapping step - path_ngs_mapping: ../ngs_mapping -""" +DEFAULT_CONFIG = RepeatExpansionConfigModel.default_config_yaml_string() class ExpansionHunterStepPart(BaseStepPart): @@ -233,8 +226,11 @@ def _get_output_files_run(): ext_dict = {"json": "json", "vcf": "vcf", "vcf_md5": "vcf.md5"} # Yield for key, ext in ext_dict.items(): - yield key, "work/{name_pattern}/out/{name_pattern}.{ext}".format( - name_pattern=name_pattern, ext=ext + yield ( + key, + "work/{name_pattern}/out/{name_pattern}.{ext}".format( + name_pattern=name_pattern, ext=ext + ), ) @staticmethod @@ -246,8 +242,11 @@ def _get_output_files_annotate(): ext_dict = {"json": "json", "json_md5": "json.md5"} # Yield for key, ext in ext_dict.items(): - yield key, "work/{name_pattern}/out/{name_pattern}.{ext}".format( - name_pattern=name_pattern, ext=ext + yield ( + key, + "work/{name_pattern}/out/{name_pattern}.{ext}".format( + name_pattern=name_pattern, ext=ext + ), ) @staticmethod @@ -297,7 +296,7 @@ def annotate_results(self, _wildcards, sm_input, sm_output): # Annotate AnnotateExpansionHunter( eh_json=input_path, - annotation_json=self.config["repeat_annotation"], + annotation_json=self.config.repeat_annotation, output_path=output_path, ).run() @@ -318,12 +317,13 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (NgsMappingWorkflow,), + config_model_class=RepeatExpansionConfigModel, + previous_steps=(NgsMappingWorkflow,), ) # Register sub step classes so the sub steps are available self.register_sub_step_classes((LinkOutStepPart, ExpansionHunterStepPart)) # Register sub workflows - self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"]) + self.register_sub_workflow("ngs_mapping", self.config.path_ngs_mapping) @classmethod def default_config_yaml(cls): @@ -349,7 +349,7 @@ def get_result_files(self): name_pattern = "{mapper}.{tool}_annotated.{donor.dna_ngs_library.name}" yield from self._yield_result_files( os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + mapper=self.w_config.step_config["ngs_mapping"].tools.dna, tool=tools, ext=EXT_JSON, ) @@ -357,7 +357,7 @@ def get_result_files(self): name_pattern = "{mapper}.{tool}.{donor.dna_ngs_library.name}" yield from self._yield_result_files( os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + mapper=self.w_config.step_config["ngs_mapping"].tools.dna, tool=tools, ext=EXT_VCF, ) @@ -370,11 +370,6 @@ def _yield_result_files(self, tpl, **kwargs): def check_config(self): """Check that the necessary configuration is available for the step""" - # Requires path to ngs_mapping output, i.e., the BAM files - self.ensure_w_config( - config_keys=("step_config", "repeat_expansion", "path_ngs_mapping"), - msg="Path to NGS mapping not configured but required for repeat expansion analysis.", - ) # Requires path to reference genome FASTA self.ensure_w_config( config_keys=("static_data_config", "reference", "path"), diff --git a/snappy_pipeline/workflows/repeat_expansion/model.py b/snappy_pipeline/workflows/repeat_expansion/model.py new file mode 100644 index 000000000..be5ba5215 --- /dev/null +++ b/snappy_pipeline/workflows/repeat_expansion/model.py @@ -0,0 +1,12 @@ +from snappy_pipeline.models import SnappyStepModel + + +class RepeatExpansion(SnappyStepModel): + repeat_catalog: str + """Repeat expansions definitions - used in ExpansionHunter call""" + + repeat_annotation: str + """Repeat expansions annotations, e.g., normality range - custom file""" + + path_ngs_mapping: str = "../ngs_mapping" + """Path to the ngs_mapping step""" diff --git a/snappy_pipeline/workflows/somatic_cnv_checking/Snakefile b/snappy_pipeline/workflows/somatic_cnv_checking/Snakefile index f670a39a4..2f6e8b7fe 100644 --- a/snappy_pipeline/workflows/somatic_cnv_checking/Snakefile +++ b/snappy_pipeline/workflows/somatic_cnv_checking/Snakefile @@ -92,7 +92,7 @@ rule somatic_cnv_checking_pileup_tumor: # Add CNV status at the locii ------------------------------------------------- -if config["step_config"]["somatic_cnv_checking"]["path_cnv_calling"]: +if wf.w_config.step_config["somatic_cnv_checking"].path_cnv_calling: rule somatic_cnv_checking_cnv_run: input: diff --git a/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py b/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py index 521807961..e5e7624c6 100644 --- a/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py +++ b/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py @@ -60,9 +60,9 @@ import os import sys -from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background from snakemake.io import expand +from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background from snappy_pipeline.base import InvalidConfiguration from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import ( @@ -77,6 +77,8 @@ ) from snappy_pipeline.workflows.somatic_wgs_cnv_calling import SomaticWgsCnvCallingWorkflow +from .model import SomaticCnvChecking as SomaticCnvCheckingConfigModel + __author__ = "Eric Blanc " #: Extensions of files to create as main payload @@ -86,18 +88,7 @@ EXT_NAMES = ("vcf", "vcf_tbi", "vcf_md5", "vcf_tbi_md5") #: Default configuration for the somatic_cnv_checking schema -DEFAULT_CONFIG = r""" -# Default configuration somatic_cnv_checking -step_config: - somatic_cnv_checking: - path_ngs_mapping: ../ngs_mapping # REQUIRED - path_cnv_calling: "" # Can use for instance ../somatic_targeted_seq_cnv_calling - cnv_assay_type: "" # Empty: no CNV, WES for somatic_targeted_seq_snv_calling step, WGS for somatic_wgs_cnv_calling step - excluded_regions: "" # Bed file of regions to be excluded - max_depth: 10000 # Max depth for pileups - min_depth: 20 # Minimum depth for reference and alternative alleles to consider variant - min_baf: 0.4 # Maximum BAF to consider variant as heterozygous (between 0 & 1/2) -""" +DEFAULT_CONFIG = SomaticCnvCheckingConfigModel.default_config_yaml_string() class SomaticCnvCheckingStepPart(BaseStepPart): @@ -177,7 +168,7 @@ def get_log_file(self, action): "work/{mapper}.{library_name}/log/{mapper}.{library_name}." + action ) - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: # Validate action self._validate_action(action) return ResourceUsage( @@ -301,25 +292,26 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - ( + config_model_class=SomaticCnvCheckingConfigModel, + previous_steps=( SomaticTargetedSeqCnvCallingWorkflow, SomaticWgsCnvCallingWorkflow, NgsMappingWorkflow, ), ) - if self.config["path_cnv_calling"] and self.config["cnv_assay_type"]: - if self.config["cnv_assay_type"] == "WES": + if self.config.path_cnv_calling and self.config.cnv_assay_type: + if self.config.cnv_assay_type == "WES": cnv_calling = "somatic_targeted_seq_cnv_calling" - elif self.config["cnv_assay_type"] == "WES": + elif self.config.cnv_assay_type == "WES": cnv_calling = "somatic_wgs_cnv_calling" else: raise InvalidConfiguration( "Illegal cnv_assay_type {}, must be either WES or WGS".format( - self.config["cnv_assay_type"] + self.config.cnv_assay_type ) ) - self.register_sub_workflow(cnv_calling, self.config["path_cnv_calling"], "cnv_calling") - self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"]) + self.register_sub_workflow(cnv_calling, self.config.path_cnv_calling, "cnv_calling") + self.register_sub_workflow("ngs_mapping", self.config.path_ngs_mapping) # Register sub step classes so the sub steps are available self.register_sub_step_classes( ( @@ -359,24 +351,24 @@ def get_result_files(self): ext = ("log", "conda_info.txt", "conda_list.txt") yield from expand( os.path.join("output", name_pattern, "log", name_pattern + ".normal.{ext}{chksum}"), - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + mapper=self.w_config.step_config["ngs_mapping"].tools.dna, library_name=set(self.tumor_to_normal.values()), ext=ext, chksum=chksum, ) yield from expand( os.path.join("output", name_pattern, "log", name_pattern + ".tumor.{ext}{chksum}"), - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + mapper=self.w_config.step_config["ngs_mapping"].tools.dna, library_name=self.tumor_to_normal.keys(), ext=ext, chksum=chksum, ) # Main result: vcf & optionally segment table if CNV available ext = {"out": [".vcf.gz", ".vcf.gz.tbi"]} - if self.config["path_cnv_calling"]: + if self.config.path_cnv_calling: # CNV avaliable name_pattern = "{mapper}.{caller}.{library_name}" - callers = self.w_config["step_config"]["somatic_targeted_seq_cnv_calling"]["tools"] + callers = self.w_config.step_config["somatic_targeted_seq_cnv_calling"].tools ext["out"] += [".tsv"] ext["report"] = (".cnv.pdf", ".locus.pdf", ".segment.pdf") ext["log"] = [ @@ -390,18 +382,9 @@ def get_result_files(self): for subdir, exts in ext.items(): yield from expand( os.path.join("output", name_pattern, subdir, name_pattern + "{ext}{chksum}"), - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + mapper=self.w_config.step_config["ngs_mapping"].tools.dna, caller=callers, library_name=self.tumor_to_normal.keys(), ext=exts, chksum=chksum, ) - - def check_config(self): - """Check that the path to the NGS mapping is present""" - self.ensure_w_config( - ("step_config", "somatic_cnv_checking", "path_ngs_mapping"), - "Path to NGS mapping not configured but required for somatic variant calling", - ) - if self.config["path_cnv_calling"]: - assert self.config["cnv_assay_type"] in ("WES", "WGS") diff --git a/snappy_pipeline/workflows/somatic_cnv_checking/model.py b/snappy_pipeline/workflows/somatic_cnv_checking/model.py new file mode 100644 index 000000000..f7ec574a5 --- /dev/null +++ b/snappy_pipeline/workflows/somatic_cnv_checking/model.py @@ -0,0 +1,42 @@ +import enum +from typing import Annotated + +from pydantic import Field, model_validator + +from snappy_pipeline.models import SnappyStepModel + + +class CnvAssayType(enum.StrEnum): + WES = "WES" + WGS = "WGS" + + +class SomaticCnvChecking(SnappyStepModel): + path_ngs_mapping: str = "../ngs_mapping" + + path_cnv_calling: Annotated[str, Field(examples=["../somatic_targeted_seq_cnv_calling"])] = "" + + cnv_assay_type: CnvAssayType | None = None + """ + Empty: no CNV, + WES for somatic_targeted_seq_snv_calling step, + WGS for somatic_wgs_cnv_calling step + """ + + excluded_regions: str = "" + """Bed file of regions to be excluded""" + + max_depth: int = 10000 + """Max depth for pileups""" + + min_depth: int = 20 + """Minimum depth for reference and alternative alleles to consider variant""" + + min_baf: Annotated[float, Field(0.4, ge=0, le=0.5)] + """Maximum BAF to consider variant as heterozygous (between 0 & 1/2)""" + + @model_validator(mode="after") + def ensure_cnv_assay_type_is_specified(self): + if self.path_cnv_calling and not self.cnv_assay_type: + raise ValueError("CNV assay type must be specified") + return self diff --git a/snappy_pipeline/workflows/somatic_gene_fusion_calling/Snakefile b/snappy_pipeline/workflows/somatic_gene_fusion_calling/Snakefile index ceff7a7c7..6442fe1b9 100644 --- a/snappy_pipeline/workflows/somatic_gene_fusion_calling/Snakefile +++ b/snappy_pipeline/workflows/somatic_gene_fusion_calling/Snakefile @@ -4,7 +4,9 @@ import os from snappy_pipeline import expand_ref -from snappy_pipeline.workflows.somatic_gene_fusion_calling import SomaticGeneFusionCallingWorkflow +from snappy_pipeline.workflows.somatic_gene_fusion_calling import ( + SomaticGeneFusionCallingWorkflow, +) __author__ = "Manuel Holtgrewe " diff --git a/snappy_pipeline/workflows/somatic_gene_fusion_calling/__init__.py b/snappy_pipeline/workflows/somatic_gene_fusion_calling/__init__.py index 0349a4d43..3e8c33265 100644 --- a/snappy_pipeline/workflows/somatic_gene_fusion_calling/__init__.py +++ b/snappy_pipeline/workflows/somatic_gene_fusion_calling/__init__.py @@ -50,10 +50,9 @@ import os -from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions from snakemake.io import touch -from snappy_pipeline.base import InvalidConfiguration +from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import ( BaseStep, @@ -65,6 +64,8 @@ get_ngs_library_folder_name, ) +from .model import SomaticGeneFusionCalling as SomaticGeneFusionCallingConfigModel + __author__ = "Manuel Holtgrewe " #: HLA typing tools @@ -79,51 +80,7 @@ ) #: Default configuration for the somatic_gene_fusion_calling step -DEFAULT_CONFIG = r""" -step_config: - somatic_gene_fusion_calling: - path_link_in: "" # OPTIONAL Override data set configuration search paths for FASTQ files - tools: ['fusioncatcher', 'jaffa', 'arriba', 'defuse', 'hera', 'pizzly', 'star_fusion'] # REQUIRED, available: 'fusioncatcher', 'jaffa', 'arriba', 'defuse', 'hera', 'pizzly', 'star_fusion'. - fusioncatcher: - data_dir: REQUIRED # REQUIRED - configuration: null # optional - num_threads: 16 - pizzly: - kallisto_index: REQUIRED # REQUIRED - transcripts_fasta: REQUIRED # REQUIRED - annotations_gtf: REQUIRED # REQUIRED - kmer_size: 31 - hera: - path_index: REQUIRED # REQUIRED - path_genome: REQUIRED # REQUIRED - star_fusion: - path_ctat_resource_lib: REQUIRED - defuse: - path_dataset_directory: REQUIRED - arriba: - path_index: REQUIRED # REQUIRED STAR path index (preferably 2.7.10 or later) - blacklist: "" # optional (provided in the arriba distribution, see /fast/work/groups/cubi/projects/biotools/static_data/app_support/arriba/v2.3.0) - known_fusions: "" # optional - tags: "" # optional (can be set to the same path as known_fusions) - structural_variants: "" # optional - protein_domains: "" # optional - num_threads: 8 - trim_adapters: false - num_threads_trimming: 2 - star_parameters: - - " --outFilterMultimapNmax 50" - - " --peOverlapNbasesMin 10" - - " --alignSplicedMateMapLminOverLmate 0.5" - - " --alignSJstitchMismatchNmax 5 -1 5 5" - - " --chimSegmentMin 10" - - " --chimOutType WithinBAM HardClip" - - " --chimJunctionOverhangMin 10" - - " --chimScoreDropMax 30" - - " --chimScoreJunctionNonGTAG 0" - - " --chimScoreSeparation 1" - - " --chimSegmentReadGapMax 3" - - " --chimMultimapNmax 50" -""".lstrip() +DEFAULT_CONFIG = SomaticGeneFusionCallingConfigModel.default_config_yaml_string() class SomaticGeneFusionCallingStepPart(BaseStepPart): @@ -141,7 +98,7 @@ def __init__(self, parent): self.parent.work_dir, self.parent.data_set_infos, self.parent.config_lookup_paths, - preprocessed_path=self.config["path_link_in"], + preprocessed_path=self.config.path_link_in, ) @dictify @@ -172,7 +129,7 @@ def _collect_reads(self, wildcards, library_name, prefix): Yields paths to right reads if prefix=='right-' """ folder_name = get_ngs_library_folder_name(self.parent.sheets, wildcards.library_name) - if self.config["path_link_in"]: + if self.config.path_link_in: folder_name = library_name pattern_set_keys = ("right",) if prefix.startswith("right-") else ("left",) for _, path_infix, filename in self.path_gen.run(folder_name, pattern_set_keys): @@ -205,7 +162,7 @@ def args_function(wildcards): assert action == "run", "Unsupported actions" return args_function - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -243,7 +200,7 @@ def args_function(wildcards): assert action == "run", "Unsupported actions" return args_function - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -281,7 +238,7 @@ def args_function(wildcards): assert action == "run", "Unsupported actions" return args_function - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -319,7 +276,7 @@ def args_function(wildcards): assert action == "run", "Unsupported actions" return args_function - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -357,7 +314,7 @@ def args_function(wildcards): assert action == "run", "Unsupported actions" return args_function - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -395,7 +352,7 @@ def args_function(wildcards): assert action == "run", "Unsupported actions" return args_function - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -418,32 +375,6 @@ class ArribaStepPart(SomaticGeneFusionCallingStepPart): #: Step name name = "arriba" - def check_config(self): - """Check parameters in configuration. - - Method checks that all parameters required to execute STAR & arriba are present in the - configuration. It further checks that the provided index has all the expected file - extensions. If invalid configuration, it raises InvalidConfiguration exception. - """ - # Check if tool is at all included in workflow - if self.__class__.name not in self.config["tools"]: - return # arriba not run, don't check configuration # pragma: no cover - - # Check required configuration settings present - self.parent.ensure_w_config( - config_keys=("step_config", "somatic_gene_fusion_calling", "arriba", "path_index"), - msg="Path to STAR indices is required", - ) - - # Check that the path to the STAR index is valid. - for fn in ("Genome", "SA", "SAindex"): - expected_path = self.config["arriba"]["path_index"] + "/" + fn - if not os.path.exists(expected_path): # pragma: no cover - tpl = "Expected STAR indices input path {expected_path} does not exist!".format( - expected_path=expected_path - ) - raise InvalidConfiguration(tpl) - def get_args(self, action): """Return function that maps wildcards to dict for input files""" @@ -496,7 +427,7 @@ def get_log_file(self, action): yield key, prefix + ext yield key + "_md5", prefix + ext + ".md5" - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -507,7 +438,7 @@ def get_resource_usage(self, action): # Validate action self._validate_action(action) return ResourceUsage( - threads=self.config["arriba"]["num_threads"], time="24:00:00", memory=f"{96 * 1024}M" + threads=self.config.arriba.num_threads, time="24:00:00", memory=f"{96 * 1024}M" ) # 1 day @@ -532,7 +463,14 @@ def default_config_yaml(cls): return DEFAULT_CONFIG def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir): - super().__init__(workflow, config, config_lookup_paths, config_paths, workdir) + super().__init__( + workflow, + config, + config_lookup_paths, + config_paths, + workdir, + config_model_class=SomaticGeneFusionCallingConfigModel, + ) self.register_sub_step_classes( ( FusioncatcherStepPart, @@ -558,7 +496,7 @@ def get_result_files(self): library_names_list = list(self._get_all_rna_ngs_libraries()) # Get results name_pattern = "{fusion_caller}.{ngs_library}" - for fusion_caller in self.config["tools"]: + for fusion_caller in self.config.tools: for ngs_library in library_names_list: # Constant to all callers name_pattern_value = name_pattern.format( @@ -596,7 +534,3 @@ def _yield_arriba_files(self, ngs_library): for ext in ("Log.out", "Log.std.out", "Log.final.out", "SJ.out.tab"): yield tpl.format(library_name=ngs_library, ext=ext) yield tpl.format(library_name=ngs_library, ext=ext + ".md5") - - def check_config(self): - """Check that the required configurations are present.""" - # TODO: implement check for REQUIRED configurations. diff --git a/snappy_pipeline/workflows/somatic_gene_fusion_calling/model.py b/snappy_pipeline/workflows/somatic_gene_fusion_calling/model.py new file mode 100644 index 000000000..19214c1a1 --- /dev/null +++ b/snappy_pipeline/workflows/somatic_gene_fusion_calling/model.py @@ -0,0 +1,130 @@ +import enum +import os +from typing import Annotated + +from pydantic import Field, model_validator + +from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel, validators + + +class Tool(enum.StrEnum): + fusioncatcher = "fusioncatcher" + jaffa = "jaffa" + arriba = "arriba" + defuse = "defuse" + hera = "hera" + pizzly = "pizzly" + star_fusion = "star_fusion" + + +class Fusioncatcher(SnappyModel): + data_dir: str + configuration: str = "" + num_threads: int = 16 + + +class Pizzly(SnappyModel): + kallisto_index: str + transcripts_fasta: str + annotations_gtf: str + kmer_size: Annotated[int, Field(31, gt=0)] + + +class Hera(SnappyModel): + path_index: str + path_genome: str + + +class StarFusion(SnappyModel): + path_ctat_resource_lib: str + + +class Defuse(SnappyModel): + path_dataset_directory: str + + +class Jaffa(SnappyModel): # TODO + pass + + +class Arriba(SnappyModel): + path_index: str + """STAR path index (preferably 2.7.10 or later)""" + + blacklist: str = "" + """provided in the arriba distribution, see /fast/work/groups/cubi/projects/biotools/static_data/app_support/arriba/v2.3.0""" + + known_fusions: str = "" + + tags: str = "" + """can be set to the same path as known_fusions""" + + structural_variants: str = "" + + protein_domains: str = "" + + num_threads: int = 8 + + trim_adapters: bool = False + + num_threads_trimming: int = 2 + + star_parameters: list[str] = [ + " --outFilterMultimapNmax 50", + " --peOverlapNbasesMin 10", + " --alignSplicedMateMapLminOverLmate 0.5", + " --alignSJstitchMismatchNmax 5 -1 5 5", + " --chimSegmentMin 10", + " --chimOutType WithinBAM HardClip", + " --chimJunctionOverhangMin 10", + " --chimScoreDropMax 30", + " --chimScoreJunctionNonGTAG 0", + " --chimScoreSeparation 1", + " --chimSegmentReadGapMax 3", + " --chimMultimapNmax 50", + ] + + @model_validator(mode="after") + def ensure_star_index_files_exist(self): + full_path = self.path_index + # a lot of files should be in this dir, justtest these + for indfile in ("Genome", "SA", "SAindex"): + expected_path = os.path.join(full_path, indfile) + if not os.path.exists(expected_path): # pragma: no cover + raise ValueError(f"Expected STAR index file {expected_path} does not exist!") + return self + + +class SomaticGeneFusionCalling(SnappyStepModel, validators.ToolsMixin): + path_link_in: str = "" + """Override data set configuration search paths for FASTQ files""" + + tools: Annotated[ + list[Tool], + EnumField( + Tool, + [ + Tool.fusioncatcher, + Tool.jaffa, + Tool.arriba, + Tool.defuse, + Tool.hera, + Tool.pizzly, + Tool.star_fusion, + ], + ), + ] + + fusioncatcher: Fusioncatcher | None = None + + jaffa: Jaffa | None = None + + arriba: Arriba | None = None + + defuse: Defuse | None = None + + hera: Hera | None = None + + pizzly: Pizzly | None = None + + star_fusion: StarFusion | None = None diff --git a/snappy_pipeline/workflows/somatic_hla_loh_calling/Snakefile b/snappy_pipeline/workflows/somatic_hla_loh_calling/Snakefile index 3f00c21b2..3f879d6f7 100644 --- a/snappy_pipeline/workflows/somatic_hla_loh_calling/Snakefile +++ b/snappy_pipeline/workflows/somatic_hla_loh_calling/Snakefile @@ -4,7 +4,9 @@ import os from snappy_pipeline import expand_ref -from snappy_pipeline.workflows.somatic_hla_loh_calling import SomaticHlaLohCallingWorkflow +from snappy_pipeline.workflows.somatic_hla_loh_calling import ( + SomaticHlaLohCallingWorkflow, +) __author__ = "Clemens Messerschmidt" diff --git a/snappy_pipeline/workflows/somatic_hla_loh_calling/__init__.py b/snappy_pipeline/workflows/somatic_hla_loh_calling/__init__.py index f01b24cf5..f9070d31c 100644 --- a/snappy_pipeline/workflows/somatic_hla_loh_calling/__init__.py +++ b/snappy_pipeline/workflows/somatic_hla_loh_calling/__init__.py @@ -25,24 +25,19 @@ import os import sys -from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background from snakemake.io import expand +from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow +from .model import SomaticHlaLohCalling as SomaticHlaLohCallingConfigModel + __author__ = "Clemens Messerschmidt " #: Default configuration for the somatic_msi_calling step -DEFAULT_CONFIG = r""" -# Default configuration somatic_hla_loh_calling -step_config: - somatic_hla_loh_calling: - path_ngs_mapping: ../ngs_mapping # REQUIRED - path_hla_typing: ../hla_typing # REQUIRED - path_somatic_purity_ploidy: ../somatic_purity_ploidy_estimate # REQUIRED -""" +DEFAULT_CONFIG = SomaticHlaLohCallingConfigModel.default_config_yaml_string() class LohhlaStepPart(BaseStepPart): @@ -68,7 +63,6 @@ def __init__(self, parent): ) def get_input_files(self, action): - # Validate action self._validate_action(action) @@ -155,13 +149,14 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (NgsMappingWorkflow,), + config_model_class=SomaticHlaLohCallingConfigModel, + previous_steps=(NgsMappingWorkflow,), ) # Register sub step classes so the sub steps are available self.register_sub_step_classes((LohhlaStepPart, LinkOutStepPart)) # Initialize sub-workflows - self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"]) - self.register_sub_workflow("hla_typing", self.config["path_hla_typing"]) + self.register_sub_workflow("ngs_mapping", self.config.path_ngs_mapping) + self.register_sub_workflow("hla_typing", self.config.path_hla_typing) @listify def get_result_files(self): @@ -172,12 +167,12 @@ def get_result_files(self): name_pattern = "{mapper}.optitype.lohhla.{tumor_library.name}" yield from self._yield_result_files_matched( os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + mapper=self.w_config.step_config["ngs_mapping"].tools.dna, ext=".done", ) yield from self._yield_result_files_matched( os.path.join("output", name_pattern, "log", name_pattern + "{ext}"), - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + mapper=self.w_config.step_config["ngs_mapping"].tools.dna, ext=( ".log", ".log.md5", @@ -209,10 +204,3 @@ def _yield_result_files_matched(self, tpl, **kwargs): yield from expand( tpl, tumor_library=[sample_pair.tumor_sample.dna_ngs_library], **kwargs ) - - def check_config(self): - """Check that the path to the NGS mapping is present""" - self.ensure_w_config( - ("step_config", "somatic_hla_loh_calling", "path_ngs_mapping"), - "Path to NGS mapping not configured but required.", - ) diff --git a/snappy_pipeline/workflows/somatic_hla_loh_calling/model.py b/snappy_pipeline/workflows/somatic_hla_loh_calling/model.py new file mode 100644 index 000000000..66bb5fe60 --- /dev/null +++ b/snappy_pipeline/workflows/somatic_hla_loh_calling/model.py @@ -0,0 +1,9 @@ +from snappy_pipeline.models import SnappyStepModel + + +class SomaticHlaLohCalling(SnappyStepModel): + path_ngs_mapping: str = "../ngs_mapping" + + path_hla_typing: str + + path_somatic_purity_ploidy: str diff --git a/snappy_pipeline/workflows/somatic_msi_calling/__init__.py b/snappy_pipeline/workflows/somatic_msi_calling/__init__.py index a2831bc31..b581ddce9 100644 --- a/snappy_pipeline/workflows/somatic_msi_calling/__init__.py +++ b/snappy_pipeline/workflows/somatic_msi_calling/__init__.py @@ -52,9 +52,9 @@ import os import sys -from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background from snakemake.io import expand +from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import ( BaseStep, @@ -64,6 +64,8 @@ ) from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow +from .model import SomaticMsiCalling as SomaticMsiCallingConfigModel + __author__ = "Clemens Messerschmidt " #: Extensions of files to create as main payload @@ -86,16 +88,7 @@ #: Default configuration for the somatic_msi_calling step -DEFAULT_CONFIG = r""" -# Default configuration somatic_msi_calling -step_config: - somatic_msi_calling: - tools: ['mantis_msi2'] # REQUIRED - available: 'mantis_msi2' - path_ngs_mapping: ../ngs_mapping # REQUIRED - loci_bed: "" # REQUIRED - # hg19: /fast/work/groups/cubi/projects/biotools/Mantis/appData/hg19/loci.bed - # hg38: /fast/work/groups/cubi/projects/biotools/Mantis/appData/hg38/GRCh38.d1.vd1.all_loci.bed -""" +DEFAULT_CONFIG = SomaticMsiCallingConfigModel.default_config_yaml_string() class Mantis2StepPart(BaseStepPart): @@ -176,7 +169,7 @@ def _get_log_file(self, action): yield key, prefix + ext yield key + "_md5", prefix + ext + ".md5" - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -218,27 +211,28 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (NgsMappingWorkflow,), + config_model_class=SomaticMsiCallingConfigModel, + previous_steps=(NgsMappingWorkflow,), ) # Register sub step classes so the sub steps are available self.register_sub_step_classes((Mantis2StepPart, LinkOutStepPart)) # Initialize sub-workflows - self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"]) + self.register_sub_workflow("ngs_mapping", self.config.path_ngs_mapping) @listify def get_result_files(self): """Return list of result files for the MSI calling workflow""" name_pattern = "{mapper}.{msi_caller}.{tumor_library.name}" - for msi_caller in set(self.config["tools"]) & set(MSI_CALLERS_MATCHED): + for msi_caller in set(self.config.tools) & set(MSI_CALLERS_MATCHED): yield from self._yield_result_files_matched( os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + mapper=self.w_config.step_config["ngs_mapping"].tools.dna, msi_caller=msi_caller, ext=EXT_MATCHED[msi_caller].values() if msi_caller in EXT_MATCHED else EXT_VALUES, ) yield from self._yield_result_files_matched( os.path.join("output", name_pattern, "log", name_pattern + "{ext}"), - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + mapper=self.w_config.step_config["ngs_mapping"].tools.dna, msi_caller=msi_caller, ext=( ".log", @@ -278,7 +272,3 @@ def check_config(self): ("static_data_config", "reference", "path"), "Path to reference FASTA file not configured but required", ) - self.ensure_w_config( - ("step_config", "somatic_msi_calling", "loci_bed"), - "Path to bed file with microsatellite loci needed", - ) diff --git a/snappy_pipeline/workflows/somatic_msi_calling/model.py b/snappy_pipeline/workflows/somatic_msi_calling/model.py new file mode 100644 index 000000000..ad751d5f9 --- /dev/null +++ b/snappy_pipeline/workflows/somatic_msi_calling/model.py @@ -0,0 +1,26 @@ +import enum +from typing import Annotated + +from pydantic import Field + +from snappy_pipeline.models import EnumField, SnappyStepModel + + +class Tool(enum.StrEnum): + mantis_msi2 = "mantis_msi2" + + +class SomaticMsiCalling(SnappyStepModel): + path_ngs_mapping: str = "../ngs_mapping" + + tools: Annotated[list[Tool], EnumField(Tool, [Tool.mantis_msi2], min_length=1)] + + loci_bed: Annotated[ + str, + Field( + examples=[ + "/fast/groups/cubi/projects/biotools/Mantis/appData/hg19/loci.bed", + "/fast/work/groups/cubi/projects/biotools/Mantis/appData/hg38/GRCh38.d1.vd1.all_loci.bed", + ] + ), + ] diff --git a/snappy_pipeline/workflows/somatic_purity_ploidy_estimate/Snakefile b/snappy_pipeline/workflows/somatic_purity_ploidy_estimate/Snakefile index 4223221bb..69b6290bf 100644 --- a/snappy_pipeline/workflows/somatic_purity_ploidy_estimate/Snakefile +++ b/snappy_pipeline/workflows/somatic_purity_ploidy_estimate/Snakefile @@ -55,7 +55,7 @@ rule somatic_purity_ploidy_estimate_link_out_run: # Decide if WGS or WES rules should be used: -if config["step_config"]["somatic_purity_ploidy_estimate"]["tool_cnv_calling"] == "cnvetti": +if wf.w_config.step_config["somatic_purity_ploidy_estimate"].tool_cnv_calling == "cnvetti": ruleorder: somatic_purity_ploidy_estimate_ascat_cnv_tumor > somatic_purity_ploidy_estimate_ascat_cnv_tumor_wes ruleorder: somatic_purity_ploidy_estimate_ascat_cnv_normal > somatic_purity_ploidy_estimate_ascat_cnv_normal_wes diff --git a/snappy_pipeline/workflows/somatic_purity_ploidy_estimate/__init__.py b/snappy_pipeline/workflows/somatic_purity_ploidy_estimate/__init__.py index 055b4170f..4bca9da80 100644 --- a/snappy_pipeline/workflows/somatic_purity_ploidy_estimate/__init__.py +++ b/snappy_pipeline/workflows/somatic_purity_ploidy_estimate/__init__.py @@ -14,34 +14,22 @@ from collections import OrderedDict import os -from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions from snakemake.io import touch +from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow, ResourceUsage +from .model import SomaticPurityPloidyEstimate as SomaticPurityPloidyEstimateConfigModel + __author__ = "Manuel Holtgrewe " #: Tools for estimating purity and ploidy. PURITY_PLOIDY_TOOLS = "ascat" #: Default configuration for the somatic_gene_fusion_calling step -DEFAULT_CONFIG = r""" -step_config: - somatic_purity_ploidy_estimate: - tools: ['ascat'] # REQUIRED - available: 'ascat' - tool_cnv_calling: cnvetti - # Configuration with read mapper and path to mapping output. Will use this - # for generating a pileup using samtools for obtaining the b allele - # fraction and computing coverage. - tool_ngs_mapping: bwa - path_ngs_mapping: ../ngs_mapping - # Configuration of ASCAT method. - ascat: - # BED file with loci for B allele frequency. - b_af_loci: REQUIRED # REQUIRED -""".lstrip() +DEFAULT_CONFIG = SomaticPurityPloidyEstimateConfigModel.default_config_yaml_string() class AscatStepPart(BaseStepPart): @@ -274,7 +262,7 @@ def get_log_file(self, action): } return {"log": log_dict[action]} - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -318,18 +306,16 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (NgsMappingWorkflow,), + config_model_class=SomaticPurityPloidyEstimateConfigModel, + previous_steps=(NgsMappingWorkflow,), ) self.register_sub_step_classes((AscatStepPart, LinkOutStepPart)) # Initialize sub-workflows - self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"]) - # TODO: potential bug here as this step requires an entry that is not available - # in DEFAULT_CONFIG. - if self.config["tool_cnv_calling"] == "copywriter": - + self.register_sub_workflow("ngs_mapping", self.config.path_ngs_mapping) + if self.config.tool_cnv_calling == "copywriter": self.register_sub_workflow( "somatic_targeted_seq_cnv_calling", - self.config["path_somatic_targeted_seq_cnv_calling"], + self.config.path_somatic_targeted_seq_cnv_calling, ) @listify @@ -340,7 +326,7 @@ def get_result_files(self): sheets. """ name_pattern = "{mapper}.{tool}.{ngs_library.name}" - for tool in self.config["tools"]: + for tool in self.config.tools: for sheet in self.shortcut_sheets: for donor in sheet.donors: # Skip all donors that do not have a non-tumor bio sample, estimation only @@ -354,7 +340,7 @@ def get_result_files(self): for _test_sample in bio_sample.test_samples.values(): ngs_library = bio_sample.dna_ngs_library name_pattern_value = name_pattern.format( - mapper=self.config["tool_ngs_mapping"], + mapper=self.config.tool_ngs_mapping, tool=tool, ngs_library=ngs_library, ) diff --git a/snappy_pipeline/workflows/somatic_purity_ploidy_estimate/model.py b/snappy_pipeline/workflows/somatic_purity_ploidy_estimate/model.py new file mode 100644 index 000000000..83acc5451 --- /dev/null +++ b/snappy_pipeline/workflows/somatic_purity_ploidy_estimate/model.py @@ -0,0 +1,44 @@ +import enum +from typing import Annotated + +from pydantic import model_validator + +from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel, validators + + +class Tool(enum.StrEnum): + ascat = "ascat" + + +class Ascat(SnappyModel): + b_af_loci: str + """BED file with loci for B allele frequency.""" + + +class SomaticPurityPloidyEstimate(SnappyStepModel, validators.ToolsMixin): + tools: Annotated[list[Tool], EnumField(Tool, [Tool.ascat], min_length=1)] + + tool_cnv_calling: str = "cnvetti" + """When set to 'copywriter', will trigger 'somatic_targeted_seq_cnv_calling'""" + + tool_ngs_mapping: str = "bwa" + """ + Configuration with read mapper and path to mapping output. + Will use this for generating a pileup using samtools + for obtaining the b allele fraction and computing coverage. + """ + + path_ngs_mapping: str = "../ngs_mapping" + + path_somatic_targeted_seq_cnv_calling: str = "" + + ascat: Ascat | None = None + + @model_validator(mode="after") + def check_tool_cnv_calling(self): + if self.tool_cnv_calling == "copywriter" and not self.path_somatic_targeted_seq_cnv_calling: + raise ValueError( + "When using 'copywriter' as tool_cnv_calling, " + "path_somatic_targeted_seq_cnv_calling must be set" + ) + return self diff --git a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py index 98863071e..01710d379 100644 --- a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py +++ b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py @@ -71,9 +71,9 @@ import os.path import sys -from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background from snakemake.io import expand +from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background from snappy_pipeline.base import UnsupportedActionException from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import ( @@ -84,108 +84,12 @@ ) from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow +from .model import SomaticTargetedSeqCnvCalling as SomaticTargetedSeqCnvCallingConfigModel + __author__ = "Manuel Holtgrewe " #: Default configuration for the somatic_targeted_seq_cnv_calling step -DEFAULT_CONFIG = r""" -# Default configuration somatic_targeted_seq_cnv_calling -step_config: - somatic_targeted_seq_cnv_calling: - tools: ['cnvkit'] # REQUIRED - available: 'cnvkit', 'sequenza', 'cnvetti_on_target', 'cnvetti_off_target' and 'copywriter' (deprecated) - path_ngs_mapping: ../ngs_mapping # REQUIRED - cnvkit: - path_target: REQUIRED # Usually ../panel_of_normals/output/cnvkit.target/out/cnvkit.target.bed - path_antitarget: REQUIRED # Usually ../panel_of_normals/output/cnvkit.antitarget/out/cnvkit.antitarget.bed - path_panel_of_normals: REQUIRED # Usually ../panel_of_normals/output/{mapper}.cnvkit.create_panel/out/{mapper}.cnvkit.panel_of_normals.cnn - plot: True # Generate plots (very slow) - min_mapq: 0 # [coverage] Mininum mapping quality score to count a read for coverage depth - count: False # [coverage] Alternative couting algorithm - gc_correction: True # [fix] Use GC correction - edge_correction: True # [fix] Use edge correction - rmask_correction: True # [fix] Use rmask correction - # BCBIO uses - # seg_method: haar - # seg_threshold: 0.0001 - # -- OR - # seg_method: cbs - # seg_threshold: 0.000001 - segmentation_method: cbs # [segment] One of cbs, flasso, haar, hmm, hmm-tumor, hmm-germline, none - segmentation_threshold: 0.000001 # [segment] Significance threshold (hmm methods: smoothing window size) - drop_low_coverage: False # [segment, call, genemetrics] Drop very low coverage bins - drop_outliers: 10 # [segment] Drop outlier bins (0 for no outlier filtering) - smooth_cbs: True # [segment] Additional smoothing of CBS segmentation (WARNING- not the default value) - center: "" # [call] Either one of mean, median, mode, biweight, or a constant log2 ratio value. - filter: ampdel # [call] One of ampdel, cn, ci, sem (merging segments flagged with the specified filter), "" for no filtering - calling_method: threshold # [call] One of threshold, clonal, none - call_thresholds: "-1.1,-0.25,0.2,0.7" # [call] Thresholds for calling integer copy number - ploidy: 2 # [call] Ploidy of sample cells - purity: 0 # [call] Estimated tumor cell fraction (0 for discarding tumor cell purity) - gender: "" # [call, diagram] Specify the chromosomal sex of all given samples as male or female. Guess when missing - male_reference: False # [call, diagram] Create male reference - diagram_threshold: 0.5 # [diagram] Copy number change threshold to label genes - diagram_min_probes: 3 # [diagram] Min number of covered probes to label genes - shift_xy: True # [diagram] Shift X & Y chromosomes according to sample sex - breaks_min_probes: 1 # [breaks] Min number of covered probes for a break inside the gene - genemetrics_min_probes: 3 # [genemetrics] Min number of covered probes to consider a gene - genemetrics_threshold: 0.2 # [genemetrics] Min abs log2 change to consider a gene - genemetrics_alpha: 0.05 # [genemetrics] Significance cutoff - genemetrics_bootstrap: 100 # [genemetrics] Number of bootstraps - segmetrics_alpha: 0.05 # [segmetrics] Significance cutoff - segmetrics_bootstrap: 100 # [segmetrics] Number of bootstraps - smooth_bootstrap: False # [segmetrics] Smooth bootstrap results - sequenza: - length: 50 - assembly: "hg19" # Must be hg38 for GRCh38. See copynumber for complete list (augmented with hg38) - extra_args: {} # Extra arguments for sequenza bam2seqz, valid values: - # hom: 0.9 # Threshold to select homozygous positions - # het: 0.25 # Threshold to select heterozygous positions - # qlimit: 20 # Minimum nucleotide quality score for inclusion in the counts - # qformat: "sanger" # Quality format, options are "sanger" or "illumina". This will add an offset of 33 or 64 respectively to the qlimit value - ignore_chroms: # patterns of chromosome names to ignore - [X, Y, MT, NC_007605. hs37d5, chrEBV, '*_decoy', 'HLA-*', 'GL000220.*'] # Genome hs37d5 - # [chrX, chrY, chrM, '*_random', 'chrUn_*', chrEBV, '*_decoy', 'HPV*', CMV, HBV, KSHV, MCV, SV40, 'HCV-*', 'HIV-*', 'HTLV-*'] # Genome GRch38.d1.vd1 - extra_args_extract: # Valid arguments: see ?sequenza::sequenza.extract in R - gamma: 60 # scarHRD value - kmin: 50 # scarHRD value - extra_args_fit: # Valid arguments: see ?sequenza::sequenza.fit in R - N.ratio.filter: 10 # scarHRD value - N.BAF.filter: 1 # scarHRD value - segment.filter: 3000000 # scarHRD value - mufreq.treshold: 0.1 # scarHRD value - ratio.priority: False # scarHRD value - ploidy: [1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5] - copywriter: - path_target_regions: REQUIRED # REQUIRED - bin_size: 20000 # TODO: make actually configurable - plot_genes: REQUIRED # Path to civic annotation - genome: hg19 # Could be hg38 (consider setting prefix to 'chr' when using GRCh38.v1) - features: EnsDb.Hsapiens.v75::EnsDb.Hsapiens.v75 - prefix: '' - nThread: 8 - purecn: - genome_name: "unknown" # Must be one from hg18, hg19, hg38, mm9, mm10, rn4, rn5, rn6, canFam3 - enrichment_kit_name: "unknown" # For filename only... - mappability: "" # GRCh38: /fast/work/groups/cubi/projects/biotools/static_data/app_support/PureCN/hg38/mappability.bw - reptiming: "" # Nothing for GRCh38 - seed: 1234567 - extra_commands: # Recommended extra arguments for PureCN, extra_arguments: [] to clear them all - model: betabin - "fun-segmentation": PSCBS - "post-optimize": "" # post-optimize is a flag - # A PureCN panel of normals is required, with the container, the intervals & the PON rds file - path_container: REQUIRED # ../panel_of_normals/work/containers/out/purecn.simg - path_intervals: REQUIRED # ../panel_of_normals/output/purecn/out/_.list - path_panel_of_normals: REQUIRED # ../panel_of_normals/output/bwa.purecn/out/bwa.purecn.panel_of_normals.rds - path_mapping_bias: REQUIRED # ../panel_of_normals/output/bwa.purecn/out/bwa.purecn.mapping_bias.rds - # IMPORTANT NOTE: Mutect2 must be called with "--genotype-germline-sites true --genotype-pon-sites true" - somatic_variant_caller: "mutect2" - path_somatic_variants: ../somatic_variant_calling_for_purecn - cnvetti_on_target: - path_target_regions: REQUIRED # REQUIRED - cnvetti_off_target: - path_target_regions: REQUIRED # REQUIRED - window_length: 20000 -""" +DEFAULT_CONFIG = SomaticTargetedSeqCnvCallingConfigModel.default_config_yaml_string() #: JSON key for "isCancer" KEY_IS_CANCER = "isCancer" @@ -338,8 +242,9 @@ def _get_output_files_segment(self): for infix in ("targets", "segments"): for key, ext in BCF_KEY_EXTS: name = "{}_{}".format(infix, key) - yield name, os.path.join( - "work", name_pattern, "out", name_pattern + "." + infix + ext + yield ( + name, + os.path.join("work", name_pattern, "out", name_pattern + "." + infix + ext), ) @dictify @@ -360,19 +265,11 @@ def _get_output_files_postprocess(self): for infix in ("targets", "targets_segmented", "segments", "gene_call", "gene_log2"): for key, ext in (("txt", ".txt"), ("md5", ".txt.md5")): name = "{}_{}".format(infix, key) - yield name, os.path.join( - "work", name_pattern, "out", name_pattern + "_" + infix + ext + yield ( + name, + os.path.join("work", name_pattern, "out", name_pattern + "_" + infix + ext), ) - def check_config(self): - """Check configuration""" - if self.name not in self.config["tools"]: - return # skip check - self.parent.ensure_w_config( - ("step_config", "somatic_targeted_seq_cnv_calling", self.name, "path_target_regions"), - "Path to target regions is missing for {}".format(self.name), - ) - def _get_log_file(self, action): """Return path to log file for the given action""" # Validate action @@ -381,7 +278,7 @@ def _get_log_file(self, action): prefix = os.path.join("work", name_pattern, "log", name_pattern) return self._get_log_file_from_prefix(prefix) - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -490,8 +387,11 @@ def input_function(wildcards): tumor_base_path = "output/{mapper}.{library_name}/out/{mapper}.{library_name}".format( **wildcards ) - yield "gc", "work/static_data/out/sequenza.{length}.wig.gz".format( - length=self.config["sequenza"]["length"], + yield ( + "gc", + "work/static_data/out/sequenza.{length}.wig.gz".format( + length=self.config.sequenza.length, + ), ) yield "normal_bam", ngs_mapping(normal_base_path + ".bam") yield "normal_bai", ngs_mapping(normal_base_path + ".bam.bai") @@ -515,7 +415,7 @@ def get_output_files(self, action): elif action == "gcreference": return { "gc": "work/static_data/out/sequenza.{length}.wig.gz".format( - length=self.config["sequenza"]["length"], + length=self.config.sequenza.length, ) } elif action == "coverage": @@ -553,7 +453,7 @@ def get_log_file(self, action): prefix = "work/R_packages/log/sequenza" elif action == "gcreference": prefix = "work/static_data/log/sequenza.{length}".format( - length=self.config["sequenza"]["length"], + length=self.config.sequenza.length, ) else: name_pattern = "{mapper}.sequenza.{library_name}" @@ -596,14 +496,17 @@ def get_input_files(self, action): @dictify def _get_input_files_run(self, wildcards): name_pattern = "{mapper}.purecn.{library_name}".format(**wildcards) - yield "tumor", os.path.join( - "work", - name_pattern, - "out", - name_pattern + "_coverage_loess.txt.gz", - ).format(**wildcards) + yield ( + "tumor", + os.path.join( + "work", + name_pattern, + "out", + name_pattern + "_coverage_loess.txt.gz", + ).format(**wildcards), + ) name_pattern = "{mapper}.{caller}.{library_name}".format( - caller=self.config["purecn"]["somatic_variant_caller"], + caller=self.config.purecn.somatic_variant_caller, **wildcards, ) base_path = os.path.join("output", name_pattern, "out", name_pattern + ".full.vcf.gz") @@ -652,18 +555,6 @@ def get_log_file(self, action): prefix = os.path.join("work", name_pattern, "log", name_pattern + "." + action) return self._get_log_file_from_prefix(prefix) - def check_config(self): - if self.name not in self.config["tools"]: - return # skip check - self.parent.ensure_w_config( - ("step_config", "somatic_targeted_seq_cnv_calling", self.name, "path_panel_of_normals"), - "Path to the PureCN panel of normal is missing", - ) - self.parent.ensure_w_config( - ("step_config", "somatic_targeted_seq_cnv_calling", self.name, "path_mapping_bias"), - "Path to the PureCN mapping bias file is missing (created in the panel_of_normals step)", - ) - class CnvKitStepPart(SomaticTargetedSeqCnvCallingStepPart): """Perform somatic targeted CNV calling using cnvkit""" @@ -703,23 +594,6 @@ class CnvKitStepPart(SomaticTargetedSeqCnvCallingStepPart): def __init__(self, parent): super().__init__(parent) - def check_config(self): - """Check configuration for cnvkit""" - if "cnvkit" not in self.config["tools"]: - return # cnvkit not enabled, skip - self.parent.ensure_w_config( - ("step_config", "somatic_targeted_seq_cnv_calling", "cnvkit", "path_target"), - "Path to target regions is missing for cnvkit", - ) - self.parent.ensure_w_config( - ("step_config", "somatic_targeted_seq_cnv_calling", "cnvkit", "path_antitarget"), - "Path to antitarget regions is missing for cnvkit", - ) - self.parent.ensure_w_config( - ("step_config", "somatic_targeted_seq_cnv_calling", "cnvkit", "path_panel_of_normals"), - "Path to panel of normals (reference) is missing for cnvkit", - ) - def get_input_files(self, action): """Return input paths input function, dependent on rule""" # Validate action @@ -1051,20 +925,6 @@ def get_output_files(self, action): output_files[k + "_md5"] = tpl + v + ".md5" return output_files - def check_config(self): - """Check configuration""" - if "copywriter" not in self.config["tools"]: - return # skip - self.parent.ensure_w_config( - ( - "step_config", - "somatic_targeted_seq_cnv_calling", - "copywriter", - "path_target_regions", - ), - "Path to target regions is missing", - ) - def get_log_file(self, action): """Return path to log file for the given action""" # Validate action @@ -1104,7 +964,8 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (NgsMappingWorkflow,), + config_model_class=SomaticTargetedSeqCnvCallingConfigModel, + previous_steps=(NgsMappingWorkflow,), ) # Register sub step classes so the sub steps are available self.register_sub_step_classes( @@ -1119,11 +980,11 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) ) ) # Initialize sub-workflows - self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"]) - if "purecn" in self.config["tools"]: + self.register_sub_workflow("ngs_mapping", self.config.path_ngs_mapping) + if "purecn" in self.config.tools: self.register_sub_workflow( "somatic_variant_calling", - self.config["purecn"]["path_somatic_variants"], + self.config.purecn.path_somatic_variants, "somatic_variants", ) @@ -1138,7 +999,7 @@ def get_result_files(self): "cnvetti_on_target": ("coverage", "segment", "postprocess"), "cnvetti_off_target": ("coverage", "segment", "postprocess"), } - if "cnvkit" in self.config["tools"] and self.config["cnvkit"]["plot"]: + if "cnvkit" in self.config.tools and self.config.cnvkit.plot: tool_actions["cnvkit"] += ["plot"] for sheet in filter(is_not_background, self.shortcut_sheets): for sample_pair in sheet.all_sample_pairs: @@ -1152,7 +1013,7 @@ def get_result_files(self): ) print(msg.format(sample_pair.tumor_sample.name), file=sys.stderr) continue - for tool in self.config["tools"]: + for tool in self.config.tools: for action in tool_actions[tool]: try: tpls = list(self.sub_steps[tool].get_output_files(action).values()) @@ -1165,7 +1026,7 @@ def get_result_files(self): for tpl in tpls: filenames = expand( tpl, - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + mapper=self.w_config.step_config["ngs_mapping"].tools.dna, library_name=[sample_pair.tumor_sample.dna_ngs_library.name], ) for f in filenames: @@ -1174,13 +1035,6 @@ def get_result_files(self): def check_config(self): """Check that the necessary global configuration is present""" - self.ensure_w_config( - ("step_config", "somatic_targeted_seq_cnv_calling", "path_ngs_mapping"), - ( - "Path to somatic variant calling not configured but required for " - "targeted sequencing CNV calling" - ), - ) self.ensure_w_config( ("static_data_config", "reference", "path"), ( diff --git a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/model.py b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/model.py new file mode 100644 index 000000000..1fbacf9d7 --- /dev/null +++ b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/model.py @@ -0,0 +1,241 @@ +import enum +from typing import Annotated, Any, Literal + +from pydantic import Field + +from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel, validators +from snappy_pipeline.models.cnvkit import Cnvkit + + +class Tool(enum.StrEnum): + cnvkit = "cnvkit" + sequenza = "sequenza" + copywriter = "copywriter" + cnvetti_on_target = "cnvetti_on_target" + cnvetti_off_target = "cnvetti_off_target" + purecn = "purecn" + + +class SequenzaExtraArgs(SnappyModel): + hom: float = 0.9 + """Threshold to select homozygous positions""" + + het: float = 0.25 + """Threshold to select heterozygous positions""" + + qlimit: float = 20 + """Minimum nucleotide quality score for inclusion in the counts""" + + qformat: str = "sanger" + """Quality format, options are "sanger" or "illumina". This will add an offset of 33 or 64 respectively to the qlimit value""" + + +class SequenzaExtractExtraArgs(SnappyModel): + gamma: int = 60 + """scarHRD value""" + + kmin: int = 50 + """scarHRD value""" + + +class SequenzaFitExtraArgs(SnappyModel): + N_ratio_filter: int = Field(10, alias="N.ratio.filter") + N_BAF_filter: int = Field(1, alias="N.BAF.filter") + segment_filter: int = Field(3000000, alias="segment.filter") + mufreq_treshold: float = Field(0.1, alias="mufreq.threshold") + ratio_priority: bool = Field(False, alias="ratio_priority") + ploidy: list[float] = [ + 1.0, + 1.1, + 1.2, + 1.3, + 1.4, + 1.5, + 1.6, + 1.7, + 1.8, + 1.9, + 2.0, + 2.1, + 2.2, + 2.3, + 2.4, + 2.5, + 2.6, + 2.7, + 2.8, + 2.9, + 3.0, + 3.1, + 3.2, + 3.3, + 3.4, + 3.5, + 3.6, + 3.7, + 3.8, + 3.9, + 4.0, + 4.1, + 4.2, + 4.3, + 4.4, + 4.5, + 4.6, + 4.7, + 4.8, + 4.9, + 5.0, + 5.1, + 5.2, + 5.3, + 5.4, + 5.5, + ] + + +class Sequenza(SnappyModel): + length: int = 50 + assembly: str = "hg19" + """Must be hg38 for GRCh38. See copynumber for complete list (augmented with hg38)""" + + extra_args: SequenzaExtraArgs | dict[str, Any] = {} + """Extra arguments for sequenza bam2seqz""" + + ignore_chroms: list[str] = [ + "X", + "Y", + "MT", + "NC_007605. hs37d5", + "chrEBV", + "*_decoy", + "HLA-*", + "GL000220.*", + ] + """patterns of chromosome names to ignore""" + + extra_args_extract: SequenzaExtractExtraArgs | dict[str, Any] = SequenzaExtractExtraArgs() + """Valid arguments: see ?sequenza::sequenza.extract in R""" + + extra_args_fit: SequenzaFitExtraArgs | dict[str, Any] = SequenzaFitExtraArgs() + """Valid arguments: see ?sequenza::sequenza.fit in R""" + + +class CopyWriter(SnappyModel): + path_target_regions: str + """Path to target regions""" + + bin_size: int = 20000 # TODO: make actually configurable + + plot_genes: str + """Path to civic annotation""" + + genome: str = "hg19" + """Could be hg38 (consider setting prefix to 'chr' when using GRCh38.v1)""" + + features: str = "EnsDb.Hsapiens.v75::EnsDb.Hsapiens.v75" + + prefix: str = "" + + nThread: int = 8 + + +class GenomeName(enum.StrEnum): + hg18 = "hg18" + hg19 = "hg19" + hg38 = "hg38" + mm9 = "mm9" + mm10 = "mm10" + rn4 = "rn4" + rn5 = "rn5" + rn6 = "rn6" + canFam3 = "canFam3" + + +class PureCn(SnappyModel): + genome_name: Annotated[ + GenomeName | Literal["unknown"], + EnumField(GenomeName, json_schema_extra={"options": {"unknown"}}), + ] = "unknown" + """Must be one from hg18, hg19, hg38, mm9, mm10, rn4, rn5, rn6, canFam3""" + + enrichment_kit_name: str = "unknown" + """For filename only...""" + + mappability: str = "" + """ + GRCh38: + /fast/work/groups/cubi/projects/biotools/static_data/app_support/PureCN/hg38/mappability.bw + """ + + reptiming: str = "" + """Nothing for GRCh38""" + + seed: int = 1234567 + extra_commands: dict[str, Any] = { + "model": "betabin", + "fun-segmentation": "PSCBS", + "post-optimize": "", + } + """Recommended extra arguments for PureCN, extra_commands: {} to clear them all""" + + path_container: Annotated[ + str, Field(examples=["../panel_of_normals/work/containers/out/purecn.simg"]) + ] + """ + A PureCN panel of normals is required, + with the container, the intervals & the PON rds file + """ + + path_intervals: Annotated[ + str, + Field( + examples=[ + "../panel_of_normals/output/purecn/out/_.list" + ] + ), + ] + + path_panel_of_normals: Annotated[ + str, + Field( + examples=["../panel_of_normals/output/bwa.purecn/out/bwa.purecn.panel_of_normals.rds"] + ), + ] + """Path to the PureCN panel of normal""" + + path_mapping_bias: Annotated[ + str, + Field(examples=["../panel_of_normals/output/bwa.purecn/out/bwa.purecn.mapping_bias.rds"]), + ] + """Path to the PureCN mapping bias file""" + + somatic_variant_caller: str = "mutect2" + """ + IMPORTANT NOTE: + Mutect2 must be called with "--genotype-germline-sites true --genotype-pon-sites true + """ + + path_somatic_variants: Annotated[str, Field(examples=["../somatic_variant_calling_for_purecn"])] + + +class CnvettiOnTarget(SnappyModel): + path_target_regions: str + + +class CnvettiOffTarget(SnappyModel): + path_target_regions: str + + window_length: int = 20000 + + +class SomaticTargetedSeqCnvCalling(SnappyStepModel, validators.ToolsMixin): + tools: Annotated[list[Tool], EnumField(Tool, [Tool.cnvkit], min_length=1)] + path_ngs_mapping: str = "../ngs_mapping" + + cnvkit: Cnvkit | None = None + sequenza: Sequenza | None = None + copywriter: CopyWriter | None = None + purecn: PureCn | None = None + cnvetti_on_target: CnvettiOnTarget | None = None + cnvetti_off_target: CnvettiOffTarget | None = None diff --git a/snappy_pipeline/workflows/somatic_variant_annotation/Snakefile b/snappy_pipeline/workflows/somatic_variant_annotation/Snakefile index 7fb41ff1d..2e8cd26a9 100644 --- a/snappy_pipeline/workflows/somatic_variant_annotation/Snakefile +++ b/snappy_pipeline/workflows/somatic_variant_annotation/Snakefile @@ -4,7 +4,9 @@ import os from snappy_pipeline import expand_ref -from snappy_pipeline.workflows.somatic_variant_annotation import SomaticVariantAnnotationWorkflow +from snappy_pipeline.workflows.somatic_variant_annotation import ( + SomaticVariantAnnotationWorkflow, +) __author__ = "Manuel Holtgrewe " diff --git a/snappy_pipeline/workflows/somatic_variant_annotation/__init__.py b/snappy_pipeline/workflows/somatic_variant_annotation/__init__.py index 9b599887d..6baa58627 100644 --- a/snappy_pipeline/workflows/somatic_variant_annotation/__init__.py +++ b/snappy_pipeline/workflows/somatic_variant_annotation/__init__.py @@ -77,10 +77,9 @@ import os import sys -from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background from snakemake.io import expand -from snappy_pipeline.base import InvalidConfiguration +from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow, ResourceUsage @@ -90,6 +89,8 @@ SomaticVariantCallingWorkflow, ) +from .model import SomaticVariantAnnotation as SomaticVariantAnnotationConfigModel + __author__ = "Manuel Holtgrewe " #: Extensions of files to create as main payload @@ -102,49 +103,7 @@ ANNOTATION_TOOLS = ("jannovar", "vep") #: Default configuration for the somatic_variant_calling step -DEFAULT_CONFIG = r""" -# Default configuration variant_annotation -step_config: - somatic_variant_annotation: - tools: ["jannovar", "vep"] - path_somatic_variant_calling: ../somatic_variant_calling # REQUIRED - tools_ngs_mapping: [] # default to those configured for ngs_mapping - tools_somatic_variant_calling: [] # default to those configured for somatic_variant_calling - jannovar: - path_jannovar_ser: REQUIRED # REQUIRED - flag_off_target: False # REQUIRED - dbnsfp: # configuration for default genome release, needs change if differing - col_contig: 1 - col_pos: 2 - columns: [] - annotation_tracks_bed: [] - annotation_tracks_tsv: [] - annotation_tracks_vcf: [] - window_length: 50000000 # split input into windows of this size, each triggers a job - num_jobs: 100 # number of windows to process in parallel - use_profile: true # use Snakemake profile for parallel processing - restart_times: 5 # number of times to re-launch jobs in case of failure - max_jobs_per_second: 10 # throttling of job creation - max_status_checks_per_second: 10 # throttling of status checks - ignore_chroms: # patterns of chromosome names to ignore - - NC_007605 # herpes virus - - hs37d5 # GRCh37 decoy - - chrEBV # Eppstein-Barr Virus - - 'GL*' # problematic unplaced loci - - '*_decoy' # decoy contig - - 'HLA-*' # HLA genes - vep: - cache_dir: "" # Defaults to $HOME/.vep Not a good idea on the cluster - species: homo_sapiens - assembly: GRCh38 - cache_version: 102 # WARNING- this must match the wrapper's vep version! - tx_flag: "gencode_basic" # The flag selecting the transcripts. One of "gencode_basic", "refseq", and "merged". - pick_order: ["biotype", "mane", "appris", "tsl", "ccds", "canonical", "rank", "length"] - num_threads: 8 - buffer_size: 1000 - output_options: - - everything -""" +DEFAULT_CONFIG = SomaticVariantAnnotationConfigModel.default_config_yaml_string() class AnnotateSomaticVcfStepPart(BaseStepPart): @@ -254,7 +213,7 @@ class JannovarAnnotateSomaticVcfStepPart(AnnotateSomaticVcfStepPart): #: Class available actions actions = ("annotate_somatic_vcf",) - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -270,14 +229,6 @@ def get_resource_usage(self, action): memory=f"{8 * 1024 * 2}M", ) - def check_config(self): - if self.name not in self.config["tools"]: - return - self.parent.ensure_w_config( - ("step_config", "somatic_variant_annotation", "jannovar", "path_jannovar_ser"), - "Path to serialized Jannovar database", - ) - class VepAnnotateSomaticVcfStepPart(AnnotateSomaticVcfStepPart): """Annotate VCF file from somatic calling using ENSEMBL's VEP""" @@ -297,7 +248,7 @@ class VepAnnotateSomaticVcfStepPart(AnnotateSomaticVcfStepPart): #: Allowed keywords for pick order PICK_ORDER = ("biotype", "mane", "appris", "tsl", "ccds", "canonical", "rank", "length") - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -308,21 +259,11 @@ def get_resource_usage(self, action): # Validate action self._validate_action(action) return ResourceUsage( - threads=self.config["vep"]["num_threads"], + threads=self.config.vep.num_threads, time="24:00:00", # 24 hours memory=f"{16 * 1024 * 1}M", ) - def check_config(self): - if self.name not in self.config["tools"]: - return - if not self.config["vep"]["tx_flag"] in ("merged", "refseq", "gencode_basic"): - raise InvalidConfiguration("tx_flag must be 'gencode_basic', or 'merged' or 'refseq'") - if not all([x in self.PICK_ORDER for x in self.config["vep"]["pick_order"]]): - raise InvalidConfiguration( - "pick order keywords must be in {}".format(", ".join(self.PICK_ORDER)) - ) - class SomaticVariantAnnotationWorkflow(BaseStep): """Perform germline variant annotation""" @@ -345,7 +286,8 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (SomaticVariantCallingWorkflow, NgsMappingWorkflow), + config_model_class=SomaticVariantAnnotationConfigModel, + previous_steps=(SomaticVariantCallingWorkflow, NgsMappingWorkflow), ) # Register sub step classes so the sub steps are available self.register_sub_step_classes( @@ -353,17 +295,15 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) ) # Register sub workflows self.register_sub_workflow( - "somatic_variant_calling", self.config["path_somatic_variant_calling"] + "somatic_variant_calling", self.config.path_somatic_variant_calling ) # Copy over "tools" setting from somatic_variant_calling/ngs_mapping if not set here - if not self.config["tools_ngs_mapping"]: - self.config["tools_ngs_mapping"] = self.w_config["step_config"]["ngs_mapping"]["tools"][ - "dna" - ] - if not self.config["tools_somatic_variant_calling"]: - self.config["tools_somatic_variant_calling"] = self.w_config["step_config"][ + if not self.config.tools_ngs_mapping: + self.config.tools_ngs_mapping = self.w_config.step_config["ngs_mapping"].tools.dna + if not self.config.tools_somatic_variant_calling: + self.config.tools_somatic_variant_calling = self.w_config.step_config[ "somatic_variant_calling" - ]["tools"] + ].tools @listify def get_result_files(self): @@ -371,19 +311,19 @@ def get_result_files(self): We will process all primary DNA libraries and perform joint calling within pedigrees """ - annotators = set(self.config["tools"]) & set(ANNOTATION_TOOLS) - callers = set(self.config["tools_somatic_variant_calling"]) + annotators = set(self.config.tools) & set(ANNOTATION_TOOLS) + callers = set(self.config.tools_somatic_variant_calling) name_pattern = "{mapper}.{caller}.{annotator}.{tumor_library.name}" yield from self._yield_result_files_matched( os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), - mapper=self.config["tools_ngs_mapping"], + mapper=self.config.tools_ngs_mapping, caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED), annotator=annotators, ext=EXT_VALUES, ) yield from self._yield_result_files_matched( os.path.join("output", name_pattern, "log", name_pattern + "{ext}"), - mapper=self.config["tools_ngs_mapping"], + mapper=self.config.tools_ngs_mapping, caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED), annotator=annotators, ext=( @@ -399,12 +339,12 @@ def get_result_files(self): full = list( filter( lambda x: self.sub_steps[x].has_full, - set(self.config["tools"]) & set(ANNOTATION_TOOLS), + set(self.config.tools) & set(ANNOTATION_TOOLS), ), ) yield from self._yield_result_files_matched( os.path.join("output", name_pattern, "out", name_pattern + ".full{ext}"), - mapper=self.config["tools_ngs_mapping"], + mapper=self.config.tools_ngs_mapping, caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED), annotator=full, ext=EXT_VALUES, @@ -413,7 +353,7 @@ def get_result_files(self): name_pattern = "{mapper}.{caller}.{annotator}.{donor.name}" yield from self._yield_result_files_joint( os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + mapper=self.w_config.step_config["ngs_mapping"].tools.dna, caller=callers & set(SOMATIC_VARIANT_CALLERS_JOINT), annotator=annotators, ext=EXT_VALUES, @@ -449,10 +389,3 @@ def _yield_result_files_joint(self, tpl, **kwargs): for sheet in filter(is_not_background, self.shortcut_sheets): for donor in sheet.donors: yield from expand(tpl, donor=[donor], **kwargs) - - def check_config(self): - """Check that the path to the NGS mapping is present""" - self.ensure_w_config( - ("step_config", "somatic_variant_annotation", "path_somatic_variant_calling"), - "Path to variant calling not configured but required for somatic variant annotation", - ) diff --git a/snappy_pipeline/workflows/somatic_variant_annotation/model.py b/snappy_pipeline/workflows/somatic_variant_annotation/model.py new file mode 100644 index 000000000..56e2f5b22 --- /dev/null +++ b/snappy_pipeline/workflows/somatic_variant_annotation/model.py @@ -0,0 +1,71 @@ +import enum +from typing import Annotated + +from pydantic import Field + +from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel, validators +from snappy_pipeline.models.annotation import Vep + + +class Tool(enum.StrEnum): + jannovar = "jannovar" + vep = "vep" + + +class Dbnsfp(SnappyModel): + col_contig: int = 1 + col_pos: int = 2 + columns: list[str] = [] + + +class Jannovar(SnappyModel): + path_jannovar_ser: str + """Path to serialized Jannovar database""" + + flag_off_target: bool + + dbnsfp: Dbnsfp + """configuration for default genome release, needs change if differing""" + + annotation_tracks_bed: list[str] = [] + + annotation_tracks_tsv: list[str] = [] + + annotation_tracks_vcf: list[str] = [] + + window_length: int = 50000000 + """split input into windows of this size, each triggers a job""" + + num_jobs: int = 100 + """number of windows to process in parallel""" + + use_profile: bool = True + """use Snakemake profile for parallel processing""" + + restart_times: int = 5 + """number of times to re-launch jobs in case of failure""" + + max_jobs_per_second: int = 10 + """throttling of job creation""" + + max_status_checks_per_second: int = 10 + """throttling of status checks""" + + ignore_chroms: list[str] = ["NC_007605", "hs37d5", "chrEBV", "GL*", "*_decoy", "HLA-*"] + """patterns of chromosome names to ignore""" + + +class SomaticVariantAnnotation(SnappyStepModel, validators.ToolsMixin): + tools: Annotated[list[Tool], EnumField(Tool, [Tool.jannovar, Tool.vep], min_length=1)] + + path_somatic_variant_calling: Annotated[str, Field(examples=["../somatic_variant_calling"])] + + tools_ngs_mapping: list[str] = [] + """default to those configured for ngs_mapping""" + + tools_somatic_variant_calling: list[str] = [] + """default to those configured for somatic_variant_calling""" + + jannovar: Jannovar | None = None + + vep: Vep | None = None diff --git a/snappy_pipeline/workflows/somatic_variant_calling/Snakefile b/snappy_pipeline/workflows/somatic_variant_calling/Snakefile index 5a8bfb139..2f782163b 100644 --- a/snappy_pipeline/workflows/somatic_variant_calling/Snakefile +++ b/snappy_pipeline/workflows/somatic_variant_calling/Snakefile @@ -4,7 +4,9 @@ import os from snappy_pipeline import expand_ref -from snappy_pipeline.workflows.somatic_variant_calling import SomaticVariantCallingWorkflow +from snappy_pipeline.workflows.somatic_variant_calling import ( + SomaticVariantCallingWorkflow, +) __author__ = "Manuel Holtgrewe " @@ -72,7 +74,7 @@ rule somatic_variant_calling_mutect_run: # Run MuTect 2 ---------------------------------------------------------------- -if config["step_config"]["somatic_variant_calling"]["mutect2"]["common_variants"]: +if wf.w_config.step_config["somatic_variant_calling"].mutect2.common_variants: rule somatic_variant_calling_mutect2_pileup_normal: input: diff --git a/snappy_pipeline/workflows/somatic_variant_calling/__init__.py b/snappy_pipeline/workflows/somatic_variant_calling/__init__.py index c58464fba..b07786ca9 100644 --- a/snappy_pipeline/workflows/somatic_variant_calling/__init__.py +++ b/snappy_pipeline/workflows/somatic_variant_calling/__init__.py @@ -99,13 +99,12 @@ """ from collections import OrderedDict -from itertools import chain import os import sys -from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background from snakemake.io import expand +from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import ( BaseStep, @@ -115,6 +114,8 @@ ) from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow +from .model import SomaticVariantCalling as SomaticVariantCallingConfigModel + __author__ = "Manuel Holtgrewe " #: Extensions of files to create as main payload @@ -163,197 +164,22 @@ } #: Available somatic variant callers assuming matched samples. -SOMATIC_VARIANT_CALLERS_MATCHED = ("mutect", "mutect2", "scalpel") +SOMATIC_VARIANT_CALLERS_MATCHED = {"mutect", "mutect2", "scalpel", "strelka2"} #: Available somatic variant callers that just call all samples from one donor together. -SOMATIC_VARIANT_CALLERS_JOINT = ( +SOMATIC_VARIANT_CALLERS_JOINT = { "bcftools_joint", "platypus_joint", "gatk_hc_joint", "gatk_ug_joint", "varscan_joint", -) +} #: Available somatic variant callers -SOMATIC_VARIANT_CALLERS = tuple( - chain(SOMATIC_VARIANT_CALLERS_MATCHED, SOMATIC_VARIANT_CALLERS_JOINT) -) - -#: Available somatic variant callers assuming matched samples. -SOMATIC_VARIANT_CALLERS_MATCHED = ("mutect", "mutect2", "scalpel", "strelka2") - -#: Available somatic variant callers that just call all samples from one donor together. -SOMATIC_VARIANT_CALLERS_JOINT = ( - "bcftools_joint", - "platypus_joint", - "gatk_hc_joint", - "gatk_ug_joint", - "varscan_joint", -) +SOMATIC_VARIANT_CALLERS = SOMATIC_VARIANT_CALLERS_MATCHED | SOMATIC_VARIANT_CALLERS_JOINT #: Default configuration for the somatic_variant_calling schema -DEFAULT_CONFIG = r""" -# Default configuration somatic_variant_calling -step_config: - somatic_variant_calling: - tools: ['mutect', 'scalpel'] # REQUIRED, examples: 'mutect' and 'scalpel'. - path_ngs_mapping: ../ngs_mapping # REQUIRED - ignore_chroms: # patterns of chromosome names to ignore - - NC_007605 # herpes virus - - hs37d5 # GRCh37 decoy - - chrEBV # Eppstein-Barr Virus - - '*_decoy' # decoy contig - - 'HLA-*' # HLA genes - - 'GL000220.*' # Contig with problematic, repetitive DNA in GRCh37 - # Configuration for joint calling with samtools+bcftools. - bcftools_joint: - max_depth: 4000 - max_indel_depth: 4000 - window_length: 10000000 - num_threads: 16 - # Configuration for joint calling with Platypus. - platypus_joint: - split_complex_mnvs: true # whether or not to split complex and MNV variants - num_threads: 16 - # VCF annotation databases are given as mapping from name to - # {'file': '/path.vcf.gz', - # 'info_tag': 'VCF_TAG', - # 'description': 'VCF header description'} - # Configuration for MuTect - mutect: - # Parallelization configuration - num_cores: 2 # number of cores to use locally - window_length: 3500000 # split input into windows of this size, each triggers a job - num_jobs: 500 # number of windows to process in parallel - use_profile: true # use Snakemake profile for parallel processing - restart_times: 5 # number of times to re-launch jobs in case of failure - max_jobs_per_second: 2 # throttling of job creation - max_status_checks_per_second: 10 # throttling of status checks - debug_trunc_tokens: 0 # truncation to first N tokens (0 for none) - keep_tmpdir: never # keep temporary directory, {always, never, onerror} - job_mult_memory: 1 # memory multiplier - job_mult_time: 1 # running time multiplier - merge_mult_memory: 1 # memory multiplier for merging - merge_mult_time: 1 # running time multiplier for merging - # Configuration for MuTect 2 - mutect2: - panel_of_normals: '' # Set path to panel of normals vcf if required - germline_resource: '' # Germline variants resource (same as panel of normals) - common_variants: '' # Common germline variants for contamination estimation - extra_arguments: [] # List additional Mutect2 arguments - # Each additional argument must be in the form: - # "-- " - # For example, to filter reads prior to calling & to - # add annotations to the output vcf: - # - "--read-filter CigarContainsNoNOperator" - # - "--annotation AssemblyComplexity BaseQuality" - # Parallelization configuration - num_cores: 2 # number of cores to use locally - window_length: 50000000 # split input into windows of this size, each triggers a job - num_jobs: 500 # number of windows to process in parallel - use_profile: true # use Snakemake profile for parallel processing - restart_times: 5 # number of times to re-launch jobs in case of failure - max_jobs_per_second: 2 # throttling of job creation - max_status_checks_per_second: 10 # throttling of status checks - debug_trunc_tokens: 0 # truncation to first N tokens (0 for none) - keep_tmpdir: never # keep temporary directory, {always, never, onerror} - job_mult_memory: 1 # memory multiplier - job_mult_time: 1 # running time multiplier - merge_mult_memory: 1 # memory multiplier for merging - merge_mult_time: 1 # running time multiplier for merging - # Configuration for Scalpel - scalpel: - path_target_regions: REQUIRED # REQUIRED - # Configuration for strelka2 - strelka2: - path_target_regions: "" # For exomes: include a bgzipped bed file with tabix index. That also triggers the --exome flag - gatk_hc_joint: - # Parallelization configuration - num_cores: 2 # number of cores to use locally - window_length: 50000000 # split input into windows of this size, each triggers a job - num_jobs: 500 # number of windows to process in parallel - use_profile: true # use Snakemake profile for parallel processing - restart_times: 5 # number of times to re-launch jobs in case of failure - max_jobs_per_second: 10 # throttling of job creation - max_status_checks_per_second: 10 # throttling of status checks - debug_trunc_tokens: 0 # truncation to first N tokens (0 for none) - keep_tmpdir: never # keep temporary directory, {always, never, onerror} - job_mult_memory: 1 # memory multiplier - job_mult_time: 1 # running time multiplier - merge_mult_memory: 1 # memory multiplier for merging - merge_mult_time: 1 # running time multiplier for merging - # GATK HC--specific configuration - allow_seq_dict_incompatibility: false - annotations: - - BaseQualityRankSumTest - - FisherStrand - - GCContent - - HaplotypeScore - - HomopolymerRun - - MappingQualityRankSumTest - - MappingQualityZero - - QualByDepth - - ReadPosRankSumTest - - RMSMappingQuality - - DepthPerAlleleBySample - - Coverage - - ClippingRankSumTest - - DepthPerSampleHC - gatk_ug_joint: - # Parallelization configuration - num_cores: 2 # number of cores to use locally - window_length: 50000000 # split input into windows of this size, each triggers a job - num_jobs: 500 # number of windows to process in parallel - use_profile: true # use Snakemake profile for parallel processing - restart_times: 5 # number of times to re-launch jobs in case of failure - max_jobs_per_second: 10 # throttling of job creation - max_status_checks_per_second: 10 # throttling of status checks - debug_trunc_tokens: 0 # truncation to first N tokens (0 for none) - keep_tmpdir: never # keep temporary directory, {always, never, onerror} - job_mult_memory: 1 # memory multiplier - job_mult_time: 1 # running time multiplier - merge_mult_memory: 1 # memory multiplier for merging - merge_mult_time: 1 # running time multiplier for merging - # GATK UG--specific configuration - downsample_to_coverage: 250 - allow_seq_dict_incompatibility: false - annotations: - - BaseQualityRankSumTest - - FisherStrand - - GCContent - - HaplotypeScore - - HomopolymerRun - - MappingQualityRankSumTest - - MappingQualityZero - - QualByDepth - - ReadPosRankSumTest - - RMSMappingQuality - - DepthPerAlleleBySample - - Coverage - - ClippingRankSumTest - - DepthPerSampleHC - varscan_joint: - # Parallelization configuration - num_cores: 2 # number of cores to use locally - window_length: 5000000 # split input into windows of this size, each triggers a job - num_jobs: 500 # number of windows to process in parallel - use_profile: true # use Snakemake profile for parallel processing - restart_times: 5 # number of times to re-launch jobs in case of failure - max_jobs_per_second: 2 # throttling of job creation - max_status_checks_per_second: 10 # throttling of status checks - # Configuration for samtools mpileup - max_depth: 4000 - max_indel_depth: 4000 - min_bq: 13 - no_baq: True - # Configuration for Varscan - min_coverage: 8 - min_reads2: 2 - min_avg_qual: 15 - min_var_freq: 0.01 - min_freq_for_hom: 0.75 - p_value: 99e-02 -""" +DEFAULT_CONFIG = SomaticVariantCallingConfigModel.default_config_yaml_string() class SomaticVariantCallingStepPart(BaseStepPart): @@ -457,7 +283,7 @@ class MutectBaseStepPart(SomaticVariantCallingStepPart): """Base class for Mutect 1 and 2 step parts""" def check_config(self): - if self.name not in self.config["tools"]: + if self.name not in self.config.tools: return # Mutect not enabled, skip self.parent.ensure_w_config( ("static_data_config", "cosmic", "path"), @@ -478,7 +304,7 @@ def get_output_files(self, action): output_files[k] = self.base_path_out.format(var_caller=self.name, ext=v) return output_files - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -549,14 +375,12 @@ def __init__(self, parent): super().__init__(parent) def check_config(self): - if self.name not in self.config["tools"]: + if self.name not in self.config.tools: return # Mutect not enabled, skip self.parent.ensure_w_config( ("static_data_config", "reference", "path"), "Path to reference FASTA not configured but required for %s" % (self.name,), ) - if self.config[self.name]["common_variants"]: - self.actions.extend(["contamination", "pileup_normal", "pileup_tumor"]) def get_input_files(self, action): """Return input function for Mutect2 rules. @@ -778,7 +602,7 @@ def get_log_file(self, action): log_files[key + "_md5"] = prefix + ext + ".md5" return log_files - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -803,7 +627,7 @@ class ScalpelStepPart(SomaticVariantCallingStepPart): actions = ("run",) def check_config(self): - if "scalpel" not in self.config["tools"]: + if "scalpel" not in self.config.tools: return # scalpel not enabled, skip self.parent.ensure_w_config( ("static_data_config", "reference", "path"), @@ -826,7 +650,7 @@ def get_output_files(self, action): ) return result - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -880,7 +704,7 @@ def get_output_files(self, action): output_files[k] = self.base_path_out.format(var_caller=self.name, ext=v) return output_files - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -984,8 +808,7 @@ def arg_function(wildcards): for ngs_library in test_sample.ngs_libraries.values() ] } - if "ignore_chroms" in self.parent.config: - ignore_chroms = self.parent.config["ignore_chroms"] + if ignore_chroms := self.parent.config.ignore_chroms: result["ignore_chroms"] = ignore_chroms return result @@ -1002,7 +825,7 @@ class BcftoolsJointStepPart(JointCallingStepPart): #: Step name name = "bcftools_joint" - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -1012,9 +835,9 @@ def get_resource_usage(self, action): """ # Validate action self._validate_action(action) - mem_mb = 1024 * self.parent.config["bcftools_joint"]["num_threads"] + mem_mb = 1024 * self.parent.config.bcftools_joint.num_threads return ResourceUsage( - threads=self.parent.config["bcftools_joint"]["num_threads"], + threads=self.parent.config.bcftools_joint.num_threads, time="2-00:00:00", # 2 days memory=f"{mem_mb}M", ) @@ -1029,7 +852,7 @@ class VarscanJointStepPart(JointCallingStepPart): #: Class available actions actions = ("run", "call_pedigree") - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -1056,7 +879,7 @@ class PlatypusJointStepPart(JointCallingStepPart): #: Step name name = "platypus_joint" - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -1066,9 +889,9 @@ def get_resource_usage(self, action): """ # Validate action self._validate_action(action) - mem_mb = int(3.75 * 1024 * self.parent.config["platypus_joint"]["num_threads"]) + mem_mb = int(3.75 * 1024 * self.parent.config.platypus_joint.num_threads) return ResourceUsage( - threads=self.parent.config["platypus_joint"]["num_threads"], + threads=self.parent.config.platypus_joint.num_threads, time="2-00:00:00", # 2 days memory=f"{mem_mb}M", ) @@ -1084,7 +907,7 @@ class GatkHcJointStepPart(JointCallingStepPart): #: Step name name = "gatk_hc_joint" - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -1111,7 +934,7 @@ class GatkUgJointStepPart(JointCallingStepPart): #: Step name name = "gatk_ug_joint" - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -1153,7 +976,8 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (NgsMappingWorkflow,), + config_model_class=SomaticVariantCallingConfigModel, + previous_steps=(NgsMappingWorkflow,), ) # Register sub step classes so the sub steps are available self.register_sub_step_classes( @@ -1171,7 +995,13 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) ) ) # Initialize sub-workflows - self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"]) + self.register_sub_workflow("ngs_mapping", self.config.path_ngs_mapping) + + if "mutect2" in self.config.tools: + if self.config.mutect2.common_variants: + self.sub_steps["mutect2"].actions.extend( + ["contamination", "pileup_normal", "pileup_tumor"] + ) @listify def get_result_files(self): @@ -1180,16 +1010,16 @@ def get_result_files(self): We will process all NGS libraries of all bio samples in all sample sheets. """ name_pattern = "{mapper}.{caller}.{tumor_library.name}" - for caller in set(self.config["tools"]) & set(SOMATIC_VARIANT_CALLERS_MATCHED): + for caller in set(self.config.tools) & set(SOMATIC_VARIANT_CALLERS_MATCHED): yield from self._yield_result_files_matched( os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + mapper=self.w_config.step_config["ngs_mapping"].tools.dna, caller=caller, ext=EXT_MATCHED[caller].values() if caller in EXT_MATCHED else EXT_VALUES, ) yield from self._yield_result_files_matched( os.path.join("output", name_pattern, "log", name_pattern + "{ext}"), - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + mapper=self.w_config.step_config["ngs_mapping"].tools.dna, caller=caller, ext=( ".log", @@ -1205,8 +1035,8 @@ def get_result_files(self): name_pattern = "{mapper}.{caller}.{donor.name}" yield from self._yield_result_files_joint( os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], - caller=set(self.config["tools"]) & set(SOMATIC_VARIANT_CALLERS_JOINT), + mapper=self.w_config.step_config["ngs_mapping"].tools.dna, + caller=set(self.config.tools) & set(SOMATIC_VARIANT_CALLERS_JOINT), ext=EXT_VALUES, ) @@ -1240,10 +1070,3 @@ def _yield_result_files_joint(self, tpl, **kwargs): for sheet in filter(is_not_background, self.shortcut_sheets): for donor in sheet.donors: yield from expand(tpl, donor=[donor], **kwargs) - - def check_config(self): - """Check that the path to the NGS mapping is present""" - self.ensure_w_config( - ("step_config", "somatic_variant_calling", "path_ngs_mapping"), - "Path to NGS mapping not configured but required for somatic variant calling", - ) diff --git a/snappy_pipeline/workflows/somatic_variant_calling/model.py b/snappy_pipeline/workflows/somatic_variant_calling/model.py new file mode 100644 index 000000000..5e017785a --- /dev/null +++ b/snappy_pipeline/workflows/somatic_variant_calling/model.py @@ -0,0 +1,252 @@ +import enum +from typing import Annotated + +from pydantic import AfterValidator, Field + +from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel, validators + + +class Tool(enum.StrEnum): + mutect = "mutect" + mutect2 = "mutect2" + scalpel = "scalpel" + strelka2 = "strelka2" + gatk_hc_joint = "gatk_hc_joint" + gatk_ug_joint = "gatk_ug_joint" + bcftools_joint = "bcftools_joint" + platypus_joint = "platypus_joint" + varscan_joint = "varscan_joint" + + +class BcfToolsJoint(SnappyModel): + max_depth: int = 4000 + max_indel_depth: int = 4000 + window_length: int = 10000000 + num_threads: int = 16 + + +class PlatypusJoint(SnappyModel): + split_complex_mnvs: bool = True + """whether or not to split complex and MNV variants""" + + num_threads: int = 16 + + +class Keep(enum.StrEnum): + ALWAYS = "always" + NEVER = "never" + ONERROR = "onerror" + + +class Parallel(SnappyModel): + num_cores: int = 2 + """number of cores to use locally""" + + window_length: int = 3500000 + """split input into windows of this size, each triggers a job""" + + num_jobs: int = 500 + """number of windows to process in parallel""" + + use_profile: bool = True + """use Snakemake profile for parallel processing""" + + restart_times: int = 5 + """number of times to re-launch jobs in case of failure""" + + max_jobs_per_second: int = 2 + """throttling of job creation""" + + max_status_checks_per_second: int = 10 + """throttling of status checks""" + + debug_trunc_tokens: int = 0 + """truncation to first N tokens (0 for none)""" + + keep_tmpdir: Keep = Keep.NEVER + """keep temporary directory, {always, never, onerror}""" + + job_mult_memory: float = 1 + """memory multiplier""" + + job_mult_time: float = 1 + """running time multiplier""" + + merge_mult_memory: float = 1 + """memory multiplier for merging""" + + merge_mult_time: float = 1 + """running time multiplier for merging""" + + +class Mutect(Parallel): + pass + + +def argument(args: list[str]) -> list[str]: + def _is_valid_argument(arg: str) -> bool: + return arg.startswith("--") + + if any(_is_valid_argument(arg) for arg in args): + raise ValueError( + f"invalid arguments: {list(filter(lambda x: not _is_valid_argument(x), args))}" + ) + return args + + +class Mutect2(Parallel): + # Sadly a type of + # `FilePath | None = None` + # still applies `FilePath` validation on `None`, which errors + panel_of_normals: str | None = "" + """Set path to panel of normals vcf if required""" + + germline_resource: str | None = "" + """Germline variants resource (same as panel of normals)""" + + common_variants: str | None = "" + """Common germline variants for contamination estimation""" + + extra_arguments: Annotated[ + list[str], + AfterValidator(argument), + Field( + examples=[ + "--read-filter CigarContainsNoNOperator", + "--annotation AssemblyComplexity BaseQuality", + ] + ), + ] = [] + """ + List additional Mutect2 arguments. + Each additional argument must be of the form: + "-- " + For example, to filter reads prior to calling & to add annotations to the output vcf: + - "--read-filter CigarContainsNoNOperator" + - "--annotation AssemblyComplexity BaseQuality" + """ + + window_length: int = 50000000 + + +class Scalpel(SnappyModel): + path_target_regions: str + + +class Strelka2(SnappyModel): + path_target_regions: str | None = None + """For exomes: include a bgzipped bed file with tabix index. That also triggers the --exome flag""" + + +class GatkHcJoint(Parallel): + # GATK HC--specific configuration + allow_seq_dict_incompatibility: bool = False + annotations: list[str] = [ + "BaseQualityRankSumTest", + "FisherStrand", + "GCContent", + "HaplotypeScore", + "HomopolymerRun", + "MappingQualityRankSumTest", + "MappingQualityZero", + "QualByDepth", + "ReadPosRankSumTest", + "RMSMappingQuality", + "DepthPerAlleleBySample", + "Coverage", + "ClippingRankSumTest", + "DepthPerSampleHC", + ] + + window_length: int = 50000000 + + +class GatkUgJoint(Parallel): + # GATK UG--specific configuration + downsample_to_coverage: float = 250 + allow_seq_dict_incompatibility: bool = False + annotations: list[str] = [ + "BaseQualityRankSumTest", + "FisherStrand", + "GCContent", + "HaplotypeScore", + "HomopolymerRun", + "MappingQualityRankSumTest", + "MappingQualityZero", + "QualByDepth", + "ReadPosRankSumTest", + "RMSMappingQuality", + "DepthPerAlleleBySample", + "Coverage", + "ClippingRankSumTest", + "DepthPerSampleHC", + ] + + window_length: int = 50000000 + + +class SamtoolsMpileup(SnappyModel): + max_depth: int = 4000 + + max_indel_depth: int = 4000 + + min_bq: int = 13 + + no_baq: bool = True + + +class VarscanJoint(Parallel, SamtoolsMpileup): + min_coverage: float = 8 + + min_reads2: int = 2 + + min_avg_qual: float = 15 + + min_var_freq: Annotated[float, Field(ge=0, le=1)] = 0.01 + + min_freq_for_hom: Annotated[float, Field(ge=0, le=1)] = 0.75 + + p_value: Annotated[float, Field(ge=0, le=1)] = 99e-02 + + window_length: int = 5000000 + + +class SomaticVariantCalling(SnappyStepModel, validators.ToolsMixin): + tools: Annotated[list[Tool], EnumField(Tool, [], min_length=1)] + """List of tools""" + + path_ngs_mapping: str = "../ngs_mapping" + """Path to ngs_mapping""" + + ignore_chroms: Annotated[ + list[str], + Field(examples=["NC_007605", "hs37d5", "chrEBV", "*_decoy", "HLA-*", "GL000220.*"]), + ] = ["NC_007605", "hs37d5", "chrEBV", "*_decoy", "HLA-*", "GL000220.*"] + """Patterns of contig names to ignore""" + + bcftools_joint: BcfToolsJoint | None = None + """Configuration for joint calling with samtools+bcftools.""" + + platypus_joint: PlatypusJoint | None = None + """Configuration for joint calling with Platypus.""" + + mutect: Mutect | None = None + """Configuration for MuTect""" + + mutect2: Mutect2 | None = None + """Configuration for MuTect 2""" + + scalpel: Scalpel | None = None + """Configuration for Scalpel""" + + strelka2: Strelka2 | None = None + """Configuration for Strelka2""" + + gatk_hc_joint: GatkHcJoint | None = None + """Configuration for GatkHcJoint""" + + gatk_ug_joint: GatkUgJoint | None = None + """Configuration for GatkUgJoint""" + + varscan_joint: VarscanJoint | None = None + """Configuration for VarscanJoint""" diff --git a/snappy_pipeline/workflows/somatic_variant_filtration/Snakefile b/snappy_pipeline/workflows/somatic_variant_filtration/Snakefile index 9d8b55792..f8be70fa1 100644 --- a/snappy_pipeline/workflows/somatic_variant_filtration/Snakefile +++ b/snappy_pipeline/workflows/somatic_variant_filtration/Snakefile @@ -4,7 +4,9 @@ import os from snappy_pipeline import expand_ref -from snappy_pipeline.workflows.somatic_variant_filtration import SomaticVariantFiltrationWorkflow +from snappy_pipeline.workflows.somatic_variant_filtration import ( + SomaticVariantFiltrationWorkflow, +) __author__ = "Manuel Holtgrewe " @@ -76,7 +78,7 @@ rule somatic_variant_filtration_eb_filter_write_panel: # Run DKFZ Bias Filter -------------------------------------------------------- -if config["step_config"]["somatic_variant_filtration"]["filtration_schema"] == "sets": +if wf.w_config.step_config["somatic_variant_filtration"].filtration_schema == "sets": rule somatic_variant_filtration_dkfz_bias_filter_run: input: @@ -158,7 +160,7 @@ if config["step_config"]["somatic_variant_filtration"]["filtration_schema"] == " # Flexible Somatic Variant Filtration ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -if config["step_config"]["somatic_variant_filtration"]["filtration_schema"] == "list": +if wf.w_config.step_config["somatic_variant_filtration"].filtration_schema == "list": checkpoint one_dkfz: input: diff --git a/snappy_pipeline/workflows/somatic_variant_filtration/__init__.py b/snappy_pipeline/workflows/somatic_variant_filtration/__init__.py index f56353c6a..f76fc5b3f 100644 --- a/snappy_pipeline/workflows/somatic_variant_filtration/__init__.py +++ b/snappy_pipeline/workflows/somatic_variant_filtration/__init__.py @@ -116,9 +116,9 @@ import random import sys -from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background from snakemake.io import expand +from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import ( BaseStep, @@ -136,6 +136,8 @@ SomaticVariantCallingWorkflow, ) +from .model import SomaticVariantFiltration as SomaticVariantFiltrationConfigModel + __author__ = "Manuel Holtgrewe " #: Extensions of files to create as main payload @@ -145,53 +147,7 @@ EXT_NAMES = ("vcf", "vcf_tbi", "vcf_md5", "vcf_tbi_md5") #: Default configuration for the somatic_variant_calling step -DEFAULT_CONFIG = r""" -# Default configuration variant_annotation -step_config: - somatic_variant_filtration: - path_somatic_variant: ../somatic_variant_annotation # When annotations are present, otherwise ../somatic_variant_calling - path_ngs_mapping: ../ngs_mapping # Needed for dkfz & ebfilter - tools_ngs_mapping: null # Default: use those defined in ngs_mapping step - tools_somatic_variant_calling: null # Default: use those defined in somatic_variant_calling step - tools_somatic_variant_annotation: null # Default: use those defined in somatic_variant_annotation step - has_annotation: True - filtration_schema: "list" # Either "sets" (old scheme- filter_sets) or "list" (new scheme- filter_list) - filter_sets: # Deprecated filtration method, use filter_list - # no_filter: no_filters # implicit, always defined - dkfz_only: '' # empty - dkfz_and_ebfilter: - ebfilter_threshold: 2.4 - dkfz_and_ebfilter_and_oxog: - vaf_threshold: 0.08 - coverage_threshold: 5 - dkfz_and_oxog: - vaf_threshold: 0.08 - coverage_threshold: 5 - exon_lists: {} # Deprecated filtration method, use filter_list - # genome_wide: null # implicit, always defined - # ensembl74: path/to/ensembl47.bed - eb_filter: # Deprecated filter, use in filter_list - shuffle_seed: 1 - panel_of_normals_size: 25 - min_mapq: 20 - min_baseq: 15 - filter_list: [] - # Available filters - # dkfz: {} # Not parametrisable - # ebfilter: - # ebfilter_threshold: 2.4 - # shuffle_seed: 1 - # panel_of_normals_size: 25 - # min_mapq: 20 - # min_baseq: 15 - # bcftools: - # include: "" # Expression to be used in bcftools view --include - # exclude: "" # Expression to be used in bcftools view --exclude - # regions: - # path_bed: REQUIRED # Bed file of regions to be considered (variants outside are filtered out) - # protected: - # path_bed: REQUIRED # Bed file of regions that should not be filtered out at all. -""" +DEFAULT_CONFIG = SomaticVariantFiltrationConfigModel.default_config_yaml_string() class SomaticVariantFiltrationStepPart(BaseStepPart): @@ -201,7 +157,7 @@ def __init__(self, parent): super().__init__(parent) self.config = parent.config self.name_pattern = "{mapper}.{var_caller}" - if self.config["has_annotation"]: + if self.config.has_annotation: self.name_pattern += ".{annotator}" self.name_pattern += ".{tumor_library}" # Build shortcut from cancer bio sample name to matched cancer sample @@ -267,18 +223,24 @@ def get_input_files(self, action): def input_function(wildcards): filter_nb = int(wildcards["filter_nb"]) if filter_nb > 1: - prev = list(self.config["filter_list"][filter_nb - 2].keys())[0] + prev = list(self.config.filter_list[filter_nb - 2].keys())[0] n = filter_nb - 1 - yield "vcf", os.path.join( - "work", self.name_pattern, "out", self.name_pattern + f".{prev}_{n}.vcf.gz" + yield ( + "vcf", + os.path.join( + "work", self.name_pattern, "out", self.name_pattern + f".{prev}_{n}.vcf.gz" + ), ) else: - yield "vcf", os.path.join( - self.config["path_somatic_variant"], - "output", - self.name_pattern, - "out", - self.name_pattern + ".vcf.gz", + yield ( + "vcf", + os.path.join( + self.config.path_somatic_variant, + "output", + self.name_pattern, + "out", + self.name_pattern + ".vcf.gz", + ), ) return input_function @@ -313,17 +275,23 @@ def get_log_file(self, action): ("conda_list", ".conda_list.txt"), ) for key, ext in key_ext: - yield key, os.path.join( - "work", - self.name_pattern, - "log", - self.name_pattern + "." + self.filter_name + "_{filter_nb}" + ext, + yield ( + key, + os.path.join( + "work", + self.name_pattern, + "log", + self.name_pattern + "." + self.filter_name + "_{filter_nb}" + ext, + ), ) - yield key + "_md5", os.path.join( - "work", - self.name_pattern, - "log", - self.name_pattern + "." + self.filter_name + "_{filter_nb}" + ext + ".md5", + yield ( + key + "_md5", + os.path.join( + "work", + self.name_pattern, + "log", + self.name_pattern + "." + self.filter_name + "_{filter_nb}" + ext + ".md5", + ), ) def get_params(self, action): @@ -347,20 +315,26 @@ def input_function(wildcards): parent = super(OneFilterWithBamStepPart, self).get_input_files(action) yield from parent(wildcards).items() - yield "bam", os.path.join( - self.config["path_ngs_mapping"], - "output", - "{mapper}.{tumor_library}", - "out", - "{mapper}.{tumor_library}.bam", - ) - if normal_library := self.tumor_to_normal_library.get(wildcards["tumor_library"], None): - yield "normal", os.path.join( - self.config["path_ngs_mapping"], + yield ( + "bam", + os.path.join( + self.config.path_ngs_mapping, "output", - f"{{mapper}}.{normal_library}", + "{mapper}.{tumor_library}", "out", - f"{{mapper}}.{normal_library}.bam", + "{mapper}.{tumor_library}.bam", + ), + ) + if normal_library := self.tumor_to_normal_library.get(wildcards["tumor_library"], None): + yield ( + "normal", + os.path.join( + self.config.path_ngs_mapping, + "output", + f"{{mapper}}.{normal_library}", + "out", + f"{{mapper}}.{normal_library}.bam", + ), ) return input_function @@ -393,9 +367,12 @@ def input_function(wildcards): @dictify def _get_output_files_write_panel(self): - yield "txt", ( - "work/{mapper}.eb_filter.panel_of_normals/out/{mapper}.eb_filter." - "panel_of_normals.txt" + yield ( + "txt", + ( + "work/{mapper}.eb_filter.panel_of_normals/out/{mapper}.eb_filter." + "panel_of_normals.txt" + ), ) def get_params(self, action): @@ -408,9 +385,9 @@ def input_function(wildcards): parent = super(OneFilterEbfilterStepPart, self).get_params(action) parameters = parent(wildcards) filter_nb = int(wildcards["filter_nb"]) - ebfilter_config = self.config["filter_list"][filter_nb - 1][self.filter_name] + ebfilter_config = self.config.filter_list[filter_nb - 1][self.filter_name] parameters.update(ebfilter_config) - parameters["has_annotation"] = self.config.get("has_annotation", False) + parameters["has_annotation"] = self.config.has_annotation return parameters return input_function @@ -428,14 +405,8 @@ def input_function(wildcards): parent = super(OneFilterBcftoolsStepPart, self).get_params(action) parameters = parent(wildcards) filter_nb = int(wildcards["filter_nb"]) - keywords = self.config["filter_list"][filter_nb - 1][self.filter_name] - msg = "Only one include or exclude expression is allowed in {} filter {} (configuration: {})" - assert len(keywords) == 1, msg.format(self.filter_name, filter_nb, keywords) - keyword = list(keywords.keys())[0] - msg = 'Unknown keyword "{}" in {} filter {} (allowed values: include, exclude. Configuration: {})' - assert keyword in ("include", "exclude"), msg.format( - keyword, self.filter_name, filter_nb, keywords - ) + filter = self.config.filter_list[filter_nb - 1][self.filter_name] + keywords = filter.keywords() parameters.update(keywords) return parameters @@ -454,18 +425,8 @@ def input_function(wildcards): parent = super(OneFilterRegionsStepPart, self).get_params(action) parameters = parent(wildcards) filter_nb = int(wildcards["filter_nb"]) - keywords = self.config["filter_list"][filter_nb - 1][self.filter_name] - msg = ( - "Only one include or exclude region is allowed in {} filter {} (configuration: {})" - ) - assert len(keywords) == 1, msg.format(self.filter_name, filter_nb, keywords) - keyword = list(keywords.keys())[0] - msg = 'Unknown keyword "{}" in {} filter {} (allowed values: include, exclude, path_bed (deprecated). Configuration: {})' - assert keyword in ("include", "exclude", "path_bed"), msg.format( - keyword, self.filter_name, filter_nb, keywords - ) - if keyword == "path_bed": - keywords = {"exclude": keywords["path_bed"]} + filter = self.config.filter_list[filter_nb - 1][self.filter_name] + keywords = filter.keywords() parameters.update(keywords) return parameters @@ -484,18 +445,8 @@ def input_function(wildcards): parent = super(OneFilterProtectedStepPart, self).get_params(action) parameters = parent(wildcards) filter_nb = int(wildcards["filter_nb"]) - keywords = self.config["filter_list"][filter_nb - 1][self.filter_name] - msg = ( - "Only one protected region bed file is allowed in {} filter {} (configuration: {})" - ) - assert len(keywords) == 1, msg.format(self.filter_name, filter_nb, keywords) - keyword = list(keywords.keys())[0] - msg = ( - 'Unknown keyword "{}" in {} filter {} (allowed value: path_bed. Configuration: {})' - ) - assert keyword in ("path_bed",), msg.format( - keyword, self.filter_name, filter_nb, keywords - ) + filter = self.config.filter_list[filter_nb - 1][self.filter_name] + keywords = filter.keywords() parameters.update(keywords) return parameters @@ -515,8 +466,8 @@ def get_input_files(self, action): # Validate action self._validate_action(action) - filter_names = [list(filter_name.keys())[0] for filter_name in self.config["filter_list"]] - filter_nb = len(self.config["filter_list"]) + filter_names = [list(filter_name.keys())[0] for filter_name in self.config.filter_list] + filter_nb = len(self.config.filter_list) filter_name = filter_names[filter_nb - 1] vcf = os.path.join( "work", @@ -538,7 +489,7 @@ def get_output_files(self, action): # Validate action self._validate_action(action) name_pattern = "{mapper}.{var_caller}" - if self.config["has_annotation"]: + if self.config.has_annotation: name_pattern += ".{annotator}" name_pattern += ".filtered.{tumor_library}" vcf = os.path.join("work", name_pattern, "out", name_pattern) @@ -561,7 +512,7 @@ def get_log_file(self, action): # Validate action self._validate_action(action) name_pattern = "{mapper}.{var_caller}" - if self.config["has_annotation"]: + if self.config.has_annotation: name_pattern += ".{annotator}" name_pattern += ".filtered.{tumor_library}" tpl = os.path.join("work", name_pattern, "log", name_pattern) @@ -591,7 +542,7 @@ def get_input_files(self, action): self._validate_action(action) name_pattern = "{mapper}.{var_caller}" - if self.config["has_annotation"]: + if self.config.has_annotation: name_pattern += ".{annotator}" # VCF file and index @@ -614,7 +565,7 @@ def get_output_files(self, action): self._validate_action(action) name_pattern = "{mapper}.{var_caller}" - if self.config["has_annotation"]: + if self.config.has_annotation: name_pattern += ".{annotator}" prefix = ( @@ -639,7 +590,7 @@ def _get_log_file(self, action): self._validate_action(action) name_pattern = "{mapper}.{var_caller}" - if self.config["has_annotation"]: + if self.config.has_annotation: name_pattern += ".{annotator}" name_pattern += f".{self.name}" prefix = os.path.join( @@ -657,7 +608,7 @@ def _get_log_file(self, action): for key, ext in key_ext: yield key, prefix + ext - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -691,11 +642,11 @@ def get_input_files(self, action): @dictify def _get_input_files_run(self, wildcards): name_pattern = "{mapper}.{var_caller}" - if self.config["has_annotation"]: + if self.config.has_annotation: name_pattern += ".{annotator}" # VCF file and index name_pattern = "{mapper}.{var_caller}" - if self.config["has_annotation"]: + if self.config.has_annotation: name_pattern += ".{annotator}" tpl = ( f"work/{name_pattern}." @@ -730,15 +681,15 @@ def get_params(self, action): """Return EBFilter parameters from the config""" # Validate action self._validate_action(action) - parameters = self.config["eb_filter"] - parameters.update(self.config["filter_sets"]["dkfz_and_ebfilter"]) - parameters["has_annotation"] = self.config.get("has_annotation", False) + parameters = self.config.eb_filter + parameters.update(self.config.filter_sets.dkfz_and_ebfilter) + parameters["has_annotation"] = self.config.has_annotation return parameters @dictify def _get_output_files_run(self): name_pattern = "{mapper}.{var_caller}" - if self.config["has_annotation"]: + if self.config.has_annotation: name_pattern += ".{annotator}" # VCF file and index prefix = ( @@ -758,9 +709,12 @@ def _get_output_files_run(self): @dictify def _get_output_files_write_panel(self): - yield "txt", ( - "work/{mapper}.eb_filter.panel_of_normals/out/{mapper}.eb_filter." - "panel_of_normals.txt" + yield ( + "txt", + ( + "work/{mapper}.eb_filter.panel_of_normals/out/{mapper}.eb_filter." + "panel_of_normals.txt" + ), ) @dictify @@ -774,7 +728,7 @@ def _get_log_file(self, action): return {} else: name_pattern = "{mapper}.{var_caller}" - if self.config["has_annotation"]: + if self.config.has_annotation: name_pattern += ".{annotator}" name_pattern += ".dkfz_bias_filter.eb_filter" prefix = os.path.join( @@ -808,15 +762,15 @@ def _get_panel_of_normal_bams(self, wildcards): if not bio_sample.extra_infos["isTumor"]: libraries.append(bio_sample.dna_ngs_library.name) libraries.sort() - random.seed(self.config["eb_filter"]["shuffle_seed"]) - lib_count = self.config["eb_filter"]["panel_of_normals_size"] + random.seed(self.config.eb_filter.shuffle_seed) + lib_count = self.config.eb_filter.panel_of_normals_size random.shuffle(libraries) ngs_mapping = self.parent.sub_workflows["ngs_mapping"] tpl = "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}" for library in libraries[:lib_count]: yield ngs_mapping(tpl.format(normal_library=library, **wildcards) + ".bam") - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -848,7 +802,7 @@ class ApplyFiltersStepPart(SomaticVariantFiltrationStepPart): def __init__(self, parent): super().__init__(parent) name_pattern = "{mapper}.{var_caller}" - if self.config["has_annotation"]: + if self.config.has_annotation: name_pattern += ".{annotator}" name_pattern += ".dkfz_bias_filter.eb_filter" self.base_path_out = os.path.join( @@ -882,7 +836,7 @@ def get_input_files(self, action): # Validate action self._validate_action(action) name_pattern = "{mapper}.{var_caller}" - if self.config["has_annotation"]: + if self.config.has_annotation: name_pattern += ".{annotator}" name_pattern += ".dkfz_bias_filter.eb_filter.{tumor_library}" tpl = os.path.join("work", name_pattern, "out", name_pattern) @@ -918,7 +872,7 @@ class FilterToExonsStepPart(SomaticVariantFiltrationStepPart): def __init__(self, parent): super().__init__(parent) name_pattern = "{mapper}.{var_caller}" - if self.config["has_annotation"]: + if self.config.has_annotation: name_pattern += ".{annotator}" name_pattern += ".dkfz_bias_filter.eb_filter" self.base_path_out = os.path.join( @@ -947,14 +901,17 @@ def get_input_files(self, action): @dictify def input_function(wildcards): for key, ext in zip(EXT_NAMES, EXT_VALUES): - yield key, self.base_path_in_.format( - tumor_library=wildcards.tumor_library, - mapper=wildcards.mapper, - var_caller=wildcards.var_caller, - annotator=wildcards.get("annotator", ""), - filter_set=wildcards.filter_set, - exon_list=wildcards.exon_list, - ext=ext, + yield ( + key, + self.base_path_in_.format( + tumor_library=wildcards.tumor_library, + mapper=wildcards.mapper, + var_caller=wildcards.var_caller, + annotator=wildcards.get("annotator", ""), + filter_set=wildcards.filter_set, + exon_list=wildcards.exon_list, + ext=ext, + ), ) return input_function @@ -997,7 +954,12 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (SomaticVariantAnnotationWorkflow, SomaticVariantCallingWorkflow, NgsMappingWorkflow), + config_model_class=SomaticVariantFiltrationConfigModel, + previous_steps=( + SomaticVariantAnnotationWorkflow, + SomaticVariantCallingWorkflow, + NgsMappingWorkflow, + ), ) # Register sub step classes so the sub steps are available self.register_sub_step_classes( @@ -1017,50 +979,48 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) ) # Register sub workflows self.register_sub_workflow( - "somatic_variant_annotation" - if self.config["has_annotation"] - else "somatic_variant_calling", - self.config["path_somatic_variant"], + ( + "somatic_variant_annotation" + if self.config.has_annotation + else "somatic_variant_calling" + ), + self.config.path_somatic_variant, "somatic_variant", ) - self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"]) + self.register_sub_workflow("ngs_mapping", self.config.path_ngs_mapping) # Copy over "tools" setting from somatic_variant_calling/ngs_mapping if not set here - if not self.config["tools_ngs_mapping"]: - self.config["tools_ngs_mapping"] = self.w_config["step_config"]["ngs_mapping"]["tools"][ - "dna" - ] - if not self.config["tools_somatic_variant_calling"]: - self.config["tools_somatic_variant_calling"] = self.w_config["step_config"][ + if not self.config.tools_ngs_mapping: + self.config.tools_ngs_mapping = self.w_config.step_config["ngs_mapping"].tools.dna + if not self.config.tools_somatic_variant_calling: + self.config.tools_somatic_variant_calling = self.w_config.step_config[ "somatic_variant_calling" - ]["tools"] - if not self.config["tools_somatic_variant_annotation"]: - self.config["tools_somatic_variant_annotation"] = self.w_config["step_config"][ + ].tools + if not self.config.tools_somatic_variant_annotation: + self.config.tools_somatic_variant_annotation = self.w_config.step_config[ "somatic_variant_annotation" - ]["tools"] + ].tools @listify def get_result_files(self): """Return list of result files Process all primary DNA libraries and perform pairwise calling for tumor/normal pairs """ - mappers = set(self.config["tools_ngs_mapping"]) & set( - self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"] + mappers = set(self.config.tools_ngs_mapping) & set( + self.w_config.step_config["ngs_mapping"].tools.dna ) - callers = set(self.config["tools_somatic_variant_calling"]) & set( + callers = set(self.config.tools_somatic_variant_calling) & set( SOMATIC_VARIANT_CALLERS_MATCHED ) - if self.config["has_annotation"]: - annotators = set(self.config["tools_somatic_variant_annotation"]) & set( - ANNOTATION_TOOLS - ) + if self.config.has_annotation: + annotators = set(self.config.tools_somatic_variant_annotation) & set(ANNOTATION_TOOLS) else: annotators = [] log_ext = [e + m for e in ("log", "conda_list.txt", "conda_info.txt") for m in ("", ".md5")] - if self.config["filtration_schema"] == "list": + if self.config.filtration_schema == "list": name_pattern = "{mapper}.{caller}" - if self.config["has_annotation"]: + if self.config.has_annotation: name_pattern += ".{annotator}" name_pattern += ".filtered.{tumor_library}" @@ -1086,13 +1046,15 @@ def get_result_files(self): ext=("", ".md5"), ) else: - filter_sets = ["no_filter"] - filter_sets += self.config["filter_sets"].keys() - exon_lists = ["genome_wide"] - exon_lists += list(self.config["exon_lists"].keys()) + filter_sets = {"no_filter"} + if self.config.filter_sets: + filter_sets |= self.config.filter_sets.keys() + exon_lists = {"genome_wide"} + if self.config.exon_lists: + exon_lists |= self.config.exon_lists.keys() name_pattern = "{mapper}.{caller}" - if self.config["has_annotation"]: + if self.config.has_annotation: name_pattern += ".{annotator}" name_pattern += ".dkfz_bias_filter.eb_filter.{tumor_library}.{filter_set}.{exon_list}" @@ -1137,16 +1099,3 @@ def _yield_result_files_matched(self, tpl, **kwargs): continue for ngs_library in test_sample.ngs_libraries.values(): yield from expand(tpl, tumor_library=[ngs_library.name], **kwargs) - - def check_config(self): - """Check that the path to the NGS mapping is present""" - self.ensure_w_config( - ("step_config", "somatic_variant_filtration", "path_somatic_variant"), - "Path to variant calling not configured but required for somatic variant annotation", - ) - assert self.config["filtration_schema"] in ( - "sets", - "list", - ), "Filtration schema must be either 'list' or 'sets' (deprecated)" - if self.config["filtration_schema"] == "list": - assert len(self.config["filter_list"]) > 0, "No filter defined in the filter list" diff --git a/snappy_pipeline/workflows/somatic_variant_filtration/model.py b/snappy_pipeline/workflows/somatic_variant_filtration/model.py new file mode 100644 index 000000000..08e808c15 --- /dev/null +++ b/snappy_pipeline/workflows/somatic_variant_filtration/model.py @@ -0,0 +1,198 @@ +import enum +from typing import Annotated, Any, Self, TypedDict + +from pydantic import Field, model_validator + +from snappy_pipeline.models import SnappyModel, SnappyStepModel + + +class DkfzAndEbfilter(SnappyModel): + ebfilter_threshold: float = 2.4 + + +class DkfzAndEbfilterAndOxog(SnappyModel): + vaf_threshold: float = 0.08 + coverage_threshold: float = 5 + + +class DkfzAndOxog(SnappyModel): + vaf_threshold: float = 0.08 + coverage_threshold: float = 5 + + +class FilterSets(SnappyModel): + no_filter: str | None = None + dkfz_only: str | None = None + dkfz_and_ebfilter: DkfzAndEbfilter | None = None + dkfz_and_ebfilter_and_oxog: DkfzAndEbfilterAndOxog | None = None + dkfz_and_oxog: DkfzAndOxog | None = None + + +class EbfilterSet(SnappyModel): + shuffle_seed: int = 1 + panel_of_normals_size: int = 25 + min_mapq: float = 20 + min_baseq: float = 15 + + +class Ebfilter(SnappyModel): + ebfilter_threshold: float = 2.4 + shuffle_seed: int = 1 + panel_of_normals_size: int = 25 + min_mapq: float = 20 + min_baseq: float = 15 + + +class Dkfz(SnappyModel): + pass + + +class Bcftools(SnappyModel): + include: str = "" + """Expression to be used in bcftools view --include""" + + exclude: str = "" + """Expression to be used in bcftools view --exclude""" + + @model_validator(mode="after") + def ensure_include_or_exclude(self) -> Self: + if not self.include and not self.exclude: + raise ValueError("Either include or exclude must be set") + if self.include and self.exclude: + raise ValueError("Only one of include or exclude may be set") + return self + + def keywords(self) -> dict[str, str]: + if self.include: + return {"include": self.include} + elif self.exclude: + return {"exclude": self.exclude} + return {} + + +class Regions(SnappyModel): + include: str = "" + """Expression to be used in bcftools view --include""" + + exclude: str = "" + """Expression to be used in bcftools view --exclude""" + + path_bed: Annotated[str, Field(deprecated="Use `exclude` instead")] = "" + """Bed file of regions to be considered (variants outside are filtered out)""" + + @model_validator(mode="after") + def ensure_include_or_exclude(self) -> Self: + if not any((self.include, self.exclude, self.path_bed)): + raise ValueError("Either include, exclude or path_bed must be set") + if sum((bool(self.include), bool(self.exclude), bool(self.path_bed))) > 1: + raise ValueError("Only one of include, exclude or path_bed may be set") + return self + + def keywords(self) -> dict[str, str]: + if self.include: + return {"include": self.include} + elif self.exclude: + return {"exclude": self.exclude} + elif self.path_bed: + return {"exclude": self.path_bed} # path_bed is deprecated and replaced by exclude + return {} + + +class Protected(SnappyModel): + path_bed: str + """Bed file of regions that should not be filtered out at all.""" + + def keywords(self) -> dict[str, str]: + if self.path_bed: + return {"path_bed": self.path_bed} + return {} + + +class Filter(TypedDict, total=False): + bcftools: Bcftools + dkfz: Dkfz + ebfilter: Ebfilter + regions: Regions + protected: Protected + + +class FiltrationSchema(enum.StrEnum): + list = "list" + sets = "sets" + + +class SomaticVariantFiltration(SnappyStepModel): + path_somatic_variant: Annotated[ + str, Field(examples=["../somatic_variant_annotation", "../somatic_variant_calling"]) + ] = "../somatic_variant" + + path_ngs_mapping: str = "../ngs_mapping" + """Needed for dkfz & ebfilter""" + + tools_ngs_mapping: list[str] | None = None + """Default: use those defined in ngs_mapping step""" + + tools_somatic_variant_calling: list[str] | None = None + """Default: use those defined in somatic_variant_calling step""" + + tools_somatic_variant_annotation: list[str] | None = None + """Default: use those defined in somatic_variant_annotation step""" + + has_annotation: bool = True + + filtration_schema: FiltrationSchema = FiltrationSchema.list + + filter_sets: Annotated[FilterSets | None, Field(deprecated="use filter_list instead")] = None + + exon_lists: Annotated[dict[str, Any], Field(deprecated="use filter_list instead")] = {} + + eb_filter: Annotated[EbfilterSet | None, Field(deprecated="use filter_list instead")] = ( + EbfilterSet() + ) + + filter_list: list[Filter] = [] + """ + Available filters + dkfz: {} # Not parametrisable + ebfilter: + ebfilter_threshold: 2.4 + shuffle_seed: 1 + panel_of_normals_size: 25 + min_mapq: 20 + min_baseq: 15 + bcftools: + include: "" # Expression to be used in bcftools view --include + exclude: "" # Expression to be used in bcftools view --exclude + regions: + path_bed: REQUIRED # Bed file of regions to be considered (variants outside are filtered out) + protected: + path_bed: REQUIRED # Bed file of regions that should not be filtered out at all. + """ + + @model_validator(mode="after") + def ensure_filter_list_is_configured_correctly(self): + if self.filter_list: + # check ebfilter and dkfz are only used at most once + num_ebfilter = num_dkfz = 0 + for f in self.filter_list: + if "ebfilter" in f: + num_ebfilter += 1 + if "dkfz" in f: + num_dkfz += 1 + if num_ebfilter > 1: + raise ValueError("Only one ebfilter is allowed") + if num_dkfz > 1: + raise ValueError("Only one dkfz is allowed") + return self + + @model_validator(mode="after") + def ensure_either_filter_sets_or_filter_list_is_configured(self): + if self.filtration_schema == FiltrationSchema.sets: + if not self.filter_sets: + raise ValueError("filter_sets must be set") + if self.filtration_schema == FiltrationSchema.list: + if not self.filter_list: + raise ValueError("filter_list must be set") + if self.filter_sets and self.filter_list: + raise ValueError("Either filter_sets or filter_list must be set") + return self diff --git a/snappy_pipeline/workflows/somatic_variant_signatures/Snakefile b/snappy_pipeline/workflows/somatic_variant_signatures/Snakefile index c98de0a98..33391ae1f 100644 --- a/snappy_pipeline/workflows/somatic_variant_signatures/Snakefile +++ b/snappy_pipeline/workflows/somatic_variant_signatures/Snakefile @@ -4,7 +4,9 @@ import os from snappy_pipeline import expand_ref -from snappy_pipeline.workflows.somatic_variant_signatures import SomaticVariantSignaturesWorkflow +from snappy_pipeline.workflows.somatic_variant_signatures import ( + SomaticVariantSignaturesWorkflow, +) __author__ = "Clemens Messerschmidt" diff --git a/snappy_pipeline/workflows/somatic_variant_signatures/__init__.py b/snappy_pipeline/workflows/somatic_variant_signatures/__init__.py index 98da18bac..390f880a2 100644 --- a/snappy_pipeline/workflows/somatic_variant_signatures/__init__.py +++ b/snappy_pipeline/workflows/somatic_variant_signatures/__init__.py @@ -8,14 +8,13 @@ signature explains as well as a plot. """ - from collections import OrderedDict import os import sys -from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background from snakemake.io import expand +from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow, ResourceUsage @@ -29,20 +28,12 @@ ) from snappy_pipeline.workflows.somatic_variant_filtration import SomaticVariantFiltrationWorkflow +from .model import SomaticVariantSignatures as SomaticVariantSignaturesConfigModel + __author__ = "Clemens Messerschmidt" # Default configuration variant_signatures -DEFAULT_CONFIG = r""" -step_config: - somatic_variant_signatures: - is_filtered: false # REQUIRED - path_somatic_variant: ../somatic_variant_calling # REQUIRED - tools_ngs_mapping: [] # default to those configured for ngs_mapping - tools_somatic_variant_calling: [] # default to those configured for somatic_variant_calling - tools_somatic_variant_annotation: [] # default to those configured for somatic_variant_annotation - filters: [] # When using variants after the somatic_variant_filtration step, use "no_filter", "dkfz_only", "dkfz_and_ebfilter" or "dkfz_and_ebfilter_and_oxog" - filtered_regions: [] # When using variants after the somatic_variant_filtration step, use "genome_wide" or "" -""" +DEFAULT_CONFIG = SomaticVariantSignaturesConfigModel.default_config_yaml_string() class SignaturesStepPart(BaseStepPart): @@ -53,15 +44,15 @@ class SignaturesStepPart(BaseStepPart): def __init__(self, parent): super().__init__(parent) - self.config = parent.w_config["step_config"]["somatic_variant_signatures"] + self.config = parent.w_config.step_config["somatic_variant_signatures"] self.name_pattern = "{mapper}.{var_caller}" - if self.config["is_filtered"]: - if len(self.config["filters"]) == 0: + if self.config.is_filtered: + if len(self.config.filters) == 0: self.name_pattern += ".{anno_caller}.filtered" else: self.name_pattern += ".{anno_caller}.dkfz_bias_filter.eb_filter" self.name_pattern += "." + self.name + ".{tumor_library}" - if self.config["is_filtered"] and len(self.config["filters"]) > 0: + if self.config.is_filtered and len(self.config.filters) > 0: self.name_pattern += ".{filter}.{region}" # Build shortcut from cancer bio sample name to matched cancre sample self.tumor_ngs_library_to_sample_pair = OrderedDict() @@ -80,7 +71,7 @@ def get_log_file(self, action): self._validate_action(action) return os.path.join("work", self.name_pattern, "log", "snakemake." + self.name + ".log") - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -109,13 +100,13 @@ def get_input_files(self, action): # Validate action self._validate_action(action) name_pattern = "{mapper}.{var_caller}" - if self.config["is_filtered"]: - if len(self.config["filters"]) == 0: + if self.config.is_filtered: + if len(self.config.filters) == 0: name_pattern += ".{anno_caller}.filtered" else: name_pattern += ".{anno_caller}.dkfz_bias_filter.eb_filter" name_pattern += ".{tumor_library}" - if self.config["is_filtered"] and len(self.config["filters"]) > 0: + if self.config.is_filtered and len(self.config.filters) > 0: name_pattern += ".{filter}.{region}" tpl = os.path.join("output", name_pattern, "out", name_pattern) key_ext = {"vcf": ".vcf.gz", "vcf_tbi": ".vcf.gz.tbi"} @@ -167,13 +158,13 @@ def get_input_files(self, action): # Validate action self._validate_action(action) name_pattern = "{mapper}.{var_caller}" - if self.config["is_filtered"]: - if len(self.config["filters"]) == 0: + if self.config.is_filtered: + if len(self.config.filters) == 0: name_pattern += ".{anno_caller}.filtered" else: name_pattern += ".{anno_caller}.dkfz_bias_filter.eb_filter" name_pattern += ".tabulate_vcf.{tumor_library}" - if self.config["is_filtered"] and len(self.config["filters"]) > 0: + if self.config.is_filtered and len(self.config.filters) > 0: name_pattern += ".{filter}.{region}" yield "tsv", os.path.join("work", name_pattern, "out", name_pattern + ".tsv") @@ -211,7 +202,8 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - ( + config_model_class=SomaticVariantSignaturesConfigModel, + previous_steps=( SomaticVariantCallingWorkflow, SomaticVariantAnnotationWorkflow, SomaticVariantFiltrationWorkflow, @@ -219,44 +211,38 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) ), ) # Register sub workflows - config = self.w_config["step_config"]["somatic_variant_signatures"] + config = self.w_config.step_config["somatic_variant_signatures"] sub_workflow = "somatic_variant_calling" - if config["is_filtered"]: + if config.is_filtered: sub_workflow = "somatic_variant_filtration" - self.register_sub_workflow(sub_workflow, config["path_somatic_variant"], "somatic_variant") + self.register_sub_workflow(sub_workflow, config.path_somatic_variant, "somatic_variant") # Copy over "tools" setting from somatic_variant_calling/ngs_mapping if not set here - if not config["tools_ngs_mapping"]: - config["tools_ngs_mapping"] = self.w_config["step_config"]["ngs_mapping"]["tools"][ - "dna" - ] - if not config["tools_somatic_variant_calling"]: - config["tools_somatic_variant_calling"] = self.w_config["step_config"][ + if not config.tools_ngs_mapping: + config.tools_ngs_mapping = self.w_config.step_config["ngs_mapping"].tools.dna + if not config.tools_somatic_variant_calling: + config.tools_somatic_variant_calling = self.w_config.step_config[ "somatic_variant_calling" - ]["tools"] - if not config["tools_somatic_variant_annotation"]: - config["tools_somatic_variant_annotation"] = self.w_config["step_config"][ + ].tools + if not config.tools_somatic_variant_annotation: + config.tools_somatic_variant_annotation = self.w_config.step_config[ "somatic_variant_annotation" - ]["tools"] - if config["is_filtered"]: - if len(self.w_config["step_config"]["somatic_variant_filtration"]["filter_list"]) > 0: - config["filters"] = [] - config["filtered_regions"] = [] + ].tools + if config.is_filtered: + if len(self.w_config.step_config["somatic_variant_filtration"].filter_list) > 0: + config.filters = [] + config.filtered_regions = [] else: - if not config["filters"]: - config["filters"] = list( - self.w_config["step_config"]["somatic_variant_filtration"][ - "filter_sets" - ].keys() + if not config.filters: + config.filters = list( + self.w_config.step_config["somatic_variant_filtration"].filter_sets.keys() ) - config["filters"].append("no_filter") - if not config["filtered_regions"]: - config["filtered_regions"] = list( - self.w_config["step_config"]["somatic_variant_filtration"][ - "exon_lists" - ].keys() + config.filters.append("no_filter") + if not config.filtered_regions: + config.filtered_regions = list( + self.w_config.step_config["somatic_variant_filtration"].exon_lists.keys() ) - config["filtered_regions"].append("genome_wide") - self.w_config["step_config"]["somatic_variant_signatures"] = config + config.filtered_regions.append("genome_wide") + self.w_config.step_config["somatic_variant_signatures"] = config # Register sub step classes so the sub steps are available self.register_sub_step_classes( (TabulateVariantsStepPart, DeconstructSigsStepPart, LinkOutStepPart) @@ -265,42 +251,41 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) @listify def get_result_files(self): """Return list of result files for workflow""" - config = self.w_config["step_config"]["somatic_variant_signatures"] + config = self.w_config.step_config["somatic_variant_signatures"] name_pattern = "{mapper}.{caller}" - if config["is_filtered"]: - if len(config["filters"]) > 0: + if config.is_filtered: + if len(config.filters) > 0: name_pattern += ".{anno_caller}.dkfz_bias_filter.eb_filter" else: name_pattern += ".{anno_caller}.filtered" name_pattern += ".deconstruct_sigs.{tumor_library.name}" - if config["is_filtered"] and len(config["filters"]) > 0: + if config.is_filtered and len(config.filters) > 0: name_pattern += ".{filter}.{region}" - mappers = set(config["tools_ngs_mapping"]) & set( - self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"] + mappers = set(config.tools_ngs_mapping) & set( + self.w_config.step_config["ngs_mapping"].tools.dna ) assert len(mappers) > 0, "No valid mapper" - callers = set(config["tools_somatic_variant_calling"]) & set( - SOMATIC_VARIANT_CALLERS_MATCHED - ) + callers = set(config.tools_somatic_variant_calling) & set(SOMATIC_VARIANT_CALLERS_MATCHED) assert len(callers) > 0, "No valid somatic variant caller" - if config["is_filtered"]: - anno_callers = set(config["tools_somatic_variant_annotation"]) & set(ANNOTATION_TOOLS) + + anno_callers = [] + filters = [] + regions = [] + if config.is_filtered: + anno_callers = set(config.tools_somatic_variant_annotation) & set(ANNOTATION_TOOLS) assert len(anno_callers) > 0, "No valid somatic variant annotation tool" - filters = list( - self.w_config["step_config"]["somatic_variant_filtration"]["filter_sets"].keys() - ) - filters.append("no_filter") - filters = set(filters) & set(config["filters"]) - regions = list( - self.w_config["step_config"]["somatic_variant_filtration"]["exon_lists"].keys() - ) - regions.append("genome_wide") - regions = set(regions) & set(config["filtered_regions"]) - else: - anno_callers = [] - filters = [] - regions = [] + if len(config.filters) > 0: + filters = list( + self.w_config.step_config["somatic_variant_filtration"].filter_sets.keys() + ) + filters.append("no_filter") + filters = set(filters) & set(config.filters) + regions = list( + self.w_config.step_config["somatic_variant_filtration"].exon_lists.keys() + ) + regions.append("genome_wide") + regions = set(regions) & set(config.filtered_regions) yield from self._yield_result_files_matched( os.path.join("output", name_pattern, "out", name_pattern + ".tsv"), @@ -334,8 +319,9 @@ def _yield_result_files_matched(self, tpl, **kwargs): ) def check_config(self): - """Check that the path to the NGS mapping is present""" - self.ensure_w_config( - ("step_config", "somatic_variant_signatures", "path_somatic_variant"), - "Path to variant calling not configured but required for somatic variant signatures", - ) + if self.config.is_filtered: + self.ensure_w_config( + ("step_config", "somatic_variant_filtration"), + "When is_filtered is set to True, " + "the somatic_variant_filtration step must be configured", + ) diff --git a/snappy_pipeline/workflows/somatic_variant_signatures/model.py b/snappy_pipeline/workflows/somatic_variant_signatures/model.py new file mode 100644 index 000000000..6b0091b6d --- /dev/null +++ b/snappy_pipeline/workflows/somatic_variant_signatures/model.py @@ -0,0 +1,29 @@ +from typing import Annotated + +from pydantic import Field + +from snappy_pipeline.models import SnappyStepModel + + +class SomaticVariantSignatures(SnappyStepModel): + is_filtered: bool = False + + path_somatic_variant: Annotated[str, Field(examples=["../somatic_variant_calling"])] + + tools_ngs_mapping: list[str] = [] + """default to those configured for ngs_mapping""" + + tools_somatic_variant_calling: list[str] = [] + """default to those configured for somatic_variant_calling""" + + tools_somatic_variant_annotation: list[str] = [] + """default to those configured for somatic_variant_annotation""" + + filters: list[str] = [] + """ + When using variants after the somatic_variant_filtration step, + use "no_filter", "dkfz_only", "dkfz_and_ebfilter" or "dkfz_and_ebfilter_and_oxog" + """ + + filtered_regions: list[str] = [] + """When using variants after the somatic_variant_filtration step, use "genome_wide" or """ "" diff --git a/snappy_pipeline/workflows/somatic_wgs_cnv_calling/Snakefile b/snappy_pipeline/workflows/somatic_wgs_cnv_calling/Snakefile index ea996d6ee..2be8d1f1f 100644 --- a/snappy_pipeline/workflows/somatic_wgs_cnv_calling/Snakefile +++ b/snappy_pipeline/workflows/somatic_wgs_cnv_calling/Snakefile @@ -4,7 +4,9 @@ import os from snappy_pipeline import expand_ref -from snappy_pipeline.workflows.somatic_wgs_cnv_calling import SomaticWgsCnvCallingWorkflow +from snappy_pipeline.workflows.somatic_wgs_cnv_calling import ( + SomaticWgsCnvCallingWorkflow, +) __author__ = "Manuel Holtgrewe " diff --git a/snappy_pipeline/workflows/somatic_wgs_cnv_calling/__init__.py b/snappy_pipeline/workflows/somatic_wgs_cnv_calling/__init__.py index 2785acb1d..d2dd4fb1f 100644 --- a/snappy_pipeline/workflows/somatic_wgs_cnv_calling/__init__.py +++ b/snappy_pipeline/workflows/somatic_wgs_cnv_calling/__init__.py @@ -78,9 +78,9 @@ import os import sys -from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background from snakemake.io import expand +from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import ( BaseStep, @@ -90,6 +90,8 @@ ) from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow +from .model import SomaticWgsCnvCalling as SomaticWgsCnvCallingConfigModel + __author__ = "Manuel Holtgrewe " #: Extensions of files to create as main payload @@ -103,80 +105,7 @@ SOMATIC_WGS_CNV_CALLERS = ("canvas", "cnvetti", "control_freec") #: Default configuration for the somatic_variant_calling schema -DEFAULT_CONFIG = r""" -# Default configuration somatic_wgs_cnv_calling -step_config: - somatic_wgs_cnv_calling: - path_ngs_mapping: ../ngs_mapping # REQUIRED - path_somatic_variant_calling: ../somatic_variant_calling # REQUIRED - somatic_variant_calling_tool: null # REQUIRED - tools: [cnvetti] # REQUIRED, examples: 'cnvetti' and 'control_freec'. - canvas: - path_reference: REQUIRED # REQUIRED - path_filter_bed: REQUIRED # REQUIRED - path_genome_folder: REQUIRED # REQUIRED - cnvetti: - window_length: null - count_kind: null - segmentation: null - normalization: null - preset: deep_wgs # REQUIRED - presets: - deep_wgs: - window_length: 200 - count_kind: Coverage - segmentation: HaarSeg - normalization: MedianGcBinned - control_freec: - path_chrlenfile: REQUIRED #REQUIRED - path_mappability: REQUIRED #REQUIRED - path_mappability_enabled: False - window_size: -1 #set to a value >=0 you want a specific fixed window size - convert: - org_obj: org.Hs.eg.db::org.Hs.eg.db - tx_obj: TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene - bs_obj: BSgenome.Hsapiens.1000genomes.hs37d5::hs37d5 - cnvkit: - path_target: REQUIRED # Usually ../panel_of_normals/output/cnvkit.target/out/cnvkit.target.bed - path_antitarget: REQUIRED # Usually ../panel_of_normals/output/cnvkit.antitarget/out/cnvkit.antitarget.bed - path_panel_of_normals: REQUIRED # Usually ../panel_of_normals/output/{mapper}.cnvkit.create_panel/out/{mapper}.cnvkit.panel_of_normals.cnn - plot: True # Output plots (very slow) - min_mapq: 0 # [coverage] Mininum mapping quality score to count a read for coverage depth - count: False # [coverage] Alternative couting algorithm - gc_correction: True # [fix] Use GC correction - edge_correction: True # [fix] Use edge correction - rmask_correction: True # [fix] Use rmask correction - # BCBIO uses - # seg_method: haar - # seg_threshold: 0.0001 - # -- OR - # seg_method: cbs - # seg_threshold: 0.000001 - segmentation_method: cbs # [segment] One of cbs, flasso, haar, hmm, hmm-tumor, hmm-germline, none - segmentation_threshold: 0.000001 # [segment] Significance threshold (hmm methods: smoothing window size) - drop_low_coverage: False # [segment, call, genemetrics] Drop very low coverage bins - drop_outliers: 10 # [segment] Drop outlier bins (0 for no outlier filtering) - smooth_cbs: True # [segment] Additional smoothing of CBS segmentation (WARNING- not the default value) - center: "" # [call] Either one of mean, median, mode, biweight, or a constant log2 ratio value. - filter: ampdel # [call] One of ampdel, cn, ci, sem (merging segments flagged with the specified filter), "" for no filtering - calling_method: threshold # [call] One of threshold, clonal, none - call_thresholds: "-1.1,-0.25,0.2,0.7" # [call] Thresholds for calling integer copy number - ploidy: 2 # [call] Ploidy of sample cells - purity: 0 # [call] Estimated tumor cell fraction (0 for discarding tumor cell purity) - gender: "" # [call, diagram] Specify the chromosomal sex of all given samples as male or female. Guess when missing - male_reference: False # [call, diagram] Create male reference - diagram_threshold: 0.5 # [diagram] Copy number change threshold to label genes - diagram_min_probes: 3 # [diagram] Min number of covered probes to label genes - shift_xy: True # [diagram] Shift X & Y chromosomes according to sample sex - breaks_min_probes: 1 # [breaks] Min number of covered probes for a break inside the gene - genemetrics_min_probes: 3 # [genemetrics] Min number of covered probes to consider a gene - genemetrics_threshold: 0.2 # [genemetrics] Min abs log2 change to consider a gene - genemetrics_alpha: 0.05 # [genemetrics] Significance cutoff - genemetrics_bootstrap: 100 # [genemetrics] Number of bootstraps - segmetrics_alpha: 0.05 # [segmetrics] Significance cutoff - segmetrics_bootstrap: 100 # [segmetrics] Number of bootstraps - smooth_bootstrap: False # [segmetrics] Smooth bootstrap results -""" +DEFAULT_CONFIG = SomaticWgsCnvCallingConfigModel.default_config_yaml_string() class SomaticWgsCnvCallingStepPart(BaseStepPart): @@ -212,7 +141,6 @@ def __init__(self, parent): ) def get_input_files(self, action): - # Validate action self._validate_action(action) @@ -281,24 +209,7 @@ class CanvasSomaticWgsStepPart(SomaticWgsCnvCallingStepPart): #: Class available actions actions = ("run",) - def check_config(self): - """Check configuration for Canvas Somatic WGS CNV calling""" - if "canvas" not in (self.config["tools"] or []): # pylint: disable=C0325 - return # Canvas not enabled, skip # pragma: no cover - self.parent.ensure_w_config( - ("step_config", "somatic_wgs_cnv_calling", "canvas", "path_reference"), - "Path to Canvas reference file not configured", - ) - self.parent.ensure_w_config( - ("step_config", "somatic_wgs_cnv_calling", "canvas", "path_filter_bed"), - "Path to Canvas filter BED file not configured", - ) - self.parent.ensure_w_config( - ("step_config", "somatic_wgs_cnv_calling", "canvas", "path_genome_folder"), - "Path to Canvas genome folder not configured", - ) - - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -359,8 +270,11 @@ def _get_input_files_tumor_normal_ratio(self, wildcards): name_pattern = "{mapper}.cnvetti_coverage.{library_name}".format( library_name=library_name, **wildcards ) - yield key, "work/{name_pattern}/out/{name_pattern}{ext}".format( - name_pattern=name_pattern, ext=".bcf" + yield ( + key, + "work/{name_pattern}/out/{name_pattern}{ext}".format( + name_pattern=name_pattern, ext=".bcf" + ), ) @dictify @@ -368,8 +282,11 @@ def _get_input_files_segment(self, wildcards): """Return input files that "cnvetti segment" needs""" for key, ext in self.bcf_dict.items(): name_pattern = "{mapper}.cnvetti_tumor_normal_ratio.{library_name}".format(**wildcards) - yield key, "work/{name_pattern}/out/{name_pattern}{ext}".format( - name_pattern=name_pattern, ext=ext + yield ( + key, + "work/{name_pattern}/out/{name_pattern}{ext}".format( + name_pattern=name_pattern, ext=ext + ), ) def get_output_files(self, action): @@ -382,24 +299,33 @@ def get_output_files(self, action): def _get_output_files_coverage(self): for key, ext in self.bcf_dict.items(): name_pattern = "{mapper}.cnvetti_coverage.{library_name}" - yield key, "work/{name_pattern}/out/{name_pattern}{ext}".format( - name_pattern=name_pattern, ext=ext + yield ( + key, + "work/{name_pattern}/out/{name_pattern}{ext}".format( + name_pattern=name_pattern, ext=ext + ), ) @dictify def _get_output_files_tumor_normal_ratio(self): for key, ext in self.bcf_dict.items(): name_pattern = "{mapper}.cnvetti_tumor_normal_ratio.{library_name}" - yield key, "work/{name_pattern}/out/{name_pattern}{ext}".format( - name_pattern=name_pattern, ext=ext + yield ( + key, + "work/{name_pattern}/out/{name_pattern}{ext}".format( + name_pattern=name_pattern, ext=ext + ), ) @dictify def _get_output_files_segment(self): for key, ext in self.bcf_dict.items(): name_pattern = "{mapper}.cnvetti_segment.{library_name}" - yield key, "work/{name_pattern}/out/{name_pattern}{ext}".format( - name_pattern=name_pattern, ext=ext + yield ( + key, + "work/{name_pattern}/out/{name_pattern}{ext}".format( + name_pattern=name_pattern, ext=ext + ), ) @dictify @@ -415,12 +341,7 @@ def get_log_file(self, action): for key, ext in key_ext: yield key, prefix + ext - def check_config(self): - """Check configuration for CNVetti WGS CNV calling""" - if "cnvetti" not in (self.config["tools"] or []): # pylint: disable=C0325 - return # CNVetti not enabled, skip - - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -477,23 +398,6 @@ class CnvkitSomaticWgsStepPart(SomaticWgsCnvCallingStepPart): def __init__(self, parent): super().__init__(parent) - def check_config(self): - """Check configuration for cnvkit""" - if "cnvkit" not in self.config["tools"]: - return # cnvkit not enabled, skip - self.parent.ensure_w_config( - ("step_config", "somatic_wgs_cnv_calling", "cnvkit", "path_target"), - "Path to target regions is missing for cnvkit", - ) - self.parent.ensure_w_config( - ("step_config", "somatic_wgs_cnv_calling", "cnvkit", "path_antitarget"), - "Path to antitarget regions is missing for cnvkit", - ) - self.parent.ensure_w_config( - ("step_config", "somatic_wgs_cnv_calling", "cnvkit", "path_panel_of_normals"), - "Path to panel of normals (reference) is missing for cnvkit", - ) - def get_input_files(self, action): """Return input paths input function, dependent on rule""" # Validate action @@ -701,7 +605,7 @@ def get_log_file(self, action): log_files[key + "_md5"] = prefix + ext + ".md5" return log_files - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -786,20 +690,7 @@ def get_output_files(self, action): return result - def check_config(self): - """Check configuration for ControlFreec Somatic WGS CNV calling""" - if "control_freec" not in (self.config["tools"] or []): # pylint: disable=C0325 - return # ControlFreec not enabled, skip # pragma: no cover - self.parent.ensure_w_config( - ("step_config", "somatic_wgs_cnv_calling", "control_freec", "path_chrlenfile"), - "Path to ControlFreec ChrLenFile not configured", - ) - self.parent.ensure_w_config( - ("step_config", "somatic_wgs_cnv_calling", "control_freec", "path_mappability"), - "Path to ControlFreec mappability file not configured", - ) - - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -837,7 +728,8 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (NgsMappingWorkflow,), + config_model_class=SomaticWgsCnvCallingConfigModel, + previous_steps=(NgsMappingWorkflow,), ) # Register sub step classes so the sub steps are available self.register_sub_step_classes( @@ -850,19 +742,17 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) ) ) # Register sub workflows - self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"]) + self.register_sub_workflow("ngs_mapping", self.config.path_ngs_mapping) self.register_sub_workflow( - "somatic_variant_calling", self.config["path_somatic_variant_calling"] + "somatic_variant_calling", self.config.path_somatic_variant_calling ) # Copy over "tools" setting from somatic_variant_calling/ngs_mapping if not set here - if not self.config["tools_ngs_mapping"]: - self.config["tools_ngs_mapping"] = self.w_config["step_config"]["ngs_mapping"]["tools"][ - "dna" - ] - if not self.config["somatic_variant_calling_tool"]: - self.config["somatic_variant_calling_tool"] = self.w_config["step_config"][ + if not self.config.tools_ngs_mapping: + self.config.tools_ngs_mapping = self.w_config.step_config["ngs_mapping"].tools.dna + if not self.config.somatic_variant_calling_tool: + self.config.somatic_variant_calling_tool = self.w_config.step_config[ "somatic_variant_calling" - ]["tools"][0] + ].tools[0] @listify def get_result_files(self): @@ -873,25 +763,25 @@ def get_result_files(self): name_pattern = "{mapper}.{caller}.{cancer_library.name}" tpl = os.path.join("output", name_pattern, "out", name_pattern + "{ext}") vcf_tools = [ - t for t in self.config["tools"] if t not in ("cnvetti", "control_freec", "cnvkit") + t for t in self.config.tools if t not in ("cnvetti", "control_freec", "cnvkit") ] - bcf_tools = [t for t in self.config["tools"] if t in ("cnvetti",)] + bcf_tools = [t for t in self.config.tools if t in ("cnvetti",)] yield from self._yield_result_files( tpl, - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + mapper=self.w_config.step_config["ngs_mapping"].tools.dna, caller=vcf_tools, ext=EXT_VALUES, ) yield from self._yield_result_files( tpl, - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + mapper=self.w_config.step_config["ngs_mapping"].tools.dna, caller=bcf_tools, ext=BCF_EXT_VALUES, ) - if "control_freec" in self.config["tools"]: + if "control_freec" in self.config.tools: yield from self._yield_result_files( tpl, - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + mapper=self.w_config.step_config["ngs_mapping"].tools.dna, caller="control_freec", ext=[ ".ratio.txt", @@ -905,17 +795,17 @@ def get_result_files(self): ], ) # Plots for cnvetti - if "cnvkit" in self.config["tools"]: + if "cnvkit" in self.config.tools: exts = (".cnr", ".cns", ".bed", ".seg", ".vcf.gz", ".vcf.gz.tbi") yield from self._yield_result_files( tpl, - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + mapper=self.w_config.step_config["ngs_mapping"].tools.dna, caller="cnvkit", ext=exts, ) yield from self._yield_result_files( tpl, - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + mapper=self.w_config.step_config["ngs_mapping"].tools.dna, caller="cnvkit", ext=[ext + ".md5" for ext in exts], ) @@ -926,10 +816,10 @@ def get_result_files(self): "{mapper}.{caller}.{cancer_library.name}.{ext}" ), [(report, "txt", False) for report in reports], - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + mapper=self.w_config.step_config["ngs_mapping"].tools.dna, caller="cnvkit", ) - if self.config["cnvkit"]["plot"]: + if self.config.cnvkit.plot: plots = ( ("diagram", "pdf", False), ("heatmap", "pdf", True), @@ -941,7 +831,7 @@ def get_result_files(self): "{mapper}.{caller}.{cancer_library.name}.{ext}" ), plots, - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + mapper=self.w_config.step_config["ngs_mapping"].tools.dna, caller="cnvkit", ) if "cnvetti" in bcf_tools: @@ -954,7 +844,7 @@ def get_result_files(self): os.path.join( "output", name_pattern, "out", name_pattern + "_genome" + ext ), - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + mapper=self.w_config.step_config["ngs_mapping"].tools.dna, donor=[donor.name], ) yield from expand( @@ -964,7 +854,7 @@ def get_result_files(self): "out", name_pattern + "_chr{chrom}" + ext, ), - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + mapper=self.w_config.step_config["ngs_mapping"].tools.dna, donor=[donor.name], chrom=map(str, chain(range(1, 23), ("X", "Y"))), ) @@ -1032,17 +922,3 @@ def _yield_report_files(self, tpl, exts, **kwargs): ext=plot + ".chr" + c + "." + ext + ".md5", **kwargs, ) - - def check_config(self): - """Check that the necessary configuration is available for the step""" - self.ensure_w_config( - ("step_config", "somatic_wgs_cnv_calling", "path_ngs_mapping"), - "Path to NGS mapping not configured but required for somatic variant calling", - ) - self.ensure_w_config( - ("step_config", "somatic_wgs_cnv_calling", "path_somatic_variant_calling"), - ( - "Path to somatic (small) variant calling not configured but required for somatic " - "WGS CNV calling" - ), - ) diff --git a/snappy_pipeline/workflows/somatic_wgs_cnv_calling/model.py b/snappy_pipeline/workflows/somatic_wgs_cnv_calling/model.py new file mode 100644 index 000000000..05b39d9ee --- /dev/null +++ b/snappy_pipeline/workflows/somatic_wgs_cnv_calling/model.py @@ -0,0 +1,98 @@ +import enum +from typing import Annotated + +from pydantic import Field + +from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel, validators +from snappy_pipeline.models.cnvkit import Cnvkit + + +class Tool(enum.StrEnum): + cnvetti = "cnvetti" + control_freec = "control_freec" + canvas = "canvas" + cnvkit = "cnvkit" + + +class Canvas(SnappyModel): + path_reference: str + """Path to Canvas reference file""" + + path_filter_bed: str + """Path to Canvas filter BED file""" + + path_genome_folder: str + """Path to Canvas genome folder""" + + +class CnvettiPreset(SnappyModel): + window_length: int + count_kind: str + segmentation: str + normalization: str + + +class Cnvetti(SnappyModel): + window_length: int | None = None + count_kind: str | None = None + segmentation: str | None = None + normalization: str | None = None + presets: dict[str, CnvettiPreset] = { + "deep_wgs": CnvettiPreset( + **{ + "window_length": 200, + "count_kind": "Coverage", + "segmentation": "HaarSeg", + "normalization": "MedianGcBinned", + } + ) + } + preset: str = "deep_wgs" + + +class ControlFreecConvert(SnappyModel): + org_obj: str = "org.Hs.eg.db::org.Hs.eg.db" + + tx_obj: str = "TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene" + + bs_obj: str = "BSgenome.Hsapiens.1000genomes.hs37d5::hs37d5" + + +class ControlFreec(SnappyModel): + path_chrlenfile: str + """Path to ControlFreec ChrLenFile""" + + path_mappability: str + """Path to ControlFreec mappability file""" + + path_mappability_enabled: bool = False + + window_size: int = -1 + """set to a value >=0 you want a specific fixed window size""" + + convert: ControlFreecConvert + + +# If defaults need to be overwritten, subclass the model and override the defaults +class CnvkitWgs(Cnvkit): + pass + + +class SomaticWgsCnvCalling(SnappyStepModel, validators.ToolsMixin): + path_ngs_mapping: str = "../ngs_mapping" + + tools_ngs_mapping: list[str] = [] + + path_somatic_variant_calling: Annotated[str, Field(examples=["../somatic_variant_calling"])] + + somatic_variant_calling_tool: str + + tools: Annotated[list[Tool], EnumField(Tool, [Tool.cnvetti], min_length=1)] + + canvas: Canvas | None = None + + cnvetti: Cnvetti | None = None + + control_freec: ControlFreec | None = None + + cnvkit: CnvkitWgs | None = None diff --git a/snappy_pipeline/workflows/somatic_wgs_sv_calling/__init__.py b/snappy_pipeline/workflows/somatic_wgs_sv_calling/__init__.py index 40eb06df8..68b0be6f4 100644 --- a/snappy_pipeline/workflows/somatic_wgs_sv_calling/__init__.py +++ b/snappy_pipeline/workflows/somatic_wgs_sv_calling/__init__.py @@ -77,9 +77,9 @@ import os import sys -from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background from snakemake.io import expand +from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import ( BaseStep, @@ -89,6 +89,8 @@ ) from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow +from .model import SomaticWgsSvCalling as SomaticWgsSvCallingConfigModel + __author__ = "Manuel Holtgrewe " #: Extensions of files to create as main payload @@ -101,16 +103,7 @@ SOMATIC_VARIANT_CALLERS = ("manta", "delly2") #: Default configuration for the somatic_wgs_sv_calling schema -DEFAULT_CONFIG = r""" -# Default configuration somatic_wgs_sv_calling -step_config: - somatic_wgs_sv_calling: - path_ngs_mapping: ../ngs_mapping # REQUIRED - tools: [manta] # REQUIRED - available: 'delly2' and 'manta' - delly2: - path_exclude_tsv: null # optional - max_threads: 16 -""" +DEFAULT_CONFIG = SomaticWgsSvCallingConfigModel.default_config_yaml_string() class SomaticWgsSvCallingStepPart(BaseStepPart): @@ -196,7 +189,7 @@ class MantaStepPart(SomaticWgsSvCallingStepPart): #: Class available actions actions = ("run",) - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -271,8 +264,9 @@ def _get_input_files_call(self, wildcards): normal_tpl = "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}{ext}" norm_lib = self.get_normal_lib_name(wildcards) for name, ext in {"normal_bam": ".bam", "normal_bai": ".bam.bai"}.items(): - yield name, ngs_mapping( - normal_tpl.format(ext=ext, normal_library=norm_lib, **wildcards) + yield ( + name, + ngs_mapping(normal_tpl.format(ext=ext, normal_library=norm_lib, **wildcards)), ) tumor_tpl = "output/{mapper}.{cancer_library}/out/{mapper}.{cancer_library}{ext}" for name, ext in {"tumor_bam": ".bam", "tumor_bai": ".bam.bai"}.items(): @@ -389,7 +383,7 @@ def get_log_file(self, action): infix = self.dir_infixes[action].replace(r",[^\.]+", "") return "work/" + infix + "/log/snakemake.log" - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -431,12 +425,13 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (NgsMappingWorkflow,), + config_model_class=SomaticWgsSvCallingConfigModel, + previous_steps=(NgsMappingWorkflow,), ) # Register sub step classes so the sub steps are available self.register_sub_step_classes((Delly2StepPart, MantaStepPart, LinkOutStepPart)) # Initialize sub-workflows - self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"]) + self.register_sub_workflow("ngs_mapping", self.config.path_ngs_mapping) @listify def get_result_files(self): @@ -447,8 +442,8 @@ def get_result_files(self): name_pattern = "{mapper}.{caller}.{cancer_library.name}" yield from self._yield_result_files( os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], - caller=self.config["tools"], + mapper=self.w_config.step_config["ngs_mapping"].tools.dna, + caller=self.config.tools, ext=EXT_VALUES, ) @@ -474,10 +469,6 @@ def _yield_result_files(self, tpl, **kwargs): def check_config(self): """Check that the necessary configuration is available for the step""" - self.ensure_w_config( - ("step_config", "somatic_wgs_sv_calling", "path_ngs_mapping"), - "Path to NGS mapping not configured but required for somatic WGS SV calling", - ) self.ensure_w_config( ("static_data_config", "reference", "path"), "Path to reference FASTA file required by not available", diff --git a/snappy_pipeline/workflows/somatic_wgs_sv_calling/model.py b/snappy_pipeline/workflows/somatic_wgs_sv_calling/model.py new file mode 100644 index 000000000..d24935f61 --- /dev/null +++ b/snappy_pipeline/workflows/somatic_wgs_sv_calling/model.py @@ -0,0 +1,27 @@ +import enum +from typing import Annotated + +from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel, validators + + +class Tool(enum.StrEnum): + manta = "manta" + delly2 = "delly2" + + +class Manta(SnappyModel): + pass + + +class Delly2(SnappyModel): + path_exclude_tsv: str | None = None + max_threads: int = 16 + + +class SomaticWgsSvCalling(SnappyStepModel, validators.ToolsMixin): + path_ngs_mapping: str = "../ngs_mapping" + tools: Annotated[list[Tool], EnumField(Tool, [Tool.manta], min_length=1)] + + manta: Manta | None = None + + delly2: Delly2 | None = None diff --git a/snappy_pipeline/workflows/sv_calling_targeted/__init__.py b/snappy_pipeline/workflows/sv_calling_targeted/__init__.py index 3cb8d4675..fa804c1c8 100644 --- a/snappy_pipeline/workflows/sv_calling_targeted/__init__.py +++ b/snappy_pipeline/workflows/sv_calling_targeted/__init__.py @@ -7,8 +7,7 @@ import re from biomedsheets.shortcuts import GermlineCaseSheet, is_not_background - -from snappy_pipeline.utils import DictQuery, dictify, listify +from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import BaseStep, WritePedigreeStepPart from snappy_pipeline.workflows.common.delly import Delly2StepPart from snappy_pipeline.workflows.common.gcnv.gcnv_run import RunGcnvStepPart @@ -16,6 +15,8 @@ from snappy_pipeline.workflows.common.melt import MeltStepPart from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow +from .model import SvCallingTargeted as SvCallingTargetedConfigModel + __author__ = "Manuel Holtgrewe " #: Extensions of files to create as main payload (VCF) @@ -31,68 +32,7 @@ GCNV_MIN_KIT_SAMPLES = 10 #: Default configuration for the sv_calling_targeted step -DEFAULT_CONFIG = r""" -# Default configuration sv_calling_targeted -step_config: - sv_calling_targeted: - # Path to the ngs_mapping step. - path_ngs_mapping: ../ngs_mapping - - # List of used tools - tools: [gcnv, delly2, manta] # REQUIRED - - # The following allows to define one or more set of target intervals. This is only used by gcnv. - # - # Example: - # - # - name: "Agilent SureSelect Human All Exon V6" - # pattern: "Agilent SureSelect Human All Exon V6.*" - # path: "path/to/targets.bed" - path_target_interval_list_mapping: [] - - gcnv: - # Path to interval block list with PAR region for contig calling. - path_par_intervals: null # REQUIRED - # Path to gCNV model - will execute analysis in CASE MODE. - # - # Example: - # - # - library: "Agilent SureSelect Human All Exon V6" # Kit name, match in path_target_interval_list_mapping - # contig_ploidy: /path/to/ploidy-model # Output from `DetermineGermlineContigPloidy` - # model_pattern: /path/to/model_* # Output from `GermlineCNVCaller` - precomputed_model_paths: [] - # Skip processing of the following libraries. If the library is in - # family/pedigree then all of the family/pedigree will be skipped. - skip_libraries: [] - - delly2: - path_exclude_tsv: null # optional - map_qual: 1 - geno_qual: 5 - qual_tra: 20 - mad_cutoff: 9 - # Skip processing of the following libraries. If the library is in - # family/pedigree then all of the family/pedigree will be skipped. - skip_libraries: [] - - manta: - num_threads: 16 - # Skip processing of the following libraries. If the library is in - # family/pedigree then all of the family/pedigree will be skipped. - skip_libraries: [] - - melt: - me_refs_infix: 1KGP_Hg19 - me_types: - - ALU - - LINE1 - - SVA - jar_file: REQUIRED - genes_file: add_bed_files/1KGP_Hg19/hg19.genes.bed # adjust, e.g., Hg38/Hg38.genes.bed - # Skip processing of the following libraries. If the library is in - # family/pedigree then all of the family/pedigree will be skipped. - skip_libraries: [] -""" +DEFAULT_CONFIG = SvCallingTargetedConfigModel.default_config_yaml_string() class GcnvTargetedStepPart(RunGcnvStepPart): @@ -119,7 +59,8 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (NgsMappingWorkflow,), + config_model_class=SvCallingTargetedConfigModel, + previous_steps=(NgsMappingWorkflow,), ) # Build mapping from NGS library name to kit self.ngs_library_to_kit = self._build_ngs_library_to_kit() @@ -134,22 +75,20 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) ) ) # Register sub workflows - self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"]) + self.register_sub_workflow("ngs_mapping", self.config.path_ngs_mapping) # Build dictionary with sample count per library kit _, _, self.library_kit_counts_dict = self.pick_kits_and_donors() @dictify def _build_ngs_library_to_kit(self): - config = DictQuery(self.w_config).get("step_config/sv_calling_targeted/gcnv") - if not config["path_target_interval_list_mapping"]: + config = self.w_config.step_config["sv_calling_targeted"].gcnv + if not config.path_target_interval_list_mapping: # No mapping given, we will use the "default" one for all. for donor in self.all_donors(): if donor.dna_ngs_library: yield donor.dna_ngs_library.name, "default" # Build mapping - regexes = { - item["pattern"]: item["name"] for item in config["path_target_interval_list_mapping"] - } + regexes = {item.pattern: item.name for item in config.path_target_interval_list_mapping} result = {} for donor in self.all_donors(): if donor.dna_ngs_library and donor.dna_ngs_library.extra_infos.get("libraryKit"): @@ -216,10 +155,3 @@ def pick_kits_and_donors(self): if donor.dna_ngs_library and donor.dna_ngs_library.name in self.ngs_library_to_kit ] return list(sorted(set(self.ngs_library_to_kit.values()))), donors, kit_counts - - def check_config(self): - """Check that the necessary configuration is available for the step""" - self.ensure_w_config( - config_keys=("step_config", "sv_calling_targeted", "path_ngs_mapping"), - msg="Path to NGS mapping not configured but required for targeted seq. CNV calling", - ) diff --git a/snappy_pipeline/workflows/sv_calling_targeted/model.py b/snappy_pipeline/workflows/sv_calling_targeted/model.py new file mode 100644 index 000000000..72c829825 --- /dev/null +++ b/snappy_pipeline/workflows/sv_calling_targeted/model.py @@ -0,0 +1,110 @@ +import enum +from typing import Annotated + +from pydantic import Field + +from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel, validators +from snappy_pipeline.models.gcnv import PrecomputedModelEntry, TargetIntervalEntry + + +class Tool(enum.StrEnum): + gcnv = "gcnv" + delly2 = "delly2" + manta = "manta" + melt = "melt" + + +class Gcnv(SnappyModel): + # path_par_intervals: str = "" + # """Path to interval block list with PAR region for contig calling.""" + + # path_uniquely_mapable_bed: str + # """path to BED file with uniquely mappable regions.""" + + path_target_interval_list_mapping: list[TargetIntervalEntry] + """ + The following allows to define one or more set of target intervals. This is only used by gcnv. + Example: + - name: "Agilent SureSelect Human All Exon V6" + pattern: "Agilent SureSelect Human All Exon V6.*" + path: "path/to/targets.bed" + """ + + # Not sure if this can/should be empty be default; can we set a required flag? + precomputed_model_paths: list[PrecomputedModelEntry] = [] + """ + Path to gCNV model - will execute analysis in CASE MODE. + Example: + - library: "Agilent SureSelect Human All Exon V6" # Kit name, match in path_target_interval_list_mapping + contig_ploidy: /path/to/ploidy-model # Output from `DetermineGermlineContigPloidy` + model_pattern: /path/to/model_* # Output from `GermlineCNVCaller` + """ + + skip_libraries: list[str] = [] + """ + Skip processing of the following libraries. + If the library is in family/pedigree then all of the family/pedigree will be skipped. + """ + + +class Delly2(SnappyModel): + path_exclude_tsv: str | None = None + + map_qual: int = 1 + + geno_qual: int = 5 + + qual_tra: int = 20 + + mad_cutoff: int = 9 + + skip_libraries: list[str] = [] + """ + Skip processing of the following libraries. + If the library is in family/pedigree then all of the family/pedigree will be skipped. + """ + + +class Manta(SnappyModel): + num_threads: int = 16 + + skip_libraries: list[str] = [] + """ + Skip processing of the following libraries. + If the library is in family/pedigree then all of the family/pedigree will be skipped. + """ + + +class Melt(SnappyModel): + me_refs_infix: str = Field(examples=["1KGP_Hg19"]) + + me_types: list[str] = ["ALU", "LINE1", "SVA"] + + jar_file: str = Field(examples=["MELT.jar"]) + + genes_file: str = Field(examples=["add_bed_files/1KGP_Hg19/hg19.genes.bed"]) + """adjust, e.g., Hg38/Hg38.genes.bed""" + + me_refs_path: str = Field(examples=["me_refs"]) + + skip_libraries: list[str] = [] + """ + Skip processing of the following libraries. + If the library is in family/pedigree then all of the family/pedigree will be skipped. + """ + + +class SvCallingTargeted(SnappyStepModel, validators.ToolsMixin): + path_ngs_mapping: str = "../ngs_mapping" + + tools: Annotated[ + list[Tool], EnumField(Tool, [Tool.gcnv, Tool.delly2, Tool.manta], min_length=1) + ] + + gcnv: Gcnv | None = None + + delly2: Delly2 | None = None + + manta: Manta | None = None + + melt: Melt | None = None diff --git a/snappy_pipeline/workflows/sv_calling_wgs/Snakefile b/snappy_pipeline/workflows/sv_calling_wgs/Snakefile index 7f1574698..a0a70e04a 100644 --- a/snappy_pipeline/workflows/sv_calling_wgs/Snakefile +++ b/snappy_pipeline/workflows/sv_calling_wgs/Snakefile @@ -484,7 +484,7 @@ rule sv_calling_wgs_gcnv_contig_ploidy: wf.get_log_file("gcnv", "contig_ploidy"), params: args=wf.get_params("gcnv", "contig_ploidy"), - step_key="sv_calling_targeted", + step_key="sv_calling_wgs", wrapper: wf.wrapper_path("gcnv/contig_ploidy_case_mode") diff --git a/snappy_pipeline/workflows/sv_calling_wgs/__init__.py b/snappy_pipeline/workflows/sv_calling_wgs/__init__.py index 5e8569d1f..2b258c9a6 100644 --- a/snappy_pipeline/workflows/sv_calling_wgs/__init__.py +++ b/snappy_pipeline/workflows/sv_calling_wgs/__init__.py @@ -1,11 +1,9 @@ -"""Implementation of the ``sv_calling_wgs`` step -""" +"""Implementation of the ``sv_calling_wgs`` step""" from itertools import chain import re from biomedsheets.shortcuts import GermlineCaseSheet, is_not_background - from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import ( BaseStep, @@ -28,6 +26,8 @@ from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow from snappy_wrappers.tools.genome_windows import yield_regions +from .model import SvCallingWgs as SvCallingWgsConfigModel + __author__ = "Manuel Holtgrewe " #: Available (short) DNA WGS SV callers @@ -37,79 +37,7 @@ LONG_DNA_WGS_SV_CALLERS = ("pb_honey_spots", "sniffles", "sniffles2") #: Default configuration for the sv_calling_wgs step -DEFAULT_CONFIG = r""" -# Default configuration -step_config: - sv_calling_wgs: - tools: - dna: [delly2] # Required if short-read mapper used; otherwise, leave empty. Example: 'delly2'. - dna_long: [] # Required if long-read mapper used (PacBio/Oxford Nanopore); otherwise, leave empty. Example: 'sniffles'. - - path_ngs_mapping: ../ngs_mapping # REQUIRED - - # Short-read SV calling tool configuration - delly2: - path_exclude_tsv: null # optional - map_qual: 1 - geno_qual: 5 - qual_tra: 20 - mad_cutoff: 9 - # Skip processing of the following libraries. If the library is in - # family/pedigree then all of the family/pedigree will be skipped. - skip_libraries: [] - manta: - num_threads: 16 - # Skip processing of the following libraries. If the library is in - # family/pedigree then all of the family/pedigree will be skipped. - skip_libraries: [] - popdel: - window_size: 10000000 - max_sv_size: 20000 # == padding - # Skip processing of the following libraries. If the library is in - # family/pedigree then all of the family/pedigree will be skipped. - skip_libraries: [] - gcnv: - # Path to interval block list with PAR region for contig calling. - path_par_intervals: null # REQUIRED - # Path to gCNV model - will execute analysis in CASE MODE. - # - # Example of precomputed model: - # - library: "Agilent SureSelect Human All Exon V6" # Library name - # contig_ploidy: /path/to/ploidy-model # Output from `DetermineGermlineContigPloidy` - # model_pattern: /path/to/model_* # Output from `GermlineCNVCaller` - # Path to BED file with uniquely mappable regions. - path_uniquely_mapable_bed: null # REQUIRED - # Skip processing of the following libraries. If the library is in - # family/pedigree then all of the family/pedigree will be skipped. - skip_libraries: [] - melt: - me_refs_infix: 1KGP_Hg19 - me_types: - - ALU - - LINE1 - - SVA - jar_file: REQUIRED - genes_file: add_bed_files/1KGP_Hg19/hg19.genes.bed # adjust, e.g., Hg38/Hg38.genes.bed - # Skip processing of the following libraries. If the library is in - # family/pedigree then all of the family/pedigree will be skipped. - skip_libraries: [] - - # Long-read SV calling tool configuration - sniffles2: - tandem_repeats: /fast/groups/cubi/work/projects/biotools/sniffles2/trf/GRCh37/human_hs37d5.trf.bed # REQUIRED - # Skip processing of the following libraries. If the library is in - # family/pedigree then all of the family/pedigree will be skipped. - skip_libraries: [] - - # Common configuration - ignore_chroms: - - NC_007605 # herpes virus - - hs37d5 # GRCh37 decoy - - chrEBV # Eppstein-Barr Virus - - '*_decoy' # decoy contig - - 'HLA-*' # HLA genes - - 'chrUn_*' # unplaced contigs -""" +DEFAULT_CONFIG = SvCallingWgsConfigModel.default_config_yaml_string() class GcnvWgsStepPart(RunGcnvStepPart): @@ -217,8 +145,8 @@ def _get_log_file_infix_call(self): @dictify def _get_input_files_concat_calls(self, wildcards): - window_size = self.config["popdel"]["window_size"] - padding = self.config["popdel"]["max_sv_size"] + window_size = self.config.popdel.window_size + padding = self.config.popdel.max_sv_size vcfs = [] with open(self._get_fai_path(), "rt") as fai_file: for r in yield_regions( @@ -235,10 +163,10 @@ def _get_input_files_concat_calls(self, wildcards): yield "vcf", vcfs def _get_fai_path(self): - return self.w_config["static_data_config"]["reference"]["path"] + ".fai" + return self.w_config.static_data_config.reference.path + ".fai" def _get_ignore_chroms(self): - return self.config["ignore_chroms"] + return self.config.ignore_chroms @dictify def _get_output_files_concat_calls(self): @@ -265,10 +193,15 @@ def _get_output_files_reorder_vcf(self): work_files["vcf_tbi"] = f"work/{infix}/out/{infix}.vcf.gz.tbi" work_files["vcf_tbi_md5"] = f"work/{infix}/out/{infix}.vcf.gz.tbi.md5" yield from work_files.items() - yield "output_links", [ - re.sub(r"^work/", "output/", work_path) - for work_path in chain(work_files.values(), self.get_log_file("reorder_vcf").values()) - ] + yield ( + "output_links", + [ + re.sub(r"^work/", "output/", work_path) + for work_path in chain( + work_files.values(), self.get_log_file("reorder_vcf").values() + ) + ], + ) def get_ped_members(self, wildcards): """Used in Snakefile to rule ``sv_calling_wgs_popdel_reorder_vcf``""" @@ -346,7 +279,8 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (NgsMappingWorkflow,), + config_model_class=SvCallingWgsConfigModel, + previous_steps=(NgsMappingWorkflow,), ) # Register sub step classes so the sub steps are available self.register_sub_step_classes( @@ -361,7 +295,7 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) ) ) # Register sub workflows - self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"]) + self.register_sub_workflow("ngs_mapping", self.config.path_ngs_mapping) @listify def all_donors(self, include_background=True): @@ -384,24 +318,7 @@ def get_result_files(self): def check_config(self): """Check that the path to the NGS mapping is present""" - self.ensure_w_config( - ("step_config", "sv_calling_wgs", "path_ngs_mapping"), - "Path to NGS mapping not configured but required for variant calling", - ) self.ensure_w_config( ("static_data_config", "reference", "path"), "Path to reference FASTA not configured but required for variant calling", ) - # Check that only valid tools are selected - selected = set(self.w_config["step_config"]["sv_calling_wgs"]["tools"]["dna"]) - invalid = selected - set(DNA_WGS_SV_CALLERS) - if invalid: - raise Exception( - "Invalid short-read WGS SV caller selected: {}".format(list(sorted(invalid))) - ) - selected = set(self.w_config["step_config"]["sv_calling_wgs"]["tools"]["dna_long"]) - invalid = selected - set(LONG_DNA_WGS_SV_CALLERS) - if invalid: - raise Exception( - "Invalid long-read WGS SV caller selected: {}".format(list(sorted(invalid))) - ) diff --git a/snappy_pipeline/workflows/sv_calling_wgs/model.py b/snappy_pipeline/workflows/sv_calling_wgs/model.py new file mode 100644 index 000000000..43a59a84a --- /dev/null +++ b/snappy_pipeline/workflows/sv_calling_wgs/model.py @@ -0,0 +1,151 @@ +import enum +from typing import Annotated + +from pydantic import Field, model_validator + +from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel +from snappy_pipeline.models.gcnv import PrecomputedModelEntry + + +class DnaTool(enum.StrEnum): + delly2 = "delly2" + manta = "manta" + popdel = "popdel" + gcnv = "gcnv" + melt = "melt" + + +class DnaLongTool(enum.StrEnum): + sniffles2 = "sniffles2" + # These seem to be unused: + # sniffles = "sniffles" + # pb_honey_spots = "pb_honey_spots" + + +class Tools(SnappyModel): + dna: Annotated[list[DnaTool], EnumField(DnaTool, [DnaTool.delly2])] + dna_long: Annotated[list[DnaLongTool], EnumField(DnaLongTool, [])] + + +class Gcnv(SnappyModel): + # path_par_intervals: str + # """Path to interval block list with PAR region for contig calling.""" + + # path_uniquely_mapable_bed: str + # """path to BED file with uniquely mappable regions.""" + + precomputed_model_paths: list[PrecomputedModelEntry] = [] + """ + Path to gCNV model - will execute analysis in CASE MODE. + Example: + - library: "Agilent SureSelect Human All Exon V6" # Kit name, match in path_target_interval_list_mapping + contig_ploidy: /path/to/ploidy-model # Output from `DetermineGermlineContigPloidy` + model_pattern: /path/to/model_* # Output from `GermlineCNVCaller` + """ + + skip_libraries: list[str] = [] + """ + Skip processing of the following libraries. + If the library is in family/pedigree then all of the family/pedigree will be skipped. + """ + + +class Delly2(SnappyModel): + path_exclude_tsv: str | None = None + + map_qual: int = 1 + + geno_qual: int = 5 + + qual_tra: int = 20 + + mad_cutoff: int = 9 + + skip_libraries: list[str] = [] + """ + Skip processing of the following libraries. + If the library is in family/pedigree then all of the family/pedigree will be skipped. + """ + + +class Manta(SnappyModel): + num_threads: int = 16 + + skip_libraries: list[str] = [] + """ + Skip processing of the following libraries. + If the library is in family/pedigree then all of the family/pedigree will be skipped. + """ + + +class Melt(SnappyModel): + me_refs_infix: str = "1KGP_Hg19" + me_types: list[str] = ["ALU", "LINE1", "SVA"] + jar_file: str + genes_file: str = "add_bed_files/1KGP_Hg19 / hg19.genes.bed" + """adjust, e.g., Hg38/Hg38.genes.bed""" + + skip_libraries: list[str] = [] + """ + Skip processing of the following libraries. + If the library is in family/pedigree then all of the family/pedigree will be skipped. + """ + + +class Popdel(SnappyModel): + window_size: int = 10000000 + + max_sv_size: int = 20000 + """== padding""" + + skip_libraries: list[str] = [] + """ + Skip processing of the following libraries. + If the library is in family/pedigree then all of the family/pedigree will be skipped. + """ + + +class Sniffles2(SnappyModel): + tandem_repeats: Annotated[ + str, + Field( + examples=[ + "/fast/groups/cubi/work/projects/biotools/sniffles2/trf/GRCh37/human_hs37d5.trf.bed" + ] + ), + ] + + skip_libraries: list[str] = [] + """ + Skip processing of the following libraries. + If the library is in family/pedigree then all of the family/pedigree will be skipped. + """ + + +class SvCallingWgs(SnappyStepModel): + path_ngs_mapping: str = "../ngs_mapping" + + tools: Tools + + delly2: Delly2 | None = None + + manta: Manta | None = None + + popdel: Popdel | None = None + + gcnv: Gcnv | None = None + + melt: Melt | None = None + + sniffles2: Sniffles2 | None = None + + ignore_chroms: list[str] = ["NC_007605", "hs37d5", "chrEBV", "*_decoy", "HLA-*", "chrUn_*"] + + @model_validator(mode="after") + def ensure_tools_are_configured(self): + for data_type in ("dna", "dna_long"): + tool_list = getattr(self.tools, data_type) + for tool in tool_list: + if not getattr(self, tool): + raise ValueError(f"Tool {tool} not configured") + return self diff --git a/snappy_pipeline/workflows/sv_calling_wgs/test_workflows_wgs_sv_calling.py b/snappy_pipeline/workflows/sv_calling_wgs/test_workflows_wgs_sv_calling.py index 12b67df03..57cda98b7 100644 --- a/snappy_pipeline/workflows/sv_calling_wgs/test_workflows_wgs_sv_calling.py +++ b/snappy_pipeline/workflows/sv_calling_wgs/test_workflows_wgs_sv_calling.py @@ -32,8 +32,6 @@ def minimal_config(): ngs_mapping: tools: dna: ['bwa'] - compute_coverage_bed: true - path_target_regions: /path/to/regions.bed bwa: path_index: /path/to/bwa/index.fa diff --git a/snappy_pipeline/workflows/targeted_seq_mei_calling/__init__.py b/snappy_pipeline/workflows/targeted_seq_mei_calling/__init__.py index 42bc0ca60..bf8f55abe 100644 --- a/snappy_pipeline/workflows/targeted_seq_mei_calling/__init__.py +++ b/snappy_pipeline/workflows/targeted_seq_mei_calling/__init__.py @@ -78,11 +78,12 @@ Not available. """ + import os -from biomedsheets.shortcuts import GermlineCaseSheet, is_not_background from snakemake.io import expand +from biomedsheets.shortcuts import GermlineCaseSheet, is_not_background from snappy_pipeline.base import InvalidConfiguration from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import ( @@ -93,28 +94,14 @@ ) from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow +from .model import TargetedSeqMeiCalling as TargetedSeqMeiCallingConfigModel + #: Extensions of files to create as main payload. EXT_VALUES = (".vcf.gz", ".vcf.gz.tbi", ".vcf.gz.md5", ".vcf.gz.tbi.md5") #: Default configuration for the targeted_seq_mei_calling step. -DEFAULT_CONFIG = r""" -# Default configuration -step_config: - targeted_seq_mei_calling: - # Path to the ngs_mapping step - path_ngs_mapping: ../ngs_mapping - - tools: [scramble] # REQUIRED - available: 'scramble' - - scramble: - blast_ref: null # REQUIRED: path to FASTA reference with BLAST DB (`makeblastdb`) - mei_refs: null # OPTIONAL: MEI reference file (FASTA), if none provided will use default. - n_cluster: 5 # OPTIONAL: minimum cluster size, depth of soft-clipped reads. - mei_score: 50 # OPTIONAL: minimum MEI alignment score. - indel_score: 80 # OPTIONAL: minimum INDEL alignment score. - mei_polya_frac: 0.75 # OPTIONAL: minimum fraction of clipped length for calling polyA tail. -""" +DEFAULT_CONFIG = TargetedSeqMeiCallingConfigModel.default_config_yaml_string() class ScrambleStepPart(BaseStepPart): @@ -220,8 +207,11 @@ def _get_output_files_cluster(): """Yield output files' patterns for scramble cluster call.""" name_pattern = "{mapper}.scramble.{library_name}" ext = "txt" - yield ext, "work/{name_pattern}/out/{name_pattern}_cluster.{ext}".format( - name_pattern=name_pattern, ext=ext + yield ( + ext, + "work/{name_pattern}/out/{name_pattern}_cluster.{ext}".format( + name_pattern=name_pattern, ext=ext + ), ) @staticmethod @@ -239,8 +229,11 @@ def _get_output_files_analysis(): "vcf_tbi_md5": ".vcf.gz.tbi.md5", } for key, ext in ext_dict.items(): - yield key, "work/{name_pattern}/out/{name_pattern}{ext}".format( - name_pattern=name_pattern, ext=ext + yield ( + key, + "work/{name_pattern}/out/{name_pattern}{ext}".format( + name_pattern=name_pattern, ext=ext + ), ) def _get_analysis_parameters(self, _wildcards): @@ -254,7 +247,7 @@ def _get_analysis_parameters(self, _wildcards): :raises InvalidConfiguration: if information provided in configuration isn't enough to run the analysis. """ - blast_ref_path = self.config["scramble"]["blast_ref"] + blast_ref_path = self.config.scramble.blast_ref try: if not os.path.isfile(blast_ref_path): raise InvalidConfiguration( @@ -265,15 +258,15 @@ def _get_analysis_parameters(self, _wildcards): raise TypeError("Path to reference genome ('blast_ref') cannot be empty.") from e params = { "reference_genome": blast_ref_path, - "mei_refs": self.config["scramble"]["mei_refs"], - "n_cluster": self.config["scramble"]["n_cluster"], - "mei_score": self.config["scramble"]["mei_score"], - "indel_score": self.config["scramble"]["indel_score"], - "mei_polya_frac": self.config["scramble"]["mei_polya_frac"], + "mei_refs": self.config.scramble.mei_refs, + "n_cluster": self.config.scramble.n_cluster, + "mei_score": self.config.scramble.mei_score, + "indel_score": self.config.scramble.indel_score, + "mei_polya_frac": self.config.scramble.mei_polya_frac, } return params - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -308,12 +301,13 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (NgsMappingWorkflow,), + config_model_class=TargetedSeqMeiCallingConfigModel, + previous_steps=(NgsMappingWorkflow,), ) # Register sub step classes so the sub steps are available self.register_sub_step_classes((LinkOutStepPart, ScrambleStepPart)) # Register sub workflows - self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"]) + self.register_sub_workflow("ngs_mapping", self.config.path_ngs_mapping) @classmethod def default_config_yaml(cls): @@ -342,7 +336,7 @@ def get_result_files(self): name_pattern = "{mapper}.{tool}.{donor.dna_ngs_library.name}" yield from self._yield_result_files( os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + mapper=self.w_config.step_config["ngs_mapping"].tools.dna, tool=tools, ext=EXT_VALUES, ) @@ -352,14 +346,3 @@ def _yield_result_files(self, tpl, **kwargs): for donor in self._all_donors(include_background=False): if donor.dna_ngs_library: # ignores samples without DNA library yield from expand(tpl, donor=[donor], **kwargs) - - def check_config(self): - """Check that the necessary configuration is available for the step""" - # Requires path to ngs_mapping output, i.e., the BAM files - self.ensure_w_config( - config_keys=("step_config", "targeted_seq_mei_calling", "path_ngs_mapping"), - msg=( - "Path to NGS mapping not configured but required for mobile " - "element insertion detection." - ), - ) diff --git a/snappy_pipeline/workflows/targeted_seq_mei_calling/model.py b/snappy_pipeline/workflows/targeted_seq_mei_calling/model.py new file mode 100644 index 000000000..cc92e7d67 --- /dev/null +++ b/snappy_pipeline/workflows/targeted_seq_mei_calling/model.py @@ -0,0 +1,38 @@ +import enum +from typing import Annotated + +from pydantic import Field + +from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel, validators + + +class Tool(enum.StrEnum): + scramble = "scramble" + + +class Scramble(SnappyModel): + blast_ref: str + """path to FASTA reference with BLAST DB (`makeblastdb`)""" + + mei_refs: str | None = None + """MEI reference file (FASTA), if none provided will use default.""" + + n_cluster: int = 5 + """minimum cluster size, depth of soft-clipped reads.""" + + mei_score: int = 50 + """minimum MEI alignment score.""" + + indel_score: int = 80 + """minimum INDEL alignment score.""" + + mei_polya_frac: Annotated[float, Field(ge=0, le=1)] = 0.75 + """minimum fraction of clipped length for calling polyA tail.""" + + +class TargetedSeqMeiCalling(SnappyStepModel, validators.ToolsMixin): + path_ngs_mapping: str = "../ngs_mapping" + + tools: Annotated[list[Tool], EnumField(Tool, [Tool.scramble], min_length=1)] + + scramble: Scramble | None = None diff --git a/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py b/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py index 83090819f..894d58573 100644 --- a/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py +++ b/snappy_pipeline/workflows/tumor_mutational_burden/__init__.py @@ -2,9 +2,9 @@ import os import sys -from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background from snakemake.io import expand +from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow, ResourceUsage @@ -18,6 +18,8 @@ ) from snappy_pipeline.workflows.somatic_variant_filtration import SomaticVariantFiltrationWorkflow +from .model import TumorMutationalBurden as TumorMutationalBurdenConfigModel + #: Extensions of files to create as main payload EXT_VALUES = (".json", ".json.md5") @@ -25,20 +27,7 @@ EXT_NAMES = ("json", "json_md5") #: Default configuration for the tmb calculation step -DEFAULT_CONFIG = r""" -step_config: - tumor_mutational_burden: - has_annotation: true # REQUIRED - is_filtered: false # REQUIRED - path_somatic_variant: ../somatic_variant_annotation # REQUIRED - tools_ngs_mapping: [] # default to those configured for ngs_mapping - tools_somatic_variant_calling: [] # default to those configured for somatic_variant_calling - tools_somatic_variant_annotation: [] # default to those configured for somatic_variant_annotation - filters: [] # When using variants after the somatic_variant_filtration step, use "no_filter", "dkfz_only", "dkfz_and_ebfilter" or "dkfz_and_ebfilter_and_oxog" - filtered_regions: [] # When using variants after the somatic_variant_filtration step, use "genome_wide" or "" - target_regions: # REQUIRED - missense_regex: '.*[\|&]missense_variant[\|&].*' #change if the annotation tool doesn't use 'missense_variant' to indicate missense variant -""" +DEFAULT_CONFIG = TumorMutationalBurdenConfigModel.default_config_yaml_string() class TumorMutationalBurdenCalculationStepPart(BaseStepPart): @@ -50,7 +39,7 @@ class TumorMutationalBurdenCalculationStepPart(BaseStepPart): def __init__(self, parent): super().__init__(parent) - self.config = parent.w_config["step_config"]["tumor_mutational_burden"] + self.config = parent.w_config.step_config["tumor_mutational_burden"] # Build shortcut from cancer bio sample name to matched cancer sample self.tumor_ngs_library_to_sample_pair = OrderedDict() for sheet in self.parent.shortcut_sheets: @@ -69,17 +58,17 @@ def get_input_files(self, action): self._validate_action(action) additional_steps = "" - if self.config["has_annotation"]: + if self.config.has_annotation: additional_steps += ".{anno_caller}" - if self.config["is_filtered"]: - if len(self.config["filters"]) == 0: + if self.config.is_filtered: + if len(self.config.filters) == 0: additional_steps += ".filtered" else: additional_steps += ".dkfz_bias_filter.eb_filter" base_name = "{mapper}.{var_caller}" + additional_steps + ".{tumor_library}" - if self.config["filters"]: + if self.config.filters: base_name += ".{filter}" - if self.config["filtered_regions"]: + if self.config.filtered_regions: base_name += ".{region}" tpl = os.path.join("output", base_name, "out", base_name) @@ -94,17 +83,17 @@ def get_output_files(self, action): self._validate_action(action) additional_steps = "" - if self.config["has_annotation"]: + if self.config.has_annotation: additional_steps += ".{anno_caller}" - if self.config["is_filtered"]: - if len(self.config["filters"]) == 0: + if self.config.is_filtered: + if len(self.config.filters) == 0: additional_steps += ".filtered" else: additional_steps += ".dkfz_bias_filter.eb_filter" base_name = "{mapper}.{var_caller}" + additional_steps + ".tmb.{tumor_library}" - if self.config["filters"]: + if self.config.filters: base_name += ".{filter}" - if self.config["filtered_regions"]: + if self.config.filtered_regions: base_name += ".{region}" tpl = os.path.join("output", base_name, "out", base_name) @@ -118,17 +107,17 @@ def _get_log_file(self, action): self._validate_action(action) additional_steps = "" - if self.config["has_annotation"]: + if self.config.has_annotation: additional_steps += ".{anno_caller}" - if self.config["is_filtered"]: - if len(self.config["filters"]) == 0: + if self.config.is_filtered: + if len(self.config.filters) == 0: additional_steps += ".filtered" else: additional_steps += ".dkfz_bias_filter.eb_filter" base_name = "{mapper}.{var_caller}" + additional_steps + ".tmb.{tumor_library}" - if self.config["filters"]: + if self.config.filters: base_name += ".{filter}" - if self.config["filtered_regions"]: + if self.config.filtered_regions: base_name += ".{region}" tpl = os.path.join("output", base_name, "log", base_name) @@ -140,7 +129,7 @@ def _get_log_file(self, action): for key, ext in key_ext: yield key, tpl + ext - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: self._validate_action(action) mem_mb = 4 * 1024 # 4GB return ResourceUsage( @@ -154,15 +143,13 @@ def get_params(self, action): return getattr(self, "_get_params_run") def _get_params_run(self, wildcards): - return { - "missense_re": self.w_config["step_config"]["tumor_mutational_burden"]["missense_regex"] - } + return {"missense_re": self.w_config.step_config["tumor_mutational_burden"].missense_regex} class TumorMutationalBurdenCalculationWorkflow(BaseStep): """Perform TMB calculation""" - name = "tumormutation" + name = "tumor_mutational_burden" sheet_shortcut_class = CancerCaseSheet sheet_shortcut_kwargs = { "options": CancerCaseSheetOptions(allow_missing_normal=True, allow_missing_tumor=True) @@ -180,7 +167,8 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - ( + config_model_class=TumorMutationalBurdenConfigModel, + previous_steps=( SomaticVariantCallingWorkflow, SomaticVariantAnnotationWorkflow, SomaticVariantFiltrationWorkflow, @@ -188,88 +176,80 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) ), ) # Register sub workflows - config = self.w_config["step_config"]["tumor_mutational_burden"] + config = self.config sub_workflow = "somatic_variant_calling" - if config["has_annotation"]: + if config.has_annotation: sub_workflow = "somatic_variant_annotation" - if config["is_filtered"]: + if config.is_filtered: sub_workflow = "somatic_variant_filtration" - self.register_sub_workflow(sub_workflow, config["path_somatic_variant"], "somatic_variant") + self.register_sub_workflow(sub_workflow, config.path_somatic_variant, "somatic_variant") # Copy over "tools" setting from somatic_variant_calling/ngs_mapping if not set here - if not config["tools_ngs_mapping"]: - config["tools_ngs_mapping"] = self.w_config["step_config"]["ngs_mapping"]["tools"][ - "dna" - ] - if not config["tools_somatic_variant_calling"]: - config["tools_somatic_variant_calling"] = self.w_config["step_config"][ + if not config.tools_ngs_mapping: + config.tools_ngs_mapping = self.w_config.step_config["ngs_mapping"].tools.dna + if not config.tools_somatic_variant_calling: + config.tools_somatic_variant_calling = self.w_config.step_config[ "somatic_variant_calling" - ]["tools"] - if not config["tools_somatic_variant_annotation"]: - config["tools_somatic_variant_annotation"] = self.w_config["step_config"][ + ].tools + if not config.tools_somatic_variant_annotation: + config.tools_somatic_variant_annotation = self.w_config.step_config[ "somatic_variant_annotation" - ]["tools"] - if config["is_filtered"]: - if len(self.w_config["step_config"]["somatic_variant_filtration"]["filter_list"]) > 0: - config["filters"] = [] - config["filtered_regions"] = [] + ].tools + if config.is_filtered: + if len(self.w_config.step_config["somatic_variant_filtration"].filter_list) > 0: + config.filters = [] + config.filtered_regions = [] else: - if not config["filters"]: - config["filters"] = list( - self.w_config["step_config"]["somatic_variant_filtration"][ - "filter_sets" - ].keys() + if not config.filters: + config.filters = list( + self.w_config.step_config["somatic_variant_filtration"].filter_sets.keys() ) - config["filters"].append("no_filter") - if not config["filtered_regions"]: - config["filtered_regions"] = list( - self.w_config["step_config"]["somatic_variant_filtration"][ - "exon_lists" - ].keys() + config.filters.append("no_filter") + if not config.filtered_regions: + config.filtered_regions = list( + self.w_config.step_config["somatic_variant_filtration"].exon_lists.keys() ) - config["filtered_regions"].append("genome_wide") + config.filtered_regions.append("genome_wide") # Register sub step classes so the sub steps are available - self.w_config["step_config"]["tumor_mutational_burden"] = config + self.w_config.step_config["tumor_mutational_burden"] = config self.register_sub_step_classes((TumorMutationalBurdenCalculationStepPart, LinkOutStepPart)) @listify def get_result_files(self): - config = self.w_config["step_config"]["tumor_mutational_burden"] + config = self.w_config.step_config["tumor_mutational_burden"] name_pattern = "{mapper}.{caller}" - if config["has_annotation"]: + if config.has_annotation: name_pattern += ".{anno_caller}" - if config["is_filtered"]: - if len(config["filters"]) > 0: + if config.is_filtered: + if len(config.filters) > 0: name_pattern += ".dkfz_bias_filter.eb_filter" else: name_pattern += ".filtered" name_pattern += ".tmb.{tumor_library.name}" - if config["is_filtered"] and len(config["filters"]) > 0: + if config.is_filtered and len(config.filters) > 0: name_pattern += ".{filter}.{region}" - mappers = set(config["tools_ngs_mapping"]) & set( - self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"] + mappers = set(config.tools_ngs_mapping) & set( + self.w_config.step_config["ngs_mapping"].tools.dna ) assert len(mappers) > 0, "No valid mapper" - callers = set(config["tools_somatic_variant_calling"]) & set( - SOMATIC_VARIANT_CALLERS_MATCHED - ) + callers = set(config.tools_somatic_variant_calling) & set(SOMATIC_VARIANT_CALLERS_MATCHED) assert len(callers) > 0, "No valid somatic variant caller" - if config["has_annotation"]: - anno_callers = set(config["tools_somatic_variant_annotation"]) & set(ANNOTATION_TOOLS) + if config.has_annotation: + anno_callers = set(config.tools_somatic_variant_annotation) & set(ANNOTATION_TOOLS) assert len(anno_callers) > 0, "No valid somatic variant annotation tool" else: anno_callers = [] - if config["is_filtered"]: + if config.is_filtered: filters = list( - self.w_config["step_config"]["somatic_variant_filtration"]["filter_sets"].keys() + self.w_config.step_config["somatic_variant_filtration"].filter_sets.keys() ) filters.append("no_filter") - filters = set(filters) & set(config["filters"]) + filters = set(filters) & set(config.filters) regions = list( - self.w_config["step_config"]["somatic_variant_filtration"]["exon_lists"].keys() + self.w_config.step_config["somatic_variant_filtration"].exon_lists.keys() ) regions.append("genome_wide") - regions = set(regions) & set(config["filtered_regions"]) + regions = set(regions) & set(config.filtered_regions) else: filters = [] regions = [] @@ -323,21 +303,3 @@ def _yield_result_files_matched(self, tpl, **kwargs): tumor_library=[sample_pair.tumor_sample.dna_ngs_library], **kwargs, ) - - def check_config(self): - """Check that the path to the NGS mapping is present""" - self.ensure_w_config( - ("step_config", "tumor_mutational_burden", "path_somatic_variant"), - "Path to variant (directory of vcf files) not configured but required for tmb calculation", - ) - - self.ensure_w_config( - ("step_config", "tumor_mutational_burden", "target_regions"), - "Path to target_regions file (bed format)" - "not configured but required for tmb calculation", - ) - - self.ensure_w_config( - ("step_config", "tumor_mutational_burden", "has_annotation"), - "TMB needs to know whether the vcf is annotated or not", - ) diff --git a/snappy_pipeline/workflows/tumor_mutational_burden/model.py b/snappy_pipeline/workflows/tumor_mutational_burden/model.py new file mode 100644 index 000000000..7fa8f5974 --- /dev/null +++ b/snappy_pipeline/workflows/tumor_mutational_burden/model.py @@ -0,0 +1,41 @@ +from typing import Annotated + +from pydantic import Field + +from snappy_pipeline.models import SnappyStepModel + + +class TumorMutationalBurden(SnappyStepModel): + has_annotation: bool + """TMB needs to know whether the vcf is annotated or not""" + + is_filtered: bool + + path_somatic_variant: Annotated[ + str, Field(examples=["../somatic_variant_annotation", "../somatic_variant_calling"]) + ] + """Path to variant (directory of vcf files)""" + + tools_ngs_mapping: list[str] = [] + """default to those configured for ngs_mapping""" + + tools_somatic_variant_calling: list[str] = [] + """default to those configured for somatic_variant_calling""" + + tools_somatic_variant_annotation: list[str] = [] + """default to those configured for somatic_variant_annotation""" + + filters: list[str] = [] + """ + When using variants after the somatic_variant_filtration step, + use "no_filter", "dkfz_only", "dkfz_and_ebfilter" or "dkfz_and_ebfilter_and_oxog" + """ + + filtered_regions: list[str] = [] + """When using variants after the somatic_variant_filtration step, use "genome_wide" or "" """ + + target_regions: str + """Path to target_regions file (bed format)""" + + missense_regex: str = r".*[\|&]missense_variant[\|&].*" + """change if the annotation tool doesn't use 'missense_variant' to indicate missense variant""" diff --git a/snappy_pipeline/workflows/varfish_export/__init__.py b/snappy_pipeline/workflows/varfish_export/__init__.py index 7e2c32f49..339a9e71b 100644 --- a/snappy_pipeline/workflows/varfish_export/__init__.py +++ b/snappy_pipeline/workflows/varfish_export/__init__.py @@ -54,17 +54,17 @@ .. include:: DEFAULT_CONFIG_varfish_export.rst """ -from itertools import chain import re import typing import warnings +from itertools import chain from biomedsheets.shortcuts import GermlineCaseSheet, Pedigree, is_not_background from matplotlib.cbook import flatten from snakemake.io import Wildcards, expand from snappy_pipeline.base import SkipLibraryWarning -from snappy_pipeline.utils import DictQuery, dictify, listify +from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import ( BaseStep, BaseStepPart, @@ -82,6 +82,8 @@ VariantCallingWorkflow, ) +from .model import VarfishExport as VarfishExportConfigModel + __author__ = "Manuel Holtgrewe " #: Extension of files @@ -90,37 +92,7 @@ # TODO: the number of restart times is high because tabix in HTSJDK/Jannovar is flaky... #: Default configuration for the somatic_variant_calling step -DEFAULT_CONFIG = r""" -# Default configuration varfish_export. -step_config: - varfish_export: - # Configuration of the input path enables export from the corresponding pipeline step. - # - # Used output of ngs_mapping is alignment quality control data - path_ngs_mapping: ../ngs_mapping - # Used output of variant_calling is variant calls - path_variant_calling: ../variant_calling - # Used output of targeted SV calling is variant calls - path_sv_calling_targeted: null # REQUIRED; optional - # Used output of WGS SV calling is variant calls - path_sv_calling_wgs: null # REQUIRED; optional - - # Optionally, you can override the exported mappers and variant callers by setting - # the following variables. - tools_ngs_mapping: null - tools_variant_calling: null - tools_sv_calling_targeted: null - tools_sv_calling_wgs: null - - # The following configuration is used for parameterizing the output itself. - # - # The release of the genome reference that data has been aligned to. - release: GRCh37 # REQUIRED: default 'GRCh37' - # Path to BED file with exons; used for reducing data to near-exon small variants. - path_exon_bed: null # REQUIRED: exon BED file to use - # Path to mehari database. - path_mehari_db: REQUIRED # REQUIRED: path to mehari database -""" +DEFAULT_CONFIG = VarfishExportConfigModel.default_config_yaml_string() class MehariStepPart(VariantCallingGetLogFileMixin, BaseStepPart): @@ -166,7 +138,7 @@ def get_params(self, action): self._validate_action(action) return getattr(self, f"_get_params_{action}") - def get_resource_usage(self, action: str) -> ResourceUsage: + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: self._validate_action(action) return ResourceUsage( threads=2, @@ -182,8 +154,8 @@ def get_result_files(self, action): elif action == "annotate_strucvars": # Only annotate_seqvars SVs if path to step for calling them is configured. if ( - not self.parent.config["path_sv_calling_targeted"] - and not self.parent.config["path_sv_calling_wgs"] + not self.parent.config.path_sv_calling_targeted + and not self.parent.config.path_sv_calling_wgs ): return raw_path_tpls = self._get_output_files_annotate_strucvars().values() @@ -195,13 +167,13 @@ def get_result_files(self, action): # Create concrete paths for all pedigrees in the sample sheet. index_ngs_libraries = self._get_index_ngs_libraries( require_consistent_pedigree_kits=( - bool(self.parent.config["path_sv_calling_targeted"]) + bool(self.parent.config.path_sv_calling_targeted) and (action == "annotate_strucvars") ) ) kwargs = { "index_ngs_library": list(index_ngs_libraries.keys()), - "mapper": [self.parent.config["tools_ngs_mapping"][0]], + "mapper": [self.parent.config.tools_ngs_mapping[0]], } for path_tpl in path_tpls: yield from expand(path_tpl, **kwargs) @@ -251,7 +223,7 @@ def _get_input_files_annotate_seqvars(self, wildcards): ) vcfs = [] - for var_caller in self.parent.config["tools_variant_calling"]: + for var_caller in self.parent.config.tools_variant_calling: vcfs.append( variant_calling(path).format( mapper=wildcards.mapper, @@ -278,12 +250,15 @@ def _get_output_files_annotate_seqvars(self): } yield from work_paths.items() # Generate paths in "output/" directory - yield "output_links", [ - re.sub(r"^work/", "output/", work_path) - for work_path in chain( - work_paths.values(), self.get_log_file("annotate_seqvars").values() - ) - ] + yield ( + "output_links", + [ + re.sub(r"^work/", "output/", work_path) + for work_path in chain( + work_paths.values(), self.get_log_file("annotate_seqvars").values() + ) + ], + ) def _get_params_annotate_seqvars(self, wildcards: Wildcards) -> typing.Dict[str, typing.Any]: pedigree = self.index_ngs_library_to_pedigree[wildcards.index_ngs_library] @@ -299,22 +274,22 @@ def _get_params_annotate_seqvars(self, wildcards: Wildcards) -> typing.Dict[str, def _get_input_files_annotate_strucvars(self, wildcards): yield "ped", "work/write_pedigree.{index_ngs_library}/out/{index_ngs_library}.ped" - if self.parent.config["path_sv_calling_targeted"]: + if self.parent.config.path_sv_calling_targeted: sv_calling = self.parent.sub_workflows["sv_calling_targeted"] - sv_callers = self.parent.config["tools_sv_calling_targeted"] + sv_callers = self.parent.config.tools_sv_calling_targeted skip_libraries = { - sv_caller: self.parent.w_config["step_config"]["sv_calling_targeted"] - .get(sv_caller, {}) - .get("skip_libraries", []) + sv_caller: getattr( + self.parent.w_config.step_config["sv_calling_targeted"], sv_caller + ).skip_libraries for sv_caller in sv_callers } - elif self.parent.config["path_sv_calling_wgs"]: + elif self.parent.config.path_sv_calling_wgs: sv_calling = self.parent.sub_workflows["sv_calling_wgs"] - sv_callers = self.parent.config["tools_sv_calling_wgs"]["dna"] + sv_callers = self.parent.config.tools_sv_calling_wgs.dna skip_libraries = { - sv_caller: self.parent.w_config["step_config"]["sv_calling_wgs"] - .get(sv_caller, {}) - .get("skip_libraries", []) + sv_caller: getattr( + self.parent.w_config.step_config["sv_calling_wgs"], sv_caller + ).skip_libraries for sv_caller in sv_callers } else: @@ -395,12 +370,15 @@ def _get_output_files_annotate_strucvars(self): } yield from work_paths.items() # Generate paths in "output/" directory - yield "output_links", [ - re.sub(r"^work/", "output/", work_path) - for work_path in chain( - work_paths.values(), self.get_log_file("annotate_strucvars").values() - ) - ] + yield ( + "output_links", + [ + re.sub(r"^work/", "output/", work_path) + for work_path in chain( + work_paths.values(), self.get_log_file("annotate_strucvars").values() + ) + ], + ) #: Alias the get params function. _get_params_annotate_strucvars = _get_params_annotate_seqvars @@ -439,10 +417,13 @@ def _get_output_files_bam_qc(self) -> SnakemakeDictItemsGenerator: "bam_qc_md5": f"{prefix}.bam-qc.tsv.gz.md5", } yield from work_paths.items() - yield "output_links", [ - re.sub(r"^work/", "output/", work_path) - for work_path in chain(work_paths.values(), self.get_log_file("bam_qc").values()) - ] + yield ( + "output_links", + [ + re.sub(r"^work/", "output/", work_path) + for work_path in chain(work_paths.values(), self.get_log_file("bam_qc").values()) + ], + ) def _get_params_bam_qc(self, wildcards: Wildcards) -> typing.Dict[str, str]: """Get parameters for wrapper ``variant_annotator/bam_qc`` @@ -467,9 +448,9 @@ def _get_params_bam_qc(self, wildcards: Wildcards) -> typing.Dict[str, str]: pedigree = self.index_ngs_library_to_pedigree[wildcards.index_ngs_library] for donor in pedigree.donors: if donor.dna_ngs_library: - library_name_to_file_identifier[ + library_name_to_file_identifier[donor.dna_ngs_library.name] = ( donor.dna_ngs_library.name - ] = donor.dna_ngs_library.name + ) return library_name_to_file_identifier @@ -491,38 +472,34 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (VariantCallingWorkflow, SvCallingTargetedWorkflow, NgsMappingWorkflow), + config_model_class=VarfishExportConfigModel, + previous_steps=(VariantCallingWorkflow, SvCallingTargetedWorkflow, NgsMappingWorkflow), ) # Register sub step classes so the sub steps are available self.register_sub_step_classes((WritePedigreeStepPart, MehariStepPart, LinkOutStepPart)) # Register sub workflows - self.register_sub_workflow("variant_calling", self.config["path_variant_calling"]) - if self.config["path_sv_calling_targeted"]: - self.register_sub_workflow( - "sv_calling_targeted", self.config["path_sv_calling_targeted"] - ) - if self.config["path_sv_calling_wgs"]: - self.register_sub_workflow("sv_calling_wgs", self.config["path_sv_calling_wgs"]) - self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"]) + self.register_sub_workflow("variant_calling", self.config.path_variant_calling) + if self.config.path_sv_calling_targeted: + self.register_sub_workflow("sv_calling_targeted", self.config.path_sv_calling_targeted) + if self.config.path_sv_calling_wgs: + self.register_sub_workflow("sv_calling_wgs", self.config.path_sv_calling_wgs) + self.register_sub_workflow("ngs_mapping", self.config.path_ngs_mapping) # Copy over "tools" setting from variant_calling/ngs_mapping if not set here - step_config = self.w_config["step_config"] - if not self.config["tools_ngs_mapping"]: - self.config["tools_ngs_mapping"] = step_config["ngs_mapping"]["tools"]["dna"] - if not self.config["tools_variant_calling"] and "variant_calling" in step_config: - self.config["tools_variant_calling"] = step_config["variant_calling"]["tools"] + step_config = self.w_config.step_config + if not self.config.tools_ngs_mapping: + self.config.tools_ngs_mapping = step_config["ngs_mapping"].tools.dna + if not self.config.tools_variant_calling and "variant_calling" in step_config: + self.config.tools_variant_calling = step_config["variant_calling"].tools if ( - not self.config["tools_sv_calling_targeted"] - and "sv_calling_targeted" in self.w_config["step_config"] + not self.config.tools_sv_calling_targeted + and "sv_calling_targeted" in self.w_config.step_config ): - self.config["tools_sv_calling_targeted"] = step_config["sv_calling_targeted"]["tools"] - if ( - not self.config["tools_sv_calling_wgs"] - and "sv_calling_wgs" in self.w_config["step_config"] - ): - self.config["tools_sv_calling_wgs"] = step_config["sv_calling_wgs"]["tools"] + self.config.tools_sv_calling_targeted = step_config["sv_calling_targeted"].tools + if not self.config.tools_sv_calling_wgs and "sv_calling_wgs" in self.w_config.step_config: + self.config.tools_sv_calling_wgs = step_config["sv_calling_wgs"].tools # Build additional information self.ngs_library_to_kit = self._build_ngs_library_to_kit() @@ -530,11 +507,11 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) @dictify def _build_ngs_library_to_kit(self): """Build mapping of NGS library to kit based on the ``ngs_mapping`` configuration""" - cov_config = DictQuery(self.w_config).get("step_config/ngs_mapping/target_coverage_report") + cov_config = self.w_config.step_config["ngs_mapping"].target_coverage_report regexes = { - item["pattern"]: item["name"] - for item in cov_config["path_target_interval_list_mapping"] - if item["name"] != "__default__" + item.pattern: item.name + for item in cov_config.path_target_interval_list_mapping + if item.name != "__default__" } result = {} for sheet in self.shortcut_sheets: @@ -557,13 +534,3 @@ def get_result_files(self): """ for action in self.sub_steps["mehari"].actions: yield from self.sub_steps["mehari"].get_result_files(action) - - def check_config(self): - self.ensure_w_config( - ("step_config", "varfish_export", "path_ngs_mapping"), - "Path to ngs_mapping not configured but required for variant annotation", - ) - self.ensure_w_config( - ("step_config", "varfish_export", "path_variant_calling"), - "Path to variant_calling not configured but required for variant annotation", - ) diff --git a/snappy_pipeline/workflows/varfish_export/model.py b/snappy_pipeline/workflows/varfish_export/model.py new file mode 100644 index 000000000..64ed6da2e --- /dev/null +++ b/snappy_pipeline/workflows/varfish_export/model.py @@ -0,0 +1,45 @@ +from typing import Annotated + +from pydantic import Field + +from snappy_pipeline.models import SnappyStepModel + + +class VarfishExport(SnappyStepModel): + """Configuration of the input path enables export from the corresponding pipeline step.""" + + path_ngs_mapping: Annotated[str, Field(examples=["../ngs_mapping"])] + """Used output of ngs_mapping is alignment quality control data""" + + path_variant_calling: Annotated[str, Field(examples=["../variant_calling"])] + """Used output of variant_calling is variant calls""" + + path_sv_calling_targeted: str | None = None + """Used output of targeted SV calling is variant calls""" + + path_sv_calling_wgs: str | None = None + """Used output of WGS SV calling is variant calls""" + + # Optionally, you can override the exported mappers and variant callers by setting + # the following variables. + tools_ngs_mapping: list[str] = [] + """Can be used to override the exported mappers and variant callers""" + + tools_variant_calling: list[str] = [] + """Can be used to override the exported mappers and variant callers""" + + tools_sv_calling_targeted: list[str] = [] + """Can be used to override the exported mappers and variant callers""" + + tools_sv_calling_wgs: list[str] = [] + """Can be used to override the exported mappers and variant callers""" + + # The following configuration is used for parameterizing the output itself. + release: str = "GRCh37" + """The release of the genome reference that data has been aligned to.""" + + path_exon_bed: str + """Path to BED file with exons; used for reducing data to near-exon small variants.""" + + path_mehari_db: str + """Path to mehari database.""" diff --git a/snappy_pipeline/workflows/variant_annotation/__init__.py b/snappy_pipeline/workflows/variant_annotation/__init__.py index 3e512bd8f..571433cf3 100644 --- a/snappy_pipeline/workflows/variant_annotation/__init__.py +++ b/snappy_pipeline/workflows/variant_annotation/__init__.py @@ -57,14 +57,14 @@ import re from biomedsheets.shortcuts import GermlineCaseSheet - from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, ResourceUsage from snappy_pipeline.workflows.abstract.common import SnakemakeListItemsGenerator -from snappy_pipeline.workflows.abstract.exceptions import InvalidConfigurationException from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow from snappy_pipeline.workflows.variant_calling import GetResultFilesMixin, VariantCallingWorkflow +from .model import VariantAnnotation as VariantAnnotationConfigModel + __author__ = "Manuel Holtgrewe " #: Valid tools for variant annotation. @@ -79,30 +79,7 @@ # TODO: the number of restart times is high because tabix in HTSJDK/Jannovar is flaky... #: Default configuration for the somatic_variant_calling step -DEFAULT_CONFIG = r""" -# Default configuration variant_annotation -step_config: - variant_annotation: - path_variant_calling: ../variant_calling - tools: - - vep - vep: - # We will always run VEP in cache mode. You have to provide the directory to the - # cache to use (VEP would be ``~/.vep``). - cache_dir: null # OPTIONAL - # The cache version to use. gnomAD v2 used 85, gnomAD v3.1 uses 101. - cache_version: "85" - # The assembly to use. gnomAD v2 used "GRCh37", gnomAD v3.1 uses "GRCh38". - assembly: "GRCh37" - # The flag selecting the transcripts. One of "gencode_basic", "refseq", and "merged". - tx_flag: "gencode_basic" - # Number of threads to use with forking, set to 0 to disable forking. - num_threads: 16 - # Additional flags. - more_flags: "--af_gnomade --af_gnomadg" - # The --buffer_size parameter - buffer_size: 100000 -""" +DEFAULT_CONFIG = VariantAnnotationConfigModel.default_config_yaml_string() class VepStepPart(GetResultFilesMixin, BaseStepPart): @@ -136,13 +113,16 @@ def get_output_files(self, action): "vcf_tbi_md5": f"work/{token}/out/{token}.vcf.gz.tbi.md5", } yield from work_files.items() - yield "output_links", [ - re.sub(r"^work/", "output/", work_path) - for work_path in chain(work_files.values(), self.get_log_file("run").values()) - ] + yield ( + "output_links", + [ + re.sub(r"^work/", "output/", work_path) + for work_path in chain(work_files.values(), self.get_log_file("run").values()) + ], + ) def get_extra_kv_pairs(self): - return {"var_caller": self.parent.w_config["step_config"]["variant_calling"]["tools"]} + return {"var_caller": self.parent.w_config.step_config["variant_calling"].tools} @dictify def _get_log_file(self, action): @@ -160,9 +140,9 @@ def _get_log_file(self, action): yield key, f"{prefix}{ext}" yield f"{key}_md5", f"{prefix}{ext}.md5" - def get_resource_usage(self, action) -> ResourceUsage: + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: self._validate_action(action) - num_threads = self.config[self.name]["num_threads"] + num_threads = self.config[self.name].num_threads return ResourceUsage( threads=num_threads, time="1-00", @@ -188,33 +168,25 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (VariantCallingWorkflow, NgsMappingWorkflow), + config_model_class=VariantAnnotationConfigModel, + previous_steps=(VariantCallingWorkflow, NgsMappingWorkflow), ) # Register sub step classes so the sub steps are available self.register_sub_step_classes((VepStepPart,)) # Register sub workflows self.register_sub_workflow( - "ngs_mapping", self.w_config["step_config"]["variant_calling"]["path_ngs_mapping"] + "ngs_mapping", self.w_config.step_config["variant_calling"].path_ngs_mapping ) - self.register_sub_workflow("variant_calling", self.config["path_variant_calling"]) + self.register_sub_workflow("variant_calling", self.config.path_variant_calling) @listify def get_result_files(self) -> SnakemakeListItemsGenerator: - for tool in self.config["tools"]: + for tool in self.config.tools: yield from self.sub_steps[tool].get_result_files() def check_config(self): """Check that the path to the NGS mapping is present""" - self.ensure_w_config( - ("step_config", "variant_annotation", "path_variant_calling"), - "Path to variant calling not configured but required for variant annotation", - ) self.ensure_w_config( ("static_data_config", "reference", "path"), "Path to reference FASTA not configured but required for variant calling", ) - # Check that only valid tools are selected - selected = set(self.w_config["step_config"]["variant_annotation"]["tools"]) - invalid = list(sorted(selected - set(VARIANT_ANNOTATORS))) - if invalid: - raise InvalidConfigurationException(f"Invalid variant callers selected: {invalid}") diff --git a/snappy_pipeline/workflows/variant_annotation/model.py b/snappy_pipeline/workflows/variant_annotation/model.py new file mode 100644 index 000000000..5417210af --- /dev/null +++ b/snappy_pipeline/workflows/variant_annotation/model.py @@ -0,0 +1,35 @@ +import enum +from typing import Annotated + +from pydantic import Field + +from snappy_pipeline.models import EnumField, SnappyStepModel, validators +from snappy_pipeline.models.annotation import Vep + + +class Tool(enum.StrEnum): + vep = "vep" + + +class VepCustom(Vep): + buffer_size: int = 100000 + num_threads: int = 16 + + cache_version: str = "85" + """The cache version to use. gnomAD v2 used 85, gnomAD v3.1 uses 101.""" + + assembly: str = "GRCh37" + """The assembly to use. gnomAD v2 used "GRCh37", gnomAD v3.1 uses "GRCh38".""" + + more_flags: str = "--af_gnomade --af_gnomadg" + + +class VariantAnnotation(SnappyStepModel, validators.ToolsMixin): + path_variant_calling: Annotated[str, Field(examples=["../variant_calling"])] = ( + "../variant_calling" + ) + """Path to variant calling""" + + tools: Annotated[list[Tool], EnumField(Tool, [Tool.vep], min_length=1)] + + vep: VepCustom | None = None diff --git a/snappy_pipeline/workflows/variant_calling/__init__.py b/snappy_pipeline/workflows/variant_calling/__init__.py index 8ad08932b..5c0b2ff74 100644 --- a/snappy_pipeline/workflows/variant_calling/__init__.py +++ b/snappy_pipeline/workflows/variant_calling/__init__.py @@ -253,9 +253,9 @@ import typing import warnings -from biomedsheets.shortcuts import GermlineCaseSheet, Pedigree, is_not_background from snakemake.io import Wildcards, expand +from biomedsheets.shortcuts import GermlineCaseSheet, Pedigree, is_not_background from snappy_pipeline.utils import dictify, flatten, listify from snappy_pipeline.workflows.abstract import ( BaseStep, @@ -268,10 +268,11 @@ SnakemakeDictItemsGenerator, SnakemakeListItemsGenerator, ) -from snappy_pipeline.workflows.abstract.exceptions import InvalidConfigurationException from snappy_pipeline.workflows.abstract.warnings import InconsistentPedigreeWarning from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow +from .model import VariantCalling as VariantCallingConfigModel + __author__ = "Manuel Holtgrewe " #: Extensions of files to create as main payload @@ -290,65 +291,7 @@ ) #: Default configuration for the variant_calling step -DEFAULT_CONFIG = r""" -# Default configuration variant_calling -step_config: - variant_calling: - # Common configuration - path_ngs_mapping: ../ngs_mapping # REQUIRED - - # Report generation - baf_file_generation: - enabled: true - min_dp: 10 # minimal DP of variant, must be >=1 - bcftools_stats: - enabled: true - jannovar_stats: - enabled: true - path_ser: REQUIRED # REQUIRED - bcftools_roh: - enabled: true - path_targets: null # REQUIRED; optional - path_af_file: null # REQUIRED - ignore_homref: false - skip_indels: false - rec_rate: 1e-8 - - # Variant calling tools and their configuration - # - # Common configuration - tools: ['gatk4_hc_gvcf'] # REQUIRED - ignore_chroms: - - '^NC_007605$' # herpes virus - - '^hs37d5$' # GRCh37 decoy - - '^chrEBV$' # Eppstein-Barr Virus - - '_decoy$' # decoy contig - - '^HLA-' # HLA genes - - # Variant caller specific configuration - bcftools_call: - max_depth: 250 - max_indel_depth: 250 - window_length: 10000000 - num_threads: 16 - gatk3_hc: - num_threads: 16 - window_length: 10000000 - allow_seq_dict_incompatibility: false - gatk3_ug: - num_threads: 16 - window_length: 10000000 - allow_seq_dict_incompatibility: false - downsample_to_coverage: 250 - gatk4_hc_joint: - window_length: 10000000 - num_threads: 16 - allow_seq_dict_incompatibility: false - gatk4_hc_gvcf: - window_length: 10000000 - num_threads: 16 - allow_seq_dict_incompatibility: false -""" +DEFAULT_CONFIG = VariantCallingConfigModel.default_config_yaml_string() class GetResultFilesMixin: @@ -375,7 +318,7 @@ def strip_tpl(tpl): for path_tpl in result_paths_tpls: for index_library_name, member_library_names in index_dna_ngs_libraries.items(): kwargs = { - "mapper": self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + "mapper": self.w_config.step_config["ngs_mapping"].tools.dna, } if "index_library_name" in path_tpl: kwargs["index_library_name"] = [index_library_name] @@ -391,7 +334,7 @@ def strip_tpl(tpl): ) def get_extra_kv_pairs(self): - return {"var_caller": self.parent.config["tools"]} + return {"var_caller": self.parent.config.tools} @dictify def _get_index_dna_ngs_libraries( @@ -511,10 +454,13 @@ def _get_output_files_run(self) -> SnakemakeDictItemsGenerator: "vcf_tbi_md5": f"work/{token}/out/{token}.vcf.gz.tbi.md5", } yield from work_files.items() - yield "output_links", [ - re.sub(r"^work/", "output/", work_path) - for work_path in chain(work_files.values(), self.get_log_file("run").values()) - ] + yield ( + "output_links", + [ + re.sub(r"^work/", "output/", work_path) + for work_path in chain(work_files.values(), self.get_log_file("run").values()) + ], + ) class BcftoolsCallStepPart(VariantCallingStepPart): @@ -523,7 +469,7 @@ class BcftoolsCallStepPart(VariantCallingStepPart): #: Step name name = "bcftools_call" - def get_resource_usage(self, action: str) -> ResourceUsage: + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: self._validate_action(action) return ResourceUsage( threads=16, @@ -536,16 +482,16 @@ class GatkCallerStepPartBase(VariantCallingStepPart): """Base class for GATK v3/v4 variant callers""" def check_config(self): - if self.__class__.name not in self.config["tools"]: + if self.__class__.name not in self.config.tools: return # caller not enabled, skip # pragma: no cover self.parent.ensure_w_config( ("static_data_config", "dbsnp", "path"), "dbSNP not configured but required for {}".format(self.__class__.name), ) - def get_resource_usage(self, action) -> ResourceUsage: + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: self._validate_action(action) - num_threads = self.config[self.name]["num_threads"] + num_threads = self.config[self.name].num_threads mem_per_thread = 5.5 mem_total = int(mem_per_thread * num_threads + 0.5) return ResourceUsage( @@ -651,10 +597,13 @@ def _get_output_files_genotype(self) -> SnakemakeDictItemsGenerator: "vcf_tbi_md5": f"work/{infix}/out/{infix}.vcf.gz.tbi.md5", } yield from result.items() - yield "output_links", [ - re.sub(r"^work/", "output/", work_path) - for work_path in chain(result.values(), self.get_log_file("genotype").values()) - ] + yield ( + "output_links", + [ + re.sub(r"^work/", "output/", work_path) + for work_path in chain(result.values(), self.get_log_file("genotype").values()) + ], + ) class ReportGetLogFileMixin: @@ -706,9 +655,12 @@ def get_input_files(self, action: str) -> SnakemakeDict: @dictify def _get_input_files_run(self) -> SnakemakeDictItemsGenerator: - yield "vcf", ( - "work/{mapper}.{var_caller}.{index_library_name}/out/" - "{mapper}.{var_caller}.{index_library_name}.vcf.gz" + yield ( + "vcf", + ( + "work/{mapper}.{var_caller}.{index_library_name}/out/" + "{mapper}.{var_caller}.{index_library_name}.vcf.gz" + ), ) def get_output_files(self, action: str) -> SnakemakeDict: @@ -725,12 +677,15 @@ def _get_output_files_run(self) -> SnakemakeDictItemsGenerator: ) work_files = {key: f"{base_path}{ext}" for key, ext in ext_names.items()} yield from work_files.items() - yield "output_links", [ - re.sub(r"^work/", "output/", work_path) - for work_path in chain(work_files.values(), self.get_log_file("run").values()) - ] + yield ( + "output_links", + [ + re.sub(r"^work/", "output/", work_path) + for work_path in chain(work_files.values(), self.get_log_file("run").values()) + ], + ) - def get_resource_usage(self, action: str) -> ResourceUsage: + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -760,9 +715,12 @@ def get_input_files(self, action: str) -> SnakemakeDict: @dictify def _get_input_files_run(self) -> SnakemakeDictItemsGenerator: - yield "vcf", ( - "output/{mapper}.{var_caller}.{index_library_name}/out/" - "{mapper}.{var_caller}.{index_library_name}.vcf.gz" + yield ( + "vcf", + ( + "output/{mapper}.{var_caller}.{index_library_name}/out/" + "{mapper}.{var_caller}.{index_library_name}.vcf.gz" + ), ) def get_output_files(self, action: str) -> SnakemakeDict: @@ -779,12 +737,15 @@ def _get_output_files_run(self) -> SnakemakeDictItemsGenerator: ) work_files = {key: f"{base_path}{ext}" for key, ext in ext_names.items()} yield from work_files.items() - yield "output_links", [ - re.sub(r"^work/", "output/", work_path) - for work_path in chain(work_files.values(), self.get_log_file("run").values()) - ] + yield ( + "output_links", + [ + re.sub(r"^work/", "output/", work_path) + for work_path in chain(work_files.values(), self.get_log_file("run").values()) + ], + ) - def get_resource_usage(self, action: str) -> ResourceUsage: + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -814,9 +775,12 @@ class JannovarStatisticsStepPart(GetResultFilesMixin, ReportGetLogFileMixin, Bas def get_input_files(self, action) -> SnakemakeDictItemsGenerator: """Return path to input files""" self._validate_action(action) - yield "vcf", ( - "work/{mapper}.{var_caller}.{index_library_name}/out/" - "{mapper}.{var_caller}.{index_library_name}.vcf.gz" + yield ( + "vcf", + ( + "work/{mapper}.{var_caller}.{index_library_name}/out/" + "{mapper}.{var_caller}.{index_library_name}.vcf.gz" + ), ) def get_output_files(self, action) -> SnakemakeDict: @@ -836,12 +800,15 @@ def _get_output_files_run(self) -> SnakemakeDictItemsGenerator: ext_names = {"report": ".txt", "report_md5": ".txt.md5"} work_files = {key: f"{base_path}{ext}" for key, ext in ext_names.items()} yield from work_files.items() - yield "output_links", [ - re.sub(r"^work/", "output/", work_path) - for work_path in chain(work_files.values(), self.get_log_file("run").values()) - ] + yield ( + "output_links", + [ + re.sub(r"^work/", "output/", work_path) + for work_path in chain(work_files.values(), self.get_log_file("run").values()) + ], + ) - def get_resource_usage(self, action: str) -> ResourceUsage: + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -875,9 +842,12 @@ class BafFileGenerationStepPart(GetResultFilesMixin, ReportGetLogFileMixin, Base @dictify def get_input_files(self, action: str) -> SnakemakeDictItemsGenerator: self._validate_action(action) - yield "vcf", ( - "work/{mapper}.{var_caller}.{index_library_name}/out/" - "{mapper}.{var_caller}.{index_library_name}.vcf.gz" + yield ( + "vcf", + ( + "work/{mapper}.{var_caller}.{index_library_name}/out/" + "{mapper}.{var_caller}.{index_library_name}.vcf.gz" + ), ) @dictify @@ -892,12 +862,15 @@ def get_output_files(self, action: str) -> SnakemakeDictItemsGenerator: for key, ext in ext_names.items(): work_files[key] = f"work/{base_path}{ext}" yield from work_files.items() - yield "output_links", [ - re.sub(r"^work/", "output/", work_path) - for work_path in chain(work_files.values(), self.get_log_file("run").values()) - ] + yield ( + "output_links", + [ + re.sub(r"^work/", "output/", work_path) + for work_path in chain(work_files.values(), self.get_log_file("run").values()) + ], + ) - def get_resource_usage(self, action: str) -> ResourceUsage: + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: self._validate_action(action) return ResourceUsage( threads=1, @@ -924,7 +897,8 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (NgsMappingWorkflow,), + config_model_class=VariantCallingConfigModel, + previous_steps=(NgsMappingWorkflow,), ) # Register sub step classes so the sub steps are available self.register_sub_step_classes( @@ -942,28 +916,20 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) ) ) # Register sub workflows - self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"]) + self.register_sub_workflow("ngs_mapping", self.config.path_ngs_mapping) @listify def get_result_files(self) -> SnakemakeListItemsGenerator: - for tool in self.config["tools"]: + for tool in self.config.tools: yield from self.sub_steps[tool].get_result_files() for name in ("baf_file_generation", "bcftools_stats", "jannovar_stats", "bcftools_roh"): - if self.w_config["step_config"]["variant_calling"][name]["enabled"]: + name_config = self.config.get(name) + if name_config and name_config.enabled: yield from self.sub_steps[name].get_result_files() def check_config(self): # Checks for static data - self.ensure_w_config( - ("step_config", "variant_calling", "path_ngs_mapping"), - "Path to NGS mapping not configured but required for variant calling", - ) self.ensure_w_config( ("static_data_config", "reference", "path"), "Path to reference FASTA not configured but required for variant calling", ) - # Check that only valid tools are selected - selected = set(self.w_config["step_config"]["variant_calling"]["tools"]) - invalid = list(sorted(selected - set(VARIANT_CALLERS))) - if invalid: - raise InvalidConfigurationException(f"Invalid variant callers selected: {invalid}") diff --git a/snappy_pipeline/workflows/variant_calling/model.py b/snappy_pipeline/workflows/variant_calling/model.py new file mode 100644 index 000000000..6d6d0a0e6 --- /dev/null +++ b/snappy_pipeline/workflows/variant_calling/model.py @@ -0,0 +1,97 @@ +import enum +from typing import Annotated + +from pydantic import Field + +from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel, ToggleModel, validators + + +class BafFileGeneration(ToggleModel): + min_dp: Annotated[int, Field(ge=1)] = 10 + """minimal DP of variant, must be >=1""" + + +class BcftoolsStats(ToggleModel): + pass + + +class JannovarStats(ToggleModel): + path_ser: str + + +class BcftoolsRoh(ToggleModel): + path_targets: str | None = None # FIXME this says "REQUIRED; optional" in the original code + + path_af_file: str + + ignore_homref: bool = False + + skip_indels: bool = False + + rec_rate: float = 1e-8 + + +class Tool(enum.StrEnum): + bcftools_call = "bcftools_call" + gatk3_hc = "gatk3_hc" + gatk3_ug = "gatk3_ug" + gatk4_hc_joint = "gatk4_hc_joint" + gatk4_hc_gvcf = "gatk4_hc_gvcf" + + +class BcftoolsCall(SnappyModel): + max_depth: int = 250 + max_indel_depth: int = 250 + window_length: int = 10000000 + num_threads: int = 16 + + +class Gatk3Hc(SnappyModel): + num_threads: int = 16 + window_length: int = 10000000 + allow_seq_dict_incompatibility: bool = False + + +class Gatk3Ug(SnappyModel): + num_threads: int = 16 + window_length: int = 10000000 + allow_seq_dict_incompatibility: bool = False + downsample_to_coverage: int = 250 + + +class Gatk4HcJoint(SnappyModel): + window_length: int = 10000000 + num_threads: int = 16 + allow_seq_dict_incompatibility: bool = False + + +class Gatk4HcGvcf(SnappyModel): + window_length: int = 10000000 + num_threads: int = 16 + allow_seq_dict_incompatibility: bool = False + + +class VariantCalling(SnappyStepModel, validators.ToolsMixin): + path_ngs_mapping: str = "../ngs_mapping" + + tools: Annotated[list[Tool], EnumField(Tool, [Tool.gatk4_hc_gvcf], min_length=1)] + + ignore_chroms: list[str] = ["^NC_007605$", "^hs37d5$", "^chrEBV$", "_decoy$", "^HLA-"] + + baf_file_generation: BafFileGeneration = BafFileGeneration() + + bcftools_stats: BcftoolsStats | None = None + + jannovar_stats: JannovarStats | None = None + + bcftools_roh: BcftoolsRoh | None = None + + bcftools_call: BcftoolsCall | None = None + + gatk3_hc: Gatk3Hc | None = None + + gatk3_ug: Gatk3Ug | None = None + + gatk4_hc_joint: Gatk4HcJoint | None = None + + gatk4_hc_gvcf: Gatk4HcGvcf | None = None diff --git a/snappy_pipeline/workflows/variant_checking/__init__.py b/snappy_pipeline/workflows/variant_checking/__init__.py index ab4114cd5..04c6d7347 100644 --- a/snappy_pipeline/workflows/variant_checking/__init__.py +++ b/snappy_pipeline/workflows/variant_checking/__init__.py @@ -50,9 +50,9 @@ import sys -from biomedsheets.shortcuts import GermlineCaseSheet, is_not_background from snakemake.io import expand +from biomedsheets.shortcuts import GermlineCaseSheet, is_not_background from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import ( BaseStep, @@ -64,20 +64,15 @@ from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow from snappy_pipeline.workflows.variant_calling import VariantCallingWorkflow +from .model import VariantChecking as VariantCheckingConfigModel + __author__ = "Manuel Holtgrewe " #: Available tools for checking variants VARIANT_CHECKERS = "peddy" #: Default configuration for the somatic_gene_fusion_calling step -DEFAULT_CONFIG = r""" -step_config: - variant_checking: - tools_ngs_mapping: [] # optional, copied from ngs mapping config - tools_variant_calling: [] # optional, copied from variant calling config - path_variant_calling: ../variant_calling # REQUIRED - tools: ['peddy'] # REQUIRED - available: 'peddy' -""" +DEFAULT_CONFIG = VariantCheckingConfigModel.default_config_yaml_string() class PeddyStepPart(BaseStepPart): @@ -137,7 +132,7 @@ def get_log_file(self, action): self._validate_action(action) return self.log_path - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -177,19 +172,18 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (VariantCallingWorkflow, NgsMappingWorkflow), + config_model_class=VariantCheckingConfigModel, + previous_steps=(VariantCallingWorkflow, NgsMappingWorkflow), ) # Register sub step classes so the sub steps are available self.register_sub_step_classes((PeddyStepPart, WritePedigreeStepPart, LinkOutStepPart)) # Register sub workflows - self.register_sub_workflow("variant_calling", self.config["path_variant_calling"]) + self.register_sub_workflow("variant_calling", self.config.path_variant_calling) # Copy over "tools" setting from ngs_mapping/variant_calling if not set here - if not self.config["tools_ngs_mapping"]: - self.config["tools_ngs_mapping"] = self.w_config["step_config"]["ngs_mapping"]["tools"] - if not self.config["tools_variant_calling"]: - self.config["tools_variant_calling"] = self.w_config["step_config"]["variant_calling"][ - "tools" - ] + if not self.config.tools_ngs_mapping: + self.config.tools_ngs_mapping = self.w_config.step_config["ngs_mapping"].tools + if not self.config.tools_variant_calling: + self.config.tools_variant_calling = self.w_config.step_config["variant_calling"].tools @listify def get_result_files(self): @@ -208,14 +202,7 @@ def _yield_peddy_results(self): for path in self.sub_steps["peddy"].get_output_files("run").values(): yield from expand( path, - mapper=self.config["tools_ngs_mapping"], - var_caller=self.config["tools_variant_calling"], + mapper=self.config.tools_ngs_mapping, + var_caller=self.config.tools_variant_calling, index_ngs_library=[pedigree.index.dna_ngs_library.name], ) - - def check_config(self): - """Check that the path to the NGS mapping is present""" - self.ensure_w_config( - ("step_config", "variant_checking", "path_variant_calling"), - "Path to variant calling not configured but required for variant checking", - ) diff --git a/snappy_pipeline/workflows/variant_checking/model.py b/snappy_pipeline/workflows/variant_checking/model.py new file mode 100644 index 000000000..f4e24da8c --- /dev/null +++ b/snappy_pipeline/workflows/variant_checking/model.py @@ -0,0 +1,23 @@ +import enum +from typing import Annotated + +from pydantic import Field + +from snappy_pipeline.models import EnumField, SnappyStepModel + + +class Tool(enum.StrEnum): + peddy = "peddy" + + +class VariantChecking(SnappyStepModel): + tools_ngs_mapping: list[str] = [] + """copied from ngs mapping config""" + + tools_variant_calling: list[str] = [] + """copied from variant calling config""" + + path_variant_calling: Annotated[str, Field(examples=["../variant_calling"])] + """Path to variant calling""" + + tools: Annotated[list[Tool], EnumField(Tool, [Tool.peddy], min_length=1)] diff --git a/snappy_pipeline/workflows/variant_denovo_filtration/Snakefile b/snappy_pipeline/workflows/variant_denovo_filtration/Snakefile index 67068875f..838c219ed 100644 --- a/snappy_pipeline/workflows/variant_denovo_filtration/Snakefile +++ b/snappy_pipeline/workflows/variant_denovo_filtration/Snakefile @@ -4,7 +4,9 @@ import os from snappy_pipeline import expand_ref -from snappy_pipeline.workflows.variant_denovo_filtration import VariantDeNovoFiltrationWorkflow +from snappy_pipeline.workflows.variant_denovo_filtration import ( + VariantDeNovoFiltrationWorkflow, +) __author__ = "Manuel Holtgrewe " diff --git a/snappy_pipeline/workflows/variant_denovo_filtration/__init__.py b/snappy_pipeline/workflows/variant_denovo_filtration/__init__.py index f3b822332..cc3140dc5 100644 --- a/snappy_pipeline/workflows/variant_denovo_filtration/__init__.py +++ b/snappy_pipeline/workflows/variant_denovo_filtration/__init__.py @@ -92,9 +92,9 @@ import itertools import os -from biomedsheets.shortcuts import GermlineCaseSheet, is_not_background from snakemake.io import expand +from biomedsheets.shortcuts import GermlineCaseSheet, is_not_background from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import ( BaseStep, @@ -107,6 +107,8 @@ from snappy_pipeline.workflows.variant_annotation import VariantAnnotationWorkflow from snappy_pipeline.workflows.variant_phasing import VariantPhasingWorkflow +from .model import VariantDenovoFiltration as VariantDenovoFiltrationConfigModel + __author__ = "Manuel Holtgrewe " #: Extensions of files to create as main payload @@ -116,35 +118,10 @@ EXT_NAMES = ("vcf", "vcf_tbi", "vcf_md5", "vcf_tbi_md5") #: Default configuration for the variant_denovo_filtration step -DEFAULT_CONFIG = r""" -step_config: - variant_denovo_filtration: - # One of the following must be given! - path_variant_phasing: '' - path_variant_annotation: '' - path_variant_calling: '' - path_ngs_mapping: ../ngs_mapping - tools_ngs_mapping: null # defaults to ngs_mapping tool - tools_variant_calling: null # defaults to variant_annotation tool - info_key_reliable_regions: [] # optional INFO keys with reliable regions - info_key_unreliable_regions: [] # optional INFO keys with unreliable regions - params_besenbacher: # parameters for Besenbacher quality filter - min_gq: 50 - min_dp: 10 - max_dp: 120 - min_ab: 0.20 - max_ab: 0.9 - max_ad2: 1 - bad_region_expressions: [] - # e.g., - # - 'UCSC_CRG_MAPABILITY36 == 1' - # - 'UCSC_SIMPLE_REPEAT == 1' - collect_msdn: True # whether or not to collect MSDN (requires GATK HC+UG) -""" +DEFAULT_CONFIG = VariantDenovoFiltrationConfigModel.default_config_yaml_string() class FilterDeNovosBaseStepPart(BaseStepPart): - #: Class available actions actions = ("run",) @@ -164,7 +141,7 @@ def __init__(self, parent): self.ngs_library_to_pedigree[donor.dna_ngs_library.name] = pedigree self.ngs_library_to_donor[donor.dna_ngs_library.name] = donor - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -245,7 +222,7 @@ def get_log_file(self, action): self._validate_action(action) return self.path_log - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -436,7 +413,7 @@ def get_input_files(self, action): elif not donor.mother or not donor.mother.dna_ngs_library: continue else: - for caller in self.config["tools_variant_calling"]: + for caller in self.config.tools_variant_calling: yield tpl.format( mapper="{mapper}", caller=caller, @@ -448,8 +425,9 @@ def get_output_files(self, action): # Validate action self._validate_action(action) yield "txt", "work/{mapper}.denovo_count_summary/out/{mapper}.denovo_count_summary.txt" - yield "txt_md5", ( - "work/{mapper}.denovo_count_summary/out/{mapper}.denovo_count_summary.txt.md5" + yield ( + "txt_md5", + ("work/{mapper}.denovo_count_summary/out/{mapper}.denovo_count_summary.txt.md5"), ) def get_log_file(self, action): @@ -479,17 +457,18 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (VariantPhasingWorkflow, VariantAnnotationWorkflow, NgsMappingWorkflow), + config_model_class=VariantDenovoFiltrationConfigModel, + previous_steps=(VariantPhasingWorkflow, VariantAnnotationWorkflow, NgsMappingWorkflow), ) # Register sub workflows for prev in ("variant_phasing", "variant_annotation", "variant_calling"): - if self.config["path_%s" % prev]: + if cfg := self.config.get(f"path_{prev}"): self.previous_step = prev - self.register_sub_workflow(prev, self.config["path_%s" % prev]) + self.register_sub_workflow(prev, cfg) break else: raise Exception("No path to previous step given!") # pragma: no cover - self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"]) + self.register_sub_workflow("ngs_mapping", self.config.path_ngs_mapping) #: Name token for input self.prev_token = { "variant_phasing": "jannovar_annotate_vcf.gatk_pbt.gatk_rbp.", @@ -508,14 +487,10 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) ) ) # Copy over "tools" setting from variant_calling/ngs_mapping if not set here - if not self.config["tools_ngs_mapping"]: - self.config["tools_ngs_mapping"] = self.w_config["step_config"]["ngs_mapping"]["tools"][ - "dna" - ] - if not self.config["tools_variant_calling"]: - self.config["tools_variant_calling"] = self.w_config["step_config"]["variant_calling"][ - "tools" - ] + if not self.config.tools_ngs_mapping: + self.config.tools_ngs_mapping = self.w_config.step_config["ngs_mapping"].tools.dna + if not self.config.tools_variant_calling: + self.config.tools_variant_calling = self.w_config.step_config["variant_calling"].tools @listify def get_result_files(self): @@ -525,22 +500,22 @@ def get_result_files(self): ext_values = list(itertools.chain(EXT_VALUES, (".summary.txt", ".summary.txt.md5"))) yield from self._yield_result_files( os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), - mapper=self.config["tools_ngs_mapping"], - caller=self.config["tools_variant_calling"], + mapper=self.config.tools_ngs_mapping, + caller=self.config.tools_variant_calling, ext=ext_values, ) # Summarise counts yield from expand( "output/{mapper}.denovo_count_summary/out/{mapper}.denovo_count_summary{ext}", - mapper=self.config["tools_ngs_mapping"], - caller=self.config["tools_variant_calling"], + mapper=self.config.tools_ngs_mapping, + caller=self.config.tools_variant_calling, ext=(".txt", ".txt.md5"), ) # Collect MSDN statistics - if self.w_config["step_config"]["variant_denovo_filtration"]["collect_msdn"]: + if self.w_config.step_config["variant_denovo_filtration"].collect_msdn: yield from expand( "output/{mapper}.multisite_de_novo/out/{mapper}.multisite_de_novo{ext}", - mapper=self.config["tools_ngs_mapping"], + mapper=self.config.tools_ngs_mapping, ext=(".txt", ".txt.md5"), ) @@ -571,8 +546,13 @@ def _yield_result_files(self, tpl, **kwargs): yield from expand(tpl, index_library=[donor.dna_ngs_library], **kwargs) def check_config(self): - """Check that the path to the variant annotation step is present.""" - self.ensure_w_config( - ("step_config", "variant_denovo_filtration", "path_ngs_mapping"), - "Path to ngs_mapping not configured but required for variant_denovo_filtration", - ) + if not self.config.tools_ngs_mapping: + self.ensure_w_config( + ("step_config", "ngs_mapping", "tools", "dna"), + "Either define tools_ngs_mapping or provide a configuration for ngs_mapping", + ) + if not self.config.tools_variant_calling: + self.ensure_w_config( + ("step_config", "variant_calling", "tools"), + "Either define tools_variant_calling or provide a configuration for variant_calling", + ) diff --git a/snappy_pipeline/workflows/variant_denovo_filtration/model.py b/snappy_pipeline/workflows/variant_denovo_filtration/model.py new file mode 100644 index 000000000..40c6945b4 --- /dev/null +++ b/snappy_pipeline/workflows/variant_denovo_filtration/model.py @@ -0,0 +1,54 @@ +from typing import Annotated + +from pydantic import Field, model_validator + +from snappy_pipeline.models import SnappyModel, SnappyStepModel + + +class BesenbacherParams(SnappyModel): + """parameters for Besenbacher quality filter""" + + min_gq: int = 50 + min_dp: int = 10 + max_dp: int = 120 + min_ab: float = 0.20 + max_ab: float = 0.90 + max_ad2: int = 1 + + +class VariantDenovoFiltration(SnappyStepModel): + path_variant_phasing: str = "" + + path_variant_annotation: str = "" + + path_variant_calling: str = "" + + path_ngs_mapping: str = "../ngs_mapping" + + tools_ngs_mapping: list[str] = [] + """defaults to ngs_mapping tool""" + + tools_variant_calling: list[str] = [] + """defaults to variant_annotation tool""" + + info_key_reliable_regions: list[str] = [] + """optional INFO keys with reliable regions""" + + info_key_unreliable_regions: list[str] = [] + """optional INFO keys with unreliable regions""" + + params_besenbacher: BesenbacherParams = BesenbacherParams() + + bad_region_expressions: Annotated[ + list[str], Field(examples=[["'UCSC_CRG_MAPABILITY36 == 1'", "'UCSC_SIMPLE_REPEAT == 1'"]]) + ] = [] + + collect_msdn: bool = True + """whether or not to collect MSDN (requires GATK HC+UG)""" + + @model_validator(mode="after") + def ensure_variant_paths_are_configured(self): + assert ( + self.path_variant_phasing or self.path_variant_annotation or self.path_variant_calling + ) + return self diff --git a/snappy_pipeline/workflows/variant_export_external/Snakefile b/snappy_pipeline/workflows/variant_export_external/Snakefile index 926850b58..9397167ca 100644 --- a/snappy_pipeline/workflows/variant_export_external/Snakefile +++ b/snappy_pipeline/workflows/variant_export_external/Snakefile @@ -4,7 +4,9 @@ import os from snappy_pipeline import expand_ref -from snappy_pipeline.workflows.variant_export_external import VariantExportExternalWorkflow +from snappy_pipeline.workflows.variant_export_external import ( + VariantExportExternalWorkflow, +) # Configuration =============================================================== @@ -97,7 +99,7 @@ rule variant_export_external_write_pedigree_run: # Convert gVCF to VCF | Merge VCFs ------------------------------------------------ -if config["step_config"]["variant_export_external"]["gvcf_option"]: +if wf.w_config.step_config["variant_export_external"].gvcf_option: rule variant_export_external_gvcf_to_vcf: input: diff --git a/snappy_pipeline/workflows/variant_export_external/__init__.py b/snappy_pipeline/workflows/variant_export_external/__init__.py index b504e0b1c..96ca91726 100644 --- a/snappy_pipeline/workflows/variant_export_external/__init__.py +++ b/snappy_pipeline/workflows/variant_export_external/__init__.py @@ -78,10 +78,9 @@ import os import sys -from biomedsheets.shortcuts import GermlineCaseSheet, is_not_background from snakemake.io import expand -from snappy_pipeline.base import MissingConfiguration +from biomedsheets.shortcuts import GermlineCaseSheet, is_not_background from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import ( BaseStep, @@ -96,30 +95,10 @@ ) from snappy_pipeline.workflows.ngs_mapping import TargetCovReportStepPart +from .model import VariantExportExternal as VariantExportExternalConfigModel + #: Default configuration for the somatic_variant_calling step -DEFAULT_CONFIG = r""" -# Default configuration variant_export_external. -step_config: - variant_export_external: - external_tool: dragen # OPTIONAL: external tool name. - bam_available_flag: false # REQUIRED: BAM QC only possible if BAM files are present. - merge_vcf_flag: false # OPTIONAL: true if pedigree VCFs still need merging (not recommended). - merge_option: null # OPTIONAL: How to merge VCF, used in `bcftools --merge` argument. - gvcf_option: true # OPTIONAL: Flag to indicate if inputs are genomic VCFs. - search_paths: [] # REQUIRED: list of paths to VCF files. - search_patterns: [] # REQUIRED: list of search patterns, ex.: [{"vcf": "*.vcf.gz"}, {"bam": "*.bam"}, {"bai": "*.bam.bai"}] - release: GRCh37 # OPTIONAL: genome release; default 'GRCh37'. - # Path to BED file with exons; used for reducing data to near-exon small variants. - path_exon_bed: null # REQUIRED: exon BED file to use - path_refseq_ser: REQUIRED # REQUIRED: path to RefSeq .ser file. - path_ensembl_ser: REQUIRED # REQUIRED: path to ENSEMBL .ser file. - path_db: REQUIRED # REQUIRED: path to annotator DB file to use. - target_coverage_report: - # Mapping from enrichment kit to target region BED file, for either computing per target - # region coverage or selecting targeted exons. Only used if 'bam_available_flag' is True. - # It will not generated detailed reporting. - path_targets_bed: OPTIONAL # OPTIONAL -""" +DEFAULT_CONFIG = VariantExportExternalConfigModel.default_config_yaml_string() class BamReportsExternalStepPart(TargetCovReportStepPart): @@ -209,7 +188,7 @@ def _get_params_run(self, wildcards): return { "bam": sorted(list(self._collect_bam_files(wildcards))), "bam_count": len(sorted(list(self._collect_bam_files(wildcards)))), - "path_targets_bed": self.config["target_coverage_report"]["path_targets_bed"], + "path_targets_bed": self.config.target_coverage_report.path_targets_bed, } def _get_params_bam_qc(self, wildcards): @@ -275,8 +254,8 @@ def _get_external_source_prefix(self): :return: Returns external tool prefix if any provided, example: "dragen.". Otherwise, returns empty string. """ - if self.config["external_tool"]: - return self.config["external_tool"].lower() + "." + if self.config.external_tool: + return self.config.external_tool.lower() + "." return "" def get_input_files(self, action): @@ -290,7 +269,7 @@ def _get_input_files_gvcf_to_vcf(self, wildcards): @listify def _get_input_files_merge_vcf(self, wildcards): - if self.config["merge_vcf_flag"]: + if self.config.merge_vcf_flag: pedigree = self.index_ngs_library_to_pedigree.get(wildcards.index_ngs_library) for donor in filter(lambda d: d.dna_ngs_library, pedigree.donors): for bio_sample in donor.bio_samples.values(): @@ -424,7 +403,7 @@ def _get_log_file_annotation_generic(action): yield key, prefix + ext yield key + "_md5", prefix + ext + ".md5" - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -461,7 +440,7 @@ def _get_params_merge_vcf(self, wildcards): result = { "input": list(sorted(self._collect_vcfs(wildcards))), "sample_names": list(sorted(self._collect_sample_ids(wildcards))), - "merge_option": self.config["merge_option"], + "merge_option": self.config.merge_option, } return result @@ -567,7 +546,8 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (), + config_model_class=VariantExportExternalConfigModel, + previous_steps=(), ) # Load external data search information self.data_search_infos = list(self._load_data_search_infos()) @@ -607,7 +587,7 @@ def get_result_files(self): # Define infixes and actions - check if BAM QC is possible infixes = ("gts", "db-infos") performed_actions = ("annotate",) - if self.config["bam_available_flag"]: + if self.config.bam_available_flag: infixes += ("bam-qc",) performed_actions += ("bam_qc",) @@ -657,53 +637,3 @@ def _yield_result_files(self, tpl, **kwargs): ) continue # pragma: no cover yield from expand(tpl, index=[pedigree.index], **kwargs) - - def check_config(self): - """Check configuration - - :raises: MissingConfiguration: on missing or invalid configuration. - """ - # Initialise variables - fail_test_bool = False - error_msg = "Missing or invalid configuration issue(s):\n" - required_file_keys = ("path_refseq_ser", "path_ensembl_ser", "path_db") - - # Test files - for key in required_file_keys: - path_ = self.config[key] - try: - if not os.path.isfile(path_): - error_msg += f"- Value for '{key}' is not a file: {path_}\n" - fail_test_bool = True - except (ValueError, KeyError): - error_msg += f"- Value '{key}' is not properly defined: {path_}\n" - fail_test_bool = True - - # Test search paths - search_paths = [item for item in self.config["search_paths"]] - if len(search_paths) == 0: - error_msg += "- Value for 'search_paths' cannot be empty.\n" - fail_test_bool = True - else: - for path_ in search_paths: - if not os.path.isdir(path_): - error_msg += f"- Path in 'search_paths' is not a directory: {path_}\n" - fail_test_bool = True - - # Test search pattern - search_patterns = [item for item in self.config["search_patterns"]] - if len(search_patterns) == 0: - error_msg += "- Value for 'search_patterns' cannot be empty.\n" - fail_test_bool = True - else: - for value in search_patterns: - if not isinstance(value, dict): - error_msg += ( - "- Value in 'search_patterns' is not a dictionary.\n" - "Expected: [{'vcf': '*/*.vcf.gz'}]\n" - f"Observed {type(value)}: '{value}'\n" - ) - fail_test_bool = True - # Assert - if fail_test_bool: - raise MissingConfiguration(error_msg) diff --git a/snappy_pipeline/workflows/variant_export_external/model.py b/snappy_pipeline/workflows/variant_export_external/model.py new file mode 100644 index 000000000..733562786 --- /dev/null +++ b/snappy_pipeline/workflows/variant_export_external/model.py @@ -0,0 +1,57 @@ +from typing import Annotated + +from pydantic import DirectoryPath, Field, FilePath + +from snappy_pipeline.models import SnappyModel, SnappyStepModel + + +class TargetCoverageReport(SnappyModel): + path_targets_bed: str | None = None + """ + Mapping from enrichment kit to target region BED file, for either computing per target + region coverage or selecting targeted exons. Only used if 'bam_available_flag' is True. + It will not generate detailed reporting. + """ + + +class VariantExportExternal(SnappyStepModel): + external_tool: str = "dragen" + """external tool name.""" + + bam_available_flag: bool + """BAM QC only possible if BAM files are present.""" + + merge_vcf_flag: bool = False + """true if pedigree VCFs still need merging (not recommended).""" + + merge_option: str | None = None + """How to merge VCF, used in `bcftools --merge` argument.""" + + gvcf_option: bool = True + """Flag to indicate if inputs are genomic VCFs.""" + + search_paths: Annotated[list[DirectoryPath], Field(min_length=1)] + """list of paths to VCF files.""" + + search_patterns: Annotated[ + list[dict[str, str]], + Field(examples=[{"vcf": "*.vcf.gz"}, {"bam": "*.bam"}, {"bai": "*.bam.bai"}], min_length=1), + ] + """list of search patterns""" + + release: str = "GRCh37" + """genome release; default 'GRCh37'.""" + + path_exon_bed: str = "" + """Path to BED file with exons; used for reducing data to near-exon small variants.""" + + path_refseq_ser: FilePath + """path to RefSeq .ser file.""" + + path_ensembl_ser: FilePath + """path to ENSEMBL .ser file.""" + + path_db: FilePath + """path to annotator DB file to use.""" + + target_coverage_report: TargetCoverageReport = TargetCoverageReport() diff --git a/snappy_pipeline/workflows/variant_filtration/__init__.py b/snappy_pipeline/workflows/variant_filtration/__init__.py index 6e215b235..376e8dab9 100644 --- a/snappy_pipeline/workflows/variant_filtration/__init__.py +++ b/snappy_pipeline/workflows/variant_filtration/__init__.py @@ -96,9 +96,9 @@ import os.path import sys -from biomedsheets.shortcuts import GermlineCaseSheet, is_not_background from snakemake.io import expand +from biomedsheets.shortcuts import GermlineCaseSheet, is_not_background from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import ( BaseStep, @@ -111,6 +111,8 @@ from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow from snappy_pipeline.workflows.variant_annotation import VariantAnnotationWorkflow +from .model import VariantFiltration as VariantFiltrationConfigModel + __author__ = "Manuel Holtgrewe " #: Extensions of files to create as main payload @@ -120,97 +122,7 @@ EXT_NAMES = ("vcf", "vcf_tbi", "vcf_md5", "vcf_tbi_md5") #: Default configuration for the somatic_variant_calling step -DEFAULT_CONFIG = r""" -step_config: - variant_filtration: - path_variant_annotation: ../variant_annotation - tools_ngs_mapping: null # defaults to ngs_mapping tool - tools_variant_calling: null # defaults to variant_annotation tool - thresholds: # quality filter sets, "keep_all" implicitely defined - conservative: - min_gq: 40 - min_dp_het: 10 - min_dp_hom: 5 - include_expressions: - - 'MEDGEN_COHORT_INCONSISTENT_AC=0' - relaxed: - min_gq: 20 - min_dp_het: 6 - min_dp_hom: 3 - include_expressions: - - 'MEDGEN_COHORT_INCONSISTENT_AC=0' - frequencies: # values to use for frequency filtration - af_dominant: 0.001 # AF (allele frequency) values - af_recessive: 0.01 - ac_dominant: 3 # AC (allele count in gnomAD) values - region_beds: # regions to filter to, "whole_genome" implicitely defined - all_tads: /fast/projects/medgen_genomes/static_data/GRCh37/hESC_hg19_allTads.bed - all_genes: /fast/projects/medgen_genomes/static_data/GRCh37/gene_bed/ENSEMBL_v75.bed.gz - limb_tads: /fast/projects/medgen_genomes/static_data/GRCh37/newlimb_tads.bed - lifted_enhancers: /fast/projects/medgen_genomes/static_data/GRCh37/all_but_onlyMB.bed - vista_enhancers: /fast/projects/medgen_genomes/static_data/GRCh37/vista_limb_enhancers.bed - score_thresholds: # thresholds on scores to filter to, "all_scores" implictely defined - coding: - require_coding: true - require_gerpp_gt2: false - min_cadd: null - conservative: # unused; TODO: rename? - require_coding: false - require_gerpp_gt2: false - min_cadd: 0 - conserved: # TODO: rename? - require_coding: false - require_gerpp_gt2: true - min_cadd: null - filter_combinations: # dot-separated {thresholds}.{inherit}.{freq}.{region}.{score}.{het_comp} - - conservative.de_novo.dominant_freq.lifted_enhancers.all_scores.passthrough - - conservative.de_novo.dominant_freq.lifted_enhancers.conserved.passthrough - - conservative.de_novo.dominant_freq.limb_tads.all_scores.passthrough - - conservative.de_novo.dominant_freq.limb_tads.coding.passthrough - - conservative.de_novo.dominant_freq.limb_tads.conserved.passthrough - - conservative.de_novo.dominant_freq.vista_enhancers.all_scores.passthrough - - conservative.de_novo.dominant_freq.vista_enhancers.conserved.passthrough - - conservative.de_novo.dominant_freq.whole_genome.all_scores.passthrough - - conservative.de_novo.dominant_freq.whole_genome.coding.passthrough - - conservative.de_novo.dominant_freq.whole_genome.conserved.passthrough - - conservative.dominant.dominant_freq.lifted_enhancers.all_scores.passthrough - - conservative.dominant.dominant_freq.lifted_enhancers.conserved.passthrough - - conservative.dominant.dominant_freq.limb_tads.all_scores.passthrough - - conservative.dominant.dominant_freq.limb_tads.coding.passthrough - - conservative.dominant.dominant_freq.limb_tads.conserved.passthrough - - conservative.dominant.dominant_freq.vista_enhancers.all_scores.passthrough - - conservative.dominant.dominant_freq.vista_enhancers.conserved.passthrough - - conservative.dominant.dominant_freq.whole_genome.all_scores.passthrough - - conservative.dominant.dominant_freq.whole_genome.coding.passthrough - - conservative.dominant.dominant_freq.whole_genome.conserved.passthrough - - conservative.dominant.recessive_freq.lifted_enhancers.all_scores.intervals500 - - conservative.dominant.recessive_freq.lifted_enhancers.conserved.intervals500 - - conservative.dominant.recessive_freq.lifted_enhancers.conserved.tads - - conservative.dominant.recessive_freq.limb_tads.all_scores.intervals500 - - conservative.dominant.recessive_freq.limb_tads.coding.gene - - conservative.dominant.recessive_freq.limb_tads.conserved.intervals500 - - conservative.dominant.recessive_freq.limb_tads.conserved.tads - - conservative.dominant.recessive_freq.vista_enhancers.all_scores.intervals500 - - conservative.dominant.recessive_freq.vista_enhancers.conserved.intervals500 - - conservative.dominant.recessive_freq.vista_enhancers.conserved.tads - - conservative.dominant.recessive_freq.whole_genome.all_scores.intervals500 - - conservative.dominant.recessive_freq.whole_genome.coding.gene - - conservative.dominant.recessive_freq.whole_genome.conserved.intervals500 - - conservative.dominant.recessive_freq.whole_genome.conserved.tads - - conservative.recessive_hom.recessive_freq.lifted_enhancers.all_scores.passthrough - - conservative.recessive_hom.recessive_freq.lifted_enhancers.conserved.passthrough - - conservative.recessive_hom.recessive_freq.limb_tads.all_scores.passthrough - - conservative.recessive_hom.recessive_freq.limb_tads.coding.passthrough - - conservative.recessive_hom.recessive_freq.limb_tads.conserved.passthrough - - conservative.recessive_hom.recessive_freq.vista_enhancers.all_scores.passthrough - - conservative.recessive_hom.recessive_freq.vista_enhancers.conserved.passthrough - - conservative.recessive_hom.recessive_freq.whole_genome.all_scores.passthrough - - conservative.recessive_hom.recessive_freq.whole_genome.coding.passthrough - - conservative.recessive_hom.recessive_freq.whole_genome.conserved.passthrough - # The following are for input to variant_combination. - - conservative.dominant.recessive_freq.whole_genome.coding.passthrough - - conservative.dominant.recessive_freq.whole_genome.conserved.passthrough -""" +DEFAULT_CONFIG = VariantFiltrationConfigModel.default_config_yaml_string() class FiltersVariantsStepPartBase(BaseStepPart): @@ -236,7 +148,7 @@ def __init__(self, parent): "work", name_pattern, "out", name_pattern.replace(r",[^\.]+", "") + ".log" ) - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -292,9 +204,12 @@ def get_input_files(self, action): @dictify def input_function(wildcards): - yield "ped", os.path.realpath( - "work/write_pedigree.{index_library}/out/{index_library}.ped" - ).format(**wildcards) + yield ( + "ped", + os.path.realpath( + "work/write_pedigree.{index_library}/out/{index_library}.ped" + ).format(**wildcards), + ) variant_annotation = self.parent.sub_workflows["variant_annotation"] for key, ext in zip(EXT_NAMES, EXT_VALUES): output_path = ( @@ -454,7 +369,8 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (VariantAnnotationWorkflow, NgsMappingWorkflow), + config_model_class=VariantFiltrationConfigModel, + previous_steps=(VariantAnnotationWorkflow, NgsMappingWorkflow), ) # Register sub step classes so the sub steps are available self.register_sub_step_classes( @@ -470,16 +386,12 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) ) ) # Register sub workflows - self.register_sub_workflow("variant_annotation", self.config["path_variant_annotation"]) + self.register_sub_workflow("variant_annotation", self.config.path_variant_annotation) # Copy over "tools" setting from somatic_variant_calling/ngs_mapping if not set here - if not self.config["tools_ngs_mapping"]: - self.config["tools_ngs_mapping"] = self.w_config["step_config"]["ngs_mapping"]["tools"][ - "dna" - ] - if not self.config["tools_variant_calling"]: - self.config["tools_variant_calling"] = self.w_config["step_config"]["variant_calling"][ - "tools" - ] + if not self.config.tools_ngs_mapping: + self.config.tools_ngs_mapping = self.w_config.step_config["ngs_mapping"].tools.dna + if not self.config.tools_variant_calling: + self.config.tools_variant_calling = self.w_config.step_config["variant_calling"].tools @listify def get_result_files(self): @@ -490,8 +402,8 @@ def get_result_files(self): ) yield from self._yield_result_files( os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), - mapper=self.config["tools_ngs_mapping"], - caller=self.config["tools_variant_calling"], + mapper=self.config.tools_ngs_mapping, + caller=self.config.tools_variant_calling, ext=EXT_VALUES, ) @@ -510,13 +422,6 @@ def _yield_result_files(self, tpl, **kwargs): yield from expand( tpl, index_library=[pedigree.index.dna_ngs_library], - filters=self.config["filter_combinations"], + filters=self.config.filter_combinations, **kwargs, ) - - def check_config(self): - """Check that the path to the NGS mapping is present""" - self.ensure_w_config( - ("step_config", "variant_filtration", "path_variant_annotation"), - "Path to variant_annotation not configured but required for variant_filtration", - ) diff --git a/snappy_pipeline/workflows/variant_filtration/model.py b/snappy_pipeline/workflows/variant_filtration/model.py new file mode 100644 index 000000000..b9b813c67 --- /dev/null +++ b/snappy_pipeline/workflows/variant_filtration/model.py @@ -0,0 +1,182 @@ +import re +from typing import Annotated + +from pydantic import AfterValidator, Field, model_validator + +from snappy_pipeline.models import SnappyModel, SnappyStepModel + + +class Threshold(SnappyModel): + min_gq: int + min_dp_het: int + min_dp_hom: int + include_expressions: list[str] + + +class Frequencies(SnappyModel): + af_dominant: float = 0.001 + """AF (allele frequency) values""" + + af_recessive: float = 0.01 + """AF (allele frequency) values""" + + ac_dominant: int = 3 + """AC (allele count in gnomAD) values""" + + +class ScoreThreshold(SnappyModel): + require_coding: bool + require_gerpp_gt2: bool + min_cadd: int | None + + +def check_combination(s: str) -> str: + """ + A very simple validator that checks if the string has 5 dots. + The actual validation is done in the model validator, because the sets of valid pattern strings + can only be known at that point. + """ + if s.count(".") != 5: + raise ValueError( + f"Invalid combination: {s}, has to have 6 parts separated by dots" + "({thresholds}.{inherit}.{freq}.{region}.{score}.{het_comp})" + ) + return s + + +FilterCombination = Annotated[str, AfterValidator(check_combination)] + +FILTER_COMBINATION_EXAMPLES = [ + "conservative.de_novo.dominant_freq.lifted_enhancers.all_scores.passthrough", + "conservative.de_novo.dominant_freq.lifted_enhancers.conserved.passthrough", + "conservative.de_novo.dominant_freq.limb_tads.all_scores.passthrough", + "conservative.de_novo.dominant_freq.limb_tads.coding.passthrough", + "conservative.de_novo.dominant_freq.limb_tads.conserved.passthrough", + "conservative.de_novo.dominant_freq.vista_enhancers.all_scores.passthrough", + "conservative.de_novo.dominant_freq.vista_enhancers.conserved.passthrough", + "conservative.de_novo.dominant_freq.whole_genome.all_scores.passthrough", + "conservative.de_novo.dominant_freq.whole_genome.coding.passthrough", + "conservative.de_novo.dominant_freq.whole_genome.conserved.passthrough", + "conservative.dominant.dominant_freq.lifted_enhancers.all_scores.passthrough", + "conservative.dominant.dominant_freq.lifted_enhancers.conserved.passthrough", + "conservative.dominant.dominant_freq.limb_tads.all_scores.passthrough", + "conservative.dominant.dominant_freq.limb_tads.coding.passthrough", + "conservative.dominant.dominant_freq.limb_tads.conserved.passthrough", + "conservative.dominant.dominant_freq.vista_enhancers.all_scores.passthrough", + "conservative.dominant.dominant_freq.vista_enhancers.conserved.passthrough", + "conservative.dominant.dominant_freq.whole_genome.all_scores.passthrough", + "conservative.dominant.dominant_freq.whole_genome.coding.passthrough", + "conservative.dominant.dominant_freq.whole_genome.conserved.passthrough", + "conservative.dominant.recessive_freq.lifted_enhancers.all_scores.intervals500", + "conservative.dominant.recessive_freq.lifted_enhancers.conserved.intervals500", + "conservative.dominant.recessive_freq.lifted_enhancers.conserved.tads", + "conservative.dominant.recessive_freq.limb_tads.all_scores.intervals500", + "conservative.dominant.recessive_freq.limb_tads.coding.gene", + "conservative.dominant.recessive_freq.limb_tads.conserved.intervals500", + "conservative.dominant.recessive_freq.limb_tads.conserved.tads", + "conservative.dominant.recessive_freq.vista_enhancers.all_scores.intervals500", + "conservative.dominant.recessive_freq.vista_enhancers.conserved.intervals500", + "conservative.dominant.recessive_freq.vista_enhancers.conserved.tads", + "conservative.dominant.recessive_freq.whole_genome.all_scores.intervals500", + "conservative.dominant.recessive_freq.whole_genome.coding.gene", + "conservative.dominant.recessive_freq.whole_genome.conserved.intervals500", + "conservative.dominant.recessive_freq.whole_genome.conserved.tads", + "conservative.recessive_hom.recessive_freq.lifted_enhancers.all_scores.passthrough", + "conservative.recessive_hom.recessive_freq.lifted_enhancers.conserved.passthrough", + "conservative.recessive_hom.recessive_freq.limb_tads.all_scores.passthrough", + "conservative.recessive_hom.recessive_freq.limb_tads.coding.passthrough", + "conservative.recessive_hom.recessive_freq.limb_tads.conserved.passthrough", + "conservative.recessive_hom.recessive_freq.vista_enhancers.all_scores.passthrough", + "conservative.recessive_hom.recessive_freq.vista_enhancers.conserved.passthrough", + "conservative.recessive_hom.recessive_freq.whole_genome.all_scores.passthrough", + "conservative.recessive_hom.recessive_freq.whole_genome.coding.passthrough", + "conservative.recessive_hom.recessive_freq.whole_genome.conserved.passthrough", + # The following are for input to variant_combination. + "conservative.dominant.recessive_freq.whole_genome.coding.passthrough", + "conservative.dominant.recessive_freq.whole_genome.conserved.passthrough", +] + + +class VariantFiltration(SnappyStepModel): + path_variant_annotation: str = "../variant_annotation" + + tools_ngs_mapping: list[str] = [] + """defaults to ngs_mapping tool""" + + tools_variant_calling: list[str] = [] + """defaults to variant_annotation tool""" + + thresholds: dict[str, Threshold] = { + "conservative": Threshold( + **dict( + min_gq=40, + min_dp_het=10, + min_dp_hom=5, + include_expressions=["'MEDGEN_COHORT_INCONSISTENT_AC=0'"], + ) + ), + "relaxed": Threshold( + **dict( + min_gq=20, + min_dp_het=6, + min_dp_hom=3, + include_expressions=["'MEDGEN_COHORT_INCONSISTENT_AC=0'"], + ) + ), + } + """quality filter sets, "keep_all" implicitly defined""" + + frequencies: Frequencies = Frequencies() + + region_beds: Annotated[ + dict[str, str], + Field( + examples=[ + { + "all_tads": "/fast/projects/medgen_genomes/static_data/GRCh37/hESC_hg19_allTads.bed", + "all_genes": "/fast/projects/medgen_genomes/static_data/GRCh37/gene_bed/ENSEMBL_v75.bed.gz", + "limb_tads": "/fast/projects/medgen_genomes/static_data/GRCh37/newlimb_tads.bed", + "lifted_enhancers": "/fast/projects/medgen_genomes/static_data/GRCh37/all_but_onlyMB.bed", + "vista_enhancers": "/fast/projects/medgen_genomes/static_data/GRCh37/vista_limb_enhancers.bed", + } + ] + ), + ] = {} + """regions to filter to, "whole_genome" implicitly defined""" + + score_thresholds: dict[str, ScoreThreshold] = { + "coding": ScoreThreshold( + **dict(require_coding=True, require_gerpp_gt2=False, min_cadd=None) + ), + "conservative": ScoreThreshold( + **dict(require_coding=False, require_gerpp_gt2=False, min_cadd=0) + ), + "conserved": ScoreThreshold( + **dict(require_coding=False, require_gerpp_gt2=True, min_cadd=None) + ), + } + + filter_combinations: Annotated[ + list[FilterCombination], Field(examples=FILTER_COMBINATION_EXAMPLES) + ] = [] + """dot-separated {thresholds}.{inherit}.{freq}.{region}.{score}.{het_comp}""" + + @model_validator(mode="after") + def ensure_filter_combinations_are_valid(self): + thresholds: set[str] = set(self.thresholds.keys()) + inherit: set[str] = {"de_novo", "dominant", "recessive_hom"} + freq: set[str] = {"dominant_freq", "recessive_freq"} + region: set[str] = set(self.region_beds.keys()) | {"whole_genome"} + score: set[str] = set(self.score_thresholds.keys()) | {"all_scores"} + het_comp: set[str] = {"passthrough", "intervals500", "tads", "gene"} + pattern: str = r".".join( + f'({"|".join(p)})' for p in [thresholds, inherit, freq, region, score, het_comp] + ) + pattern: re.Pattern[str] = re.compile(pattern) + for combination in self.filter_combinations: + if pattern.fullmatch(combination) is None: + raise ValueError( + f"Invalid combination: {combination}, " f"must match pattern {pattern.pattern}" + ) + + return self diff --git a/snappy_pipeline/workflows/variant_phasing/__init__.py b/snappy_pipeline/workflows/variant_phasing/__init__.py index 9f87880a1..46fecf7aa 100644 --- a/snappy_pipeline/workflows/variant_phasing/__init__.py +++ b/snappy_pipeline/workflows/variant_phasing/__init__.py @@ -63,9 +63,9 @@ from collections import OrderedDict import os -from biomedsheets.shortcuts import GermlineCaseSheet, is_not_background from snakemake.io import expand +from biomedsheets.shortcuts import GermlineCaseSheet, is_not_background from snappy_pipeline.base import UnsupportedActionException from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import ( @@ -77,6 +77,8 @@ from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow from snappy_pipeline.workflows.variant_annotation import VariantAnnotationWorkflow +from .model import VariantPhasing as VariantPhasingConfigModel + #: Extensions of files to create as main payload EXT_VALUES = (".vcf.gz", ".vcf.gz.tbi", ".vcf.gz.md5", ".vcf.gz.tbi.md5") @@ -91,39 +93,7 @@ } #: Default configuration of the wgs_sv_filtration step -DEFAULT_CONFIG = r""" -# Default configuration wgs_sv_filtration -step_config: - variant_phasing: - path_ngs_mapping: ../ngs_mapping - path_variant_annotation: ../variant_annotation - tools_ngs_mapping: [] # expected tools for ngs mapping - tools_variant_calling: [] # expected tools for variant calling - phasings: - - gatk_phasing_both - ignore_chroms: # patterns of chromosome names to ignore - - NC_007605 # herpes virus - - hs37d5 # GRCh37 decoy - - chrEBV # Eppstein-Barr Virus - - '*_decoy' # decoy contig - - 'HLA-*' # HLA genes - gatk_read_backed_phasing: - phase_quality_threshold: 20.0 # quality threshold for phasing - window_length: 5000000 # split input into windows of this size, each triggers a job - num_jobs: 1000 # number of windows to process in parallel - use_profil: true # use Snakemake profile for parallel processing - restart_times: 0 # number of times to re-launch jobs in case of failure - max_jobs_per_second: 10 # throttling of job creation - max_status_checks_per_second: 10 # throttling of status checks - debug_trunc_tokens: 0 # truncation to first N tokens (0 for none) - keep_tmpdir: never # keep temporary directory, {always, never, onerror} - job_mult_memory: 1 # memory multiplier - job_mult_time: 1 # running time multiplier - merge_mult_memory: 1 # memory multiplier for merging - merge_mult_time: 1 # running time multiplier for merging - gatk_phase_by_transmission: - de_novo_prior: 1e-8 # default, use 1e-6 when interested in phasing de novos -""" +DEFAULT_CONFIG = VariantPhasingConfigModel.default_config_yaml_string() class WriteTrioPedigreeStepPart(BaseStepPart): @@ -238,8 +208,9 @@ def get_input_files(self, action): @dictify def input_function(wildcards): # Pedigree file required for PhaseByTransmission. - yield "ped", "work/write_pedigree.{index_library}/out/{index_library}.ped".format( - **wildcards + yield ( + "ped", + "work/write_pedigree.{index_library}/out/{index_library}.ped".format(**wildcards), ) # Get name of real index real_index = self.ngs_library_to_pedigree[wildcards.index_library].index @@ -255,7 +226,7 @@ def input_function(wildcards): assert action == "run", "Unsupported actions" return input_function - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -268,7 +239,7 @@ def get_resource_usage(self, action): return ResourceUsage( threads=1, time="1-00:00:00", # 1 day - memory=f"{ 14 * 1024}M", + memory=f"{14 * 1024}M", ) @@ -305,7 +276,7 @@ def _yield_bams(self, wildcards): ] yield key, list(map(ngs_mapping, files)) - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -406,7 +377,8 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (VariantAnnotationWorkflow, NgsMappingWorkflow), + config_model_class=VariantPhasingConfigModel, + previous_steps=(VariantAnnotationWorkflow, NgsMappingWorkflow), ) # Register sub step classes so the sub steps are available self.register_sub_step_classes( @@ -419,17 +391,13 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) ) ) # Register sub workflows - self.register_sub_workflow("variant_annotation", self.config["path_variant_annotation"]) - self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"]) + self.register_sub_workflow("variant_annotation", self.config.path_variant_annotation) + self.register_sub_workflow("ngs_mapping", self.config.path_ngs_mapping) # Copy over "tools" setting from somatic_variant_calling/ngs_mapping if not set here - if not self.config["tools_ngs_mapping"]: - self.config["tools_ngs_mapping"] = self.w_config["step_config"]["ngs_mapping"]["tools"][ - "dna" - ] - if not self.config["tools_variant_calling"]: - self.config["tools_variant_calling"] = self.w_config["step_config"]["variant_calling"][ - "tools" - ] + if not self.config.tools_ngs_mapping: + self.config.tools_ngs_mapping = self.w_config.step_config["ngs_mapping"].tools.dna + if not self.config.tools_variant_calling: + self.config.tools_variant_calling = self.w_config.step_config["variant_calling"].tools @listify def get_result_files(self): @@ -437,12 +405,12 @@ def get_result_files(self): # Generate output paths without extracting individuals. name_pattern = "{mapper}.{caller}.jannovar_annotate_vcf.{phasing}.{index_library.name}" phasings = [ - token for name, token in CONFIG_TO_TOKEN.items() if name in self.config["phasings"] + token for name, token in CONFIG_TO_TOKEN.items() if name in self.config.phasings ] yield from self._yield_result_files( os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), - mapper=self.config["tools_ngs_mapping"], - caller=self.config["tools_variant_calling"], + mapper=self.config.tools_ngs_mapping, + caller=self.config.tools_variant_calling, phasing=phasings, ext=EXT_VALUES, ) @@ -464,10 +432,3 @@ def _yield_result_files(self, tpl, **kwargs): and donor.mother.dna_ngs_library ): # only phase if both parents present yield from expand(tpl, index_library=[donor.dna_ngs_library], **kwargs) - - def check_config(self): - """Check that the path to the variant annotation step is present""" - self.ensure_w_config( - ("step_config", "variant_phasing", "path_variant_annotation"), - "Path to variant calling not configured but required for somatic variant annotation", - ) diff --git a/snappy_pipeline/workflows/variant_phasing/model.py b/snappy_pipeline/workflows/variant_phasing/model.py new file mode 100644 index 000000000..4879cf51f --- /dev/null +++ b/snappy_pipeline/workflows/variant_phasing/model.py @@ -0,0 +1,74 @@ +from typing import Annotated + +from pydantic import Field + +from snappy_pipeline.models import KeepTmpdir, SnappyModel, SnappyStepModel + + +class GatkReadBackedPhasing(SnappyModel): + phase_quality_threshold: float = 20.0 + """quality threshold for phasing""" + + window_length: int = 5000000 + """split input into windows of this size, each triggers a job""" + + num_jobs: int = 1000 + """number of windows to process in parallel""" + + use_profile: bool = True + """use Snakemake profile for parallel processing""" + + restart_times: int = 0 + """number of times to re-launch jobs in case of failure""" + + max_jobs_per_second: int = 10 + """throttling of job creation""" + + max_status_checks_per_second: int = 10 + """throttling of status checks""" + + debug_trunc_tokens: int = 0 + """truncation to first N tokens (0 for none)""" + + keep_tmpdir: KeepTmpdir = KeepTmpdir.never + """keep temporary directory, {always, never, onerror}""" + + job_mult_memory: float = 1 + """memory multiplier""" + + job_mult_time: float = 1 + """running time multiplier""" + + merge_mult_memory: float = 1 + """memory multiplier for merging""" + + merge_mult_time: float = 1 + """running time multiplier for merging""" + + +class GatkPhaseByTransmission(SnappyModel): + de_novo_prior: float = 1e-8 + """use 1e-6 when interested in phasing de novos""" + + +class VariantPhasing(SnappyStepModel): + path_ngs_mapping: str = "../ngs_mapping" + + path_variant_annotation: Annotated[str, Field(examples=["../variant_annotation"])] = ( + "../variant_annotation" + ) + + tools_ngs_mapping: list[str] = [] + """expected tools for ngs mapping""" + + tools_variant_calling: list[str] = [] + """expected tools for variant calling""" + + phasings: list[str] = ["gatk_phasing_both"] + + ignore_chroms: list[str] = ["NC_007605", "hs37d5", "chrEBV", "*_decoy", "HLA-*"] + """patterns of chromosome names to ignore""" + + gatk_read_backed_phasing: GatkReadBackedPhasing = GatkReadBackedPhasing() + + gatk_phase_by_transmission: GatkPhaseByTransmission = GatkPhaseByTransmission() diff --git a/snappy_pipeline/workflows/wgs_cnv_export_external/Snakefile b/snappy_pipeline/workflows/wgs_cnv_export_external/Snakefile index 1191a7423..82f4bde8f 100644 --- a/snappy_pipeline/workflows/wgs_cnv_export_external/Snakefile +++ b/snappy_pipeline/workflows/wgs_cnv_export_external/Snakefile @@ -4,7 +4,9 @@ import os from snappy_pipeline import expand_ref -from snappy_pipeline.workflows.wgs_cnv_export_external import WgsCnvExportExternalWorkflow +from snappy_pipeline.workflows.wgs_cnv_export_external import ( + WgsCnvExportExternalWorkflow, +) # Configuration =============================================================== diff --git a/snappy_pipeline/workflows/wgs_cnv_export_external/__init__.py b/snappy_pipeline/workflows/wgs_cnv_export_external/__init__.py index 4af27db97..21e4674f3 100644 --- a/snappy_pipeline/workflows/wgs_cnv_export_external/__init__.py +++ b/snappy_pipeline/workflows/wgs_cnv_export_external/__init__.py @@ -69,10 +69,9 @@ import os import sys -from biomedsheets.shortcuts import GermlineCaseSheet, is_not_background from snakemake.io import expand -from snappy_pipeline.base import MissingConfiguration +from biomedsheets.shortcuts import GermlineCaseSheet, is_not_background from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import ( BaseStep, @@ -84,6 +83,8 @@ WritePedigreeSampleNameStepPart, ) +from .model import WgsCnvExportExternal as WgsCnvExportExternalConfigModel + #: Extension of files EXTS = (".tsv.gz", ".tsv.gz.md5") @@ -99,22 +100,7 @@ } #: Default configuration for the wgs_cnv_export_external step -DEFAULT_CONFIG = r""" -# Default configuration wgs_cnv_export_external. -step_config: - wgs_cnv_export_external: - tool_ngs_mapping: null # OPTIONAL: used to create output file prefix. - tool_wgs_cnv_calling: null # OPTIONAL: used to create output file prefix. - merge_vcf_flag: false # OPTIONAL: true if pedigree VCFs still need merging (not recommended). - merge_option: id # How to merge VCF, used in `bcftools --merge` call. - search_paths: [] # REQUIRED: path to all VCF files. - search_patterns: [] # REQUIRED: list of search pattern, ex.: [{"vcf": "*/*.vcf.gz"}] - release: GRCh37 # REQUIRED: default 'GRCh37' - path_refseq_ser: REQUIRED # REQUIRED: path to RefSeq .ser file - path_ensembl_ser: REQUIRED # REQUIRED: path to ENSEMBL .ser file - path_db: REQUIRED # REQUIRED: path to annotator DB file to use - varfish_server_compatibility: false # OPTIONAL: build output compatible with varfish-server v1.2 (Anthenea) and early versions of the v2 (Bollonaster) -""" +DEFAULT_CONFIG = WgsCnvExportExternalConfigModel.default_config_yaml_string() class VarfishAnnotatorExternalStepPart(BaseStepPart): @@ -147,7 +133,7 @@ def get_input_files(self, action): @listify def _get_input_files_merge_vcf(self, wildcards): """""" - if self.config["merge_vcf_flag"]: + if self.config.merge_vcf_flag: pedigree = self.index_ngs_library_to_pedigree.get(wildcards.index_ngs_library) for donor in filter(lambda d: d.dna_ngs_library, pedigree.donors): for bio_sample in donor.bio_samples.values(): @@ -235,7 +221,7 @@ def _get_log_file_annotate(self): yield key, prefix + ext yield key + "_md5", prefix + ext + ".md5" - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -265,13 +251,13 @@ def _get_params_merge_vcf(self, wildcards): result = { "input": list(sorted(self._collect_vcfs(wildcards))), "sample_names": list(sorted(self._collect_sample_ids(wildcards))), - "merge_option": self.config["merge_option"], + "merge_option": self.config.merge_option, "gvcf_option": False, } return result def _get_params_annotate(self, wildcards): - varfish_server_compatibility_flag = self.config["varfish_server_compatibility"] + varfish_server_compatibility_flag = self.config.varfish_server_compatibility return { "step_name": "wgs_cnv_export_external", "varfish_server_compatibility": varfish_server_compatibility_flag, @@ -309,8 +295,8 @@ def _get_mapper_caller_tag(self): :return: Returns tag to be used to name intermediate and final files. Tag based on information provided in configuration. Output examples: 'bwa.delly2.', 'dragen.', or ''. """ - mapper = self.config["tool_ngs_mapping"] - caller = self.config["tool_wgs_cnv_calling"] + mapper = self.config.tool_ngs_mapping + caller = self.config.tool_wgs_cnv_calling if mapper and caller: return f"{mapper}.{caller}." elif mapper or caller: @@ -342,7 +328,8 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (), + config_model_class=WgsCnvExportExternalConfigModel, + previous_steps=(), ) # Load external data search information self.data_search_infos = list(self._load_data_search_infos()) @@ -404,53 +391,3 @@ def _yield_result_files(self, tpl, **kwargs): ) continue # pragma: no cover yield from expand(tpl, index_library=[pedigree.index.dna_ngs_library], **kwargs) - - def check_config(self): - """Check configuration - - :raises: MissingConfiguration: on missing or invalid configuration. - """ - # Initialise variables - fail_test_bool = False - error_msg = "Missing or invalid configuration issue(s):\n" - required_file_keys = ("path_refseq_ser", "path_ensembl_ser", "path_db") - - # Test files - for key in required_file_keys: - path_ = self.config[key] - try: - if not os.path.isfile(path_): - error_msg += f"- Value for '{key}' is not a file: {path_}\n" - fail_test_bool = True - except (ValueError, KeyError): - error_msg += f"- Value '{key}' is not properly defined: {path_}\n" - fail_test_bool = True - - # Test search paths - search_paths = [item for item in self.config["search_paths"]] - if len(search_paths) == 0: - error_msg += "- Value for 'search_paths' cannot be empty.\n" - fail_test_bool = True - else: - for path_ in search_paths: - if not os.path.isdir(path_): - error_msg += f"- Path in 'search_paths' is not a directory: {path_}\n" - fail_test_bool = True - - # Test search pattern - search_patterns = [item for item in self.config["search_patterns"]] - if len(search_patterns) == 0: - error_msg += "- Value for 'search_patterns' cannot be empty.\n" - fail_test_bool = True - else: - for value in search_patterns: - if not isinstance(value, dict): - error_msg += ( - "- Value in 'search_patterns' is not a dictionary.\n" - "Expected: [{'vcf': '*/*.vcf.gz'}]\n" - f"Observed {type(value)}: '{value}'\n" - ) - fail_test_bool = True - # Assert - if fail_test_bool: - raise MissingConfiguration(error_msg) diff --git a/snappy_pipeline/workflows/wgs_cnv_export_external/model.py b/snappy_pipeline/workflows/wgs_cnv_export_external/model.py new file mode 100644 index 000000000..046d4f71d --- /dev/null +++ b/snappy_pipeline/workflows/wgs_cnv_export_external/model.py @@ -0,0 +1,44 @@ +from typing import Annotated + +from pydantic import DirectoryPath, Field, FilePath + +from snappy_pipeline.models import SnappyStepModel + + +class WgsCnvExportExternal(SnappyStepModel): + tool_ngs_mapping: str | None = None + """used to create output file prefix.""" + + tool_wgs_cnv_calling: str | None = None + """used to create output file prefix.""" + + merge_vcf_flag: bool = False + """true if pedigree VCFs still need merging (not recommended).""" + + merge_option: str = "id" + """How to merge VCF, used in `bcftools --merge` call.""" + + search_paths: Annotated[list[DirectoryPath], Field(min_length=1)] + """path to all VCF files.""" + + search_patterns: Annotated[ + list[dict[str, str]], Field(examples=[{"vcf": "*/*.vcf.gz"}], min_length=1) + ] + """list of search pattern""" + + release: str = "GRCh37" + + path_refseq_ser: FilePath + """path to RefSeq .ser file""" + + path_ensembl_ser: FilePath + """path to ENSEMBL .ser file""" + + path_db: FilePath + """path to annotator DB file to use""" + + varfish_server_compatibility: bool = False + """ + build output compatible with + varfish-server v1.2 (Anthenea) and early versions of the v2 (Bollonaster) + """ diff --git a/snappy_pipeline/workflows/wgs_sv_export_external/__init__.py b/snappy_pipeline/workflows/wgs_sv_export_external/__init__.py index bff8afc82..8992ddf17 100644 --- a/snappy_pipeline/workflows/wgs_sv_export_external/__init__.py +++ b/snappy_pipeline/workflows/wgs_sv_export_external/__init__.py @@ -69,10 +69,9 @@ import os import sys -from biomedsheets.shortcuts import GermlineCaseSheet, is_not_background from snakemake.io import expand -from snappy_pipeline.base import MissingConfiguration +from biomedsheets.shortcuts import GermlineCaseSheet, is_not_background from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import ( BaseStep, @@ -84,6 +83,8 @@ WritePedigreeSampleNameStepPart, ) +from .model import WgsSvExportExternal as WgsSvExportExternalConfigModel + #: Extension of files EXTS = (".tsv.gz", ".tsv.gz.md5") @@ -99,22 +100,7 @@ } #: Default configuration for the wgs_sv_export_external step -DEFAULT_CONFIG = r""" -# Default configuration wgs_sv_export_external. -step_config: - wgs_sv_export_external: - tool_ngs_mapping: null # OPTIONAL: used to create output file prefix. - tool_sv_calling_wgs: null # OPTIONAL: used to create output file prefix. - merge_vcf_flag: false # OPTIONAL: true if pedigree VCFs still need merging (not recommended). - merge_option: id # How to merge VCF, used in `bcftools --merge` call. - search_paths: [] # REQUIRED: path to all VCF files. - search_patterns: [] # REQUIRED: list of search pattern, ex.: [{"vcf": "*/*.vcf.gz"}] - release: GRCh37 # REQUIRED: default 'GRCh37' - path_refseq_ser: REQUIRED # REQUIRED: path to RefSeq .ser file - path_ensembl_ser: REQUIRED # REQUIRED: path to ENSEMBL .ser file - path_db: REQUIRED # REQUIRED: path to annotator DB file to use - varfish_server_compatibility: false # OPTIONAL: build output compatible with varfish-server v1.2 (Anthenea) and early versions of the v2 (Bollonaster). -""" +DEFAULT_CONFIG = WgsSvExportExternalConfigModel.default_config_yaml_string() class VarfishAnnotatorExternalStepPart(BaseStepPart): @@ -147,7 +133,7 @@ def get_input_files(self, action): @listify def _get_input_files_merge_vcf(self, wildcards): """""" - if self.config["merge_vcf_flag"]: + if self.config.merge_vcf_flag: pedigree = self.index_ngs_library_to_pedigree.get(wildcards.index_ngs_library) for donor in filter(lambda d: d.dna_ngs_library, pedigree.donors): for bio_sample in donor.bio_samples.values(): @@ -235,7 +221,7 @@ def _get_log_file_annotate(self): yield key, prefix + ext yield key + "_md5", prefix + ext + ".md5" - def get_resource_usage(self, action): + def get_resource_usage(self, action: str, **kwargs) -> ResourceUsage: """Get Resource Usage :param action: Action (i.e., step) in the workflow, example: 'run'. @@ -265,7 +251,7 @@ def _get_params_merge_vcf(self, wildcards): result = { "input": list(sorted(self._collect_vcfs(wildcards))), "sample_names": list(sorted(self._collect_sample_ids(wildcards))), - "merge_option": self.config["merge_option"], + "merge_option": self.config.merge_option, "gvcf_option": False, } return result @@ -273,7 +259,7 @@ def _get_params_merge_vcf(self, wildcards): def _get_params_annotate(self, wildcards): return { "step_name": "wgs_sv_export_external", - "varfish_server_compatibility": self.config["varfish_server_compatibility"], + "varfish_server_compatibility": self.config.varfish_server_compatibility, } def _collect_vcfs(self, wildcards): @@ -308,8 +294,8 @@ def _get_mapper_caller_tag(self): :return: Returns tag to be used to name intermediate and final files. Tag based on information provided in configuration. Output examples: 'bwa.delly2.', 'dragen.', or ''. """ - mapper = self.config["tool_ngs_mapping"] - caller = self.config["tool_sv_calling_wgs"] + mapper = self.config.tool_ngs_mapping + caller = self.config.tool_sv_calling_wgs if mapper and caller: return f"{mapper}.{caller}." elif mapper or caller: @@ -341,7 +327,8 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (), + config_model_class=WgsSvExportExternalConfigModel, + previous_steps=(), ) # Load external data search information self.data_search_infos = list(self._load_data_search_infos()) @@ -403,53 +390,3 @@ def _yield_result_files(self, tpl, **kwargs): ) continue # pragma: no cover yield from expand(tpl, index_library=[pedigree.index.dna_ngs_library], **kwargs) - - def check_config(self): - """Check configuration - - :raises: MissingConfiguration: on missing or invalid configuration. - """ - # Initialise variables - fail_test_bool = False - error_msg = "Missing or invalid configuration issue(s):\n" - required_file_keys = ("path_refseq_ser", "path_ensembl_ser", "path_db") - - # Test files - for key in required_file_keys: - path_ = self.config[key] - try: - if not os.path.isfile(path_): - error_msg += f"- Value for '{key}' is not a file: {path_}\n" - fail_test_bool = True - except (ValueError, KeyError): - error_msg += f"- Value '{key}' is not properly defined: {path_}\n" - fail_test_bool = True - - # Test search paths - search_paths = [item for item in self.config["search_paths"]] - if len(search_paths) == 0: - error_msg += "- Value for 'search_paths' cannot be empty.\n" - fail_test_bool = True - else: - for path_ in search_paths: - if not os.path.isdir(path_): - error_msg += f"- Path in 'search_paths' is not a directory: {path_}\n" - fail_test_bool = True - - # Test search pattern - search_patterns = [item for item in self.config["search_patterns"]] - if len(search_patterns) == 0: - error_msg += "- Value for 'search_patterns' cannot be empty.\n" - fail_test_bool = True - else: - for value in search_patterns: - if not isinstance(value, dict): - error_msg += ( - "- Value in 'search_patterns' is not a dictionary.\n" - "Expected: [{'vcf': '*/*.vcf.gz'}]\n" - f"Observed {type(value)}: '{value}'\n" - ) - fail_test_bool = True - # Assert - if fail_test_bool: - raise MissingConfiguration(error_msg) diff --git a/snappy_pipeline/workflows/wgs_sv_export_external/model.py b/snappy_pipeline/workflows/wgs_sv_export_external/model.py new file mode 100644 index 000000000..966a4dfa4 --- /dev/null +++ b/snappy_pipeline/workflows/wgs_sv_export_external/model.py @@ -0,0 +1,44 @@ +from typing import Annotated + +from pydantic import DirectoryPath, Field, FilePath + +from snappy_pipeline.models import SnappyStepModel + + +class WgsSvExportExternal(SnappyStepModel): + tool_ngs_mapping: str | None = None + """used to create output file prefix.""" + + tool_sv_calling_wgs: str | None = None + """used to create output file prefix.""" + + merge_vcf_flag: bool = False + """true if pedigree VCFs still need merging (not recommended).""" + + merge_option: str = "id" + """How to merge VCF, used in `bcftools --merge` call.""" + + search_paths: Annotated[list[DirectoryPath], Field(min_length=1)] + """path to all VCF files.""" + + search_patterns: Annotated[ + list[dict[str, str]], Field(examples=[{"vcf": "*/*.vcf.gz"}], min_length=1) + ] + """list of search pattern""" + + release: str = "GRCh37" + + path_refseq_ser: FilePath + """path to RefSeq .ser file""" + + path_ensembl_ser: FilePath + """path to ENSEMBL .ser file""" + + path_db: FilePath + """path to annotator DB file to use""" + + varfish_server_compatibility: bool = False + """ + build output compatible with + varfish-server v1.2 (Anthenea) and early versions of the v2 (Bollonaster) + """ diff --git a/snappy_wrappers/genome_regions.py b/snappy_wrappers/genome_regions.py index 18e3985f8..49aeac060 100644 --- a/snappy_wrappers/genome_regions.py +++ b/snappy_wrappers/genome_regions.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- -"""Code for genome regions -""" +"""Code for genome regions""" import re diff --git a/snappy_wrappers/runner_seq.py b/snappy_wrappers/runner_seq.py index 357f6d63b..cbc02188f 100644 --- a/snappy_wrappers/runner_seq.py +++ b/snappy_wrappers/runner_seq.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- -"""Code for running a sequential CUBI+Snakemake wrappers -""" +"""Code for running a sequential CUBI+Snakemake wrappers""" import json import os diff --git a/snappy_wrappers/tools/fix_vcf.py b/snappy_wrappers/tools/fix_vcf.py index 5e609da72..8c8b2d6b1 100644 --- a/snappy_wrappers/tools/fix_vcf.py +++ b/snappy_wrappers/tools/fix_vcf.py @@ -136,9 +136,11 @@ def main(argv=None): group.add_argument( "--output-vcf", type=argparse.FileType("wt"), help="output VCF file", default=sys.stdout ) - group.add_argument( - "--faidx", type=argparse.FileType("rt"), help="FAI file for generating ##contig lines" - ), + ( + group.add_argument( + "--faidx", type=argparse.FileType("rt"), help="FAI file for generating ##contig lines" + ), + ) group.add_argument("--sample", nargs="+", default=[], action="append", dest="samples") args = parser.parse_args(argv) diff --git a/snappy_wrappers/tools/gcnv_merge_vcfs.py b/snappy_wrappers/tools/gcnv_merge_vcfs.py index 337596e57..8473d6005 100644 --- a/snappy_wrappers/tools/gcnv_merge_vcfs.py +++ b/snappy_wrappers/tools/gcnv_merge_vcfs.py @@ -120,9 +120,9 @@ class ContigCnvs: ncls: ncls.NCLS @staticmethod - def from_cnvs( - contig: str, cnvs: typing.Iterable[CopyNumberVariant] - ) -> typing.TypeVar("ContigCnvs"): + def from_cnvs(contig: str, cnvs: typing.Iterable[CopyNumberVariant]) -> typing.TypeVar( + "ContigCnvs" + ): """Build from name and list of CopyNumberVariant.""" start = pd.Series([cnv.pos_begin for cnv in cnvs]) ends = pd.Series([cnv.pos_end for cnv in cnvs]) diff --git a/snappy_wrappers/tools/quickvenn.py b/snappy_wrappers/tools/quickvenn.py index 7aac55cc8..0dada9927 100644 --- a/snappy_wrappers/tools/quickvenn.py +++ b/snappy_wrappers/tools/quickvenn.py @@ -18,6 +18,7 @@ 6 """ + import argparse import itertools import math diff --git a/snappy_wrappers/tools/vcf_filter_denovo.py b/snappy_wrappers/tools/vcf_filter_denovo.py index 05a6a2678..79778de4e 100644 --- a/snappy_wrappers/tools/vcf_filter_denovo.py +++ b/snappy_wrappers/tools/vcf_filter_denovo.py @@ -92,7 +92,6 @@ def to_string(self): class ClippedRegion(genome_regions.GenomeRegion): - NONE = 0 LEFT = 1 RIGHT = 2 diff --git a/snappy_wrappers/wrapper_parallel.py b/snappy_wrappers/wrapper_parallel.py index 3b71750f0..61ae053f4 100644 --- a/snappy_wrappers/wrapper_parallel.py +++ b/snappy_wrappers/wrapper_parallel.py @@ -948,8 +948,9 @@ def construct_parallel_rules(self): "wrapper_prefix": "file://" + self.wrapper_base_dir, "inner_wrapper": self.inner_wrapper, } - yield textwrap.dedent( - r""" + yield ( + textwrap.dedent( + r""" rule chunk_{jobno}: input: {input_bam}, @@ -965,7 +966,10 @@ def construct_parallel_rules(self): **{params} wrapper: '{wrapper_prefix}/snappy_wrappers/wrappers/{inner_wrapper}' """ - ).format(**vals).lstrip() + ) + .format(**vals) + .lstrip() + ) class ParallelVariantAnnotationBaseWrapper(ParallelVcfOutputBaseWrapper): @@ -998,8 +1002,9 @@ def construct_parallel_rules(self): "wrapper_prefix": "file://" + self.wrapper_base_dir, "inner_wrapper": self.inner_wrapper, } - yield textwrap.dedent( - r""" + yield ( + textwrap.dedent( + r""" rule chunk_{jobno}: input: **{input_}, @@ -1015,7 +1020,10 @@ def construct_parallel_rules(self): **{params} wrapper: '{wrapper_prefix}/snappy_wrappers/wrappers/{inner_wrapper}' """ - ).format(**vals).lstrip() + ) + .format(**vals) + .lstrip() + ) class ParallelSomaticVariantCallingBaseWrapper(ParallelVcfOutputBaseWrapper): @@ -1048,8 +1056,9 @@ def construct_parallel_rules(self): "wrapper_prefix": "file://" + self.wrapper_base_dir, "inner_wrapper": self.inner_wrapper, } - yield textwrap.dedent( - r""" + yield ( + textwrap.dedent( + r""" rule chunk_{jobno}: input: tumor_bam={tumor_bam}, @@ -1066,7 +1075,10 @@ def construct_parallel_rules(self): **{params} wrapper: '{wrapper_prefix}/snappy_wrappers/wrappers/{inner_wrapper}' """ - ).format(**vals).lstrip() + ) + .format(**vals) + .lstrip() + ) class ParallelSomaticVariantAnnotationBaseWrapper(ParallelVcfOutputBaseWrapper): @@ -1099,8 +1111,9 @@ def construct_parallel_rules(self): "wrapper_prefix": "file://" + self.wrapper_base_dir, "inner_wrapper": self.inner_wrapper, } - yield textwrap.dedent( - r""" + yield ( + textwrap.dedent( + r""" rule chunk_{jobno}: input: **{input_}, @@ -1116,7 +1129,10 @@ def construct_parallel_rules(self): **{params} wrapper: '{wrapper_prefix}/snappy_wrappers/wrappers/{inner_wrapper}' """ - ).format(**vals).lstrip() + ) + .format(**vals) + .lstrip() + ) class ParallelMutect2BaseWrapper(ParallelBaseWrapper): @@ -1319,8 +1335,9 @@ def construct_parallel_rules(self): "wrapper_prefix": "file://" + self.wrapper_base_dir, "inner_wrapper": self.inner_wrapper, } - yield textwrap.dedent( - r""" + yield ( + textwrap.dedent( + r""" rule chunk_{jobno}: input: **{input} @@ -1337,7 +1354,10 @@ def construct_parallel_rules(self): **{params} wrapper: '{wrapper_prefix}/snappy_wrappers/wrappers/{inner_wrapper}' """ - ).format(**vals).lstrip() + ) + .format(**vals) + .lstrip() + ) def construct_merge_rule(self): """Join the overall result files. diff --git a/snappy_wrappers/wrappers/bed_venn/__init__.py b/snappy_wrappers/wrappers/bed_venn/__init__.py index 7373bcf43..8fe3c9378 100644 --- a/snappy_wrappers/wrappers/bed_venn/__init__.py +++ b/snappy_wrappers/wrappers/bed_venn/__init__.py @@ -1,3 +1,2 @@ # -*- coding: utf-8 -*- -"""Create Venn diagrams from BED file overlaps based on Jaccard-index -""" +"""Create Venn diagrams from BED file overlaps based on Jaccard-index""" diff --git a/snappy_wrappers/wrappers/eb_filter_par/parallel_eb_filter.py b/snappy_wrappers/wrappers/eb_filter_par/parallel_eb_filter.py index 9a9c9fe59..c1bb179b6 100644 --- a/snappy_wrappers/wrappers/eb_filter_par/parallel_eb_filter.py +++ b/snappy_wrappers/wrappers/eb_filter_par/parallel_eb_filter.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- -"""Definition for running EasyBayes-Filter in parallel, genome is split into windows -""" +"""Definition for running EasyBayes-Filter in parallel, genome is split into windows""" import os import sys @@ -69,8 +68,9 @@ def construct_parallel_rules(self): "wrapper_prefix": "file://" + self.wrapper_base_dir, "inner_wrapper": self.inner_wrapper, } - yield textwrap.dedent( - r""" + yield ( + textwrap.dedent( + r""" rule chunk_{jobno}: input: vcf={input_vcf}, @@ -90,4 +90,7 @@ def construct_parallel_rules(self): """ - ).format(**vals).lstrip() + ) + .format(**vals) + .lstrip() + ) diff --git a/snappy_wrappers/wrappers/gatk_hc_par/parallel_gatk_hc.py b/snappy_wrappers/wrappers/gatk_hc_par/parallel_gatk_hc.py index e39d7bd6e..5b956e03a 100644 --- a/snappy_wrappers/wrappers/gatk_hc_par/parallel_gatk_hc.py +++ b/snappy_wrappers/wrappers/gatk_hc_par/parallel_gatk_hc.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- -"""Definition for GATK HC variant caller in parallel, genome is split into windows -""" +"""Definition for GATK HC variant caller in parallel, genome is split into windows""" import os import sys diff --git a/snappy_wrappers/wrappers/gatk_read_backed_phasing_par/parallel_read_backed_phasing.py b/snappy_wrappers/wrappers/gatk_read_backed_phasing_par/parallel_read_backed_phasing.py index 2957cc7ae..40663ccd4 100644 --- a/snappy_wrappers/wrappers/gatk_read_backed_phasing_par/parallel_read_backed_phasing.py +++ b/snappy_wrappers/wrappers/gatk_read_backed_phasing_par/parallel_read_backed_phasing.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- -"""Definition for GATK ReadBackedPhasing paralllel wrapper -""" +"""Definition for GATK ReadBackedPhasing paralllel wrapper""" import os import sys @@ -75,8 +74,9 @@ def construct_parallel_rules(self): "wrapper_prefix": "file://" + self.wrapper_base_dir, "inner_wrapper": self.inner_wrapper, } - yield textwrap.dedent( - r""" + yield ( + textwrap.dedent( + r""" rule chunk_{jobno}: input: **{input_}, @@ -93,4 +93,7 @@ def construct_parallel_rules(self): wrapper: '{wrapper_prefix}/snappy_wrappers/wrappers/{inner_wrapper}' """ - ).format(**vals).lstrip() + ) + .format(**vals) + .lstrip() + ) diff --git a/snappy_wrappers/wrappers/gatk_ug_par/parallel_gatk_ug.py b/snappy_wrappers/wrappers/gatk_ug_par/parallel_gatk_ug.py index 2791a5188..d2ed2b939 100644 --- a/snappy_wrappers/wrappers/gatk_ug_par/parallel_gatk_ug.py +++ b/snappy_wrappers/wrappers/gatk_ug_par/parallel_gatk_ug.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- -"""Definition for running GATK UG variant caller in parallel, genome is split into windows -""" +"""Definition for running GATK UG variant caller in parallel, genome is split into windows""" import os import sys diff --git a/snappy_wrappers/wrappers/gcnv/annotate_gc/wrapper.py b/snappy_wrappers/wrappers/gcnv/annotate_gc/wrapper.py index cf611d780..539d35b9b 100644 --- a/snappy_wrappers/wrappers/gcnv/annotate_gc/wrapper.py +++ b/snappy_wrappers/wrappers/gcnv/annotate_gc/wrapper.py @@ -6,7 +6,8 @@ from snakemake.shell import shell # Pick the target BED file to use. -config = DictQuery(snakemake.config).get("step_config/sv_calling_targeted/gcnv") +# FIXME: why is 'target_interval_bed' not used? +config = DictQuery(snakemake.config).get("step_config/helper_gcnv_model_targeted/gcnv") for item in config["path_target_interval_list_mapping"]: if item["name"] == snakemake.wildcards.library_kit: target_interval_bed = item["path"] @@ -15,7 +16,7 @@ raise Exception("Found no target intervals for %s" % item["name"]) map_bed = DictQuery(snakemake.config).get( - "step_config/sv_calling_targeted/gcnv/path_uniquely_mapable_bed" + "step_config/helper_gcnv_model_targeted/gcnv/path_uniquely_mapable_bed" ) shell( diff --git a/snappy_wrappers/wrappers/gcnv/contig_ploidy/wrapper.py b/snappy_wrappers/wrappers/gcnv/contig_ploidy/wrapper.py index 6974d42ce..2f802e27a 100644 --- a/snappy_wrappers/wrappers/gcnv/contig_ploidy/wrapper.py +++ b/snappy_wrappers/wrappers/gcnv/contig_ploidy/wrapper.py @@ -26,7 +26,7 @@ paths_tsv = " ".join(snakemake.input.tsv) # Add interval block list for PAR regions if configured. -par_intervals = snakemake.config["step_config"]["helper_gcnv_model_targeted"]["gcnv"].get( +par_intervals = snakemake.config["step_config"][snakemake.params.step_key]["gcnv"].get( "path_par_intervals" ) if par_intervals: diff --git a/snappy_wrappers/wrappers/gcnv/contig_ploidy_case_mode/wrapper.py b/snappy_wrappers/wrappers/gcnv/contig_ploidy_case_mode/wrapper.py index 134ac416d..46a849543 100644 --- a/snappy_wrappers/wrappers/gcnv/contig_ploidy_case_mode/wrapper.py +++ b/snappy_wrappers/wrappers/gcnv/contig_ploidy_case_mode/wrapper.py @@ -27,7 +27,7 @@ paths_tsv = " ".join(snakemake.input.tsv) ## Add interval block list for PAR regions if configured. -# par_intervals = snakemake.config["step_config"]["helper_gcnv_model_targeted"].get("path_par_intervals") +# par_intervals = snakemake.config["step_config"][snakemake.params.step_key].get("path_par_intervals") # if par_intervals: # par_args = f"-XL {par_intervals}" # else: diff --git a/snappy_wrappers/wrappers/gcnv/preprocess_intervals/wrapper.py b/snappy_wrappers/wrappers/gcnv/preprocess_intervals/wrapper.py index 065e03b68..92980b388 100644 --- a/snappy_wrappers/wrappers/gcnv/preprocess_intervals/wrapper.py +++ b/snappy_wrappers/wrappers/gcnv/preprocess_intervals/wrapper.py @@ -7,7 +7,7 @@ # Pick the target BED file to use. -config = DictQuery(snakemake.config).get("step_config/sv_calling_targeted/gcnv") +config = snakemake.config["step_config"][snakemake.params.step_key]["gcnv"] for item in config["path_target_interval_list_mapping"]: if item["name"] == snakemake.wildcards.library_kit: target_interval_bed = item["path"] diff --git a/snappy_wrappers/wrappers/hts_screen/match_vector_to_report.py b/snappy_wrappers/wrappers/hts_screen/match_vector_to_report.py index c0e2f1f5a..a18eec194 100644 --- a/snappy_wrappers/wrappers/hts_screen/match_vector_to_report.py +++ b/snappy_wrappers/wrappers/hts_screen/match_vector_to_report.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 -"""Convert match_vector.tsv file from snappy-hts_screen.sh -""" +"""Convert match_vector.tsv file from snappy-hts_screen.sh""" import argparse import sys diff --git a/snappy_wrappers/wrappers/jannovar_par/annotate_somatic_vcf/parallel_annotate_somatic_vcf.py b/snappy_wrappers/wrappers/jannovar_par/annotate_somatic_vcf/parallel_annotate_somatic_vcf.py index 4b0f7f356..9f70eb9f0 100644 --- a/snappy_wrappers/wrappers/jannovar_par/annotate_somatic_vcf/parallel_annotate_somatic_vcf.py +++ b/snappy_wrappers/wrappers/jannovar_par/annotate_somatic_vcf/parallel_annotate_somatic_vcf.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -"""Definition for Jannovar somatic annotation in parallel, genome is split into windows -""" +"""Definition for Jannovar somatic annotation in parallel, genome is split into windows""" + import os import sys diff --git a/snappy_wrappers/wrappers/jannovar_par/annotate_vcf/parallel_annotate_vcf.py b/snappy_wrappers/wrappers/jannovar_par/annotate_vcf/parallel_annotate_vcf.py index 056cd0207..dce491b5b 100644 --- a/snappy_wrappers/wrappers/jannovar_par/annotate_vcf/parallel_annotate_vcf.py +++ b/snappy_wrappers/wrappers/jannovar_par/annotate_vcf/parallel_annotate_vcf.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- -"""Definition for Jannovar germline annotation in parallel, genome is split into windows -""" +"""Definition for Jannovar germline annotation in parallel, genome is split into windows""" import os import sys diff --git a/snappy_wrappers/wrappers/mbcs/wrapper.py b/snappy_wrappers/wrappers/mbcs/wrapper.py index b49cb9cd6..ad6791676 100644 --- a/snappy_wrappers/wrappers/mbcs/wrapper.py +++ b/snappy_wrappers/wrappers/mbcs/wrapper.py @@ -21,6 +21,7 @@ shell.executable("/bin/bash") + # Helper functions ------------------------------------------------------------ def pair_fastq_files(input_left, input_right): r1s = input_left.copy() diff --git a/snappy_wrappers/wrappers/mehari/annotate_strucvars/fix_manta_invs.py b/snappy_wrappers/wrappers/mehari/annotate_strucvars/fix_manta_invs.py index 8b306e707..67c11b672 100644 --- a/snappy_wrappers/wrappers/mehari/annotate_strucvars/fix_manta_invs.py +++ b/snappy_wrappers/wrappers/mehari/annotate_strucvars/fix_manta_invs.py @@ -19,7 +19,7 @@ def looks_like_manta(header): @enum.unique -class InversionType(enum.Enum): +class InversionType(enum.StrEnum): """Inversion type.""" INV3 = "INV3" diff --git a/snappy_wrappers/wrappers/mutect2/environment.yaml b/snappy_wrappers/wrappers/mutect2/environment.yaml index a74188fee..ae348dbcd 100644 --- a/snappy_wrappers/wrappers/mutect2/environment.yaml +++ b/snappy_wrappers/wrappers/mutect2/environment.yaml @@ -3,7 +3,7 @@ channels: - bioconda dependencies: - - python==3.9.19 + - python=3.12 - gatk4==4.3.0.0 - htslib==1.19.1 - bcftools==1.19 diff --git a/snappy_wrappers/wrappers/mutect2_par/prepare_panel/parallel_prepare_panel.py b/snappy_wrappers/wrappers/mutect2_par/prepare_panel/parallel_prepare_panel.py index 0e2f0bbd5..77d9e2d4a 100644 --- a/snappy_wrappers/wrappers/mutect2_par/prepare_panel/parallel_prepare_panel.py +++ b/snappy_wrappers/wrappers/mutect2_par/prepare_panel/parallel_prepare_panel.py @@ -23,7 +23,6 @@ class ParallelMutect2Wrapper(ParallelMutect2BaseWrapper): - inner_wrapper = "mutect2/prepare_panel" step_name = "panel_of_normals" tool_name = "mutect2" diff --git a/snappy_wrappers/wrappers/mutect2_par/run/parallel_mutect2.py b/snappy_wrappers/wrappers/mutect2_par/run/parallel_mutect2.py index 9de25ac92..21923923a 100644 --- a/snappy_wrappers/wrappers/mutect2_par/run/parallel_mutect2.py +++ b/snappy_wrappers/wrappers/mutect2_par/run/parallel_mutect2.py @@ -23,7 +23,6 @@ class ParallelMutect2Wrapper(ParallelMutect2BaseWrapper): - inner_wrapper = "mutect2/run" step_name = "somatic_variant_calling" tool_name = "mutect2" diff --git a/snappy_wrappers/wrappers/mutect_par/parallel_mutect.py b/snappy_wrappers/wrappers/mutect_par/parallel_mutect.py index 8d2d46202..5fad61c20 100644 --- a/snappy_wrappers/wrappers/mutect_par/parallel_mutect.py +++ b/snappy_wrappers/wrappers/mutect_par/parallel_mutect.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- -"""Definition for Mutect variant caller in parallel, genome is split into windows -""" +"""Definition for Mutect variant caller in parallel, genome is split into windows""" import os import sys diff --git a/snappy_wrappers/wrappers/platypus/call_joint/splitMNPsAndComplex.py b/snappy_wrappers/wrappers/platypus/call_joint/splitMNPsAndComplex.py index b61b01d10..6a3a97c77 100644 --- a/snappy_wrappers/wrappers/platypus/call_joint/splitMNPsAndComplex.py +++ b/snappy_wrappers/wrappers/platypus/call_joint/splitMNPsAndComplex.py @@ -71,7 +71,6 @@ def splitMAVariant(chrom, pos, theId, ref, alts, qual, filters, info, theRest): ################################################################################################### for line in sys.stdin: - if line.startswith("#"): print(line.strip()) continue diff --git a/snappy_wrappers/wrappers/purecn/prepare/wrapper.py b/snappy_wrappers/wrappers/purecn/prepare/wrapper.py index f72addccc..16799408e 100644 --- a/snappy_wrappers/wrappers/purecn/prepare/wrapper.py +++ b/snappy_wrappers/wrappers/purecn/prepare/wrapper.py @@ -17,12 +17,16 @@ bound_files = { "genome": os.path.normpath(genome), "path_bait_regions": os.path.normpath(config["path_bait_regions"]), - "mappability": os.path.normpath(config["mappability"]) - if "mappability" in config and config["mappability"] - else "", - "reptiming": os.path.normpath(config["reptiming"]) - if "reptiming" in config and config["reptiming"] - else "", + "mappability": ( + os.path.normpath(config["mappability"]) + if "mappability" in config and config["mappability"] + else "" + ), + "reptiming": ( + os.path.normpath(config["reptiming"]) + if "reptiming" in config and config["reptiming"] + else "" + ), } keys = list(bound_files.keys()) diff --git a/snappy_wrappers/wrappers/scramble/analysis/wrapper.py b/snappy_wrappers/wrappers/scramble/analysis/wrapper.py index a662c3423..0270a2d14 100644 --- a/snappy_wrappers/wrappers/scramble/analysis/wrapper.py +++ b/snappy_wrappers/wrappers/scramble/analysis/wrapper.py @@ -1,4 +1,5 @@ """CUBI+Snakemake wrapper code for scramble (analysis): Snakemake wrapper.py""" + import os from snakemake import shell diff --git a/snappy_wrappers/wrappers/varscan_par/call_joint/environment.yaml b/snappy_wrappers/wrappers/varscan_par/call_joint/environment.yaml index eb1f4045e..4b10c4c19 100644 --- a/snappy_wrappers/wrappers/varscan_par/call_joint/environment.yaml +++ b/snappy_wrappers/wrappers/varscan_par/call_joint/environment.yaml @@ -2,6 +2,7 @@ channels: - conda-forge - bioconda dependencies: +- python =3.12 - bcftools ==1.5 - htslib ==1.5 - parallel ==20170422 diff --git a/snappy_wrappers/wrappers/vcf2maf/vcf_to_table/common_functions.py b/snappy_wrappers/wrappers/vcf2maf/vcf_to_table/common_functions.py index 6429233be..c7cf926d2 100644 --- a/snappy_wrappers/wrappers/vcf2maf/vcf_to_table/common_functions.py +++ b/snappy_wrappers/wrappers/vcf2maf/vcf_to_table/common_functions.py @@ -101,7 +101,7 @@ def variant_type(x, args=None): return [variant_type] -strip_sequence_version_pattern = re.compile("\.[0-9]+$") # noqa: W605 +strip_sequence_version_pattern = re.compile(r"\.[0-9]+$") def strip_sequence_version(x, args): diff --git a/snappy_wrappers/wrappers/vcf2maf/vcf_to_table/parser.py b/snappy_wrappers/wrappers/vcf2maf/vcf_to_table/parser.py index 6c2410732..8bedaf22d 100644 --- a/snappy_wrappers/wrappers/vcf2maf/vcf_to_table/parser.py +++ b/snappy_wrappers/wrappers/vcf2maf/vcf_to_table/parser.py @@ -8,7 +8,7 @@ class VcfParser: - pattern = re.compile("^([^\[\]\s]+)(\[([0-9]+|REF)\])?$") # noqa: W605 + pattern = re.compile(r"^([^\[\]\s]+)(\[([0-9]+|REF)\])?$") def __init__( self, @@ -59,9 +59,9 @@ def __init__( ): self.annotation = Annotation( annotation_id=config["annotation"]["id"], - allele_column=config["annotation"]["allele"] - if "allele" in config["annotation"] - else None, + allele_column=( + config["annotation"]["allele"] if "allele" in config["annotation"] else None + ), extract=config["annotation"]["extract"], split=config["annotation"]["split"], ) diff --git a/snappy_wrappers/wrappers/vcf2maf/vcf_to_table/protein_mutation_parser.py b/snappy_wrappers/wrappers/vcf2maf/vcf_to_table/protein_mutation_parser.py index 94ab53b00..7b02f5958 100644 --- a/snappy_wrappers/wrappers/vcf2maf/vcf_to_table/protein_mutation_parser.py +++ b/snappy_wrappers/wrappers/vcf2maf/vcf_to_table/protein_mutation_parser.py @@ -167,28 +167,14 @@ class ProteinMutationFormatException(Exception): @functools.lru_cache def _build_protein_pattern(): - prefix = "^(([A-z0-9_\.\(\)-]+):)?p\.\(?" # noqa: W605 - postfix = "\)?$" # noqa: W605 + prefix = r"^(([A-z0-9_\.\(\)-]+):)?p\.\(?" + postfix = r"\)?$" aa = "(" + "|".join(aa_codes_short) + "|" + "|".join(aa_codes_long) + ")" - aaTer = ( - "(" - + "|".join(aa_codes_short) - + "|" - + "|".join(aa_codes_long) - + "|\*|Ter" # noqa: W605 - + ")" - ) - aaAll = ( - "(" - + "|".join(aa_codes_short) - + "|" - + "|".join(aa_codes_long) - + "|\*|Ter|=|\?" # noqa: W605 - + ")" - ) + aaTer = "(" + "|".join(aa_codes_short) + "|" + "|".join(aa_codes_long) + r"|\*|Ter" + ")" + aaAll = "(" + "|".join(aa_codes_short) + "|" + "|".join(aa_codes_long) + r"|\*|Ter|=|\?" + ")" nb = "([0-9]+)" - nb_unknown = "([0-9]+|\?)" # noqa: W605 + nb_unknown = r"([0-9]+|\?)" interval = aa + nb + "_" + aa + nb one_or_interval = aa + nb + "(_" + aa + nb + ")?" @@ -202,17 +188,17 @@ def _build_protein_pattern(): delins = one_or_interval + "delins" + "(" + aa + "*)" + aaTer - frameshift = aaTer + nb + aa + "fs" + "(Ter|\*)" + nb_unknown # noqa: W605 + frameshift = aaTer + nb + aa + "fs" + r"(Ter|\*)" + nb_unknown - extensionN = "(Met|M)1" + "ext(-[0-9]+|\?)" # noqa: W605 - extensionC = "(Ter|\*)" + nb + aa + "ext" + "(Ter|\*)" + nb_unknown # noqa: W605 + extensionN = "(Met|M)1" + r"ext(-[0-9]+|\?)" + extensionC = r"(Ter|\*)" + nb + aa + "ext" + r"(Ter|\*)" + nb_unknown pattern = ( prefix + "(" + "|".join( [ - "(0\??|=|\?)", # noqa: W605 + r"(0\??|=|\?)", substitution, # 4: ref, 5: position, 6: alt duplication, # 7: start, 8: start pos, 10: end, 11: end pos deletion, # 12: start, 13: start pos, 15: end, 16: end pos @@ -232,18 +218,11 @@ def _build_protein_pattern(): @functools.lru_cache def _build_silent_dinucleotide(): - prefix = "^(([A-z0-9_\.\(\)-]+):)?p\.\(?" # noqa: W605 - postfix = "\)?$" # noqa: W605 + prefix = r"^(([A-z0-9_\.\(\)-]+):)?p\.\(?" + postfix = r"\)?$" aa = "(" + "|".join(aa_codes_short) + "|" + "|".join(aa_codes_long) + ")" - aaTer = ( - "(" - + "|".join(aa_codes_short) - + "|" - + "|".join(aa_codes_long) - + "|\*|Ter" # noqa: W605 - + ")" - ) + aaTer = "(" + "|".join(aa_codes_short) + "|" + "|".join(aa_codes_long) + r"|\*|Ter" + ")" nb = "([0-9]+)" return re.compile(prefix + aa + aaTer + "*" + nb + "=" + postfix) diff --git a/snappy_wrappers/wrappers/vcf2maf/vcf_to_table/variant_classification.py b/snappy_wrappers/wrappers/vcf2maf/vcf_to_table/variant_classification.py index 62bf85891..45c94365f 100644 --- a/snappy_wrappers/wrappers/vcf2maf/vcf_to_table/variant_classification.py +++ b/snappy_wrappers/wrappers/vcf2maf/vcf_to_table/variant_classification.py @@ -125,7 +125,7 @@ def variant_classification_vep(x, args=None): inFrame = (abs(len(ref) - len(alt)) % 3) == 0 found = False - for (k, v) in variant_classes_vep: + for k, v in variant_classes_vep: if ensembl == k: tcga.append(v) found = True @@ -179,7 +179,7 @@ def variant_classification_jannovar(x, args=None): # inFrame = (abs(len(ref) - len(alt)) % 3) == 0 found = False - for (k, v) in variant_classes_jannovar: + for k, v in variant_classes_jannovar: if jannovar == k: tcga.append(v) found = True diff --git a/snappy_wrappers/wrappers/vcf2maf/vcf_to_table/vcf_to_table.py b/snappy_wrappers/wrappers/vcf2maf/vcf_to_table/vcf_to_table.py index 42034b3e0..1d57d3a31 100644 --- a/snappy_wrappers/wrappers/vcf2maf/vcf_to_table/vcf_to_table.py +++ b/snappy_wrappers/wrappers/vcf2maf/vcf_to_table/vcf_to_table.py @@ -71,13 +71,16 @@ def _check_config(config: typing.Dict[str, typing.Any]) -> typing.Dict[str, typi ), '"set" and/or "column" definition missing from element {} of inputs definition for output column {}'.format( i + 1, k ) - assert x["set"] in ( - "annotation", - "constant", - "fixed", - "format", - "info", - "variant", + assert ( + x["set"] + in ( + "annotation", + "constant", + "fixed", + "format", + "info", + "variant", + ) ), 'Illegal set "{}" for input column "{}" of inputs definition for output column {}, must be one of "annotation", "constant", "fixed", "format", "info" or "variant"'.format( x["set"], x["column"], k ) @@ -88,11 +91,14 @@ def _check_config(config: typing.Dict[str, typing.Any]) -> typing.Dict[str, typi x["set"], x["column"], k ) if "on_missing" in x: - assert x["on_missing"] in ( - "stop", - "skip", - "default", - "ignore", + assert ( + x["on_missing"] + in ( + "stop", + "skip", + "default", + "ignore", + ) ), 'Illegal value of requested action for missing input "{}" for output column "{}", must be either "stop", "skip", "default" or "ignore"'.format( x["column"], k ) @@ -107,11 +113,14 @@ def _check_config(config: typing.Dict[str, typing.Any]) -> typing.Dict[str, typi else: config["output"][k]["input"][i]["default"] = None if "on_missing" in v: - assert v["on_missing"] in ( - "stop", - "skip", - "default", - "ignore", + assert ( + v["on_missing"] + in ( + "stop", + "skip", + "default", + "ignore", + ) ), 'Illegal value of requested action for output column "{}", must be either "stop", "skip", "default" or "ignore"'.format( k ) diff --git a/snappy_wrappers/wrappers/vcfpy/add_bed/wrapper.py b/snappy_wrappers/wrappers/vcfpy/add_bed/wrapper.py index 4ef49107e..466ea97d8 100644 --- a/snappy_wrappers/wrappers/vcfpy/add_bed/wrapper.py +++ b/snappy_wrappers/wrappers/vcfpy/add_bed/wrapper.py @@ -55,6 +55,7 @@ """ ) + # Implmentation of R's quantile function for continuous quantiles. def quantile(x, probs, na_rm=False, method=7): if na_rm: diff --git a/tests/snappy_pipeline/apps/conftest.py b/tests/snappy_pipeline/apps/conftest.py index 946d0f7f5..c9dd8fd1b 100644 --- a/tests/snappy_pipeline/apps/conftest.py +++ b/tests/snappy_pipeline/apps/conftest.py @@ -104,7 +104,7 @@ def germline_sheet_fake_project_ngs_mapping_fs( create_missing_dirs=True, ) # Make the snappy_pipeline workflows visible - fake_fs.fs.add_real_directory(snappy_pipeline.workflows.__path__._path[0]) + fake_fs.fs.add_real_directory(snappy_pipeline.workflows.__path__[0]) # Go into pipeline step fake_fs.os.chdir("/project-dir/ngs_mapping") return fake_fs diff --git a/tests/snappy_pipeline/apps/test_snappy_snake.py b/tests/snappy_pipeline/apps/test_snappy_snake.py index 78f27ffd3..1cc6e73ac 100644 --- a/tests/snappy_pipeline/apps/test_snappy_snake.py +++ b/tests/snappy_pipeline/apps/test_snappy_snake.py @@ -37,7 +37,7 @@ def test_snappy_snake_list_output(germline_sheet_fake_project_ngs_mapping_fs, mo # Run the code under test assert 0 == snappy_snake.main(["-S", "--verbose"]) # Check assersions - p = os.path.realpath(snappy_pipeline.workflows.__path__._path[0] + "/..") + p = os.path.realpath(snappy_pipeline.workflows.__path__[0] + "/..") m.assert_called_once_with( [ "--directory", diff --git a/tests/snappy_pipeline/test_find_file.py b/tests/snappy_pipeline/test_find_file.py index 3316aa81d..23f60601a 100644 --- a/tests/snappy_pipeline/test_find_file.py +++ b/tests/snappy_pipeline/test_find_file.py @@ -54,9 +54,11 @@ def test_file_system_crawler_result_with_names(): assert obj.named_files == {"first": "foo_R1.fastq.gz", "second": "foo_R2.fastq.gz"} assert obj.to_dict() == {"first": "foo_R1.fastq.gz", "second": "foo_R2.fastq.gz"} assert str(obj) == ( - "FileSystemCrawlerResult('/base', ('foo_R1.fastq.gz', 'foo_R2.fastq.gz'), " - "('first', 'second'), OrderedDict([('first', 'foo_R1.fastq.gz'), " - "('second', 'foo_R2.fastq.gz')]))" + "FileSystemCrawlerResult('/base', " + "('foo_R1.fastq.gz', 'foo_R2.fastq.gz'), " + "('first', 'second'), " + "OrderedDict({'first': 'foo_R1.fastq.gz', 'second': 'foo_R2.fastq.gz'})" + ")" ) @@ -106,7 +108,6 @@ def test_file_system_crawler_invalidate_cache(sample_cache_dict): with patch("snappy_pipeline.find_file.os", fake_os), patch( "snappy_pipeline.find_file.InterProcessLock", mock_lock ), patch("snappy_pipeline.find_file.open", fake_open, create=True): - # Get the original modification time original_cache_file_time = fake_os.path.getmtime(CACHE_PATH) diff --git a/tests/snappy_pipeline/workflows/common.py b/tests/snappy_pipeline/workflows/common.py index b569459ba..f67c314a3 100644 --- a/tests/snappy_pipeline/workflows/common.py +++ b/tests/snappy_pipeline/workflows/common.py @@ -1,6 +1,5 @@ """Shared method used in methods.""" - import typing diff --git a/tests/snappy_pipeline/workflows/conftest.py b/tests/snappy_pipeline/workflows/conftest.py index c7723397c..710901ead 100644 --- a/tests/snappy_pipeline/workflows/conftest.py +++ b/tests/snappy_pipeline/workflows/conftest.py @@ -8,12 +8,13 @@ import textwrap from unittest.mock import MagicMock, patch -from biomedsheets.io_tsv import read_germline_tsv_sheet -from biomedsheets.shortcuts import GenericSampleSheet, GermlineCaseSheet +from pydantic import ConfigDict from pyfakefs import fake_filesystem import pytest -from ruamel.yaml.comments import CommentedMap +from biomedsheets.io_tsv import read_germline_tsv_sheet +from biomedsheets.shortcuts import GenericSampleSheet, GermlineCaseSheet +from snappy_pipeline.models import SnappyStepModel from snappy_pipeline.workflows.abstract import BaseStep @@ -27,7 +28,7 @@ def mock_settings_env_vars(): @pytest.fixture def dummy_config(): """Return dummy configuration OrderedDicts""" - return CommentedMap([("data_sets", CommentedMap())]) + return {"data_sets": {}, "step_config": {"dummy": {"key": "value"}}} @pytest.fixture @@ -61,6 +62,11 @@ def config_paths(): return [] +class DummyModel(SnappyStepModel): + model_config = ConfigDict(extra="allow") + key: str = "value" + + @pytest.fixture def dummy_generic_step( dummy_workflow, dummy_config, dummy_cluster_config, work_dir, config_lookup_paths @@ -79,7 +85,12 @@ def default_config_yaml(cls): return "step_config:\n dummy:\n key: value" return DummyBaseStep( - dummy_workflow, dummy_config, dummy_cluster_config, config_lookup_paths, work_dir + dummy_workflow, + dummy_config, + dummy_cluster_config, + config_lookup_paths, + work_dir, + config_model_class=DummyModel, ) @@ -899,7 +910,16 @@ def cancer_sheet_fake_fs_path_link_in(fake_fs, cancer_sheet_tsv): def aligner_indices_fake_fs(fake_fs): """Return fake file system setup with files for aligner indices""" d = { - "bwa": (".fasta.amb", ".fasta.ann", ".fasta.bwt", ".fasta.pac", ".fasta.sa"), + "bwa": [ + pre + ext + for ext in (".amb", ".ann", ".bwt", ".pac", ".sa", "") + for pre in (".fasta", ".fa") + ], + "bwa_mem2": [ + pre + ext + for ext in (".0123", ".amb", ".ann", ".bwt.2bit.64", ".pac", "") + for pre in (".fasta", ".fa") + ], "star": ("/Genome", "/SA", "/SAindex"), } for aligner, suffixes in d.items(): @@ -908,15 +928,26 @@ def aligner_indices_fake_fs(fake_fs): return fake_fs -def patch_module_fs(module_name, fake_fs, mocker): +def patch_module_fs(module_name: str, fake_fs, mocker): """Helper function to mock out the file-system related things in the module with the given name using the given fake_fs and pytest-mock mocker """ - mocker.patch(f"{module_name}.open", fake_fs.open, create=True) - try: - mocker.patch(f"{module_name}.os", fake_fs.os) - except AttributeError: - pass # swallo, "os" not imported + + # Because workflows have a .model module which potentially uses filesystem operations for + # validation, make sure to patch both the main module and the model module + modules = [module_name] + if module_name.startswith("snappy_pipeline.workflows.") and not module_name.endswith(".model"): + # TODO replace with more robust solution + if not module_name.endswith("abstract") and not ".common." in module_name: + modules.append(module_name + ".model") + + for module_name in modules: + mocker.patch(f"{module_name}.open", fake_fs.open, create=True) + try: + mocker.patch(f"{module_name}.os", fake_fs.os) + except AttributeError: + pass # swallo, "os" not imported + mocker.patch("snappy_pipeline.find_file.InterProcessLock", fake_fs.inter_process_lock) mocker.patch("snappy_pipeline.find_file.open", fake_fs.open, create=True) mocker.patch("snappy_pipeline.find_file.os", fake_fs.os) diff --git a/tests/snappy_pipeline/workflows/test_tumor_mutational_burden.py b/tests/snappy_pipeline/workflows/test_tumor_mutational_burden.py index 0ed8a5ef4..86a53dbfe 100644 --- a/tests/snappy_pipeline/workflows/test_tumor_mutational_burden.py +++ b/tests/snappy_pipeline/workflows/test_tumor_mutational_burden.py @@ -34,8 +34,6 @@ def minimal_config(): ngs_mapping: tools: dna: ['bwa'] - compute_coverage_bed: true - path_target_regions: /path/to/regions.bed bwa: path_index: /path/to/bwa/index.fa @@ -43,22 +41,27 @@ def minimal_config(): tools: - mutect2 - scalpel + mutect2: {} scalpel: path_target_regions: /path/to/target/regions.bed somatic_variant_annotation: + path_somatic_variant_calling: ../somatic_variant_calling tools: ["vep", "jannovar"] jannovar: path_jannovar_ser: /path/to/jannover.ser + flag_off_target: False + dbnsfp: {} vep: - path_dir_cache: /path/to/dir/cache + cache_dir: /path/to/dir/cache somatic_variant_filtration: - filters: + filtration_schema: sets + filter_sets: dkfz_only: ~ - dkfz_and_ebfilter: ~ - dkfz_and_oxog: ~ - dkfz_and_ebfilter_and_oxog: ~ + dkfz_and_ebfilter: {} + dkfz_and_oxog: {} + dkfz_and_ebfilter_and_oxog: {} exon_lists: {} tumor_mutational_burden: @@ -90,11 +93,13 @@ def tumor_mutational_burden_workflow( work_dir, config_paths, cancer_sheet_fake_fs, + aligner_indices_fake_fs, mocker, ): """Return TumorMutationalBurdenCalculationWorkflow object pre-configured with cancer sheet""" # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we # can obtain paths from the function as if we really had a NGSMappingPipelineStep there dummy_workflow.globals = { @@ -157,7 +162,7 @@ def test_tumor_mutational_step_part_get_resource_usage(tumor_mutational_burden_w # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = tumor_mutational_burden_workflow.get_resource("tmb_gathering", "run", resource) + actual = tumor_mutational_burden_workflow.get_resource("tmb_gathering", "run", resource)() assert actual == expected, msg_error diff --git a/tests/snappy_pipeline/workflows/test_workflows_abstract.py b/tests/snappy_pipeline/workflows/test_workflows_abstract.py index 7cee7a7f8..e6dd35fd0 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_abstract.py +++ b/tests/snappy_pipeline/workflows/test_workflows_abstract.py @@ -1,17 +1,24 @@ # -*- coding: utf-8 -*- """Code for testing the code in the "abstract" workflow """ +from copy import deepcopy import filecmp from pathlib import Path from tempfile import NamedTemporaryFile import textwrap +from typing import TypedDict +from unittest.mock import MagicMock -from biomedsheets.shortcuts import GenericSampleSheet, GermlineCaseSheet import pytest +import ruamel.yaml import ruamel.yaml as ruamel_yaml from snakemake.io import OutputFiles, Wildcards +import yaml -from snappy_pipeline.base import MissingConfiguration +from biomedsheets.shortcuts import GenericSampleSheet, GermlineCaseSheet +from snappy_pipeline.base import MissingConfiguration, merge_dictlikes +import snappy_pipeline.workflow_model +from snappy_pipeline.workflow_model import ConfigModel from snappy_pipeline.workflows.abstract import ( BaseStep, DataSearchInfo, @@ -24,10 +31,11 @@ WritePedigreeStepPart, ) -from .conftest import patch_module_fs +from .conftest import DummyModel, patch_module_fs __author__ = "Manuel Holtgrewe " + # Tests for DataSetInfo --------------------------------------------------------------------------- @@ -224,7 +232,9 @@ def dummy_config(): textwrap.dedent( r""" step_config: {} - static_data_config: {} + static_data_config: + reference: + path: /path/to/reference.fasta data_sets: first_batch: # example for a matched cancer data set file: sheet.tsv @@ -276,12 +286,25 @@ def default_config_yaml(cls): ).lstrip() patch_module_fs("snappy_pipeline.workflows.abstract", germline_sheet_fake_fs, mocker) + + class DummyStepConfig(TypedDict, total=False): + dummy: DummyModel + + mocker.patch("snappy_pipeline.workflow_model.StepConfig", DummyStepConfig) + + config = deepcopy(dummy_config) + yaml = ruamel_yaml.YAML() + local_config = yaml.load(DummyBaseStep.default_config_yaml()) + dummy_model = DummyModel(**local_config["step_config"]["dummy"]) + dummy_config = merge_dictlikes(config, {"step_config": {"dummy": dummy_model}}) + return DummyBaseStep( dummy_workflow, dummy_config, config_lookup_paths, config_paths, work_dir, + config_model_class=DummyModel, ) @@ -324,12 +347,25 @@ def default_config_yaml(cls): patch_module_fs( "snappy_pipeline.workflows.abstract", germline_sheet_fake_fs_path_link_in, mocker ) + + class DummyStepConfig(TypedDict, total=False): + dummy: DummyModel + + mocker.patch("snappy_pipeline.workflow_model.StepConfig", DummyStepConfig) + + config = deepcopy(dummy_config) + yaml = ruamel_yaml.YAML() + local_config = yaml.load(DummyBaseStep.default_config_yaml()) + dummy_model = DummyModel(**local_config["step_config"]["dummy"]) + dummy_config = merge_dictlikes(config, {"step_config": {"dummy": dummy_model}}) + return DummyBaseStep( dummy_workflow, dummy_config, config_lookup_paths, config_paths, work_dir, + config_model_class=DummyModel, ) @@ -422,7 +458,9 @@ def vcf_dummy_config(): textwrap.dedent( r""" step_config: {} - static_data_config: {} + static_data_config: + reference: + path: /path/to/reference.fasta data_sets: first_batch: # example for a matched cancer data set file: sheet.tsv @@ -478,12 +516,25 @@ def default_config_yaml(cls): patch_module_fs( "snappy_pipeline.workflows.abstract", germline_sheet_with_ext_vcf_fake_fs, mocker ) + + class DummyStepConfig(TypedDict, total=False): + dummy: DummyModel + + mocker.patch("snappy_pipeline.workflow_model.StepConfig", DummyStepConfig) + + config = deepcopy(vcf_dummy_config) + yaml = ruamel_yaml.YAML() + local_config = yaml.load(DummyBaseStep.default_config_yaml()) + dummy_model = DummyModel(**local_config["step_config"]["dummy"]) + dummy_config = merge_dictlikes(config, {"step_config": {"dummy": dummy_model}}) + return DummyBaseStep( dummy_workflow, - vcf_dummy_config, + dummy_config, config_lookup_paths, config_paths, work_dir, + config_model_class=DummyModel, ) @@ -593,12 +644,25 @@ def default_config_yaml(cls): ).lstrip() patch_module_fs("snappy_pipeline.workflows.abstract", germline_sheet_fake_fs, mocker) + + class DummyStepConfig(TypedDict, total=False): + dummy: DummyModel + + mocker.patch("snappy_pipeline.workflow_model.StepConfig", DummyStepConfig) + + config = deepcopy(dummy_config) + yaml = ruamel_yaml.YAML() + local_config = yaml.load(DummyBaseStep.default_config_yaml()) + dummy_model = DummyModel(**local_config["step_config"]["dummy"]) + dummy_config = merge_dictlikes(config, {"step_config": {"dummy": dummy_model}}) + return DummyBaseStep( dummy_workflow, dummy_config, config_lookup_paths, config_paths, work_dir, + config_model_class=DummyModel, ) diff --git a/tests/snappy_pipeline/workflows/test_workflows_adapter_trimming.py b/tests/snappy_pipeline/workflows/test_workflows_adapter_trimming.py index 233700442..044bf0c3a 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_adapter_trimming.py +++ b/tests/snappy_pipeline/workflows/test_workflows_adapter_trimming.py @@ -29,7 +29,10 @@ def minimal_config(): adapter_trimming: tools: ["bbduk", "fastp"] bbduk: - adapter_sequences: /path/to/adapter_sequences.fa + adapter_sequences: + - /path/to/adapter_sequences.fa + fastp: + num_threads: 4 data_sets: first_batch: file: sheet.tsv @@ -163,7 +166,7 @@ def test_bbduk_step_part_get_resource_usage(adapter_trimming_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = adapter_trimming_workflow.get_resource("bbduk", "run", resource) + actual = adapter_trimming_workflow.get_resource("bbduk", "run", resource)() assert actual == expected, msg_error @@ -236,7 +239,7 @@ def test_fastp_step_part_get_resource_usage(adapter_trimming_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = adapter_trimming_workflow.get_resource("fastp", "run", resource) + actual = adapter_trimming_workflow.get_resource("fastp", "run", resource)() assert actual == expected, msg_error diff --git a/tests/snappy_pipeline/workflows/test_workflows_cbioportal_export.py b/tests/snappy_pipeline/workflows/test_workflows_cbioportal_export.py index 2a41b8c97..d7949bdc8 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_cbioportal_export.py +++ b/tests/snappy_pipeline/workflows/test_workflows_cbioportal_export.py @@ -28,10 +28,14 @@ def minimal_config(): step_config: ngs_mapping: + tools: + rna: [star] star: + path_index: /path/to/star/index cbioportal_export: # Paths to snappy steps containing results to be uploaded path_ngs_mapping: /NGS_MAPPING + path_gene_id_mappings: DUMMY expression_tool: star path_somatic_variant: /SOM_VAR_FILTRATION somatic_variant_calling_tool: mutect2 @@ -40,6 +44,9 @@ def minimal_config(): path_copy_number: /COPY_NUMBER copy_number_tool: cnvkit exclude_variant_with_flag: LowFisherScore + vcf2maf: + ncbi_build: GRCh37 + Center: DUMMY # Description of dataset in cBioPortal study: type_of_cancer: mixed @@ -47,6 +54,7 @@ def minimal_config(): study_description: "PeDiOn project A02P" study_name: PeDiOn_A02P study_name_short: A02P + reference_genome: grch37 data_sets: first_batch: @@ -69,11 +77,13 @@ def cbioportal_export_workflow( work_dir, config_paths, cancer_sheet_fake_fs, + aligner_indices_fake_fs, mocker, ): """Return cbioportalExportWorkflow object pre-configured with cancer sheet""" # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) dummy_workflow.globals = { "ngs_mapping": lambda x: "/NGS_MAPPING/" + x, "somatic_variant": lambda x: "/SOM_VAR_FILTRATION/" + x, @@ -133,7 +143,7 @@ def test_cbioportal_meta_files_step_part_get_resource_usage(cbioportal_export_wo # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = cbioportal_export_workflow.get_resource("cbioportal_meta_files", "run", resource) + actual = cbioportal_export_workflow.get_resource("cbioportal_meta_files", "run", resource)() assert actual == expected, msg_error @@ -178,7 +188,7 @@ def test_cbioportal_clinical_data_step_part_get_resource_usage(cbioportal_export msg_error = f"Assertion error for resource '{resource}'." actual = cbioportal_export_workflow.get_resource( "cbioportal_clinical_data", "run", resource - ) + )() assert actual == expected, msg_error @@ -238,7 +248,7 @@ def test_cbioportal_case_lists_step_part_get_resource_usage(cbioportal_export_wo # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = cbioportal_export_workflow.get_resource("cbioportal_case_lists", "run", resource) + actual = cbioportal_export_workflow.get_resource("cbioportal_case_lists", "run", resource)() assert actual == expected, msg_error @@ -359,7 +369,7 @@ def test_cbioportal_vcf2maf_step_part_get_resource_usage(cbioportal_export_workf # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = cbioportal_export_workflow.get_resource("cbioportal_vcf2maf", "run", resource) + actual = cbioportal_export_workflow.get_resource("cbioportal_vcf2maf", "run", resource)() assert actual == expected, msg_error @@ -408,7 +418,7 @@ def test_cbioportal_mutations_step_part_get_resource_usage(cbioportal_export_wor # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = cbioportal_export_workflow.get_resource("cbioportal_mutations", "run", resource) + actual = cbioportal_export_workflow.get_resource("cbioportal_mutations", "run", resource)() assert actual == expected, msg_error @@ -464,7 +474,7 @@ def test_cbioportal_cns2cna_step_part_get_resource_usage(cbioportal_export_workf # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = cbioportal_export_workflow.get_resource("cbioportal_cns2cna", "run", resource) + actual = cbioportal_export_workflow.get_resource("cbioportal_cns2cna", "run", resource)() assert actual == expected, msg_error @@ -557,7 +567,7 @@ def test_cbioportal_cna_step_part_get_resource_usage(cbioportal_export_workflow) for action in all_actions: for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}' in action '{action}'." - actual = cbioportal_export_workflow.get_resource("cbioportal_cna", action, resource) + actual = cbioportal_export_workflow.get_resource("cbioportal_cna", action, resource)() assert actual == expected, msg_error @@ -603,7 +613,7 @@ def test_cbioportal_segment_step_part_get_resource_usage(cbioportal_export_workf # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = cbioportal_export_workflow.get_resource("cbioportal_segment", "run", resource) + actual = cbioportal_export_workflow.get_resource("cbioportal_segment", "run", resource)() assert actual == expected, msg_error @@ -660,7 +670,7 @@ def test_cbioportal_expression_step_part_get_resource_usage(cbioportal_export_wo # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = cbioportal_export_workflow.get_resource("cbioportal_expression", "run", resource) + actual = cbioportal_export_workflow.get_resource("cbioportal_expression", "run", resource)() assert actual == expected, msg_error diff --git a/tests/snappy_pipeline/workflows/test_workflows_gene_expression_quantification.py b/tests/snappy_pipeline/workflows/test_workflows_gene_expression_quantification.py index 48d770a40..1e9318fab 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_gene_expression_quantification.py +++ b/tests/snappy_pipeline/workflows/test_workflows_gene_expression_quantification.py @@ -33,6 +33,22 @@ def minimal_config(): rna: ['star'] star: path_index: /path/to/star/index + gene_expression_quantification: + path_ngs_mapping: ../ngs_mapping + tools: [strandedness, featurecounts, dupradar, duplication, rnaseqc, salmon, stats] + featurecounts: + path_annotation_gtf: /path/to/annotation.gtf + strandedness: + path_exon_bed: /path/to/exon.bed + rnaseqc: + rnaseqc_path_annotation_gtf: /path/to/rnaseqc.gtf + dupradar: + dupradar_path_annotation_gtf: /path/to/dupradar.gtf + duplication: {} + stats: {} + salmon: + path_transcript_to_gene: /path/to/salmon/transcript_to_gene + path_index: /path/to/salmon/index data_sets: first_batch: @@ -55,11 +71,13 @@ def gene_expression_quantification_workflow( work_dir, config_paths, cancer_sheet_fake_fs, + aligner_indices_fake_fs, mocker, ): """Return GeneExpressionQuantificationWorkflow object pre-configured with cancer sheet""" # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping.model", aligner_indices_fake_fs, mocker) dummy_workflow.globals = {"ngs_mapping": lambda x: "NGS_MAPPING/" + x} # Construct the workflow object return GeneExpressionQuantificationWorkflow( @@ -124,7 +142,7 @@ def test_featurecounts_step_part_get_resource(gene_expression_quantification_wor msg_error = f"Assertion error for resource '{resource}'." actual = gene_expression_quantification_workflow.get_resource( "featurecounts", "run", resource - ) + )() assert actual == expected, msg_error @@ -138,7 +156,7 @@ def test_salmon_step_part_get_resource(gene_expression_quantification_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = gene_expression_quantification_workflow.get_resource("salmon", "run", resource) + actual = gene_expression_quantification_workflow.get_resource("salmon", "run", resource)() assert actual == expected, msg_error @@ -154,7 +172,7 @@ def test_duplication_step_part_get_resource(gene_expression_quantification_workf msg_error = f"Assertion error for resource '{resource}'." actual = gene_expression_quantification_workflow.get_resource( "duplication", "run", resource - ) + )() assert actual == expected, msg_error @@ -168,7 +186,7 @@ def test_dupradar_step_part_get_resource(gene_expression_quantification_workflow # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = gene_expression_quantification_workflow.get_resource("dupradar", "run", resource) + actual = gene_expression_quantification_workflow.get_resource("dupradar", "run", resource)() assert actual == expected, msg_error @@ -182,7 +200,7 @@ def test_rnaseqc_step_part_get_resource(gene_expression_quantification_workflow) # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = gene_expression_quantification_workflow.get_resource("rnaseqc", "run", resource) + actual = gene_expression_quantification_workflow.get_resource("rnaseqc", "run", resource)() assert actual == expected, msg_error @@ -196,7 +214,7 @@ def test_stats_step_part_get_resource(gene_expression_quantification_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = gene_expression_quantification_workflow.get_resource("stats", "run", resource) + actual = gene_expression_quantification_workflow.get_resource("stats", "run", resource)() assert actual == expected, msg_error @@ -326,4 +344,6 @@ def test_gene_expression_quantification_workflow_files(gene_expression_quantific # Get actual actual = set(gene_expression_quantification_workflow.get_result_files()) - assert actual == expected + assert ( + actual == expected + ), f"Missing from actual {expected - actual}\nMissing from expected {actual - expected}" diff --git a/tests/snappy_pipeline/workflows/test_workflows_gene_expression_quantification_processed_fastq.py b/tests/snappy_pipeline/workflows/test_workflows_gene_expression_quantification_processed_fastq.py index fa09a0e15..6d375675f 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_gene_expression_quantification_processed_fastq.py +++ b/tests/snappy_pipeline/workflows/test_workflows_gene_expression_quantification_processed_fastq.py @@ -29,7 +29,21 @@ def minimal_config(): step_config: gene_expression_quantification: + tools: [strandedness, featurecounts, dupradar, duplication, rnaseqc, salmon, stats] path_link_in: "/preprocess" + featurecounts: + path_annotation_gtf: /path/to/annotation.gtf + strandedness: + path_exon_bed: /path/to/exon.bed + rnaseqc: + rnaseqc_path_annotation_gtf: /path/to/rnaseqc.gtf + dupradar: + dupradar_path_annotation_gtf: /path/to/dupradar.gtf + duplication: {} + stats: {} + salmon: + path_transcript_to_gene: /path/to/salmon/transcript_to_gene + path_index: /path/to/salmon/index ngs_mapping: tools: rna: ['star'] @@ -57,11 +71,13 @@ def gene_expression_quantification_workflow( work_dir, config_paths, cancer_sheet_fake_fs_path_link_in, + aligner_indices_fake_fs, mocker, ): """Return GeneExpressionQuantificationWorkflow object pre-configured with cancer sheet""" # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs_path_link_in, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping.model", aligner_indices_fake_fs, mocker) dummy_workflow.globals = {"ngs_mapping": lambda x: "NGS_MAPPING/" + x} # Construct the workflow object return GeneExpressionQuantificationWorkflow( @@ -126,7 +142,7 @@ def test_featurecounts_step_part_get_resource(gene_expression_quantification_wor msg_error = f"Assertion error for resource '{resource}'." actual = gene_expression_quantification_workflow.get_resource( "featurecounts", "run", resource - ) + )() assert actual == expected, msg_error @@ -140,7 +156,7 @@ def test_salmon_step_part_get_resource(gene_expression_quantification_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = gene_expression_quantification_workflow.get_resource("salmon", "run", resource) + actual = gene_expression_quantification_workflow.get_resource("salmon", "run", resource)() assert actual == expected, msg_error @@ -156,7 +172,7 @@ def test_duplication_step_part_get_resource(gene_expression_quantification_workf msg_error = f"Assertion error for resource '{resource}'." actual = gene_expression_quantification_workflow.get_resource( "duplication", "run", resource - ) + )() assert actual == expected, msg_error @@ -170,7 +186,7 @@ def test_dupradar_step_part_get_resource(gene_expression_quantification_workflow # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = gene_expression_quantification_workflow.get_resource("dupradar", "run", resource) + actual = gene_expression_quantification_workflow.get_resource("dupradar", "run", resource)() assert actual == expected, msg_error @@ -184,7 +200,7 @@ def test_rnaseqc_step_part_get_resource(gene_expression_quantification_workflow) # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = gene_expression_quantification_workflow.get_resource("rnaseqc", "run", resource) + actual = gene_expression_quantification_workflow.get_resource("rnaseqc", "run", resource)() assert actual == expected, msg_error @@ -198,7 +214,7 @@ def test_stats_step_part_get_resource(gene_expression_quantification_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = gene_expression_quantification_workflow.get_resource("stats", "run", resource) + actual = gene_expression_quantification_workflow.get_resource("stats", "run", resource)() assert actual == expected, msg_error diff --git a/tests/snappy_pipeline/workflows/test_workflows_gene_expression_report.py b/tests/snappy_pipeline/workflows/test_workflows_gene_expression_report.py index 8ad2ca294..c71fd0dcb 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_gene_expression_report.py +++ b/tests/snappy_pipeline/workflows/test_workflows_gene_expression_report.py @@ -30,7 +30,10 @@ def minimal_config(): path_index: /path/to/star/index gene_expression_quantification: + path_ngs_mapping: ../ngs_mapping tools: ['strandedness'] + strandedness: + path_exon_bed: /path/to/exon.bed gene_expression_report: path_gene_expression_quantification: /work @@ -56,11 +59,13 @@ def gene_expression_report_workflow( work_dir, config_paths, cancer_sheet_fake_fs, + aligner_indices_fake_fs, mocker, ): """Return GeneExpressionReportWorkflow object pre-configured with cancer sheet""" # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) dummy_workflow.globals = { "ngs_mapping": lambda x: "NGS_MAPPING/" + x, "gene_expression_quantification": lambda x: "GENE_EXPRESSION_QUANTIFICATION/" + x, @@ -93,7 +98,7 @@ def test_all_steps_get_resource_usage(gene_expression_report_workflow): for step in steps: for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}' in step '{step}'." - actual = gene_expression_report_workflow.get_resource(step, "run", resource) + actual = gene_expression_report_workflow.get_resource(step, "run", resource)() assert actual == expected, msg_error diff --git a/tests/snappy_pipeline/workflows/test_workflows_helper_gcnv_model_targeted.py b/tests/snappy_pipeline/workflows/test_workflows_helper_gcnv_model_targeted.py index 082ad2883..d0ee4dfa7 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_helper_gcnv_model_targeted.py +++ b/tests/snappy_pipeline/workflows/test_workflows_helper_gcnv_model_targeted.py @@ -52,17 +52,17 @@ def minimal_config(): ngs_mapping: tools: dna: ['bwa'] - compute_coverage_bed: true - path_target_regions: /path/to/regions.bed bwa: path_index: /path/to/bwa/index.fa helper_gcnv_model_targeted: + path_ngs_mapping: ../ngs_mapping gcnv: path_target_interval_list_mapping: - pattern: "Agilent SureSelect Human All Exon V6.*" name: "Agilent_SureSelect_Human_All_Exon_V6" path: /path/to/Agilent/SureSelect_Human_All_Exon_V6_r2/GRCh37/Exons.bed path_uniquely_mapable_bed: /path/to/map_track.bed # REQUIRED + path_par_intervals: /path/to/par.intervals data_sets: first_batch: @@ -85,11 +85,13 @@ def helper_gcnv_model_workflow( work_dir, config_paths, germline_sheet_fake_fs, + aligner_indices_fake_fs, mocker, ): """Return HelperBuildTargetSeqGcnvModelWorkflow object pre-configured with germline sheet""" # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", germline_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we # can obtain paths from the function as if we really had a NGSMappingPipelineStep there dummy_workflow.globals = {"ngs_mapping": lambda x: "NGS_MAPPING/" + x} diff --git a/tests/snappy_pipeline/workflows/test_workflows_helper_gcnv_model_wgs.py b/tests/snappy_pipeline/workflows/test_workflows_helper_gcnv_model_wgs.py index 1692bcb8e..6b0aa6531 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_helper_gcnv_model_wgs.py +++ b/tests/snappy_pipeline/workflows/test_workflows_helper_gcnv_model_wgs.py @@ -45,13 +45,14 @@ def minimal_config(): ngs_mapping: tools: dna: ['bwa'] - compute_coverage_bed: true - path_target_regions: /path/to/regions.bed bwa: path_index: /path/to/bwa/index.fa - gcnv: - path_uniquely_mapable_bed: /path/to/map_track.bed # REQUIRED + helper_gcnv_model_wgs: + path_ngs_mapping: ../ngs_mapping + gcnv: + path_uniquely_mapable_bed: /path/to/map_track.bed # REQUIRED + path_par_intervals: /path/to/par.intervals data_sets: first_batch: @@ -74,11 +75,13 @@ def helper_gcnv_model_workflow( work_dir, config_paths, germline_sheet_fake_fs, + aligner_indices_fake_fs, mocker, ): """Return HelperBuildGcnvModelWorkflow object pre-configured with germline sheet""" # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", germline_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we # can obtain paths from the function as if we really had a NGSMappingPipelineStep there dummy_workflow.globals = {"ngs_mapping": lambda x: "NGS_MAPPING/" + x} @@ -125,24 +128,24 @@ def test_gcnv_get_resource(helper_gcnv_model_workflow): for action in actions: for resource in expected_low.keys(): if action == "filter_intervals" and resource == "memory": - actual = helper_gcnv_model_workflow.get_resource("gcnv", action, resource)( + actual = helper_gcnv_model_workflow.get_resource("gcnv", action, resource)()( None, attempt=1 ) assert actual == "20480M" - actual = helper_gcnv_model_workflow.get_resource("gcnv", action, resource)( + actual = helper_gcnv_model_workflow.get_resource("gcnv", action, resource)()( None, attempt=2 ) assert actual == "24576M" - actual = helper_gcnv_model_workflow.get_resource("gcnv", action, resource)( + actual = helper_gcnv_model_workflow.get_resource("gcnv", action, resource)()( None, attempt=3 ) assert actual == "28672M" else: if action in high_resource_action_list: - actual = helper_gcnv_model_workflow.get_resource("gcnv", action, resource) + actual = helper_gcnv_model_workflow.get_resource("gcnv", action, resource)() assert actual == expected_high.get(resource), f"action = {action}" else: - actual = helper_gcnv_model_workflow.get_resource("gcnv", action, resource) + actual = helper_gcnv_model_workflow.get_resource("gcnv", action, resource)() assert actual == expected_low.get(resource), f"action = {action}" diff --git a/tests/snappy_pipeline/workflows/test_workflows_hla_typing.py b/tests/snappy_pipeline/workflows/test_workflows_hla_typing.py index f8cd6efb2..648ff8b90 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_hla_typing.py +++ b/tests/snappy_pipeline/workflows/test_workflows_hla_typing.py @@ -25,6 +25,15 @@ def minimal_config(): reference: path: /path/to/ref.fa + step_config: + hla_typing: + path_ngs_mapping: ../ngs_mapping + tools: [optitype, arcashla] + optitype: + max_reads: 5000 + arcashla: + mapper: star + data_sets: first_batch: file: sheet.tsv @@ -127,7 +136,7 @@ def test_optitype_step_part_get_resource_usage(hla_typing_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = hla_typing_workflow.get_resource("optitype", "run", resource) + actual = hla_typing_workflow.get_resource("optitype", "run", resource)() assert actual == expected, msg_error @@ -168,7 +177,7 @@ def test_arcashla_step_part_get_resource_usage(hla_typing_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = hla_typing_workflow.get_resource("arcashla", "run", resource) + actual = hla_typing_workflow.get_resource("arcashla", "run", resource)() assert actual == expected, msg_error @@ -183,31 +192,41 @@ def test_hla_typing_workflow(hla_typing_workflow): assert actual == expected # Check result file construction - samples = ( + dna_samples = { "P001-N1-DNA1-WGS1", "P001-T1-DNA1-WGS1", - "P001-T1-RNA1-mRNA_seq1", "P002-N1-DNA1-WGS1", "P002-T1-DNA1-WGS1", "P002-T2-DNA1-WGS1", + } + + rna_samples = { + "P001-T1-RNA1-mRNA_seq1", "P002-T2-RNA1-mRNA_seq1", - ) + } + expected = [] + tools = [("star.arcashla", rna_samples), ("optitype", dna_samples | rna_samples)] expected += [ - "output/optitype.{sample}/out/optitype.{sample}.{ext}{chksum}".format( - sample=sample, ext=ext, chksum=chksum + "output/{tool}.{sample}/out/{tool}.{sample}.{ext}{chksum}".format( + tool=tool, sample=sample, ext=ext, chksum=chksum ) + for tool, samples in tools for sample in samples for ext in ("txt",) for chksum in ("", ".md5") ] expected += [ - "output/optitype.{sample}/log/optitype.{sample}.{ext}{chksum}".format( - sample=sample, ext=ext, chksum=chksum + "output/{tool}.{sample}/log/{tool}.{sample}.{ext}{chksum}".format( + tool=tool, sample=sample, ext=ext, chksum=chksum ) + for tool, samples in tools for sample in samples for ext in ("log", "conda_list.txt", "conda_info.txt") for chksum in ("", ".md5") ] + expected.sort() actual = hla_typing_workflow.get_result_files() + actual.sort() + assert actual == expected diff --git a/tests/snappy_pipeline/workflows/test_workflows_hla_typing_processed_fastq.py b/tests/snappy_pipeline/workflows/test_workflows_hla_typing_processed_fastq.py index 9cea63658..3eb2db53a 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_hla_typing_processed_fastq.py +++ b/tests/snappy_pipeline/workflows/test_workflows_hla_typing_processed_fastq.py @@ -28,7 +28,11 @@ def minimal_config(): step_config: hla_typing: path_link_in: /preprocess - tools: [optitype] + tools: [optitype, arcashla] + optitype: + max_reads: 5000 + arcashla: + mapper: star data_sets: first_batch: file: sheet.tsv @@ -131,7 +135,7 @@ def test_optitype_step_part_get_resource_usage(hla_typing_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = hla_typing_workflow.get_resource("optitype", "run", resource) + actual = hla_typing_workflow.get_resource("optitype", "run", resource)() assert actual == expected, msg_error @@ -172,7 +176,7 @@ def test_arcashla_step_part_get_resource_usage(hla_typing_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = hla_typing_workflow.get_resource("arcashla", "run", resource) + actual = hla_typing_workflow.get_resource("arcashla", "run", resource)() assert actual == expected, msg_error @@ -187,31 +191,41 @@ def test_hla_typing_workflow(hla_typing_workflow): assert actual == expected # Check result file construction - samples = ( + dna_samples = { "P001-N1-DNA1-WGS1", "P001-T1-DNA1-WGS1", - "P001-T1-RNA1-mRNA_seq1", "P002-N1-DNA1-WGS1", "P002-T1-DNA1-WGS1", "P002-T2-DNA1-WGS1", + } + + rna_samples = { + "P001-T1-RNA1-mRNA_seq1", "P002-T2-RNA1-mRNA_seq1", - ) + } + expected = [] + tools = [("star.arcashla", rna_samples), ("optitype", dna_samples | rna_samples)] expected += [ - "output/optitype.{sample}/out/optitype.{sample}.{ext}{chksum}".format( - sample=sample, ext=ext, chksum=chksum + "output/{tool}.{sample}/out/{tool}.{sample}.{ext}{chksum}".format( + tool=tool, sample=sample, ext=ext, chksum=chksum ) + for tool, samples in tools for sample in samples for ext in ("txt",) for chksum in ("", ".md5") ] expected += [ - "output/optitype.{sample}/log/optitype.{sample}.{ext}{chksum}".format( - sample=sample, ext=ext, chksum=chksum + "output/{tool}.{sample}/log/{tool}.{sample}.{ext}{chksum}".format( + tool=tool, sample=sample, ext=ext, chksum=chksum ) + for tool, samples in tools for sample in samples for ext in ("log", "conda_list.txt", "conda_info.txt") for chksum in ("", ".md5") ] + expected.sort() actual = hla_typing_workflow.get_result_files() + actual.sort() + assert actual == expected diff --git a/tests/snappy_pipeline/workflows/test_workflows_homologous_recombination_deficiency.py b/tests/snappy_pipeline/workflows/test_workflows_homologous_recombination_deficiency.py index 4ec289fd5..22a3214bb 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_homologous_recombination_deficiency.py +++ b/tests/snappy_pipeline/workflows/test_workflows_homologous_recombination_deficiency.py @@ -32,11 +32,16 @@ def minimal_config(): ngs_mapping: tools: dna: [bwa] + bwa: + path_index: /path/to/bwa/index.fasta.amb somatic_targeted_seq_cnv_calling: tools: ['sequenza'] + sequenza: {} homologous_recombination_deficiency: tools: ['scarHRD'] path_cnv_calling: ../somatic_targeted_seq_cnv_calling # REQUIRED + scarHRD: + genome_name: grch37 data_sets: first_batch: @@ -59,11 +64,13 @@ def homologous_recombination_deficiency_workflow( work_dir, config_paths, cancer_sheet_fake_fs, + aligner_indices_fake_fs, mocker, ): """Return HomologousRecombinationDeficiencyWorkflow object pre-configured with cancer sheet""" # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) dummy_workflow.globals = {"cnv_calling": lambda x: "SOMATIC_CNV_CALLING/" + x} # Construct the workflow object return HomologousRecombinationDeficiencyWorkflow( @@ -127,7 +134,7 @@ def test_scarHRD_step_part_get_resource_usage_run(homologous_recombination_defic msg_error = f"Assertion error for resource '{resource}'." actual = homologous_recombination_deficiency_workflow.get_resource( "scarHRD", "run", resource - ) + )() assert actual == expected, msg_error @@ -157,7 +164,7 @@ def test_scarHRD_step_part_get_resource_usage_install(homologous_recombination_d msg_error = f"Assertion error for resource '{resource}'." actual = homologous_recombination_deficiency_workflow.get_resource( "scarHRD", "install", resource - ) + )() assert actual == expected, msg_error diff --git a/tests/snappy_pipeline/workflows/test_workflows_igv_session_generation_from_variant_calling.py b/tests/snappy_pipeline/workflows/test_workflows_igv_session_generation_from_variant_calling.py index 0ef670193..33e473bfd 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_igv_session_generation_from_variant_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_igv_session_generation_from_variant_calling.py @@ -29,14 +29,13 @@ def minimal_config(): ngs_mapping: tools: dna: ['bwa'] - compute_coverage_bed: true - path_target_regions: /path/to/regions.bed bwa: path_index: /path/to/bwa/index.fa variant_calling: tools: - gatk3_hc + gatk3_hc: {} igv_session_generation: path_ngs_mapping: ../ngs_mapping @@ -64,11 +63,13 @@ def igv_session_generation( work_dir, config_paths, germline_sheet_fake_fs, + aligner_indices_fake_fs, mocker, ): """Return VariantCallingWorkflow object pre-configured with germline sheet""" # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", germline_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) patch_module_fs("snappy_pipeline.workflows.variant_calling", germline_sheet_fake_fs, mocker) patch_module_fs( "snappy_pipeline.workflows.igv_session_generation", germline_sheet_fake_fs, mocker @@ -140,7 +141,7 @@ def test_igv_session_generation_from_variant_calling_step_part_get_resource_usag # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = igv_session_generation.get_resource("write_igv_session_file", "run", resource) + actual = igv_session_generation.get_resource("write_igv_session_file", "run", resource)() assert actual == expected, msg_error diff --git a/tests/snappy_pipeline/workflows/test_workflows_ngs_data_qc.py b/tests/snappy_pipeline/workflows/test_workflows_ngs_data_qc.py index 7056f602c..cedc6314a 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_ngs_data_qc.py +++ b/tests/snappy_pipeline/workflows/test_workflows_ngs_data_qc.py @@ -28,6 +28,8 @@ def minimal_config(): ngs_mapping: tools: dna: [bwa] + bwa: + path_index: /path/to/bwa/index.fasta.amb ngs_data_qc: tools: ['picard'] picard: @@ -71,6 +73,7 @@ def ngs_data_qc( patch_module_fs("snappy_pipeline.workflows.abstract", germline_sheet_fake_fs, mocker) # Patch out files for aligner indices patch_module_fs("snappy_pipeline.workflows.ngs_data_qc", aligner_indices_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) # Construct the workflow object return NgsDataQcWorkflow( dummy_workflow, @@ -172,7 +175,7 @@ def test_picard_step_part_get_resource_usage(ngs_data_qc): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = ngs_data_qc.get_resource("picard", "metrics", resource) + actual = ngs_data_qc.get_resource("picard", "metrics", resource)() assert actual == expected, msg_error diff --git a/tests/snappy_pipeline/workflows/test_workflows_ngs_data_qc_processed_fastq.py b/tests/snappy_pipeline/workflows/test_workflows_ngs_data_qc_processed_fastq.py index ac14b7f88..9671fe3ab 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_ngs_data_qc_processed_fastq.py +++ b/tests/snappy_pipeline/workflows/test_workflows_ngs_data_qc_processed_fastq.py @@ -27,6 +27,7 @@ def minimal_config(): ngs_data_qc: path_link_in: "/preprocess" tools: ['fastqc'] + fastqc: {} data_sets: first_batch: @@ -124,7 +125,7 @@ def test_fastqc_step_part_get_resource_usage(ngs_data_qc): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = ngs_data_qc.get_resource("fastqc", "run", resource) + actual = ngs_data_qc.get_resource("fastqc", "run", resource)() assert actual == expected, msg_error diff --git a/tests/snappy_pipeline/workflows/test_workflows_ngs_mapping.py b/tests/snappy_pipeline/workflows/test_workflows_ngs_mapping.py index 8c906262d..655122c8e 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_ngs_mapping.py +++ b/tests/snappy_pipeline/workflows/test_workflows_ngs_mapping.py @@ -6,11 +6,11 @@ import io import textwrap -from biomedsheets.io_tsv import read_generic_tsv_sheet, read_germline_tsv_sheet import pytest import ruamel.yaml as ruamel_yaml from snakemake.io import Wildcards +from biomedsheets.io_tsv import read_generic_tsv_sheet, read_germline_tsv_sheet from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow from .common import get_expected_log_files_dict @@ -39,14 +39,22 @@ def minimal_config(): - pattern: "Agilent SureSelect Human All Exon V6.*" name: Agilent_SureSelect_Human_All_Exon_V6 path: path/to/SureSelect_Human_All_Exon_V6_r2.bed - compute_coverage_bed: true bwa: - path_index: /path/to/bwa/index.fasta + path_index: /path/to/bwa/index.fasta.amb bwa_mem2: - path_index: /path/to/bwa_mem2/index.fasta + path_index: /path/to/bwa_mem2/index.fasta.amb + minimap2: + mapping_threads: 16 + star: + path_index: /path/to/star/index + transcriptome: false + out_filter_intron_motifs: "" + out_sam_strand_field: "" mbcs: mapping_tool: bwa - bsqr: + use_barcodes: True + recalibrate: True + bqsr: common_variants: /path/to/common/variants agent: prepare: @@ -54,6 +62,7 @@ def minimal_config(): lib_prep_type: v2 mark_duplicates: path: /path/to/creak + path_baits: /path/to/baits consensus_mode: HYBRID bam_collect_doc: enabled: true @@ -208,6 +217,7 @@ def test_project_validation_germline( minimal_config_dict = deepcopy(minimal_config) minimal_config_dict = dict(minimal_config_dict) minimal_config_dict = minimal_config_dict["step_config"].get("ngs_mapping", OrderedDict()) + config = ngs_mapping_workflow.config_model_class(**minimal_config_dict) # Create germline sample sheet germline_sheet_io = io.StringIO(germline_sheet_tsv) @@ -218,31 +228,27 @@ def test_project_validation_germline( rna_sheet = read_generic_tsv_sheet(rna_sheet_io) # Method returns None without exception, cause DNA sample sheet and DNA tool defined in config - out = ngs_mapping_workflow.validate_project( - config_dict=minimal_config_dict, sample_sheets_list=[germline_sheet] - ) + out = ngs_mapping_workflow.validate_project(config=config, sample_sheets_list=[germline_sheet]) assert out is None, "No exception expected: DNA sample sheet and DNA tool defined in config." # Exception raised cause no RNA mapper defined in config with pytest.raises(Exception) as exec_info: - ngs_mapping_workflow.validate_project( - config_dict=minimal_config_dict, sample_sheets_list=[rna_sheet] - ) + ngs_mapping_workflow.validate_project(config=config, sample_sheets_list=[rna_sheet]) error_msg = "RNA sample provided, but config only contains DNA mapper." assert exec_info.value.args[0] is not None, error_msg # Exception raised cause only DNA mapper defined in config with pytest.raises(Exception) as exec_info: ngs_mapping_workflow.validate_project( - config_dict=minimal_config_dict, sample_sheets_list=[germline_sheet, rna_sheet] + config=config, sample_sheets_list=[germline_sheet, rna_sheet] ) error_msg = "DNA and RNA sample provided, but config only contains DNA mapper." assert exec_info.value.args[0] is not None, error_msg # Update config and remove RNA exception - minimal_config_dict["tools"]["rna"] = ["rna_mapper"] + config.tools.rna = ["rna_mapper"] out = ngs_mapping_workflow.validate_project( - config_dict=minimal_config_dict, sample_sheets_list=[germline_sheet, rna_sheet] + config=config, sample_sheets_list=[germline_sheet, rna_sheet] ) error_msg = ( "No exception expected: DNA, RNA sample sheet and respective tools defined in config." @@ -250,19 +256,17 @@ def test_project_validation_germline( assert out is None, error_msg # Update config and introduce DNA exception - minimal_config_dict["tools"]["dna"] = [] + config.tools.dna = [] with pytest.raises(Exception) as exec_info: ngs_mapping_workflow.validate_project( - config_dict=minimal_config_dict, sample_sheets_list=[germline_sheet, rna_sheet] + config=config, sample_sheets_list=[germline_sheet, rna_sheet] ) error_msg = "DNA and RNA sample provided, but config only contains RNA mapper." assert exec_info.value.args[0] is not None, error_msg # Exception raised cause no DNA mapper defined in config with pytest.raises(Exception) as exec_info: - ngs_mapping_workflow.validate_project( - config_dict=minimal_config_dict, sample_sheets_list=[germline_sheet] - ) + ngs_mapping_workflow.validate_project(config=config, sample_sheets_list=[germline_sheet]) error_msg = "DNA and RNA sample provided, but config only contains RNA mapper." assert exec_info.value.args[0] is not None, error_msg @@ -339,7 +343,7 @@ def test_bwa_step_part_get_resource(ngs_mapping_workflow): for tool, v in expected_dict.items(): for resource, expected in v.items(): msg_error = f"Assertion error for tool '{tool}' & resource '{resource}'." - actual = ngs_mapping_workflow.get_resource(tool, "run", resource) + actual = ngs_mapping_workflow.get_resource(tool, "run", resource)() assert actual == expected, msg_error @@ -424,7 +428,7 @@ def test_star_step_part_get_resource(ngs_mapping_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = ngs_mapping_workflow.get_resource("star", "run", resource) + actual = ngs_mapping_workflow.get_resource("star", "run", resource)() assert actual == expected, msg_error @@ -490,7 +494,7 @@ def test_minimap2_step_part_get_resource(ngs_mapping_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = ngs_mapping_workflow.get_resource("minimap2", "run", resource) + actual = ngs_mapping_workflow.get_resource("minimap2", "run", resource)() assert actual == expected, msg_error @@ -537,7 +541,7 @@ def test_external_step_part_get_resource(ngs_mapping_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = ngs_mapping_workflow.get_resource("external", "run", resource) + actual = ngs_mapping_workflow.get_resource("external", "run", resource)() assert actual == expected, msg_error @@ -678,7 +682,7 @@ def test_generate_doc_files_step_part_get_resource(ngs_mapping_workflow): """Tests BamCollectDocStepPart.get_resource()""" expected_dict = {"threads": 1, "time": "24:00:00", "memory": "2G", "partition": "medium"} for resource, expected in expected_dict.items(): - actual = ngs_mapping_workflow.get_resource("bam_collect_doc", "run", resource) + actual = ngs_mapping_workflow.get_resource("bam_collect_doc", "run", resource)() assert actual == expected diff --git a/tests/snappy_pipeline/workflows/test_workflows_ngs_mapping_processed_fastq.py b/tests/snappy_pipeline/workflows/test_workflows_ngs_mapping_processed_fastq.py index 83b3ae836..ff15f6bb3 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_ngs_mapping_processed_fastq.py +++ b/tests/snappy_pipeline/workflows/test_workflows_ngs_mapping_processed_fastq.py @@ -6,11 +6,11 @@ import io import textwrap -from biomedsheets.io_tsv import read_cancer_tsv_sheet import pytest import ruamel.yaml as ruamel_yaml from snakemake.io import Wildcards +from biomedsheets.io_tsv import read_cancer_tsv_sheet from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow from .common import get_expected_log_files_dict @@ -40,11 +40,14 @@ def minimal_config(): name: Agilent_SureSelect_Human_All_Exon_V6 path: path/to/SureSelect_Human_All_Exon_V6_r2.bed bwa: - path_index: /path/to/bwa/index.fasta + path_index: /path/to/bwa/index.fasta.amb star: path_index: /path/to/star/index - path_features: /path/to/features.gtf transcriptome: true + out_filter_intron_motifs: "" + out_sam_strand_field: "" + minimap2: + mapping_threads: 16 bam_collect_doc: enabled: true @@ -175,15 +178,14 @@ def test_project_validation_cancer(ngs_mapping_workflow, cancer_sheet_tsv, minim minimal_config_dict = deepcopy(minimal_config) minimal_config_dict = dict(minimal_config_dict) minimal_config_dict = minimal_config_dict["step_config"].get("ngs_mapping", OrderedDict()) + config = ngs_mapping_workflow.config_model_class(**minimal_config_dict) # Create germline sample sheet cancer_sheet_io = io.StringIO(cancer_sheet_tsv) cancer_sheet = read_cancer_tsv_sheet(cancer_sheet_io) # Method returns None without exception, cause DNA sample sheet and DNA tool defined in config - out = ngs_mapping_workflow.validate_project( - config_dict=minimal_config_dict, sample_sheets_list=[cancer_sheet] - ) + out = ngs_mapping_workflow.validate_project(config=config, sample_sheets_list=[cancer_sheet]) assert out is None, "No exception expected: DNA sample sheet and DNA tool defined in config." @@ -250,7 +252,7 @@ def test_bwa_step_part_get_resource(ngs_mapping_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = ngs_mapping_workflow.get_resource("bwa", "run", resource) + actual = ngs_mapping_workflow.get_resource("bwa", "run", resource)() assert actual == expected, msg_error @@ -342,7 +344,7 @@ def test_star_step_part_get_resource(ngs_mapping_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = ngs_mapping_workflow.get_resource("star", "run", resource) + actual = ngs_mapping_workflow.get_resource("star", "run", resource)() assert actual == expected, msg_error @@ -431,7 +433,7 @@ def test_strandedness_step_part_infer_get_resource(ngs_mapping_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = ngs_mapping_workflow.get_resource("strandedness", "infer", resource) + actual = ngs_mapping_workflow.get_resource("strandedness", "infer", resource)() assert actual == expected, msg_error @@ -499,7 +501,7 @@ def test_minimap2_step_part_get_resource(ngs_mapping_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = ngs_mapping_workflow.get_resource("minimap2", "run", resource) + actual = ngs_mapping_workflow.get_resource("minimap2", "run", resource)() assert actual == expected, msg_error @@ -546,7 +548,7 @@ def test_external_step_part_get_resource(ngs_mapping_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = ngs_mapping_workflow.get_resource("external", "run", resource) + actual = ngs_mapping_workflow.get_resource("external", "run", resource)() assert actual == expected, msg_error @@ -616,7 +618,7 @@ def test_target_coverage_report_step_part_get_resource(ngs_mapping_workflow): for action in actions: for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}' in action '{action}'." - actual = ngs_mapping_workflow.get_resource("target_coverage_report", action, resource) + actual = ngs_mapping_workflow.get_resource("target_coverage_report", action, resource)() assert actual == expected, msg_error diff --git a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py index 6688b88db..212d32198 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py +++ b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- """Tests for the panel_of_normals workflow module code""" -from collections import OrderedDict import textwrap import pytest @@ -33,8 +32,6 @@ def minimal_config(): ngs_mapping: tools: dna: ['bwa'] - compute_coverage_bed: true - path_target_regions: /path/to/regions.bed bwa: path_index: /path/to/bwa/index.fa @@ -45,12 +42,13 @@ def minimal_config(): germline_resource: /path/to/germline_resource.vcf path_normals_list: "" cnvkit: - path_excluded_regions: "" path_target_regions: /path/to/regions.bed # WES mode path_normals_list: "" purecn: path_normals_list: "" path_bait_regions: /path/to/baits/regions.bed + path_genomicsDB: /path/to/mutect2/genomicsDB + genome_name: "unknown" data_sets: first_batch: @@ -73,11 +71,13 @@ def panel_of_normals_workflow( work_dir, config_paths, cancer_sheet_fake_fs, + aligner_indices_fake_fs, mocker, ): """Return PanelOfNormalsWorkflow object pre-configured with germline sheet""" # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we # can obtain paths from the function as if we really had a NGSMappingPipelineStep there dummy_workflow.globals = {"ngs_mapping": lambda x: "NGS_MAPPING/" + x} @@ -185,13 +185,13 @@ def test_mutect2_step_part_get_resource_usage(panel_of_normals_workflow): # Evaluate action `create_panel` for resource, expected in create_panel_expected_dict.items(): msg_error = f"Assertion error for resource '{resource}' for action 'create_panel'." - actual = panel_of_normals_workflow.get_resource("mutect2", "create_panel", resource) + actual = panel_of_normals_workflow.get_resource("mutect2", "create_panel", resource)() assert actual == expected, msg_error # Evaluate action `prepare_panel` for resource, expected in prepare_panel_expected_dict.items(): msg_error = f"Assertion error for resource '{resource}' for action 'prepare_panel'." - actual = panel_of_normals_workflow.get_resource("mutect2", "prepare_panel", resource) + actual = panel_of_normals_workflow.get_resource("mutect2", "prepare_panel", resource)() assert actual == expected, msg_error @@ -424,31 +424,31 @@ def test_cnvkit_step_part_get_resource_usage(panel_of_normals_workflow): # Evaluate action `target` for resource, expected in target_expected_dict.items(): msg_error = f"Assertion error for resource '{resource}' for action 'target'." - actual = panel_of_normals_workflow.get_resource("cnvkit", "target", resource) + actual = panel_of_normals_workflow.get_resource("cnvkit", "target", resource)() assert actual == expected, msg_error # Evaluate action `antitarget` for resource, expected in antitarget_expected_dict.items(): msg_error = f"Assertion error for resource '{resource}' for action 'antitarget'." - actual = panel_of_normals_workflow.get_resource("cnvkit", "antitarget", resource) + actual = panel_of_normals_workflow.get_resource("cnvkit", "antitarget", resource)() assert actual == expected, msg_error # Evaluate action `coverage` for resource, expected in coverage_expected_dict.items(): msg_error = f"Assertion error for resource '{resource}' for action 'coverage'." - actual = panel_of_normals_workflow.get_resource("cnvkit", "coverage", resource) + actual = panel_of_normals_workflow.get_resource("cnvkit", "coverage", resource)() assert actual == expected, msg_error # Evaluate action `create_panel` for resource, expected in reference_expected_dict.items(): msg_error = f"Assertion error for resource '{resource}' for action 'create_panel'." - actual = panel_of_normals_workflow.get_resource("cnvkit", "create_panel", resource) + actual = panel_of_normals_workflow.get_resource("cnvkit", "create_panel", resource)() assert actual == expected, msg_error # Evaluate action `report` for resource, expected in report_expected_dict.items(): msg_error = f"Assertion error for resource '{resource}' for action 'report'." - actual = panel_of_normals_workflow.get_resource("cnvkit", "report", resource) + actual = panel_of_normals_workflow.get_resource("cnvkit", "report", resource)() assert actual == expected, msg_error @@ -488,7 +488,7 @@ def test_access_step_part_get_resource_usage(panel_of_normals_workflow): } for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}' for action 'run'." - actual = panel_of_normals_workflow.get_resource("access", "run", resource) + actual = panel_of_normals_workflow.get_resource("access", "run", resource)() assert actual == expected, msg_error @@ -619,7 +619,7 @@ def test_purecn_step_part_get_resource_usage(panel_of_normals_workflow): } for action, resources in expected.items(): for resource, value in resources.items(): - actual = panel_of_normals_workflow.get_resource("purecn", action, resource) + actual = panel_of_normals_workflow.get_resource("purecn", action, resource)() assert actual == value diff --git a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals_wgs.py b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals_wgs.py index 67ce2de37..e1f4c2b26 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals_wgs.py +++ b/tests/snappy_pipeline/workflows/test_workflows_panel_of_normals_wgs.py @@ -9,7 +9,7 @@ from snappy_pipeline.workflows.panel_of_normals import PanelOfNormalsWorkflow -from .common import get_expected_log_files_dict, get_expected_output_vcf_files_dict +from .common import get_expected_log_files_dict from .conftest import patch_module_fs @@ -32,17 +32,15 @@ def minimal_config(): ngs_mapping: tools: dna: ['bwa'] - compute_coverage_bed: true - path_target_regions: /path/to/regions.bed bwa: path_index: /path/to/bwa/index.fa panel_of_normals: - tools: ['cnvkit'] - cnvkit: - path_excluded_regions: "" - path_target_regions: "" # WGS mode - path_normals_list: "" + path_ngs_mapping: NGS_MAPPING/ + tools: ['cnvkit'] + cnvkit: + path_target_regions: "" # WGS mode + path_normals_list: "" data_sets: first_batch: @@ -65,11 +63,13 @@ def panel_of_normals_workflow( work_dir, config_paths, cancer_sheet_fake_fs, + aligner_indices_fake_fs, mocker, ): """Return PanelOfNormalsWorkflow object pre-configured with germline sheet""" # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we # can obtain paths from the function as if we really had a NGSMappingPipelineStep there dummy_workflow.globals = {"ngs_mapping": lambda x: "NGS_MAPPING/" + x} @@ -319,31 +319,31 @@ def test_cnvkit_step_part_get_resource_usage(panel_of_normals_workflow): # Evaluate action `target` for resource, expected in target_expected_dict.items(): msg_error = f"Assertion error for resource '{resource}' for action 'target'." - actual = panel_of_normals_workflow.get_resource("cnvkit", "target", resource) + actual = panel_of_normals_workflow.get_resource("cnvkit", "target", resource)() assert actual == expected, msg_error # Evaluate action `antitarget` for resource, expected in antitarget_expected_dict.items(): msg_error = f"Assertion error for resource '{resource}' for action 'antitarget'." - actual = panel_of_normals_workflow.get_resource("cnvkit", "antitarget", resource) + actual = panel_of_normals_workflow.get_resource("cnvkit", "antitarget", resource)() assert actual == expected, msg_error # Evaluate action `coverage` for resource, expected in coverage_expected_dict.items(): msg_error = f"Assertion error for resource '{resource}' for action 'coverage'." - actual = panel_of_normals_workflow.get_resource("cnvkit", "coverage", resource) + actual = panel_of_normals_workflow.get_resource("cnvkit", "coverage", resource)() assert actual == expected, msg_error # Evaluate action `create_panel` for resource, expected in reference_expected_dict.items(): msg_error = f"Assertion error for resource '{resource}' for action 'create_panel'." - actual = panel_of_normals_workflow.get_resource("cnvkit", "create_panel", resource) + actual = panel_of_normals_workflow.get_resource("cnvkit", "create_panel", resource)() assert actual == expected, msg_error # Evaluate action `report` for resource, expected in report_expected_dict.items(): msg_error = f"Assertion error for resource '{resource}' for action 'report'." - actual = panel_of_normals_workflow.get_resource("cnvkit", "report", resource) + actual = panel_of_normals_workflow.get_resource("cnvkit", "report", resource)() assert actual == expected, msg_error diff --git a/tests/snappy_pipeline/workflows/test_workflows_repeat_expansion.py b/tests/snappy_pipeline/workflows/test_workflows_repeat_expansion.py index 47b54ced5..31f907bcf 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_repeat_expansion.py +++ b/tests/snappy_pipeline/workflows/test_workflows_repeat_expansion.py @@ -29,6 +29,9 @@ def minimal_config(): dna: ['bwa'] bwa: path_index: /path/to/bwa/index.fasta + repeat_expansion: + repeat_catalog: DUMMY + repeat_annotation: DUMMY data_sets: first_batch: @@ -51,11 +54,13 @@ def repeat_expansion_workflow( work_dir, config_paths, germline_sheet_fake_fs, + aligner_indices_fake_fs, mocker, ): """Return RepeatExpansionWorkflow object pre-configured with germline sheet""" # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", germline_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we # can obtain paths from the function as if we really had a NGSMappingPipelineStep here dummy_workflow.globals = {"ngs_mapping": lambda x: "NGS_MAPPING/" + x} @@ -168,7 +173,7 @@ def test_expansionhunter_step_part_get_resource_usage(repeat_expansion_workflow) # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = repeat_expansion_workflow.get_resource("expansionhunter", "run", resource) + actual = repeat_expansion_workflow.get_resource("expansionhunter", "run", resource)() assert actual == expected, msg_error @@ -210,5 +215,5 @@ def test_expansionhunter_annotate_step_part_get_resource_usage(repeat_expansion_ # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = repeat_expansion_workflow.get_resource("expansionhunter", "annotate", resource) + actual = repeat_expansion_workflow.get_resource("expansionhunter", "annotate", resource)() assert actual == expected, msg_error diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_checking.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_checking.py index 92c42cabf..01d1d7ec7 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_checking.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_checking.py @@ -33,9 +33,15 @@ def minimal_config(): ngs_mapping: tools: dna: ['bwa'] + bwa: + path_index: /path/to/bwa/index.fa somatic_targeted_seq_cnv_calling: tools: ["cnvkit"] + cnvkit: + path_target: DUMMY + path_antitarget: DUMMY + path_panel_of_normals: DUMMY somatic_cnv_checking: path_ngs_mapping: ../ngs_mapping @@ -63,11 +69,13 @@ def somatic_cnv_checking_workflow( work_dir, config_paths, cancer_sheet_fake_fs, + aligner_indices_fake_fs, mocker, ): """Return SomaticTargetedSeqCnvCallingWorkflow object pre-configured with germline sheet""" # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we # can obtain paths from the function as if we really had a NGSMappingPipelineStep here dummy_workflow.globals = { @@ -119,7 +127,7 @@ def test_pileup_normal_step_part_get_resource(somatic_cnv_checking_workflow): expected_dict = {"threads": 2, "time": "12:00:00", "memory": "7577M", "partition": "medium"} for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_cnv_checking_workflow.get_resource("pileup", "normal", resource) + actual = somatic_cnv_checking_workflow.get_resource("pileup", "normal", resource)() assert actual == expected, msg_error @@ -157,7 +165,7 @@ def test_pileup_tumor_step_part_get_resource(somatic_cnv_checking_workflow): expected_dict = {"threads": 2, "time": "01:00:00", "memory": "7577M", "partition": "medium"} for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_cnv_checking_workflow.get_resource("pileup", "tumor", resource) + actual = somatic_cnv_checking_workflow.get_resource("pileup", "tumor", resource)() assert actual == expected, msg_error diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_gene_fusion_calling.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_gene_fusion_calling.py index f504dc333..c81561e7d 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_gene_fusion_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_gene_fusion_calling.py @@ -24,6 +24,8 @@ def minimal_config(): static_data_config: reference: path: /path/to/ref.fa + features: + path: /path/to/features.gtf step_config: ngs_mapping: @@ -48,7 +50,7 @@ def minimal_config(): path_dataset_directory: REQUIRED arriba: path_index: /path/to/star/index - features: /path/to/features.gtf + jaffa: {} data_sets: first_batch: @@ -77,6 +79,7 @@ def somatic_gene_fusion_calling_workflow( """Return SomaticGeneFusionCallingWorkflow object pre-configured with cancer sheet""" # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) # Patch out files for aligner indices patch_module_fs( "snappy_pipeline.workflows.somatic_gene_fusion_calling", aligner_indices_fake_fs, mocker @@ -123,7 +126,9 @@ def test_fusioncatcher_step_part_get_resource_usage(somatic_gene_fusion_calling_ # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_gene_fusion_calling_workflow.get_resource("fusioncatcher", "run", resource) + actual = somatic_gene_fusion_calling_workflow.get_resource( + "fusioncatcher", "run", resource + )() assert actual == expected, msg_error @@ -158,7 +163,7 @@ def test_jaffa_step_part_get_resource_usage(somatic_gene_fusion_calling_workflow # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_gene_fusion_calling_workflow.get_resource("jaffa", "run", resource) + actual = somatic_gene_fusion_calling_workflow.get_resource("jaffa", "run", resource)() assert actual == expected, msg_error @@ -193,7 +198,7 @@ def test_pizzly_step_part_get_resource_usage(somatic_gene_fusion_calling_workflo # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_gene_fusion_calling_workflow.get_resource("pizzly", "run", resource) + actual = somatic_gene_fusion_calling_workflow.get_resource("pizzly", "run", resource)() assert actual == expected, msg_error @@ -228,7 +233,7 @@ def test_star_fusion_step_part_get_resource_usage(somatic_gene_fusion_calling_wo # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_gene_fusion_calling_workflow.get_resource("star_fusion", "run", resource) + actual = somatic_gene_fusion_calling_workflow.get_resource("star_fusion", "run", resource)() assert actual == expected, msg_error @@ -263,7 +268,7 @@ def test_defuse_step_part_get_resource_usage(somatic_gene_fusion_calling_workflo # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_gene_fusion_calling_workflow.get_resource("defuse", "run", resource) + actual = somatic_gene_fusion_calling_workflow.get_resource("defuse", "run", resource)() assert actual == expected, msg_error @@ -298,7 +303,7 @@ def test_hera_step_part_get_resource_usage(somatic_gene_fusion_calling_workflow) # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_gene_fusion_calling_workflow.get_resource("hera", "run", resource) + actual = somatic_gene_fusion_calling_workflow.get_resource("hera", "run", resource)() assert actual == expected, msg_error @@ -377,7 +382,7 @@ def test_arriba_step_part_get_resource_usage(somatic_gene_fusion_calling_workflo # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_gene_fusion_calling_workflow.get_resource("arriba", "run", resource) + actual = somatic_gene_fusion_calling_workflow.get_resource("arriba", "run", resource)() assert actual == expected, msg_error diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_gene_fusion_calling_processed_fastq.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_gene_fusion_calling_processed_fastq.py index 99cfed4f2..8e1d1af31 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_gene_fusion_calling_processed_fastq.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_gene_fusion_calling_processed_fastq.py @@ -51,6 +51,7 @@ def minimal_config(): path_dataset_directory: REQUIRED arriba: path_index: /path/to/star/index + jaffa: {} data_sets: first_batch: @@ -79,6 +80,7 @@ def somatic_gene_fusion_calling_workflow( """Return SomaticGeneFusionCallingWorkflow object pre-configured with cancer sheet""" # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs_path_link_in, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) # Patch out files for aligner indices patch_module_fs( "snappy_pipeline.workflows.somatic_gene_fusion_calling", aligner_indices_fake_fs, mocker @@ -125,7 +127,9 @@ def test_fusioncatcher_step_part_get_resource_usage(somatic_gene_fusion_calling_ # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_gene_fusion_calling_workflow.get_resource("fusioncatcher", "run", resource) + actual = somatic_gene_fusion_calling_workflow.get_resource( + "fusioncatcher", "run", resource + )() assert actual == expected, msg_error @@ -160,7 +164,7 @@ def test_jaffa_step_part_get_resource_usage(somatic_gene_fusion_calling_workflow # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_gene_fusion_calling_workflow.get_resource("jaffa", "run", resource) + actual = somatic_gene_fusion_calling_workflow.get_resource("jaffa", "run", resource)() assert actual == expected, msg_error @@ -195,7 +199,7 @@ def test_pizzly_step_part_get_resource_usage(somatic_gene_fusion_calling_workflo # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_gene_fusion_calling_workflow.get_resource("pizzly", "run", resource) + actual = somatic_gene_fusion_calling_workflow.get_resource("pizzly", "run", resource)() assert actual == expected, msg_error @@ -230,7 +234,7 @@ def test_star_fusion_step_part_get_resource_usage(somatic_gene_fusion_calling_wo # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_gene_fusion_calling_workflow.get_resource("star_fusion", "run", resource) + actual = somatic_gene_fusion_calling_workflow.get_resource("star_fusion", "run", resource)() assert actual == expected, msg_error @@ -265,7 +269,7 @@ def test_defuse_step_part_get_resource_usage(somatic_gene_fusion_calling_workflo # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_gene_fusion_calling_workflow.get_resource("defuse", "run", resource) + actual = somatic_gene_fusion_calling_workflow.get_resource("defuse", "run", resource)() assert actual == expected, msg_error @@ -300,7 +304,7 @@ def test_hera_step_part_get_resource_usage(somatic_gene_fusion_calling_workflow) # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_gene_fusion_calling_workflow.get_resource("hera", "run", resource) + actual = somatic_gene_fusion_calling_workflow.get_resource("hera", "run", resource)() assert actual == expected, msg_error @@ -379,7 +383,7 @@ def test_arriba_step_part_get_resource_usage(somatic_gene_fusion_calling_workflo # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_gene_fusion_calling_workflow.get_resource("arriba", "run", resource) + actual = somatic_gene_fusion_calling_workflow.get_resource("arriba", "run", resource)() assert actual == expected, msg_error diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_hla_loh_calling.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_hla_loh_calling.py index f82881af0..3489beaca 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_hla_loh_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_hla_loh_calling.py @@ -56,11 +56,13 @@ def somatic_hla_loh_calling_workflow( work_dir, config_paths, cancer_sheet_fake_fs, + aligner_indices_fake_fs, mocker, ): """Return SomaticHlaLohCallingWorkflow object pre-configured with cancer sheet""" # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) dummy_workflow.globals = { "ngs_mapping": lambda x: "NGS_MAPPING/" + x, "hla_typing": lambda x: "HLA_TYPING/" + x, @@ -123,7 +125,7 @@ def test_lohhla_step_part_get_resource_usage(somatic_hla_loh_calling_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_hla_loh_calling_workflow.get_resource("lohhla", "run", resource) + actual = somatic_hla_loh_calling_workflow.get_resource("lohhla", "run", resource)() assert actual == expected, msg_error diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_msi_calling.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_msi_calling.py index c3c486107..3b9153767 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_msi_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_msi_calling.py @@ -58,11 +58,13 @@ def somatic_msi_calling_workflow( work_dir, config_paths, cancer_sheet_fake_fs, + aligner_indices_fake_fs, mocker, ): """Return SomaticMsiCallingWorkflow object pre-configured with cancer sheet""" # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) dummy_workflow.globals = {"ngs_mapping": lambda x: "NGS_MAPPING/" + x} # Construct the workflow object return SomaticMsiCallingWorkflow( @@ -122,7 +124,7 @@ def test_mantis_msi2_step_part_get_resource_usage( # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_msi_calling_workflow.get_resource("mantis_msi2", "run", resource) + actual = somatic_msi_calling_workflow.get_resource("mantis_msi2", "run", resource)() assert actual == expected, msg_error diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_purity_ploidy_estimate.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_purity_ploidy_estimate.py index 4c15f819a..6ae583da1 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_purity_ploidy_estimate.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_purity_ploidy_estimate.py @@ -36,6 +36,8 @@ def minimal_config(): tools: ['ascat'] tool_cnv_calling: cnvetti path_somatic_targeted_seq_cnv_calling: ../somatic_targeted_seq_cnv_calling + ascat: + b_af_loci: DUMMY data_sets: first_batch: @@ -68,11 +70,13 @@ def somatic_purity_ploidy_estimate_workflow( work_dir, config_paths, cancer_sheet_fake_fs, + aligner_indices_fake_fs, mocker, ): """Return SomaticPurityPloidyEstimateWorkflow object pre-configured with cancer sheet""" # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) dummy_workflow.globals = {"ngs_mapping": lambda x: "NGS_MAPPING/" + x} # Construct the workflow object return SomaticPurityPloidyEstimateWorkflow( @@ -85,18 +89,20 @@ def somatic_purity_ploidy_estimate_workflow( @pytest.fixture -def somatic_purity_ploidy_estimate_workflow_w_copywritter( +def somatic_purity_ploidy_estimate_workflow_w_copywriter( dummy_workflow, minimal_config_copywritter, config_lookup_paths, work_dir, config_paths, cancer_sheet_fake_fs, + aligner_indices_fake_fs, mocker, ): """Return SomaticPurityPloidyEstimateWorkflow object pre-configured with cancer sheet""" # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) dummy_workflow.globals = { "ngs_mapping": lambda x: "NGS_MAPPING/" + x, "somatic_targeted_seq_cnv_calling": lambda x: "SOMATIC_CNV_CALLING/" + x, @@ -167,7 +173,7 @@ def test_ascat_step_part_get_input_files_cnv_normal(somatic_purity_ploidy_estima def test_ascat_step_part_get_input_files_cnv_tumor_wes( - somatic_purity_ploidy_estimate_workflow_w_copywritter, + somatic_purity_ploidy_estimate_workflow_w_copywriter, ): """Tests AscatStepPart._get_input_files_cnv_tumor_wes()""" wildcards = Wildcards(fromdict={"tumor_library_name": "P001-T1-DNA1-WGS1", "mapper": "bwa"}) @@ -177,14 +183,14 @@ def test_ascat_step_part_get_input_files_cnv_tumor_wes( "bwa.copywriter.P001-T1-DNA1-WGS1_bins.txt" ) } - actual = somatic_purity_ploidy_estimate_workflow_w_copywritter.get_input_files( + actual = somatic_purity_ploidy_estimate_workflow_w_copywriter.get_input_files( "ascat", "cnv_tumor_wes" )(wildcards) assert actual == expected def test_ascat_step_part_get_input_files_cnv_normal_wes( - somatic_purity_ploidy_estimate_workflow_w_copywritter, + somatic_purity_ploidy_estimate_workflow_w_copywriter, ): """Tests AscatStepPart._get_input_files_cnv_normal_wes()""" wildcards = Wildcards(fromdict={"normal_library_name": "P001-N1-DNA1-WGS1", "mapper": "bwa"}) @@ -194,7 +200,7 @@ def test_ascat_step_part_get_input_files_cnv_normal_wes( "bwa.copywriter.P001-T1-DNA1-WGS1_bins.txt" ) } - actual = somatic_purity_ploidy_estimate_workflow_w_copywritter.get_input_files( + actual = somatic_purity_ploidy_estimate_workflow_w_copywriter.get_input_files( "ascat", "cnv_normal_wes" )(wildcards) assert actual == expected @@ -288,7 +294,9 @@ def test_ascat_step_part_get_resource_usage(somatic_purity_ploidy_estimate_workf for action in actions: for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}' for action '{action}'." - actual = somatic_purity_ploidy_estimate_workflow.get_resource("ascat", action, resource) + actual = somatic_purity_ploidy_estimate_workflow.get_resource( + "ascat", action, resource + )() assert actual == expected, msg_error diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py index a05198407..f9f1b45b1 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py @@ -36,8 +36,6 @@ def minimal_config(): ngs_mapping: tools: dna: ['bwa'] - compute_coverage_bed: true - path_target_regions: /path/to/regions.bed bwa: path_index: /path/to/bwa/index.fa @@ -57,6 +55,13 @@ def minimal_config(): path_intervals: /path/to/interval/list path_panel_of_normals: /path/to/purecn/pon path_mapping_bias: /path/to/mapping/bias + path_somatic_variants: /path/to/somatic/variants + cnvetti_on_target: + path_target_regions: /path/to/target/regions + copywriter: + path_target_regions: /path/to/target/regions + plot_genes: "/path/to/civic/annotation??" + sequenza: {} # use defaults, no required fields. data_sets: first_batch: @@ -79,11 +84,13 @@ def somatic_targeted_seq_cnv_calling_workflow( work_dir, config_paths, cancer_sheet_fake_fs, + aligner_indices_fake_fs, mocker, ): """Return SomaticTargetedSeqCnvCallingWorkflow object pre-configured with germline sheet""" # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we # can obtain paths from the function as if we really had a NGSMappingPipelineStep here dummy_workflow.globals = { @@ -280,7 +287,7 @@ def test_cnvetti_on_target_step_part_get_resource_usage(somatic_targeted_seq_cnv msg_error = f"Assertion error for resource '{resource}' in action {action}." actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( "cnvetti_on_target", action, resource - ) + )() assert actual == expected, msg_error @@ -334,7 +341,7 @@ def test_cnvkit_coverage_step_part_get_resource(somatic_targeted_seq_cnv_calling msg_error = f"Assertion error for resource '{resource}'." actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( "cnvkit", "coverage", resource - ) + )() assert actual == expected, msg_error @@ -376,7 +383,7 @@ def test_cnvkit_fix_step_part_get_resource(somatic_targeted_seq_cnv_calling_work # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_targeted_seq_cnv_calling_workflow.get_resource("cnvkit", "fix", resource) + actual = somatic_targeted_seq_cnv_calling_workflow.get_resource("cnvkit", "fix", resource)() assert actual == expected, msg_error @@ -419,7 +426,7 @@ def test_cnvkit_segment_step_part_get_resource(somatic_targeted_seq_cnv_calling_ msg_error = f"Assertion error for resource '{resource}'." actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( "cnvkit", "segment", resource - ) + )() assert actual == expected, msg_error @@ -461,7 +468,9 @@ def test_cnvkit_call_step_part_get_resource(somatic_targeted_seq_cnv_calling_wor # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_targeted_seq_cnv_calling_workflow.get_resource("cnvkit", "call", resource) + actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( + "cnvkit", "call", resource + )() assert actual == expected, msg_error @@ -511,7 +520,7 @@ def test_cnvkit_postprocess_step_part_get_resource(somatic_targeted_seq_cnv_call msg_error = f"Assertion error for resource '{resource}'." actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( "cnvkit", "postprocess", resource - ) + )() assert actual == expected, msg_error @@ -573,7 +582,9 @@ def test_cnvkit_plot_step_part_get_resource(somatic_targeted_seq_cnv_calling_wor # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_targeted_seq_cnv_calling_workflow.get_resource("cnvkit", "plot", resource) + actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( + "cnvkit", "plot", resource + )() assert actual == expected, msg_error @@ -627,7 +638,7 @@ def test_cnvkit_export_step_part_get_resource(somatic_targeted_seq_cnv_calling_w msg_error = f"Assertion error for resource '{resource}'." actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( "cnvkit", "export", resource - ) + )() assert actual == expected, msg_error @@ -688,7 +699,7 @@ def test_cnvkit_report_step_part_get_resource(somatic_targeted_seq_cnv_calling_w msg_error = f"Assertion error for resource '{resource}'." actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( "cnvkit", "report", resource - ) + )() assert actual == expected, msg_error @@ -795,7 +806,7 @@ def test_copywriter_step_part_get_resource_usage_prepare(somatic_targeted_seq_cn msg_error = f"Assertion error for resource '{resource}'." actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( "copywriter", "prepare", resource - ) + )() assert actual == expected, msg_error @@ -808,7 +819,7 @@ def test_copywriter_step_part_get_resource_usage_run(somatic_targeted_seq_cnv_ca msg_error = f"Assertion error for resource '{resource}'." actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( "copywriter", "run", resource - ) + )() assert actual == expected, msg_error @@ -821,7 +832,7 @@ def test_copywriter_step_part_get_resource_usage_call(somatic_targeted_seq_cnv_c msg_error = f"Assertion error for resource '{resource}'." actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( "copywriter", "call", resource - ) + )() assert actual == expected, msg_error @@ -939,7 +950,7 @@ def test_sequenza_step_part_get_resource_usage_call(somatic_targeted_seq_cnv_cal msg_error = f"Assertion error for resource '{resource}' in '{action}'." actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( "sequenza", action, resource - ) + )() assert actual == expected, msg_error @@ -1023,7 +1034,7 @@ def test_purecn_step_part_get_resource_usage(somatic_targeted_seq_cnv_calling_wo msg_error = f"Assertion error for resource '{resource}' in '{action}'." actual = somatic_targeted_seq_cnv_calling_workflow.get_resource( "purecn", action, resource - ) + )() assert actual == expected, msg_error diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_annotation.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_annotation.py index 78eba204d..1d6ebc6f8 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_annotation.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_annotation.py @@ -32,8 +32,6 @@ def minimal_config(): ngs_mapping: tools: dna: ['bwa'] - compute_coverage_bed: true - path_target_regions: /path/to/regions.bed bwa: path_index: /path/to/bwa/index.fa @@ -43,13 +41,17 @@ def minimal_config(): - scalpel scalpel: path_target_regions: /path/to/target/regions.bed - + mutect: {} + somatic_variant_annotation: + path_somatic_variant_calling: /path/to/somatic_variant_calling tools: ["jannovar", "vep"] jannovar: + dbnsfp: {} + flag_off_target: true path_jannovar_ser: /path/to/jannover.ser vep: - path_dir_cache: /path/to/dir/cache + cache_dir: /path/to/dir/cache data_sets: first_batch: @@ -72,11 +74,13 @@ def somatic_variant_annotation_workflow( work_dir, config_paths, cancer_sheet_fake_fs, + aligner_indices_fake_fs, mocker, ): """Return SomaticVariantAnnotationWorkflow object pre-configured with cancer sheet""" # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we # can obtain paths from the function as if we really had a NGSMappingPipelineStep there dummy_workflow.globals = { @@ -153,7 +157,7 @@ def test_jannovar_step_part_get_resource_usage(somatic_variant_annotation_workfl msg_error = f"Assertion error for resource '{resource}'." actual = somatic_variant_annotation_workflow.get_resource( "jannovar", "annotate_somatic_vcf", resource - ) + )() assert actual == expected, msg_error @@ -213,7 +217,7 @@ def test_vep_step_part_get_resource_usage(somatic_variant_annotation_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_variant_annotation_workflow.get_resource("vep", "run", resource) + actual = somatic_variant_annotation_workflow.get_resource("vep", "run", resource)() assert actual == expected, msg_error diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_calling.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_calling.py index d13bd9284..277d3c7af 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_calling.py @@ -34,20 +34,31 @@ def minimal_config(): ngs_mapping: tools: dna: ['bwa'] - compute_coverage_bed: true - path_target_regions: /path/to/regions.bed bwa: path_index: /path/to/bwa/index.fa somatic_variant_calling: tools: - - mutect - - scalpel - - mutect2 - scalpel: - path_target_regions: /path/to/target/regions.bed + - mutect + - mutect2 + - scalpel + - strelka2 + - bcftools_joint + - platypus_joint + - gatk_hc_joint + - gatk_ug_joint + - varscan_joint + mutect: {} mutect2: common_variants: /path/to/common_variants.vcf + scalpel: + path_target_regions: /path/to/target/regions.bed + strelka2: {} + bcftools_joint: {} + platypus_joint: {} + gatk_hc_joint: {} + gatk_ug_joint: {} + varscan_joint: {} data_sets: first_batch: @@ -94,11 +105,13 @@ def somatic_variant_calling_workflow( work_dir, config_paths, cancer_sheet_fake_fs, + aligner_indices_fake_fs, mocker, ): """Return SomaticVariantCallingWorkflow object pre-configured with germline sheet""" # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we # can obtain paths from the function as if we really had a NGSMappingPipelineStep there dummy_workflow.globals = {"ngs_mapping": lambda x: "NGS_MAPPING/" + x} @@ -168,7 +181,7 @@ def test_mutect_step_part_get_resource_usage(somatic_variant_calling_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_variant_calling_workflow.get_resource("mutect", "run", resource) + actual = somatic_variant_calling_workflow.get_resource("mutect", "run", resource)() assert actual == expected, msg_error @@ -430,7 +443,7 @@ def test_mutect2_step_part_get_resource_usage_run(somatic_variant_calling_workfl # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_variant_calling_workflow.get_resource("mutect2", "run", resource) + actual = somatic_variant_calling_workflow.get_resource("mutect2", "run", resource)() assert actual == expected, msg_error @@ -441,7 +454,7 @@ def test_mutect2_step_part_get_resource_usage_filter(somatic_variant_calling_wor # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_variant_calling_workflow.get_resource("mutect2", "filter", resource) + actual = somatic_variant_calling_workflow.get_resource("mutect2", "filter", resource)() assert actual == expected, msg_error @@ -452,7 +465,9 @@ def test_mutect2_step_part_get_resource_usage_contamination(somatic_variant_call # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_variant_calling_workflow.get_resource("mutect2", "contamination", resource) + actual = somatic_variant_calling_workflow.get_resource( + "mutect2", "contamination", resource + )() assert actual == expected, msg_error @@ -463,7 +478,9 @@ def test_mutect2_step_part_get_resource_usage_pileup_normal(somatic_variant_call # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_variant_calling_workflow.get_resource("mutect2", "pileup_normal", resource) + actual = somatic_variant_calling_workflow.get_resource( + "mutect2", "pileup_normal", resource + )() assert actual == expected, msg_error @@ -474,7 +491,9 @@ def test_mutect2_step_part_get_resource_usage_pileup_tumor(somatic_variant_calli # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_variant_calling_workflow.get_resource("mutect2", "pileup_tumor", resource) + actual = somatic_variant_calling_workflow.get_resource( + "mutect2", "pileup_tumor", resource + )() assert actual == expected, msg_error @@ -533,7 +552,7 @@ def test_scalpel_step_part_get_resource_usage(somatic_variant_calling_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_variant_calling_workflow.get_resource("scalpel", "run", resource) + actual = somatic_variant_calling_workflow.get_resource("scalpel", "run", resource)() assert actual == expected, msg_error @@ -598,7 +617,7 @@ def test_strelka2_step_part_get_resource_usage(somatic_variant_calling_workflow) # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_variant_calling_workflow.get_resource("strelka2", "run", resource) + actual = somatic_variant_calling_workflow.get_resource("strelka2", "run", resource)() assert actual == expected, msg_error @@ -662,7 +681,7 @@ def test_bcftools_joint_step_part_get_resource_usage(somatic_variant_calling_wor # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_variant_calling_workflow.get_resource("bcftools_joint", "run", resource) + actual = somatic_variant_calling_workflow.get_resource("bcftools_joint", "run", resource)() assert actual == expected, msg_error @@ -726,7 +745,7 @@ def test_varscan_joint_step_part_get_resource_usage(somatic_variant_calling_work # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_variant_calling_workflow.get_resource("varscan_joint", "run", resource) + actual = somatic_variant_calling_workflow.get_resource("varscan_joint", "run", resource)() assert actual == expected, msg_error @@ -790,7 +809,7 @@ def test_platypus_joint_step_part_get_resource_usage(somatic_variant_calling_wor # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_variant_calling_workflow.get_resource("platypus_joint", "run", resource) + actual = somatic_variant_calling_workflow.get_resource("platypus_joint", "run", resource)() assert actual == expected, msg_error @@ -854,7 +873,7 @@ def test_gatk_hc_joint_step_part_get_resource_usage(somatic_variant_calling_work # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_variant_calling_workflow.get_resource("gatk_hc_joint", "run", resource) + actual = somatic_variant_calling_workflow.get_resource("gatk_hc_joint", "run", resource)() assert actual == expected, msg_error @@ -918,7 +937,7 @@ def test_gatk_ug_joint_step_part_get_resource_usage(somatic_variant_calling_work # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_variant_calling_workflow.get_resource("gatk_ug_joint", "run", resource) + actual = somatic_variant_calling_workflow.get_resource("gatk_ug_joint", "run", resource)() assert actual == expected, msg_error @@ -927,64 +946,102 @@ def test_gatk_ug_joint_step_part_get_resource_usage(somatic_variant_calling_work def test_somatic_variant_calling_workflow(somatic_variant_calling_workflow): """Test simple functionality of the workflow""" - # Perform the tests - # + + matched_callers = { + "mutect", + "mutect2", + "scalpel", + "strelka2", + } + joint_callers = { + "bcftools_joint", + "gatk_hc_joint", + "gatk_ug_joint", + "platypus_joint", + "varscan_joint", + } + callers = matched_callers | joint_callers + expected = {"link_out"} | callers # Check created sub steps - expected = ["link_out", "mutect", "scalpel"] - assert set(expected).issubset(list(sorted(somatic_variant_calling_workflow.sub_steps.keys()))) + assert expected == set(somatic_variant_calling_workflow.sub_steps.keys()) # Check result file construction - tpl = ( + mappers = ("bwa",) + matched_tpl = ( "output/{mapper}.{var_caller}.P00{i}-T{t}-DNA1-WGS1/out/" "{mapper}.{var_caller}.P00{i}-T{t}-DNA1-WGS1.{ext}" ) + output_exts = ( + "vcf.gz", + "vcf.gz.md5", + "vcf.gz.tbi", + "vcf.gz.tbi.md5", + "full.vcf.gz", + "full.vcf.gz.md5", + "full.vcf.gz.tbi", + "full.vcf.gz.tbi.md5", + ) expected = [ - tpl.format(mapper=mapper, var_caller=var_caller, i=i, t=t, ext=ext) + matched_tpl.format(mapper=mapper, var_caller=var_caller, i=i, t=t, ext=ext) for i, t in ((1, 1), (2, 1), (2, 2)) - for ext in ( - "vcf.gz", - "vcf.gz.md5", - "vcf.gz.tbi", - "vcf.gz.tbi.md5", - "full.vcf.gz", - "full.vcf.gz.md5", - "full.vcf.gz.tbi", - "full.vcf.gz.tbi.md5", - ) - for mapper in ("bwa",) - for var_caller in ("mutect", "scalpel", "mutect2") + for ext in output_exts + for mapper in mappers + for var_caller in matched_callers ] # add special cases expected += [ - tpl.format(mapper=mapper, var_caller="mutect", i=i, t=t, ext=ext) + matched_tpl.format(mapper=mapper, var_caller="mutect", i=i, t=t, ext=ext) for i, t in ((1, 1), (2, 1), (2, 2)) for ext in ("txt", "txt.md5", "wig", "wig.md5") - for mapper in ("bwa",) + for mapper in mappers ] expected += [ - tpl.format(mapper=mapper, var_caller="scalpel", i=i, t=t, ext=ext) + matched_tpl.format(mapper=mapper, var_caller="scalpel", i=i, t=t, ext=ext) for i, t in ((1, 1), (2, 1), (2, 2)) for ext in ("tar.gz", "tar.gz.md5") - for mapper in ("bwa",) + for mapper in mappers ] # add log files - tpl = ( + matched_tpl = ( "output/{mapper}.{var_caller}.P00{i}-T{t}-DNA1-WGS1/log/" "{mapper}.{var_caller}.P00{i}-T{t}-DNA1-WGS1.{ext}" ) + meta_exts = ( + "conda_info.txt", + "conda_list.txt", + "log", + "conda_info.txt.md5", + "conda_list.txt.md5", + "log.md5", + ) expected += [ - tpl.format(mapper=mapper, var_caller=var_caller, i=i, t=t, ext=ext) + matched_tpl.format(mapper=mapper, var_caller=var_caller, i=i, t=t, ext=ext) for i, t in ((1, 1), (2, 1), (2, 2)) - for ext in ( - "conda_info.txt", - "conda_list.txt", - "log", - "conda_info.txt.md5", - "conda_list.txt.md5", - "log.md5", - ) + for ext in meta_exts + for mapper in ("bwa",) + for var_caller in matched_callers + ] + + output_exts = ( + "vcf.gz", + "vcf.gz.md5", + "vcf.gz.tbi", + "vcf.gz.tbi.md5", + ) + + joint_tpl = "output/{mapper}.{var_caller}.P00{donor}/out/{mapper}.{var_caller}.P00{donor}.{ext}" + expected += [ + joint_tpl.format(mapper=mapper, var_caller=var_caller, donor=donor, ext=ext) + for donor in (1, 2) + for ext in output_exts for mapper in ("bwa",) - for var_caller in ("mutect", "scalpel", "mutect2") + for var_caller in joint_callers ] + expected = list(sorted(expected)) actual = list(sorted(somatic_variant_calling_workflow.get_result_files())) + + # TODO properly model strelka2 output files, skipping for now + expected = [s for s in expected if "strelka2" not in s] + actual = [s for s in actual if "strelka2" not in s] + assert expected == actual diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_filtration.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_filtration.py index e0bf4b74a..f2389dd72 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_filtration.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_filtration.py @@ -36,6 +36,8 @@ def minimal_config(): tools_somatic_variant_calling: ['mutect2'] tools_somatic_variant_annotation: ['jannovar'] filtration_schema: sets + filter_sets: {} + path_somatic_variant: "../somatic_variant_annotation" data_sets: first_batch: @@ -58,11 +60,13 @@ def somatic_variant_filtration_workflow( work_dir, config_paths, cancer_sheet_fake_fs, + aligner_indices_fake_fs, mocker, ): """Return SomaticVariantFiltrationWorkflow object pre-configured with cancer sheet""" # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) dummy_workflow.globals = { "ngs_mapping": lambda x: "NGS_MAPPING/" + x, "somatic_variant": lambda x: "SOMATIC_VARIANT_ANNOTATION/" + x, @@ -129,7 +133,7 @@ def test_dkfz_bias_filter_step_part_get_resource_usage(somatic_variant_filtratio msg_error = f"Assertion error for resource '{resource}'." actual = somatic_variant_filtration_workflow.get_resource( "dkfz_bias_filter", "run", resource - ) + )() assert actual == expected, msg_error @@ -233,7 +237,9 @@ def test_eb_filter_step_part_get_resource_usage(somatic_variant_filtration_workf for action in actions: for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}' in action '{action}'." - actual = somatic_variant_filtration_workflow.get_resource("eb_filter", action, resource) + actual = somatic_variant_filtration_workflow.get_resource( + "eb_filter", action, resource + )() assert actual == expected, msg_error @@ -285,7 +291,9 @@ def test_apply_filters_step_part_get_resource_usage(somatic_variant_filtration_w # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_variant_filtration_workflow.get_resource("apply_filters", "run", resource) + actual = somatic_variant_filtration_workflow.get_resource( + "apply_filters", "run", resource + )() assert actual == expected, msg_error @@ -349,7 +357,7 @@ def test_filter_to_exons_step_part_get_resource_usage(somatic_variant_filtration msg_error = f"Assertion error for resource '{resource}'." actual = somatic_variant_filtration_workflow.get_resource( "filter_to_exons", "run", resource - ) + )() assert actual == expected, msg_error @@ -445,11 +453,12 @@ def minimal_config_list(): tools_ngs_mapping: ['bwa'] tools_somatic_variant_calling: ['mutect2'] tools_somatic_variant_annotation: ['jannovar'] + path_somatic_variant: "../somatic_variant_annotation" filtration_schema: list filter_list: - dkfz: {} - ebfilter: - threshold: 2.3 + ebfilter_threshold: 2.3 - bcftools: include: "include_statment" - regions: @@ -479,11 +488,13 @@ def somatic_variant_filtration_workflow_list( work_dir, config_paths, cancer_sheet_fake_fs, + aligner_indices_fake_fs, mocker, ): """Return SomaticVariantFiltrationWorkflow object pre-configured with cancer sheet""" # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) dummy_workflow.globals = { "ngs_mapping": lambda x: "NGS_MAPPING/" + x, "somatic_variant": lambda x: "SOMATIC_VARIANT_ANNOTATION/" + x, @@ -562,7 +573,15 @@ def test_one_filter_step_part_get_params(somatic_variant_filtration_workflow_lis assert actual == expected wildcards = Wildcards(fromdict={"filter_nb": 2}) - expected = {"filter_name": "ebfilter_2", "threshold": 2.3, "has_annotation": True} + expected = { + "filter_name": "ebfilter_2", + "ebfilter_threshold": 2.3, + "has_annotation": True, + "shuffle_seed": 1, + "panel_of_normals_size": 25, + "min_mapq": 20, + "min_baseq": 15, + } actual = somatic_variant_filtration_workflow_list.get_params("one_ebfilter", "run")(wildcards) assert actual == expected @@ -591,7 +610,7 @@ def test_one_filter_step_part_get_resource_usage(somatic_variant_filtration_work msg_error = f"Assertion error for resource '{resource}'." actual = somatic_variant_filtration_workflow_list.get_resource( "one_ebfilter", "run", resource - ) + )() assert actual == expected, msg_error diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_signatures.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_signatures.py index 3a9f6aca6..23e502e43 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_signatures.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_variant_signatures.py @@ -30,8 +30,15 @@ def minimal_config(): bwa: path_index: /path/to/bwa/index.fasta + somatic_variant_filtration: + tools_somatic_variant_calling: ['mutect'] + filter_list: + - dkfz: {} + somatic_variant_signatures: path_somatic_variant: ../SOMATIC_VARIANT_FILTRATION + tools_somatic_variant_annotation: ['vep'] + tools_somatic_variant_calling: ['mutect'] is_filtered: True data_sets: @@ -55,11 +62,13 @@ def somatic_variant_signatures_workflow( work_dir, config_paths, cancer_sheet_fake_fs, + aligner_indices_fake_fs, mocker, ): """Return SomaticVariantSignaturesWorkflow object pre-configured with cancer sheet""" # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) dummy_workflow.globals = { "ngs_mapping": lambda x: "NGS_MAPPING/" + x, "somatic_variant": lambda x: "SOMATIC_VARIANT_FILTRATION/" + x, @@ -80,8 +89,8 @@ def somatic_variant_signatures_workflow( def test_tabulate_vcf_step_part_get_input_files(somatic_variant_signatures_workflow): """Tests TabulateVariantsStepPart.get_input_files()""" base_name = ( - "SOMATIC_VARIANT_FILTRATION/output/{mapper}.{var_caller}.{anno_caller}.dkfz_bias_filter.eb_filter.{tumor_library}.{filter}.{region}/out/" - "{mapper}.{var_caller}.{anno_caller}.dkfz_bias_filter.eb_filter.{tumor_library}.{filter}.{region}" + "SOMATIC_VARIANT_FILTRATION/output/{mapper}.{var_caller}.{anno_caller}.filtered.{tumor_library}/out/" + "{mapper}.{var_caller}.{anno_caller}.filtered.{tumor_library}" ) expected = { "vcf": base_name + ".vcf.gz", @@ -95,8 +104,8 @@ def test_tabulate_vcf_step_part_get_output_files(somatic_variant_signatures_work """Tests TabulateVariantsStepPart.get_output_files()""" expected = { "tsv": ( - "work/{mapper}.{var_caller}.{anno_caller}.dkfz_bias_filter.eb_filter.tabulate_vcf.{tumor_library}.{filter}.{region}/out/" - "{mapper}.{var_caller}.{anno_caller}.dkfz_bias_filter.eb_filter.tabulate_vcf.{tumor_library}.{filter}.{region}.tsv" + "work/{mapper}.{var_caller}.{anno_caller}.filtered.tabulate_vcf.{tumor_library}/out/" + "{mapper}.{var_caller}.{anno_caller}.filtered.tabulate_vcf.{tumor_library}.tsv" ) } actual = somatic_variant_signatures_workflow.get_output_files("tabulate_vcf", "run") @@ -105,7 +114,7 @@ def test_tabulate_vcf_step_part_get_output_files(somatic_variant_signatures_work def test_tabulate_vcf_step_part_get_log_file(somatic_variant_signatures_workflow): """Tests TabulateVariantsStepPart.get_log_file()""" - expected = "work/{mapper}.{var_caller}.{anno_caller}.dkfz_bias_filter.eb_filter.tabulate_vcf.{tumor_library}.{filter}.{region}/log/snakemake.tabulate_vcf.log" + expected = "work/{mapper}.{var_caller}.{anno_caller}.filtered.tabulate_vcf.{tumor_library}/log/snakemake.tabulate_vcf.log" actual = somatic_variant_signatures_workflow.get_log_file("tabulate_vcf", "run") assert actual == expected @@ -125,7 +134,7 @@ def test_tabulate_vcf_step_part_get_resource_usage(somatic_variant_signatures_wo # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_variant_signatures_workflow.get_resource("tabulate_vcf", "run", resource) + actual = somatic_variant_signatures_workflow.get_resource("tabulate_vcf", "run", resource)() assert actual == expected, msg_error @@ -136,8 +145,8 @@ def test_deconstruct_sigs_step_part_get_input_files(somatic_variant_signatures_w """Tests DeconstructSigsStepPart.get_input_files()""" expected = { "tsv": ( - "work/{mapper}.{var_caller}.{anno_caller}.dkfz_bias_filter.eb_filter.tabulate_vcf.{tumor_library}.{filter}.{region}/out/" - "{mapper}.{var_caller}.{anno_caller}.dkfz_bias_filter.eb_filter.tabulate_vcf.{tumor_library}.{filter}.{region}.tsv" + "work/{mapper}.{var_caller}.{anno_caller}.filtered.tabulate_vcf.{tumor_library}/out/" + "{mapper}.{var_caller}.{anno_caller}.filtered.tabulate_vcf.{tumor_library}.tsv" ) } actual = somatic_variant_signatures_workflow.get_input_files("deconstruct_sigs", "run") @@ -147,8 +156,8 @@ def test_deconstruct_sigs_step_part_get_input_files(somatic_variant_signatures_w def test_deconstruct_sigs_step_part_get_output_files(somatic_variant_signatures_workflow): """Tests DeconstructSigsStepPart.get_output_files()""" base_name_out = ( - "work/{mapper}.{var_caller}.{anno_caller}.dkfz_bias_filter.eb_filter.deconstruct_sigs.{tumor_library}.{filter}.{region}/out/" - "{mapper}.{var_caller}.{anno_caller}.dkfz_bias_filter.eb_filter.deconstruct_sigs.{tumor_library}.{filter}.{region}" + "work/{mapper}.{var_caller}.{anno_caller}.filtered.deconstruct_sigs.{tumor_library}/out/" + "{mapper}.{var_caller}.{anno_caller}.filtered.deconstruct_sigs.{tumor_library}" ) expected = { "tsv": base_name_out + ".tsv", @@ -161,7 +170,7 @@ def test_deconstruct_sigs_step_part_get_output_files(somatic_variant_signatures_ def test_deconstruct_sigs_step_part_get_log_file(somatic_variant_signatures_workflow): """Tests DeconstructSigsStepPart.get_log_file()""" expected = ( - "work/{mapper}.{var_caller}.{anno_caller}.dkfz_bias_filter.eb_filter.deconstruct_sigs.{tumor_library}.{filter}.{region}/log/" + "work/{mapper}.{var_caller}.{anno_caller}.filtered.deconstruct_sigs.{tumor_library}/log/" "snakemake.deconstruct_sigs.log" ) actual = somatic_variant_signatures_workflow.get_log_file("deconstruct_sigs", "run") @@ -177,7 +186,7 @@ def test_deconstruct_sigs_step_part_get_resource_usage(somatic_variant_signature msg_error = f"Assertion error for resource '{resource}'." actual = somatic_variant_signatures_workflow.get_resource( "deconstruct_sigs", "run", resource - ) + )() assert actual == expected, msg_error @@ -192,7 +201,7 @@ def test_somatic_variant_signatures_workflow(somatic_variant_signatures_workflow assert actual == expected # Check result file construction - name_pattern = "{mapper}.{caller}.{annotator}.dkfz_bias_filter.eb_filter.deconstruct_sigs.P00{i}-T{t}-DNA1-WGS1.{filt}.{region}" + name_pattern = "{mapper}.{caller}.{annotator}.filtered.deconstruct_sigs.P00{i}-T{t}-DNA1-WGS1" tpl = "output/" + name_pattern + "/out/" + name_pattern + ".tsv" expected = [ tpl.format( @@ -201,21 +210,11 @@ def test_somatic_variant_signatures_workflow(somatic_variant_signatures_workflow annotator=annotator, i=i, t=t, - filt=filt, - region=region, ) for i, t in ((1, 1), (2, 1), (2, 2)) for mapper in ("bwa",) - for caller in ("mutect", "scalpel") - for annotator in ("vep", "jannovar") - for filt in ( - "no_filter", - "dkfz_only", - "dkfz_and_ebfilter", - "dkfz_and_ebfilter_and_oxog", - "dkfz_and_oxog", - ) - for region in ("genome_wide",) + for caller in ("mutect",) + for annotator in ("vep",) ] expected = set(expected) actual = set(somatic_variant_signatures_workflow.get_result_files()) diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_wgs_cnv_calling.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_wgs_cnv_calling.py index 35a62fa08..c4f4e3668 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_wgs_cnv_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_wgs_cnv_calling.py @@ -38,12 +38,11 @@ def minimal_config(): ngs_mapping: tools: dna: ['bwa'] - compute_coverage_bed: true - path_target_regions: /path/to/regions.bed bwa: path_index: /path/to/bwa/index.fa somatic_wgs_cnv_calling: + path_somatic_variant_calling: ../somatic_variant_calling somatic_variant_calling_tool: mutect tools: - canvas @@ -53,13 +52,18 @@ def minimal_config(): tools_ngs_mapping: - bwa canvas: - reference: /path/to/reference.fasta - filter_bed: /path/to/filter.bed - genome_folder: /path/to/genome/folder + path_reference: /path/to/reference.fasta + path_filter_bed: /path/to/filter.bed + path_genome_folder: /path/to/genome/folder cnvkit: path_target: /path/to/panel_of_normals/output/cnvkit.target/out/cnvkit.target.bed path_antitarget: /path/to/panel_of_normals/output/cnvkit.antitarget/out/cnvkit.antitarget.bed path_panel_of_normals: /path/to/panel_of_normals/output/bwa.cnvkit.create_panel/out/bwa.cnvkit.panel_of_normals.cnn + cnvetti: {} + control_freec: + path_chrlenfile: /path/to/chrlenfile + path_mappability: /path/to/mappability + convert: {} data_sets: @@ -83,11 +87,13 @@ def somatic_wgs_cnv_calling_workflow( work_dir, config_paths, cancer_sheet_fake_fs, + aligner_indices_fake_fs, mocker, ): """Return SomaticWgsCnvCallingWorkflow object pre-configured with germline sheet""" # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we # can obtain paths from the function as if we really had NGSMappingPipelineStep etc. here dummy_workflow.globals = { @@ -143,7 +149,7 @@ def test_canvas_step_part_get_resource(somatic_wgs_cnv_calling_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_wgs_cnv_calling_workflow.get_resource("canvas", "run", resource) + actual = somatic_wgs_cnv_calling_workflow.get_resource("canvas", "run", resource)() assert actual == expected, msg_error @@ -218,13 +224,13 @@ def test_control_freec_step_part_get_resource(somatic_wgs_cnv_calling_workflow): # Evaluate action `run` for resource, expected in run_expected_dict.items(): msg_error = f"Assertion error for resource '{resource}' in action 'run'." - actual = somatic_wgs_cnv_calling_workflow.get_resource("control_freec", "run", resource) + actual = somatic_wgs_cnv_calling_workflow.get_resource("control_freec", "run", resource)() assert actual == expected, msg_error # Evaluate action `plot` for resource, expected in plot_expected_dict.items(): msg_error = f"Assertion error for resource '{resource}' in action 'plot" - actual = somatic_wgs_cnv_calling_workflow.get_resource("control_freec", "plot", resource) + actual = somatic_wgs_cnv_calling_workflow.get_resource("control_freec", "plot", resource)() assert actual == expected, msg_error # Evaluate action `transform` @@ -232,7 +238,7 @@ def test_control_freec_step_part_get_resource(somatic_wgs_cnv_calling_workflow): msg_error = f"Assertion error for resource '{resource}' in action 'transform" actual = somatic_wgs_cnv_calling_workflow.get_resource( "control_freec", "transform", resource - ) + )() assert actual == expected, msg_error @@ -282,7 +288,7 @@ def test_cnvkit_coverage_step_part_get_resource(somatic_wgs_cnv_calling_workflow # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_wgs_cnv_calling_workflow.get_resource("cnvkit", "coverage", resource) + actual = somatic_wgs_cnv_calling_workflow.get_resource("cnvkit", "coverage", resource)() assert actual == expected, msg_error @@ -324,7 +330,7 @@ def test_cnvkit_fix_step_part_get_resource(somatic_wgs_cnv_calling_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_wgs_cnv_calling_workflow.get_resource("cnvkit", "fix", resource) + actual = somatic_wgs_cnv_calling_workflow.get_resource("cnvkit", "fix", resource)() assert actual == expected, msg_error @@ -362,7 +368,7 @@ def test_cnvkit_segment_step_part_get_resource(somatic_wgs_cnv_calling_workflow) # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_wgs_cnv_calling_workflow.get_resource("cnvkit", "segment", resource) + actual = somatic_wgs_cnv_calling_workflow.get_resource("cnvkit", "segment", resource)() assert actual == expected, msg_error @@ -403,7 +409,7 @@ def test_cnvkit_call_step_part_get_resource(somatic_wgs_cnv_calling_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_wgs_cnv_calling_workflow.get_resource("cnvkit", "call", resource) + actual = somatic_wgs_cnv_calling_workflow.get_resource("cnvkit", "call", resource)() assert actual == expected, msg_error @@ -444,7 +450,7 @@ def test_cnvkit_postprocess_step_part_get_resource(somatic_wgs_cnv_calling_workf # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_wgs_cnv_calling_workflow.get_resource("cnvkit", "postprocess", resource) + actual = somatic_wgs_cnv_calling_workflow.get_resource("cnvkit", "postprocess", resource)() assert actual == expected, msg_error @@ -506,7 +512,7 @@ def test_cnvkit_plot_step_part_get_resource(somatic_wgs_cnv_calling_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_wgs_cnv_calling_workflow.get_resource("cnvkit", "plot", resource) + actual = somatic_wgs_cnv_calling_workflow.get_resource("cnvkit", "plot", resource)() assert actual == expected, msg_error @@ -550,7 +556,7 @@ def test_cnvkit_export_step_part_get_resource(somatic_wgs_cnv_calling_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_wgs_cnv_calling_workflow.get_resource("cnvkit", "export", resource) + actual = somatic_wgs_cnv_calling_workflow.get_resource("cnvkit", "export", resource)() assert actual == expected, msg_error @@ -607,7 +613,7 @@ def test_cnvkit_report_step_part_get_resource(somatic_wgs_cnv_calling_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_wgs_cnv_calling_workflow.get_resource("cnvkit", "report", resource) + actual = somatic_wgs_cnv_calling_workflow.get_resource("cnvkit", "report", resource)() assert actual == expected, msg_error @@ -742,7 +748,7 @@ def test_cnvetti_step_part_get_resource(somatic_wgs_cnv_calling_workflow): for action in all_actions: for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}' in action '{action}'." - actual = somatic_wgs_cnv_calling_workflow.get_resource("cnvetti", action, resource) + actual = somatic_wgs_cnv_calling_workflow.get_resource("cnvetti", action, resource)() assert actual == expected, msg_error diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_wgs_sv_calling.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_wgs_sv_calling.py index 19b41a1ac..8bce07f78 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_wgs_sv_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_wgs_sv_calling.py @@ -31,14 +31,13 @@ def minimal_config(): ngs_mapping: tools: dna: ['bwa'] - compute_coverage_bed: true - path_target_regions: /path/to/regions.bed bwa: path_index: /path/to/bwa/index.fa somatic_wgs_sv_calling: path_ngs_mapping: ../ngs_mapping tools: ['manta'] + manta: {} data_sets: first_batch: @@ -61,11 +60,13 @@ def somatic_wgs_sv_calling_workflow( work_dir, config_paths, cancer_sheet_fake_fs, + aligner_indices_fake_fs, mocker, ): """Return SomaticWgsSvCallingWorkflow object pre-configured with germline sheet""" # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we # can obtain paths from the function as if we really had NGSMappingPipelineStep etc. here dummy_workflow.globals = {"ngs_mapping": lambda x: "NGS_MAPPING/" + x} @@ -117,7 +118,7 @@ def test_manta_somatic_step_part_get_resource(somatic_wgs_sv_calling_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = somatic_wgs_sv_calling_workflow.get_resource("manta", "run", resource) + actual = somatic_wgs_sv_calling_workflow.get_resource("manta", "run", resource)() assert actual == expected, msg_error @@ -134,7 +135,7 @@ def test_delly2_somatic_step_part_get_resource(somatic_wgs_sv_calling_workflow): for action in all_actions: for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}' in action '{action}'." - actual = somatic_wgs_sv_calling_workflow.get_resource("delly2", action, resource) + actual = somatic_wgs_sv_calling_workflow.get_resource("delly2", action, resource)() assert actual == expected, msg_error diff --git a/tests/snappy_pipeline/workflows/test_workflows_sv_calling_targeted.py b/tests/snappy_pipeline/workflows/test_workflows_sv_calling_targeted.py index 20f774f31..4910db5a3 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_sv_calling_targeted.py +++ b/tests/snappy_pipeline/workflows/test_workflows_sv_calling_targeted.py @@ -32,8 +32,6 @@ def minimal_config(): ngs_mapping: tools: dna: ['bwa'] - compute_coverage_bed: true - path_target_regions: /path/to/regions.bed bwa: path_index: /path/to/bwa/index.fa @@ -42,12 +40,14 @@ def minimal_config(): - delly2 - manta - gcnv + delly2: {} # use defaults + manta: {} # use defaults gcnv: + # path_uniquely_mapable_bed: /path/to/uniquely/mappable/variable/GRCh37/file.bed.gz path_target_interval_list_mapping: - pattern: "Agilent SureSelect Human All Exon V6.*" name: "Agilent_SureSelect_Human_All_Exon_V6" path: /path/to/Agilent/SureSelect_Human_All_Exon_V6_r2/GRCh37/Exons.bed - path_uniquely_mapable_bed: /path/to/uniquely/mappable/variable/GRCh37/file.bed.gz precomputed_model_paths: - library: "Agilent SureSelect Human All Exon V6" contig_ploidy: /path/to/ploidy-model @@ -101,6 +101,7 @@ def sv_calling_targeted_workflow( work_dir, config_paths, germline_sheet_fake_fs2_gcnv_model, + aligner_indices_fake_fs, mocker, ): """ @@ -110,6 +111,7 @@ def sv_calling_targeted_workflow( patch_module_fs( "snappy_pipeline.workflows.abstract", germline_sheet_fake_fs2_gcnv_model, mocker ) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) patch_module_fs( "snappy_pipeline.workflows.common.gcnv.gcnv_run", germline_sheet_fake_fs2_gcnv_model, @@ -141,6 +143,7 @@ def sv_calling_targeted_workflow_large_cohort( work_dir, config_paths, germline_sheet_fake_fs2, + aligner_indices_fake_fs, mocker, ): """ @@ -149,6 +152,7 @@ def sv_calling_targeted_workflow_large_cohort( """ # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", germline_sheet_fake_fs2, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we # can obtain paths from the function as if we really had a NGSMappingPipelineStep here dummy_workflow.globals = {"ngs_mapping": lambda x: "NGS_MAPPING/" + x} @@ -170,12 +174,14 @@ def sv_calling_targeted_workflow_large_cohort_background( work_dir, config_paths, germline_sheet_fake_fs2, + aligner_indices_fake_fs, mocker, ): """Return SvCallingTargetedWorkflow object pre-configured with germline sheet - large trio cohort as background.""" # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", germline_sheet_fake_fs2, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we # can obtain paths from the function as if we really had a NGSMappingPipelineStep here dummy_workflow.globals = {"ngs_mapping": lambda x: "NGS_MAPPING/" + x} @@ -199,6 +205,7 @@ def test_validate_request( work_dir, config_paths, germline_sheet_fake_fs2_gcnv_model, + aligner_indices_fake_fs, mocker, ): """Tests SvCallingTargetedWorkflow.validate_request()""" @@ -206,6 +213,7 @@ def test_validate_request( patch_module_fs( "snappy_pipeline.workflows.abstract", germline_sheet_fake_fs2_gcnv_model, mocker ) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) patch_module_fs( "snappy_pipeline.workflows.common.gcnv.gcnv_run", germline_sheet_fake_fs2_gcnv_model, @@ -387,14 +395,14 @@ def test_gcnv_step_part_get_resource_usage(sv_calling_targeted_workflow): for action in high_resource_actions: for resource, expected in high_res_expected_dict.items(): msg_error = f"Assertion error for resource '{resource}' in action '{action}'." - actual = sv_calling_targeted_workflow.get_resource("gcnv", action, resource) + actual = sv_calling_targeted_workflow.get_resource("gcnv", action, resource)() assert actual == expected, msg_error # Evaluate - all other actions for action in default_actions: for resource, expected in default_expected_dict.items(): msg_error = f"Assertion error for resource '{resource}' in action '{action}'." - actual = sv_calling_targeted_workflow.get_resource("gcnv", action, resource) + actual = sv_calling_targeted_workflow.get_resource("gcnv", action, resource)() assert actual == expected, msg_error @@ -418,37 +426,6 @@ def test_gcnv_get_params(sv_calling_targeted_workflow): sv_calling_targeted_workflow.get_params("gcnv", action) -def test_gcnv_validate_precomputed_model_paths_config(sv_calling_targeted_workflow): - """Tests RunGcnvTargetSeqStepPart.validate_model_requirements()""" - # Initialise input - valid_dict = { - "library": "library", - "contig_ploidy": "/path/to/ploidy-model", - "model_pattern": "/path/to/model_*", - } - typo_dict = { - "library_n": "library", - "contig_ploidy": "/path/to/ploidy-model", - "model_pattern": "/path/to/model_*", - } - missing_key_dict = {"model_pattern": "/path/to/model_*"} - - # Sanity check - sv_calling_targeted_workflow.substep_getattr("gcnv", "validate_precomputed_model_paths_config")( - config=[valid_dict] - ) - # Test key typo - with pytest.raises(InvalidConfiguration): - sv_calling_targeted_workflow.substep_getattr( - "gcnv", "validate_precomputed_model_paths_config" - )(config=[valid_dict, typo_dict]) - # Test key missing - with pytest.raises(InvalidConfiguration): - sv_calling_targeted_workflow.substep_getattr( - "gcnv", "validate_precomputed_model_paths_config" - )(config=[valid_dict, missing_key_dict]) - - def test_gcnv_validate_ploidy_model_directory( fake_fs, mocker, sv_calling_targeted_workflow, ploidy_model_files ): diff --git a/tests/snappy_pipeline/workflows/test_workflows_sv_calling_wgs.py b/tests/snappy_pipeline/workflows/test_workflows_sv_calling_wgs.py index 7249ee1a7..8b924b2f9 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_sv_calling_wgs.py +++ b/tests/snappy_pipeline/workflows/test_workflows_sv_calling_wgs.py @@ -7,15 +7,9 @@ import ruamel.yaml as ruamel_yaml from snakemake.io import Wildcards -from snappy_pipeline.base import InvalidConfiguration, UnsupportedActionException from snappy_pipeline.workflows.sv_calling_wgs import SvCallingWgsWorkflow -from .common import ( - get_expected_gcnv_log_file, - get_expected_log_files_dict, - get_expected_output_bcf_files_dict, - get_expected_output_vcf_files_dict, -) +from .common import get_expected_output_bcf_files_dict from .conftest import patch_module_fs __author__ = "Manuel Holtgrewe " @@ -39,26 +33,25 @@ def minimal_config(): tools: dna: - bwa - compute_coverage_bed: true - path_target_regions: /path/to/regions.bed bwa: path_index: /path/to/bwa/index.fa sv_calling_wgs: - variant_calling_tool: gatk3_ug tools: dna: - delly2 - gcnv - melt + delly2: {} gcnv: + # path_par_intervals: /path/to/par.intervals + # path_uniquely_mapable_bed: /path/to/uniquely_mapable.bed precomputed_model_paths: - library: "default" contig_ploidy: /path/to/ploidy-model model_pattern: "/data/model_*" melt: - path_genes_bed: /path/to/genes.bed - path_me_refs: /path/to/me/refs + jar_file: /path/to/melt.jar data_sets: first_batch: @@ -81,10 +74,12 @@ def sv_calling_wgs_workflow( work_dir, config_paths, germline_sheet_fake_fs2_gcnv_model, + aligner_indices_fake_fs, mocker, ): """Return SvCallingWgsWorkflow object pre-configured with germline sheet""" # Patch out file-system related things in abstract (the crawling link in step is defined there) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) patch_module_fs( "snappy_pipeline.workflows.abstract", germline_sheet_fake_fs2_gcnv_model, mocker ) @@ -287,14 +282,14 @@ def test_delly2_step_part_get_output_files_call(sv_calling_wgs_workflow): # for action in high_resource_actions: # for resource, expected in high_res_expected_dict.items(): # msg_error = f"Assertion error for resource '{resource}' in action '{action}'." -# actual = sv_calling_wgs_workflow.get_resource("gcnv", action, resource) +# actual = sv_calling_wgs_workflow.get_resource("gcnv", action, resource)() # assert actual == expected, msg_error # # Evaluate - all other actions # for action in default_actions: # for resource, expected in default_expected_dict.items(): # msg_error = f"Assertion error for resource '{resource}' in action '{action}'." -# actual = sv_calling_wgs_workflow.get_resource("gcnv", action, resource) +# actual = sv_calling_wgs_workflow.get_resource("gcnv", action, resource)() # assert actual == expected, msg_error @@ -794,7 +789,7 @@ def test_delly2_step_part_get_output_files_call(sv_calling_wgs_workflow): # for action in all_actions: # for resource, expected in expected_dict.items(): # msg_error = f"Assertion error for resource '{resource}' for action '{action}'." -# actual = sv_calling_wgs_workflow.get_resource("melt", action, resource) +# actual = sv_calling_wgs_workflow.get_resource("melt", action, resource)() # assert actual == expected, msg_error diff --git a/tests/snappy_pipeline/workflows/test_workflows_targeted_seq_mei_calling.py b/tests/snappy_pipeline/workflows/test_workflows_targeted_seq_mei_calling.py index 57f41ca5d..78657871b 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_targeted_seq_mei_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_targeted_seq_mei_calling.py @@ -55,6 +55,7 @@ def mei_workflow( work_dir, config_paths, germline_sheet_fake_fs, + aligner_indices_fake_fs, mocker, ): """Return MeiWorkflow object pre-configured with germline sheet""" @@ -66,6 +67,7 @@ def mei_workflow( ) # Patch out file-system related things in abstract (the crawling link in step is defined there) patch_module_fs("snappy_pipeline.workflows.abstract", germline_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) patch_module_fs( "snappy_pipeline.workflows.targeted_seq_mei_calling", germline_sheet_fake_fs, mocker ) @@ -170,7 +172,7 @@ def test_scramble_analysis_step_part_get_resource_usage(mei_workflow): msg_error = ( f"Assertion error for resource '{resource}' associated with action '{action}'." ) - actual = mei_workflow.get_resource("scramble", action, resource) + actual = mei_workflow.get_resource("scramble", action, resource)() assert actual == expected, msg_error diff --git a/tests/snappy_pipeline/workflows/test_workflows_varfish_export.py b/tests/snappy_pipeline/workflows/test_workflows_varfish_export.py index bf645a4d0..0708ca633 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_varfish_export.py +++ b/tests/snappy_pipeline/workflows/test_workflows_varfish_export.py @@ -28,7 +28,6 @@ def minimal_config(): ngs_mapping: tools: dna: ['bwa'] - compute_coverage_bed: true bwa: path_index: /path/to/bwa/index.fa target_coverage_report: @@ -40,8 +39,16 @@ def minimal_config(): variant_calling: tools: - gatk3_hc + gatk3_hc: {} variant_annotation: - path_jannovar_ser: /path/to/jannovar.ser + tools: + - vep + vep: {} + varfish_export: + path_ngs_mapping: ../ngs_mapping + path_variant_calling: ../variant_calling + path_exon_bed: /path/to/exons.bed + path_mehari_db: /path/to/mehari.db data_sets: first_batch: @@ -284,7 +291,7 @@ def test_mehari_step_part_get_resource_usage(varfish_export_workflow): for action in all_actions: for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}' in action '{action}'." - actual = varfish_export_workflow.get_resource("mehari", action, resource) + actual = varfish_export_workflow.get_resource("mehari", action, resource)() assert actual == expected, msg_error diff --git a/tests/snappy_pipeline/workflows/test_workflows_variant_annotation.py b/tests/snappy_pipeline/workflows/test_workflows_variant_annotation.py index 6a22d6abb..a63bef109 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_variant_annotation.py +++ b/tests/snappy_pipeline/workflows/test_workflows_variant_annotation.py @@ -29,14 +29,13 @@ def minimal_config(): ngs_mapping: tools: dna: ['bwa'] - compute_coverage_bed: true - path_target_regions: /path/to/regions.bed bwa: path_index: /path/to/bwa/index.fa variant_calling: tools: - gatk3_hc + gatk3_hc: {} variant_annotation: vep: @@ -148,7 +147,7 @@ def test_vep_run_step_part_get_resource_usage(variant_annotation_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = variant_annotation_workflow.get_resource("vep", "run", resource) + actual = variant_annotation_workflow.get_resource("vep", "run", resource)() assert actual == expected, msg_error diff --git a/tests/snappy_pipeline/workflows/test_workflows_variant_calling.py b/tests/snappy_pipeline/workflows/test_workflows_variant_calling.py index 31160eb43..ddede247b 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_variant_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_variant_calling.py @@ -33,22 +33,27 @@ def minimal_config(): ngs_mapping: tools: dna: ['bwa'] - compute_coverage_bed: true bwa: path_index: /path/to/bwa/index.fa - target_cov_report: - path_target_interval_list_mapping: - - name: "Agilent SureSelect Human All Exon V6" - pattern: "Agilent SureSelect Human All Exon V6*" - path: "path/to/targets.bed" variant_calling: baf_file_generation: enabled: true + jannovar_stats: + enabled: true + path_ser: /path/to/jannovar.ser + bcftools_stats: + enabled: true + bcftools_roh: + enabled: true + path_af_file: /path/to/af_file.txt tools: - bcftools_call - gatk3_hc - gatk3_ug + bcftools_call: {} + gatk3_hc: {} + gatk3_ug: {} data_sets: first_batch: @@ -71,6 +76,7 @@ def variant_calling_workflow( work_dir, config_paths, germline_sheet_fake_fs, + aligner_indices_fake_fs, mocker, ): """Return VariantCallingWorkflow object pre-configured with germline sheet""" @@ -81,6 +87,7 @@ def variant_calling_workflow( create_missing_dirs=True, ) patch_module_fs("snappy_pipeline.workflows.abstract", germline_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) patch_module_fs("snappy_pipeline.workflows.variant_calling", germline_sheet_fake_fs, mocker) # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we # can obtain paths from the function as if we really had a NGSMappingPipelineStep there @@ -144,7 +151,7 @@ def test_bcftools_call_step_part_get_resource(variant_calling_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = variant_calling_workflow.get_resource("bcftools_call", "run", resource) + actual = variant_calling_workflow.get_resource("bcftools_call", "run", resource)() assert actual == expected, msg_error @@ -208,7 +215,7 @@ def test_gatk3_hc_step_part_get_resource(variant_calling_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = variant_calling_workflow.get_resource("gatk3_hc", "run", resource) + actual = variant_calling_workflow.get_resource("gatk3_hc", "run", resource)() assert actual == expected, msg_error @@ -272,7 +279,7 @@ def test_gatk3_ug_step_part_get_resource(variant_calling_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = variant_calling_workflow.get_resource("gatk3_ug", "run", resource) + actual = variant_calling_workflow.get_resource("gatk3_ug", "run", resource)() assert actual == expected, msg_error @@ -338,7 +345,7 @@ def test_bcftools_stats_step_part_get_resource(variant_calling_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = variant_calling_workflow.get_resource("bcftools_stats", "run", resource) + actual = variant_calling_workflow.get_resource("bcftools_stats", "run", resource)() assert actual == expected, msg_error @@ -402,7 +409,7 @@ def test_jannovar_stats_stats_step_part_get_resource(variant_calling_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = variant_calling_workflow.get_resource("jannovar_stats", "run", resource) + actual = variant_calling_workflow.get_resource("jannovar_stats", "run", resource)() assert actual == expected, msg_error @@ -473,7 +480,7 @@ def test_baf_file_generation_step_part_get_resource(variant_calling_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = variant_calling_workflow.get_resource("baf_file_generation", "run", resource) + actual = variant_calling_workflow.get_resource("baf_file_generation", "run", resource)() assert actual == expected, msg_error @@ -486,12 +493,12 @@ def test_variant_calling_workflow(variant_calling_workflow): expected = [ "baf_file_generation", "bcftools_call", - "bcftools_roh", "bcftools_stats", + "bcftools_roh", "gatk3_hc", "gatk3_ug", - "gatk4_hc_gvcf", "gatk4_hc_joint", + "gatk4_hc_gvcf", "jannovar_stats", "write_pedigree", ] @@ -516,6 +523,15 @@ def test_variant_calling_workflow(variant_calling_workflow): "output/{mapper}.{var_caller}.P00{i}-N1-DNA1-WGS1/log/" "{mapper}.{var_caller}.P00{i}-N1-DNA1-WGS1.{step}.{ext}" ) + + stats_steps = {"jannovar_stats", "bcftools_roh"} - {"bcftools_stats"} # causes weird behaviour + stats_steps = { + step + for step in stats_steps + if ((s := variant_calling_workflow.config.get(step)) and s.enabled) + } + stats_steps_run = {f"{step}_run" for step in stats_steps} + expected += [ base_out.format(i=i, ext=ext, mapper=mapper, var_caller=var_caller, step=step) for i in (1, 4) # only for indices @@ -537,11 +553,7 @@ def test_variant_calling_workflow(variant_calling_workflow): "gatk3_hc", "gatk3_ug", ) - for step in ( - f"{var_caller}_run", - "jannovar_stats_run", - "bcftools_roh_run", - ) + for step in ({f"{var_caller}_run"} | stats_steps_run) ] base_out = ( "output/{mapper}.{var_caller}.P00{i}-N1-DNA1-WGS1/log/" @@ -616,24 +628,28 @@ def test_variant_calling_workflow(variant_calling_workflow): ) for ext in ("bw", "bw.md5") ] - tpl = ( - "output/{mapper}.{var_caller}.P00{i}-N1-DNA1-WGS1/report/" - "roh/{mapper}.{var_caller}.P00{i}-N1-DNA1-WGS1.{ext}" - ) - expected += [ - tpl.format(mapper=mapper, var_caller=var_caller, i=i, ext=ext) - for i in (1, 4) - for mapper in ("bwa",) - for var_caller in ( - "bcftools_call", - "gatk3_hc", - "gatk3_ug", + + if "bcftools_roh" in stats_steps: + tpl = ( + "output/{mapper}.{var_caller}.P00{i}-N1-DNA1-WGS1/report/" + "roh/{mapper}.{var_caller}.P00{i}-N1-DNA1-WGS1.{ext}" ) - for ext in ("txt", "txt.md5") - ] - expected = list(sorted(expected)) - actual = list(sorted(variant_calling_workflow.get_result_files())) - assert actual == expected + expected += [ + tpl.format(mapper=mapper, var_caller=var_caller, i=i, ext=ext) + for i in (1, 4) + for mapper in ("bwa",) + for var_caller in ( + "bcftools_call", + "gatk3_hc", + "gatk3_ug", + ) + for ext in ("txt", "txt.md5") + ] + expected = set(sorted(expected)) + actual = set(sorted(variant_calling_workflow.get_result_files())) + assert ( + actual == expected + ), f"Missing from actual: {expected - actual}\nMissing from expected: {actual - expected}" def test_variant_calling_custom_pedigree_field( @@ -643,6 +659,7 @@ def test_variant_calling_custom_pedigree_field( work_dir, config_paths, germline_trio_plus_sheet_fake_fs, + aligner_indices_fake_fs, mocker, ): """Tests VariantCallingWorkflow object pre-configured with germline trio plus sheet @@ -662,6 +679,7 @@ def test_variant_calling_custom_pedigree_field( create_missing_dirs=True, ) patch_module_fs("snappy_pipeline.workflows.abstract", germline_trio_plus_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) patch_module_fs( "snappy_pipeline.workflows.variant_calling", germline_trio_plus_sheet_fake_fs, mocker ) diff --git a/tests/snappy_pipeline/workflows/test_workflows_variant_checking.py b/tests/snappy_pipeline/workflows/test_workflows_variant_checking.py index bef215723..3231f2756 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_variant_checking.py +++ b/tests/snappy_pipeline/workflows/test_workflows_variant_checking.py @@ -26,14 +26,13 @@ def minimal_config(): ngs_mapping: tools: dna: ['bwa'] - compute_coverage_bed: true - path_target_regions: /path/to/regions.bed bwa: path_index: /path/to/bwa/index.fa variant_calling: tools: - gatk3_hc + gatk3_hc: {} variant_checking: tools_ngs_mapping: ['bwa'] # optional, copied from ngs mapping config @@ -140,7 +139,7 @@ def test_peddy_step_part_get_resource_usage(variant_checking_workflow): # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = variant_checking_workflow.get_resource("peddy", "run", resource) + actual = variant_checking_workflow.get_resource("peddy", "run", resource)() assert actual == expected, msg_error diff --git a/tests/snappy_pipeline/workflows/test_workflows_variant_denovo_filtration_from_variant_annotation.py b/tests/snappy_pipeline/workflows/test_workflows_variant_denovo_filtration_from_variant_annotation.py index 19a682bc1..85433ac3e 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_variant_denovo_filtration_from_variant_annotation.py +++ b/tests/snappy_pipeline/workflows/test_workflows_variant_denovo_filtration_from_variant_annotation.py @@ -33,14 +33,13 @@ def minimal_config(): ngs_mapping: tools: dna: ['bwa'] - compute_coverage_bed: true - path_target_regions: /path/to/regions.bed bwa: path_index: /path/to/bwa/index.fa variant_calling: tools: - gatk3_hc + gatk3_hc: {} variant_denovo_filtration: path_variant_annotation: ../variant_annotation @@ -65,6 +64,7 @@ def variant_de_novo_filtration_workflow( work_dir, config_paths, germline_sheet_fake_fs, + aligner_indices_fake_fs, mocker, ): """Return VariantCallingWorkflow object pre-configured with germline sheet""" @@ -75,6 +75,7 @@ def variant_de_novo_filtration_workflow( create_missing_dirs=True, ) patch_module_fs("snappy_pipeline.workflows.abstract", germline_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) patch_module_fs("snappy_pipeline.workflows.variant_calling", germline_sheet_fake_fs, mocker) patch_module_fs("snappy_pipeline.workflows.variant_annotation", germline_sheet_fake_fs, mocker) patch_module_fs( @@ -159,7 +160,9 @@ def test_filter_de_novo_from_variant_annotation_step_part_get_resource_usage( # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = variant_de_novo_filtration_workflow.get_resource("filter_denovo", "run", resource) + actual = variant_de_novo_filtration_workflow.get_resource( + "filter_denovo", "run", resource + )() assert actual == expected, msg_error @@ -227,7 +230,7 @@ def test_filter_de_novo_from_variant_annotationhard_step_part_get_resource_usage msg_error = f"Assertion error for resource '{resource}'." actual = variant_de_novo_filtration_workflow.get_resource( "filter_denovo_hard", "run", resource - ) + )() assert actual == expected, msg_error diff --git a/tests/snappy_pipeline/workflows/test_workflows_variant_denovo_filtration_from_variant_calling.py b/tests/snappy_pipeline/workflows/test_workflows_variant_denovo_filtration_from_variant_calling.py index 1c6a97eaa..ab8848433 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_variant_denovo_filtration_from_variant_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_variant_denovo_filtration_from_variant_calling.py @@ -33,14 +33,13 @@ def minimal_config(): ngs_mapping: tools: dna: ['bwa'] - compute_coverage_bed: true - path_target_regions: /path/to/regions.bed bwa: path_index: /path/to/bwa/index.fa variant_calling: tools: - gatk3_hc + gatk3_hc: {} variant_denovo_filtration: path_variant_calling: ../variant_calling @@ -65,6 +64,7 @@ def variant_de_novo_filtration_workflow( work_dir, config_paths, germline_sheet_fake_fs, + aligner_indices_fake_fs, mocker, ): """Return VariantCallingWorkflow object pre-configured with germline sheet""" @@ -75,6 +75,7 @@ def variant_de_novo_filtration_workflow( create_missing_dirs=True, ) patch_module_fs("snappy_pipeline.workflows.abstract", germline_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) patch_module_fs("snappy_pipeline.workflows.variant_calling", germline_sheet_fake_fs, mocker) patch_module_fs( "snappy_pipeline.workflows.variant_denovo_filtration", germline_sheet_fake_fs, mocker @@ -156,7 +157,9 @@ def test_filter_de_novo_from_variant_calling_step_part_get_resource( # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = variant_de_novo_filtration_workflow.get_resource("filter_denovo", "run", resource) + actual = variant_de_novo_filtration_workflow.get_resource( + "filter_denovo", "run", resource + )() assert actual == expected, msg_error @@ -222,7 +225,9 @@ def test_filter_de_novo_from_variant_annotationhard_step_part_get_resource( # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = variant_de_novo_filtration_workflow.get_resource("filter_denovo", "run", resource) + actual = variant_de_novo_filtration_workflow.get_resource( + "filter_denovo", "run", resource + )() assert actual == expected, msg_error diff --git a/tests/snappy_pipeline/workflows/test_workflows_variant_denovo_filtration_from_variant_phasing.py b/tests/snappy_pipeline/workflows/test_workflows_variant_denovo_filtration_from_variant_phasing.py index 438578e4e..449c28f71 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_variant_denovo_filtration_from_variant_phasing.py +++ b/tests/snappy_pipeline/workflows/test_workflows_variant_denovo_filtration_from_variant_phasing.py @@ -33,14 +33,13 @@ def minimal_config(): ngs_mapping: tools: dna: ['bwa'] - compute_coverage_bed: true - path_target_regions: /path/to/regions.bed bwa: path_index: /path/to/bwa/index.fa variant_calling: tools: - gatk3_hc + gatk3_hc: {} variant_denovo_filtration: path_variant_phasing: ../variant_phasing @@ -65,6 +64,7 @@ def variant_de_novo_filtration_workflow( work_dir, config_paths, germline_sheet_fake_fs, + aligner_indices_fake_fs, mocker, ): """Return VariantCallingWorkflow object pre-configured with germline sheet""" @@ -75,6 +75,7 @@ def variant_de_novo_filtration_workflow( create_missing_dirs=True, ) patch_module_fs("snappy_pipeline.workflows.abstract", germline_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) patch_module_fs("snappy_pipeline.workflows.variant_calling", germline_sheet_fake_fs, mocker) patch_module_fs("snappy_pipeline.workflows.variant_annotation", germline_sheet_fake_fs, mocker) patch_module_fs("snappy_pipeline.workflows.variant_phasing", germline_sheet_fake_fs, mocker) @@ -166,7 +167,9 @@ def test_filter_de_novo_from_variant_phasing_step_part_get_resource( # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = variant_de_novo_filtration_workflow.get_resource("filter_denovo", "run", resource) + actual = variant_de_novo_filtration_workflow.get_resource( + "filter_denovo", "run", resource + )() assert actual == expected, msg_error @@ -237,7 +240,9 @@ def test_filter_de_novo_from_variant_annotationhard_step_part_get_resource( # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = variant_de_novo_filtration_workflow.get_resource("filter_denovo", "run", resource) + actual = variant_de_novo_filtration_workflow.get_resource( + "filter_denovo", "run", resource + )() assert actual == expected, msg_error diff --git a/tests/snappy_pipeline/workflows/test_workflows_variant_export_external.py b/tests/snappy_pipeline/workflows/test_workflows_variant_export_external.py index 9879b6668..c0c2bc310 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_variant_export_external.py +++ b/tests/snappy_pipeline/workflows/test_workflows_variant_export_external.py @@ -3,11 +3,11 @@ from copy import deepcopy import textwrap +from pydantic import ValidationError import pytest import ruamel.yaml as ruamel_yaml from snakemake.io import Wildcards -from snappy_pipeline.base import MissingConfiguration from snappy_pipeline.workflows.variant_export_external import VariantExportExternalWorkflow from .common import get_expected_log_files_dict, get_expected_output_vcf_files_dict @@ -73,6 +73,7 @@ def variant_export_external_workflow( # Create search path germline_sheet_fake_fs.fs.makedirs("/search_path") # Patch out file-system related things in abstract (the crawling link in step is defined there) + patch_module_fs("pathlib", germline_sheet_fake_fs, mocker) patch_module_fs("snappy_pipeline.workflows.abstract", germline_sheet_fake_fs, mocker) patch_module_fs( "snappy_pipeline.workflows.variant_export_external", germline_sheet_fake_fs, mocker @@ -100,12 +101,13 @@ def test_workflow_check_config_invalid_annotator_files( # Create search path germline_sheet_fake_fs.fs.makedirs("/search_path") # Patch out file-system related things in abstract (the crawling link in step is defined there) + patch_module_fs("pathlib", germline_sheet_fake_fs, mocker) patch_module_fs("snappy_pipeline.workflows.abstract", germline_sheet_fake_fs, mocker) patch_module_fs( "snappy_pipeline.workflows.variant_export_external", germline_sheet_fake_fs, mocker ) # Construct the workflow object - with pytest.raises(MissingConfiguration) as exec_info: + with pytest.raises(ValidationError) as exec_info: VariantExportExternalWorkflow( dummy_workflow, minimal_config, @@ -113,9 +115,15 @@ def test_workflow_check_config_invalid_annotator_files( config_paths, work_dir, ) - assert "path_refseq_ser" in exec_info.value.args[0] - assert "path_ensembl_ser" in exec_info.value.args[0] - assert "path_db" in exec_info.value.args[0] + errors = exec_info.value.errors() + assert len(errors) == 3 + assert ( + len( + {"path_refseq_ser", "path_ensembl_ser", "path_db"} + & set(s for e in errors for s in e["loc"]) + ) + == 3 + ) def test_workflow_check_config_invalid_search_directory( @@ -136,12 +144,13 @@ def test_workflow_check_config_invalid_search_directory( create_missing_dirs=True, ) # Patch out file-system related things in abstract (the crawling link in step is defined there) + patch_module_fs("pathlib", germline_sheet_fake_fs, mocker) patch_module_fs("snappy_pipeline.workflows.abstract", germline_sheet_fake_fs, mocker) patch_module_fs( "snappy_pipeline.workflows.variant_export_external", germline_sheet_fake_fs, mocker ) # Construct the workflow object - with pytest.raises(MissingConfiguration) as exec_info: + with pytest.raises(ValidationError) as exec_info: VariantExportExternalWorkflow( dummy_workflow, minimal_config, @@ -149,7 +158,11 @@ def test_workflow_check_config_invalid_search_directory( config_paths, work_dir, ) - assert " is not a directory: /search_path" in exec_info.value.args[0] + + errors = exec_info.value.errors() + assert len(errors) == 1 + + assert "path_not_directory" in {e["type"] for e in errors} def test_workflow_check_config_invalid_search_pattern( @@ -172,6 +185,7 @@ def test_workflow_check_config_invalid_search_pattern( # Create search path germline_sheet_fake_fs.fs.makedirs("/search_path") # Patch out file-system related things in abstract (the crawling link in step is defined there) + patch_module_fs("pathlib", germline_sheet_fake_fs, mocker) patch_module_fs("snappy_pipeline.workflows.abstract", germline_sheet_fake_fs, mocker) patch_module_fs( "snappy_pipeline.workflows.variant_export_external", germline_sheet_fake_fs, mocker @@ -183,7 +197,7 @@ def test_workflow_check_config_invalid_search_pattern( "*/*.vcf.gz", ] # Construct the workflow object - with pytest.raises(MissingConfiguration) as exec_info: + with pytest.raises(ValidationError) as exec_info: VariantExportExternalWorkflow( dummy_workflow, modified_config, @@ -191,7 +205,25 @@ def test_workflow_check_config_invalid_search_pattern( config_paths, work_dir, ) - assert "Value in 'search_patterns' is not a dictionary" in exec_info.value.args[0] + + errors = exec_info.value.errors() + + # there is 1 incorrectly defined search_patterns entry + # which is incorrectly defined as a list instead of a dict/key-value pairs, + # so pydantic tries to parse *2* dicts from the list and fails + assert len(errors) == 2 + + expected_errors = [ + { + "input": input_str, + "loc": ("step_config", "variant_export_external", "search_patterns", i), + "msg": "Input should be a valid dictionary", + "type": "dict_type", + "url": "https://errors.pydantic.dev/2.7/v/dict_type", + } + for i, input_str in enumerate(["vcf", "*/*.vcf.gz"]) + ] + assert expected_errors == errors # Tests for BamReportsExternalStepPart (bam_qc) -------------------------------------------------- @@ -343,7 +375,7 @@ def test_varfish_annotator_step_part_get_resource_usage_gvcf_to_vcf( msg_error = f"Assertion error for resource '{resource}' for action 'gvcf_to_vcf'." actual = variant_export_external_workflow.get_resource( "varfish_annotator_external", "gvcf_to_vcf", resource - ) + )() assert actual == expected, msg_error @@ -408,7 +440,7 @@ def test_varfish_annotator_step_part_get_resource_usage_merge_vcf(variant_export msg_error = f"Assertion error for resource '{resource}' for action 'merge_vcf'." actual = variant_export_external_workflow.get_resource( "varfish_annotator_external", "merge_vcf", resource - ) + )() assert actual == expected, msg_error @@ -507,7 +539,7 @@ def test_varfish_annotator_step_part_get_resource_usage_annotate(variant_export_ msg_error = f"Assertion error for resource '{resource}' for action 'annotate'." actual = variant_export_external_workflow.get_resource( "varfish_annotator_external", "annotate", resource - ) + )() assert actual == expected, msg_error @@ -603,7 +635,7 @@ def test_varfish_annotator_step_part_get_resource_usage_bam_qc(variant_export_ex msg_error = f"Assertion error for resource '{resource}' for action 'bam_qc'." actual = variant_export_external_workflow.get_resource( "varfish_annotator_external", "bam_qc", resource - ) + )() assert actual == expected, msg_error diff --git a/tests/snappy_pipeline/workflows/test_workflows_variant_filtration.py b/tests/snappy_pipeline/workflows/test_workflows_variant_filtration.py index c267ab668..4008d1473 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_variant_filtration.py +++ b/tests/snappy_pipeline/workflows/test_workflows_variant_filtration.py @@ -37,14 +37,13 @@ def minimal_config(): ngs_mapping: tools: dna: ['bwa'] - compute_coverage_bed: true - path_target_regions: /path/to/regions.bed bwa: path_index: /path/to/bwa/index.fa variant_calling: tools: - gatk3_hc + gatk3_hc: {} variant_filtration: path_variant_annotation: ../variant_annotation @@ -53,7 +52,7 @@ def minimal_config(): # Testing 1 out 40+ possible combinations: # {thresholds}.{inherit}.{freq}.{region}.{score}.{het_comp} filter_combinations: - - conservative.dominant.dominant_freq.all_genes.coding.passthrough + - conservative.dominant.dominant_freq.whole_genome.coding.passthrough data_sets: first_batch: @@ -158,7 +157,7 @@ def test_filter_quality_step_part_get_resource_usage(variant_filtration_workflow # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = variant_filtration_workflow.get_resource("filter_quality", "run", resource) + actual = variant_filtration_workflow.get_resource("filter_quality", "run", resource)() assert actual == expected, msg_error @@ -218,7 +217,7 @@ def test_filter_inheritance_step_part_get_resource_usage(variant_filtration_work # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = variant_filtration_workflow.get_resource("filter_inheritance", "run", resource) + actual = variant_filtration_workflow.get_resource("filter_inheritance", "run", resource)() assert actual == expected, msg_error @@ -280,7 +279,7 @@ def test_filter_frequency_step_part_get_resource_usage(variant_filtration_workfl # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = variant_filtration_workflow.get_resource("filter_frequency", "run", resource) + actual = variant_filtration_workflow.get_resource("filter_frequency", "run", resource)() assert actual == expected, msg_error @@ -345,7 +344,7 @@ def test_filter_regions_step_part_get_resource_usage(variant_filtration_workflow # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = variant_filtration_workflow.get_resource("filter_regions", "run", resource) + actual = variant_filtration_workflow.get_resource("filter_regions", "run", resource)() assert actual == expected, msg_error @@ -362,14 +361,14 @@ def test_filter_scores_step_part_get_input_files(variant_filtration_workflow): "thresholds": "conservative", "inheritance": "dominant", "frequency": "af_dominant", - "regions": "all_genes", + "regions": "whole_genome", } ) # Define expected base_name = ( "work/bwa.gatk3_hc.jannovar_annotate_vcf.filtered.P001-N1-DNA1-WGS1.conservative." - "dominant.af_dominant.all_genes/out/bwa.gatk3_hc.jannovar_annotate_vcf.filtered." - "P001-N1-DNA1-WGS1.conservative.dominant.af_dominant.all_genes" + "dominant.af_dominant.whole_genome/out/bwa.gatk3_hc.jannovar_annotate_vcf.filtered." + "P001-N1-DNA1-WGS1.conservative.dominant.af_dominant.whole_genome" ) pedigree_dict = {"ped": "/work/write_pedigree.P001-N1-DNA1-WGS1/out/P001-N1-DNA1-WGS1.ped"} var_filtration_dict = get_expected_output_vcf_files_dict(base_out=base_name) @@ -411,7 +410,7 @@ def test_filter_scores_step_part_get_resource_usage(variant_filtration_workflow) # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = variant_filtration_workflow.get_resource("filter_scores", "run", resource) + actual = variant_filtration_workflow.get_resource("filter_scores", "run", resource)() assert actual == expected, msg_error @@ -428,15 +427,15 @@ def test_filter_het_comp_step_part_get_input_files(variant_filtration_workflow): "thresholds": "conservative", "inheritance": "dominant", "frequency": "af_dominant", - "regions": "all_genes", + "regions": "whole_genome", "scores": "coding", } ) # Define expected base_name = ( "work/bwa.gatk3_hc.jannovar_annotate_vcf.filtered.P001-N1-DNA1-WGS1.conservative." - "dominant.af_dominant.all_genes.coding/out/bwa.gatk3_hc.jannovar_annotate_vcf." - "filtered.P001-N1-DNA1-WGS1.conservative.dominant.af_dominant.all_genes.coding" + "dominant.af_dominant.whole_genome.coding/out/bwa.gatk3_hc.jannovar_annotate_vcf." + "filtered.P001-N1-DNA1-WGS1.conservative.dominant.af_dominant.whole_genome.coding" ) pedigree_dict = {"ped": "/work/write_pedigree.P001-N1-DNA1-WGS1/out/P001-N1-DNA1-WGS1.ped"} var_filtration_dict = get_expected_output_vcf_files_dict(base_out=base_name) @@ -480,7 +479,7 @@ def test_filter_het_comp_step_part_get_resource_usage(variant_filtration_workflo # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = variant_filtration_workflow.get_resource("filter_het_comp", "run", resource) + actual = variant_filtration_workflow.get_resource("filter_het_comp", "run", resource)() assert actual == expected, msg_error @@ -506,9 +505,9 @@ def test_variant_filtration_workflow(variant_filtration_workflow): # Check result file construction tpl = ( "output/bwa.gatk3_hc.jannovar_annotate_vcf.filtered.P00{i}-N1-DNA1-WGS1.conservative." - "dominant.dominant_freq.all_genes.coding.passthrough/out/" + "dominant.dominant_freq.whole_genome.coding.passthrough/out/" "bwa.gatk3_hc.jannovar_annotate_vcf.filtered.P00{i}-N1-DNA1-WGS1.conservative." - "dominant.dominant_freq.all_genes.coding.passthrough.{ext}" + "dominant.dominant_freq.whole_genome.coding.passthrough.{ext}" ) expected = [ tpl.format(i=i, ext=ext) diff --git a/tests/snappy_pipeline/workflows/test_workflows_variant_phasing.py b/tests/snappy_pipeline/workflows/test_workflows_variant_phasing.py index a6a9ab8f7..b98e3b7ca 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_variant_phasing.py +++ b/tests/snappy_pipeline/workflows/test_workflows_variant_phasing.py @@ -33,18 +33,20 @@ def minimal_config(): ngs_mapping: tools: dna: ['bwa'] - compute_coverage_bed: true - path_target_regions: /path/to/regions.bed bwa: path_index: /path/to/bwa/index.fa variant_calling: tools: - - bcftools_call - - gatk3_hc - - gatk3_ug + - bcftools_call + - gatk3_hc + - gatk3_ug + bcftools_call: {} + gatk3_hc: {} + gatk3_ug: {} variant_annotation: - path_jannovar_ser: /path/to/jannovar.ser + tools: ['vep'] + vep: {} variant_phasing: path_variant_annotation: ../variant_annotation @@ -69,6 +71,7 @@ def variant_phasing_workflow( work_dir, config_paths, germline_sheet_fake_fs, + aligner_indices_fake_fs, mocker, ): """Return VariantPhasingWorkflow object pre-configured with germline sheet""" @@ -79,6 +82,7 @@ def variant_phasing_workflow( create_missing_dirs=True, ) patch_module_fs("snappy_pipeline.workflows.abstract", germline_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping", aligner_indices_fake_fs, mocker) patch_module_fs("snappy_pipeline.workflows.variant_phasing", germline_sheet_fake_fs, mocker) # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we # can obtain paths from the function as if we really had a NGSMappingPipelineStep there @@ -161,7 +165,7 @@ def test_write_trio_pedigree_step_part_get_resource_usage(variant_phasing_workfl # Evaluate for resource, expected in expected_dict.items(): msg_error = f"Assertion error for resource '{resource}'." - actual = variant_phasing_workflow.get_resource("write_trio_pedigree", "run", resource) + actual = variant_phasing_workflow.get_resource("write_trio_pedigree", "run", resource)() assert actual == expected, msg_error @@ -223,7 +227,7 @@ def test_gatk_phase_by_transmission_step_part_get_resource_usage(variant_phasing msg_error = f"Assertion error for resource '{resource}'." actual = variant_phasing_workflow.get_resource( "gatk_phase_by_transmission", "run", resource - ) + )() assert actual == expected, msg_error @@ -285,7 +289,7 @@ def test_gatk_read_backed_phasing_only_step_part_get_resource_usage(variant_phas msg_error = f"Assertion error for resource '{resource}'." actual = variant_phasing_workflow.get_resource( "gatk_read_backed_phasing_only", "run", resource - ) + )() assert actual == expected, msg_error @@ -347,7 +351,7 @@ def test_gatk_read_backed_phasing_also_step_part_get_resource_usage(variant_phas msg_error = f"Assertion error for resource '{resource}'." actual = variant_phasing_workflow.get_resource( "gatk_read_backed_phasing_also", "run", resource - ) + )() assert actual == expected, msg_error diff --git a/tests/snappy_pipeline/workflows/test_workflows_wgs_cnv_export_external.py b/tests/snappy_pipeline/workflows/test_workflows_wgs_cnv_export_external.py index 284d289dc..4301ff915 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_wgs_cnv_export_external.py +++ b/tests/snappy_pipeline/workflows/test_workflows_wgs_cnv_export_external.py @@ -3,6 +3,7 @@ from copy import deepcopy import textwrap +from pydantic import ValidationError import pytest import ruamel.yaml as ruamel_yaml from snakemake.io import Wildcards @@ -72,6 +73,7 @@ def wgs_cnv_export_external_workflow( # Create search path germline_sheet_fake_fs.fs.makedirs("/search_path") # Patch out file-system related things in abstract (the crawling link in step is defined there) + patch_module_fs("pathlib", germline_sheet_fake_fs, mocker) patch_module_fs("snappy_pipeline.workflows.abstract", germline_sheet_fake_fs, mocker) patch_module_fs( "snappy_pipeline.workflows.wgs_cnv_export_external", germline_sheet_fake_fs, mocker @@ -99,12 +101,13 @@ def test_workflow_check_config_invalid_annotator_files( # Create search path germline_sheet_fake_fs.fs.makedirs("/search_path") # Patch out file-system related things in abstract (the crawling link in step is defined there) + patch_module_fs("pathlib", germline_sheet_fake_fs, mocker) patch_module_fs("snappy_pipeline.workflows.abstract", germline_sheet_fake_fs, mocker) patch_module_fs( "snappy_pipeline.workflows.wgs_cnv_export_external", germline_sheet_fake_fs, mocker ) # Construct the workflow object - with pytest.raises(MissingConfiguration) as exec_info: + with pytest.raises(ValidationError) as exec_info: WgsCnvExportExternalWorkflow( dummy_workflow, minimal_config, @@ -112,9 +115,15 @@ def test_workflow_check_config_invalid_annotator_files( config_paths, work_dir, ) - assert "path_refseq_ser" in exec_info.value.args[0] - assert "path_ensembl_ser" in exec_info.value.args[0] - assert "path_db" in exec_info.value.args[0] + errors = exec_info.value.errors() + assert len(errors) == 3 + assert ( + len( + {"path_refseq_ser", "path_ensembl_ser", "path_db"} + & set(s for e in errors for s in e["loc"]) + ) + == 3 + ) def test_workflow_check_config_invalid_search_directory( @@ -135,12 +144,13 @@ def test_workflow_check_config_invalid_search_directory( create_missing_dirs=True, ) # Patch out file-system related things in abstract (the crawling link in step is defined there) + patch_module_fs("pathlib", germline_sheet_fake_fs, mocker) patch_module_fs("snappy_pipeline.workflows.abstract", germline_sheet_fake_fs, mocker) patch_module_fs( "snappy_pipeline.workflows.wgs_cnv_export_external", germline_sheet_fake_fs, mocker ) # Construct the workflow object - with pytest.raises(MissingConfiguration) as exec_info: + with pytest.raises(ValidationError) as exec_info: WgsCnvExportExternalWorkflow( dummy_workflow, minimal_config, @@ -148,7 +158,11 @@ def test_workflow_check_config_invalid_search_directory( config_paths, work_dir, ) - assert " is not a directory: /search_path" in exec_info.value.args[0] + + errors = exec_info.value.errors() + assert len(errors) == 1 + + assert "path_not_directory" in {e["type"] for e in errors} def test_workflow_check_config_invalid_search_pattern( @@ -171,6 +185,7 @@ def test_workflow_check_config_invalid_search_pattern( # Create search path germline_sheet_fake_fs.fs.makedirs("/search_path") # Patch out file-system related things in abstract (the crawling link in step is defined there) + patch_module_fs("pathlib", germline_sheet_fake_fs, mocker) patch_module_fs("snappy_pipeline.workflows.abstract", germline_sheet_fake_fs, mocker) patch_module_fs( "snappy_pipeline.workflows.wgs_cnv_export_external", germline_sheet_fake_fs, mocker @@ -182,7 +197,7 @@ def test_workflow_check_config_invalid_search_pattern( "*/*.vcf.gz", ] # Construct the workflow object - with pytest.raises(MissingConfiguration) as exec_info: + with pytest.raises(ValidationError) as exec_info: WgsCnvExportExternalWorkflow( dummy_workflow, modified_config, @@ -190,7 +205,25 @@ def test_workflow_check_config_invalid_search_pattern( config_paths, work_dir, ) - assert "Value in 'search_patterns' is not a dictionary" in exec_info.value.args[0] + + errors = exec_info.value.errors() + + # there is 1 incorrectly defined search_patterns entry + # which is incorrectly defined as a list instead of a dict/key-value pairs, + # so pydantic tries to parse *2* dicts from the list and fails + assert len(errors) == 2 + + expected_errors = [ + { + "input": input_str, + "loc": ("step_config", "wgs_cnv_export_external", "search_patterns", i), + "msg": "Input should be a valid dictionary", + "type": "dict_type", + "url": "https://errors.pydantic.dev/2.7/v/dict_type", + } + for i, input_str in enumerate(["vcf", "*/*.vcf.gz"]) + ] + assert expected_errors == errors # Tests for VarfishAnnotatorExternalStepPart (merge_vcf) ----------------------------------------- @@ -256,7 +289,7 @@ def test_varfish_annotator_step_part_get_resource_usage_merge_vcf(wgs_cnv_export msg_error = f"Assertion error for resource '{resource}' for action 'merge_vcf'." actual = wgs_cnv_export_external_workflow.get_resource( "varfish_annotator_external", "merge_vcf", resource - ) + )() assert actual == expected, msg_error @@ -332,7 +365,7 @@ def test_varfish_annotator_step_part_get_resource_usage_annotate(wgs_cnv_export_ msg_error = f"Assertion error for resource '{resource}' for action 'annotate'." actual = wgs_cnv_export_external_workflow.get_resource( "varfish_annotator_external", "annotate", resource - ) + )() assert actual == expected, msg_error diff --git a/tests/snappy_pipeline/workflows/test_workflows_wgs_sv_export_external.py b/tests/snappy_pipeline/workflows/test_workflows_wgs_sv_export_external.py index 149c28901..42f1f126f 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_wgs_sv_export_external.py +++ b/tests/snappy_pipeline/workflows/test_workflows_wgs_sv_export_external.py @@ -3,11 +3,11 @@ from copy import deepcopy import textwrap +from pydantic import ValidationError import pytest import ruamel.yaml as ruamel_yaml from snakemake.io import Wildcards -from snappy_pipeline.base import MissingConfiguration from snappy_pipeline.workflows.wgs_sv_export_external import WgsSvExportExternalWorkflow from .common import get_expected_log_files_dict, get_expected_output_vcf_files_dict @@ -72,6 +72,7 @@ def wgs_sv_export_external_workflow( # Create search path germline_sheet_fake_fs.fs.makedirs("/search_path") # Patch out file-system related things in abstract (the crawling link in step is defined there) + patch_module_fs("pathlib", germline_sheet_fake_fs, mocker) patch_module_fs("snappy_pipeline.workflows.abstract", germline_sheet_fake_fs, mocker) patch_module_fs( "snappy_pipeline.workflows.wgs_sv_export_external", germline_sheet_fake_fs, mocker @@ -99,12 +100,13 @@ def test_workflow_check_config_invalid_annotator_files( # Create search path germline_sheet_fake_fs.fs.makedirs("/search_path") # Patch out file-system related things in abstract (the crawling link in step is defined there) + patch_module_fs("pathlib", germline_sheet_fake_fs, mocker) patch_module_fs("snappy_pipeline.workflows.abstract", germline_sheet_fake_fs, mocker) patch_module_fs( "snappy_pipeline.workflows.wgs_sv_export_external", germline_sheet_fake_fs, mocker ) # Construct the workflow object - with pytest.raises(MissingConfiguration) as exec_info: + with pytest.raises(ValidationError) as exec_info: WgsSvExportExternalWorkflow( dummy_workflow, minimal_config, @@ -112,9 +114,15 @@ def test_workflow_check_config_invalid_annotator_files( config_paths, work_dir, ) - assert "path_refseq_ser" in exec_info.value.args[0] - assert "path_ensembl_ser" in exec_info.value.args[0] - assert "path_db" in exec_info.value.args[0] + errors = exec_info.value.errors() + assert len(errors) == 3 + assert ( + len( + {"path_refseq_ser", "path_ensembl_ser", "path_db"} + & set(s for e in errors for s in e["loc"]) + ) + == 3 + ) def test_workflow_check_config_invalid_search_directory( @@ -135,12 +143,13 @@ def test_workflow_check_config_invalid_search_directory( create_missing_dirs=True, ) # Patch out file-system related things in abstract (the crawling link in step is defined there) + patch_module_fs("pathlib", germline_sheet_fake_fs, mocker) patch_module_fs("snappy_pipeline.workflows.abstract", germline_sheet_fake_fs, mocker) patch_module_fs( "snappy_pipeline.workflows.wgs_sv_export_external", germline_sheet_fake_fs, mocker ) # Construct the workflow object - with pytest.raises(MissingConfiguration) as exec_info: + with pytest.raises(ValidationError) as exec_info: WgsSvExportExternalWorkflow( dummy_workflow, minimal_config, @@ -148,7 +157,11 @@ def test_workflow_check_config_invalid_search_directory( config_paths, work_dir, ) - assert " is not a directory: /search_path" in exec_info.value.args[0] + + errors = exec_info.value.errors() + assert len(errors) == 1 + + assert "path_not_directory" in {e["type"] for e in errors} def test_workflow_check_config_invalid_search_pattern( @@ -171,6 +184,7 @@ def test_workflow_check_config_invalid_search_pattern( # Create search path germline_sheet_fake_fs.fs.makedirs("/search_path") # Patch out file-system related things in abstract (the crawling link in step is defined there) + patch_module_fs("pathlib", germline_sheet_fake_fs, mocker) patch_module_fs("snappy_pipeline.workflows.abstract", germline_sheet_fake_fs, mocker) patch_module_fs( "snappy_pipeline.workflows.wgs_sv_export_external", germline_sheet_fake_fs, mocker @@ -182,7 +196,7 @@ def test_workflow_check_config_invalid_search_pattern( "*/*.vcf.gz", ] # Construct the workflow object - with pytest.raises(MissingConfiguration) as exec_info: + with pytest.raises(ValidationError) as exec_info: WgsSvExportExternalWorkflow( dummy_workflow, modified_config, @@ -190,7 +204,25 @@ def test_workflow_check_config_invalid_search_pattern( config_paths, work_dir, ) - assert "Value in 'search_patterns' is not a dictionary" in exec_info.value.args[0] + + errors = exec_info.value.errors() + + # there is 1 incorrectly defined search_patterns entry + # which is incorrectly defined as a list instead of a dict/key-value pairs, + # so pydantic tries to parse *2* dicts from the list and fails + assert len(errors) == 2 + + expected_errors = [ + { + "input": input_str, + "loc": ("step_config", "wgs_sv_export_external", "search_patterns", i), + "msg": "Input should be a valid dictionary", + "type": "dict_type", + "url": "https://errors.pydantic.dev/2.7/v/dict_type", + } + for i, input_str in enumerate(["vcf", "*/*.vcf.gz"]) + ] + assert expected_errors == errors # Tests for VarfishAnnotatorExternalStepPart (merge_vcf) ----------------------------------------- @@ -254,7 +286,7 @@ def test_varfish_annotator_step_part_get_resource_usage_merge_vcf(wgs_sv_export_ msg_error = f"Assertion error for resource '{resource}' for action 'merge_vcf'." actual = wgs_sv_export_external_workflow.get_resource( "varfish_annotator_external", "merge_vcf", resource - ) + )() assert actual == expected, msg_error @@ -332,7 +364,7 @@ def test_varfish_annotator_step_part_get_resource_usage_annotate(wgs_sv_export_e msg_error = f"Assertion error for resource '{resource}' for action 'annotate'." actual = wgs_sv_export_external_workflow.get_resource( "varfish_annotator_external", "annotate", resource - ) + )() assert actual == expected, msg_error diff --git a/tests/snappy_wrappers/wrappers/conftest.py b/tests/snappy_wrappers/wrappers/conftest.py index 913a5b514..719cc17a0 100644 --- a/tests/snappy_wrappers/wrappers/conftest.py +++ b/tests/snappy_wrappers/wrappers/conftest.py @@ -9,7 +9,6 @@ from pyfakefs import fake_filesystem import pytest -import yaml FORCE_RUN = os.environ.get("FORCE_RUN", "false") == "true" DIFF_MASTER = os.environ.get("DIFF_MASTER", "false") == "true" diff --git a/tests/snappy_wrappers/wrappers/test_eb_filter_par.py b/tests/snappy_wrappers/wrappers/test_eb_filter_par.py index af00f9463..d83dc29be 100644 --- a/tests/snappy_wrappers/wrappers/test_eb_filter_par.py +++ b/tests/snappy_wrappers/wrappers/test_eb_filter_par.py @@ -31,8 +31,6 @@ def minimal_config(): ngs_mapping: tools: dna: ['bwa'] - compute_coverage_bed: true - path_target_regions: /path/to/regions.bed bwa: path_index: /path/to/bwa/index.fa diff --git a/tests/snappy_wrappers/wrappers/test_gatk_hc_par_run.py b/tests/snappy_wrappers/wrappers/test_gatk_hc_par_run.py index aa3e02e03..099e22281 100644 --- a/tests/snappy_wrappers/wrappers/test_gatk_hc_par_run.py +++ b/tests/snappy_wrappers/wrappers/test_gatk_hc_par_run.py @@ -28,8 +28,6 @@ def minimal_config(): ngs_mapping: tools: dna: ['bwa'] - compute_coverage_bed: true - path_target_regions: /path/to/regions.bed bwa: path_index: /path/to/bwa/index.fa diff --git a/tests/snappy_wrappers/wrappers/test_gatk_read_backed_phasing_par.py b/tests/snappy_wrappers/wrappers/test_gatk_read_backed_phasing_par.py index 0be6676d5..17251f2c0 100644 --- a/tests/snappy_wrappers/wrappers/test_gatk_read_backed_phasing_par.py +++ b/tests/snappy_wrappers/wrappers/test_gatk_read_backed_phasing_par.py @@ -30,8 +30,6 @@ def minimal_config(): ngs_mapping: tools: dna: ['bwa'] - compute_coverage_bed: true - path_target_regions: /path/to/regions.bed bwa: path_index: /path/to/bwa/index.fa diff --git a/tests/snappy_wrappers/wrappers/test_gatk_ug_par.py b/tests/snappy_wrappers/wrappers/test_gatk_ug_par.py index 69befe5bf..885adc79e 100644 --- a/tests/snappy_wrappers/wrappers/test_gatk_ug_par.py +++ b/tests/snappy_wrappers/wrappers/test_gatk_ug_par.py @@ -28,8 +28,6 @@ def minimal_config(): ngs_mapping: tools: dna: ['bwa'] - compute_coverage_bed: true - path_target_regions: /path/to/regions.bed bwa: path_index: /path/to/bwa/index.fa diff --git a/tests/snappy_wrappers/wrappers/test_jannovar_par_annotate_somatic_vcf.py b/tests/snappy_wrappers/wrappers/test_jannovar_par_annotate_somatic_vcf.py index 9a8d1264b..489a1771c 100644 --- a/tests/snappy_wrappers/wrappers/test_jannovar_par_annotate_somatic_vcf.py +++ b/tests/snappy_wrappers/wrappers/test_jannovar_par_annotate_somatic_vcf.py @@ -30,8 +30,6 @@ def minimal_config(): ngs_mapping: tools: dna: ['bwa'] - compute_coverage_bed: true - path_target_regions: /path/to/regions.bed bwa: path_index: /path/to/bwa/index.fa diff --git a/tests/snappy_wrappers/wrappers/test_jannovar_par_annotate_vcf.py b/tests/snappy_wrappers/wrappers/test_jannovar_par_annotate_vcf.py index c8ed657a7..7e7b18490 100644 --- a/tests/snappy_wrappers/wrappers/test_jannovar_par_annotate_vcf.py +++ b/tests/snappy_wrappers/wrappers/test_jannovar_par_annotate_vcf.py @@ -30,8 +30,6 @@ def minimal_config(): ngs_mapping: tools: dna: ['bwa'] - compute_coverage_bed: true - path_target_regions: /path/to/regions.bed bwa: path_index: /path/to/bwa/index.fa diff --git a/tests/snappy_wrappers/wrappers/test_mutect2_par_prepare_panel.py b/tests/snappy_wrappers/wrappers/test_mutect2_par_prepare_panel.py index fb2b864cb..a2bd9fb96 100644 --- a/tests/snappy_wrappers/wrappers/test_mutect2_par_prepare_panel.py +++ b/tests/snappy_wrappers/wrappers/test_mutect2_par_prepare_panel.py @@ -17,7 +17,7 @@ ParallelMutect2Wrapper, ) -from .conftest import mock_settings_env_vars, patch_module_fs +from .conftest import patch_module_fs @pytest.fixture(scope="module") # otherwise: performance issues @@ -38,8 +38,6 @@ def minimal_config(): ngs_mapping: tools: dna: ['bwa'] - compute_coverage_bed: true - path_target_regions: /path/to/regions.bed bwa: path_index: /path/to/bwa/index.fa diff --git a/tests/snappy_wrappers/wrappers/test_mutect2_par_run.py b/tests/snappy_wrappers/wrappers/test_mutect2_par_run.py index 8c4eec0c3..504e7026e 100644 --- a/tests/snappy_wrappers/wrappers/test_mutect2_par_run.py +++ b/tests/snappy_wrappers/wrappers/test_mutect2_par_run.py @@ -36,8 +36,6 @@ def minimal_config(): ngs_mapping: tools: dna: ['bwa'] - compute_coverage_bed: true - path_target_regions: /path/to/regions.bed bwa: path_index: /path/to/bwa/index.fa somatic_variant_calling: diff --git a/tests/snappy_wrappers/wrappers/test_mutect_par_run.py b/tests/snappy_wrappers/wrappers/test_mutect_par_run.py index d41a84acd..84cceaa31 100644 --- a/tests/snappy_wrappers/wrappers/test_mutect_par_run.py +++ b/tests/snappy_wrappers/wrappers/test_mutect_par_run.py @@ -31,8 +31,6 @@ def minimal_config(): ngs_mapping: tools: dna: ['bwa'] - compute_coverage_bed: true - path_target_regions: /path/to/regions.bed bwa: path_index: /path/to/bwa/index.fa somatic_variant_calling: diff --git a/tests/snappy_wrappers/wrappers/test_varscan_par_call_joint.py b/tests/snappy_wrappers/wrappers/test_varscan_par_call_joint.py index f9c3b963f..92ff41783 100644 --- a/tests/snappy_wrappers/wrappers/test_varscan_par_call_joint.py +++ b/tests/snappy_wrappers/wrappers/test_varscan_par_call_joint.py @@ -30,8 +30,6 @@ def minimal_config(): ngs_mapping: tools: dna: ['bwa'] - compute_coverage_bed: true - path_target_regions: /path/to/regions.bed bwa: path_index: /path/to/bwa/index.fa