From d6b2c4df936dcc5ddb0b50dfb4fa93d36874c50b Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Fri, 5 Jul 2024 14:57:27 +0200 Subject: [PATCH] add initial layout and config for automatically retrieving (and caching) references (and annotations) --- snappy_pipeline/workflows/reference/Snakefile | 43 ++++++++ .../workflows/reference/__init__.py | 97 +++++++++++++++++++ snappy_pipeline/workflows/reference/model.py | 80 +++++++++++++++ 3 files changed, 220 insertions(+) create mode 100644 snappy_pipeline/workflows/reference/Snakefile create mode 100644 snappy_pipeline/workflows/reference/__init__.py create mode 100644 snappy_pipeline/workflows/reference/model.py diff --git a/snappy_pipeline/workflows/reference/Snakefile b/snappy_pipeline/workflows/reference/Snakefile new file mode 100644 index 000000000..c53f36e93 --- /dev/null +++ b/snappy_pipeline/workflows/reference/Snakefile @@ -0,0 +1,43 @@ +# -*- coding: utf-8 -*- +"""CUBI Pipeline adapter_trimming step Snakefile""" + +import os + +from snappy_pipeline import expand_ref +from snappy_pipeline.workflows.reference import ReferenceWorkflow + +__author__ = "Till Hartmann " + + +# Configuration =============================================================== + + +configfile: "config.yaml" + + +# Expand "$ref" JSON pointers in configuration (also works for YAML) +config, lookup_paths, config_paths = expand_ref("config.yaml", config) + +# WorkflowImpl Object Setup =================================================== + +wf = ReferenceWorkflow(workflow, config, lookup_paths, config_paths, os.getcwd()) + +# Rules ======================================================================= + + +rule reference_all: + input: + wf.get_result_files(), + default_target: True + + +rule reference_retrieve_fasta_run: + output: + fasta=protected("work/reference/{reference}/reference.fasta"), + params: + reference=lambda wildcards: wildcards.reference, + cache: "omit-software" + shell: + """ + touch {output.fasta} + """ diff --git a/snappy_pipeline/workflows/reference/__init__.py b/snappy_pipeline/workflows/reference/__init__.py new file mode 100644 index 000000000..ff6d67204 --- /dev/null +++ b/snappy_pipeline/workflows/reference/__init__.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- +"""Implementation of the ``reference`` step + +===================== +Default Configuration +===================== + +The default configuration is as follows. + +.. include:: DEFAULT_CONFIG_adapter_trimming.rst + +""" + +from snappy_pipeline.workflows.abstract import BaseStepPart + +from .model import ReferenceModel as ReferenceConfigModel + +#: Default configuration for the reference +DEFAULT_CONFIG = ReferenceConfigModel.default_config_yaml_string() + + +class ReferenceStepPart(BaseStepPart): + """Reference retrieval common features""" + + #: Step name + name = "" + + #: Class available actions + actions = ("run",) + + def __init__(self, parent): + super().__init__(parent) + self.base_path_out = "work/{source}.{{library_name}}" + + @dictify + def get_output_files(self, action): + """Return output files""" + # Validate action + self._validate_action(action) + return (("out_done", self.base_path_out.format(source=self.name) + "/out/.done"),) + + @dictify + def _get_log_file(self, action): + """Return dict of log files.""" + # Validate action + self._validate_action(action) + _ = action + prefix = "work/{source}/log/{source}.{{reference_name}}".format(source=self.name) + key_ext = ( + ("log", ".log"), + ("conda_info", ".conda_info.txt"), + ("conda_list", ".conda_list.txt"), + ) + yield ( + "done", + "work/{source}.{{reference_name}}/log/.done".format(source=self.name), + ) + for key, ext in key_ext: + yield key, prefix + ext + yield key + "_md5", prefix + ext + ".md5" + + def get_args(self, action): + """Return function that maps wildcards to dict for input files""" + + def args_function(wildcards): + return {} + + # Validate action + self._validate_action(action) + return args_function + + +class ReferenceWorkflow(BaseStep): + """Automatically retrieve reference data""" + + #: Step name + name = "reference" + + #: Default biomed sheet class + sheet_shortcut_class = GenericSampleSheet + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs, config_model_class=ReferenceConfigModel) + self.register_sub_step_classes((LinkInStep,)) + + @classmethod + def default_config_yaml(cls): + """Return default config YAML, to be overwritten by project-specific one""" + return DEFAULT_CONFIG + + @listify + def get_result_files(self): + """Return list of result files for the reference workflow""" + tpls = ("output/{source}/{reference_name}/out/.done",) + for name, reference in self.config["references"]: + for tpl in tpls: + yield tpl.format(source=reference.source, reference_name=name) diff --git a/snappy_pipeline/workflows/reference/model.py b/snappy_pipeline/workflows/reference/model.py new file mode 100644 index 000000000..30d2c89d0 --- /dev/null +++ b/snappy_pipeline/workflows/reference/model.py @@ -0,0 +1,80 @@ +from enum import StrEnum +from typing import Annotated + +from pydantic import Field + +from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel + + +class Source(StrEnum): + Ensembl = "Ensembl" + NCBI = "NCBI" + Custom = "Custom" + + +class DataType(StrEnum): + dna = "dna" + cds = "cds" + cdna = "cdna" + ncrna = "ncrna" + pep = "pep" + + +class Region(SnappyModel): + name: str + start: int | None + end: int | None + + +class Annotation(SnappyModel): + reference: list[str] | None = None + + +class Reference(SnappyModel): + description: str + """Description of the reference.""" + + source: Annotated[Source, EnumField(Source)] + """Source of the reference.""" + + custom_url: str | None = Field( + None, examples=["file:///path/to/reference.fa", "http://example.com/reference.fa"] + ) + """URL to custom reference. Only used when source is 'Custom'.""" + + species: str = Field(examples=["Homo Sapiens"]) + """Species name.""" + + taxon_id: str | int = Field(examples=[9606]) + """Taxon ID.""" + + datatype: Annotated[DataType, EnumField(DataType)] + """Data type of the reference.""" + + release: str | int = Field(examples=[112]) + """Release of the reference.""" + + build: str | None = Field(None, examples=["GRCh37", "GRCh38"]) + """Build of the reference.""" + + branch: str | None = Field(None, examples=["grch37"]) + """Branch of the reference.""" + + exclude_contigs: str | None = None + """Regular expression to exclude contigs with""" + + regions: list[Region] | None = None + """Regions of the reference.""" + + additional_sequences: list[str] | None = None + """List of local fasta files to add to the reference""" + + annotations: dict[str, Annotation] = {} + + +class ReferenceModel(SnappyStepModel): + references: dict[str, Reference] = { + "GRCh38-foo": Reference( + source="Ensembl", species="Homo Sapiens", taxon_id=9606, datatype="dna", release=112 + ) + }