From d6b2c4df936dcc5ddb0b50dfb4fa93d36874c50b Mon Sep 17 00:00:00 2001
From: Till Hartmann <till.hartmann@bih-charite.de>
Date: Fri, 5 Jul 2024 14:57:27 +0200
Subject: [PATCH] add initial layout and config for automatically retrieving
 (and caching) references (and annotations)

---
 snappy_pipeline/workflows/reference/Snakefile | 43 ++++++++
 .../workflows/reference/__init__.py           | 97 +++++++++++++++++++
 snappy_pipeline/workflows/reference/model.py  | 80 +++++++++++++++
 3 files changed, 220 insertions(+)
 create mode 100644 snappy_pipeline/workflows/reference/Snakefile
 create mode 100644 snappy_pipeline/workflows/reference/__init__.py
 create mode 100644 snappy_pipeline/workflows/reference/model.py

diff --git a/snappy_pipeline/workflows/reference/Snakefile b/snappy_pipeline/workflows/reference/Snakefile
new file mode 100644
index 000000000..c53f36e93
--- /dev/null
+++ b/snappy_pipeline/workflows/reference/Snakefile
@@ -0,0 +1,43 @@
+# -*- coding: utf-8 -*-
+"""CUBI Pipeline adapter_trimming step Snakefile"""
+
+import os
+
+from snappy_pipeline import expand_ref
+from snappy_pipeline.workflows.reference import ReferenceWorkflow
+
+__author__ = "Till Hartmann <till.hartmann@bih-charite.de>"
+
+
+# Configuration ===============================================================
+
+
+configfile: "config.yaml"
+
+
+# Expand "$ref" JSON pointers in configuration (also works for YAML)
+config, lookup_paths, config_paths = expand_ref("config.yaml", config)
+
+# WorkflowImpl Object Setup ===================================================
+
+wf = ReferenceWorkflow(workflow, config, lookup_paths, config_paths, os.getcwd())
+
+# Rules =======================================================================
+
+
+rule reference_all:
+    input:
+        wf.get_result_files(),
+    default_target: True
+
+
+rule reference_retrieve_fasta_run:
+    output:
+        fasta=protected("work/reference/{reference}/reference.fasta"),
+    params:
+        reference=lambda wildcards: wildcards.reference,
+    cache: "omit-software"
+    shell:
+        """
+        touch {output.fasta}
+        """
diff --git a/snappy_pipeline/workflows/reference/__init__.py b/snappy_pipeline/workflows/reference/__init__.py
new file mode 100644
index 000000000..ff6d67204
--- /dev/null
+++ b/snappy_pipeline/workflows/reference/__init__.py
@@ -0,0 +1,97 @@
+# -*- coding: utf-8 -*-
+"""Implementation of the ``reference`` step
+
+=====================
+Default Configuration
+=====================
+
+The default configuration is as follows.
+
+.. include:: DEFAULT_CONFIG_adapter_trimming.rst
+
+"""
+
+from snappy_pipeline.workflows.abstract import BaseStepPart
+
+from .model import ReferenceModel as ReferenceConfigModel
+
+#: Default configuration for the reference
+DEFAULT_CONFIG = ReferenceConfigModel.default_config_yaml_string()
+
+
+class ReferenceStepPart(BaseStepPart):
+    """Reference retrieval common features"""
+
+    #: Step name
+    name = ""
+
+    #: Class available actions
+    actions = ("run",)
+
+    def __init__(self, parent):
+        super().__init__(parent)
+        self.base_path_out = "work/{source}.{{library_name}}"
+
+    @dictify
+    def get_output_files(self, action):
+        """Return output files"""
+        # Validate action
+        self._validate_action(action)
+        return (("out_done", self.base_path_out.format(source=self.name) + "/out/.done"),)
+
+    @dictify
+    def _get_log_file(self, action):
+        """Return dict of log files."""
+        # Validate action
+        self._validate_action(action)
+        _ = action
+        prefix = "work/{source}/log/{source}.{{reference_name}}".format(source=self.name)
+        key_ext = (
+            ("log", ".log"),
+            ("conda_info", ".conda_info.txt"),
+            ("conda_list", ".conda_list.txt"),
+        )
+        yield (
+            "done",
+            "work/{source}.{{reference_name}}/log/.done".format(source=self.name),
+        )
+        for key, ext in key_ext:
+            yield key, prefix + ext
+            yield key + "_md5", prefix + ext + ".md5"
+
+    def get_args(self, action):
+        """Return function that maps wildcards to dict for input files"""
+
+        def args_function(wildcards):
+            return {}
+
+        # Validate action
+        self._validate_action(action)
+        return args_function
+
+
+class ReferenceWorkflow(BaseStep):
+    """Automatically retrieve reference data"""
+
+    #: Step name
+    name = "reference"
+
+    #: Default biomed sheet class
+    sheet_shortcut_class = GenericSampleSheet
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs, config_model_class=ReferenceConfigModel)
+        self.register_sub_step_classes((LinkInStep,))
+
+    @classmethod
+    def default_config_yaml(cls):
+        """Return default config YAML, to be overwritten by project-specific one"""
+        return DEFAULT_CONFIG
+
+    @listify
+    def get_result_files(self):
+        """Return list of result files for the reference workflow"""
+        tpls = ("output/{source}/{reference_name}/out/.done",)
+        for name, reference in self.config["references"]:
+            for tpl in tpls:
+                yield tpl.format(source=reference.source, reference_name=name)
diff --git a/snappy_pipeline/workflows/reference/model.py b/snappy_pipeline/workflows/reference/model.py
new file mode 100644
index 000000000..30d2c89d0
--- /dev/null
+++ b/snappy_pipeline/workflows/reference/model.py
@@ -0,0 +1,80 @@
+from enum import StrEnum
+from typing import Annotated
+
+from pydantic import Field
+
+from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel
+
+
+class Source(StrEnum):
+    Ensembl = "Ensembl"
+    NCBI = "NCBI"
+    Custom = "Custom"
+
+
+class DataType(StrEnum):
+    dna = "dna"
+    cds = "cds"
+    cdna = "cdna"
+    ncrna = "ncrna"
+    pep = "pep"
+
+
+class Region(SnappyModel):
+    name: str
+    start: int | None
+    end: int | None
+
+
+class Annotation(SnappyModel):
+    reference: list[str] | None = None
+
+
+class Reference(SnappyModel):
+    description: str
+    """Description of the reference."""
+
+    source: Annotated[Source, EnumField(Source)]
+    """Source of the reference."""
+
+    custom_url: str | None = Field(
+        None, examples=["file:///path/to/reference.fa", "http://example.com/reference.fa"]
+    )
+    """URL to custom reference. Only used when source is 'Custom'."""
+
+    species: str = Field(examples=["Homo Sapiens"])
+    """Species name."""
+
+    taxon_id: str | int = Field(examples=[9606])
+    """Taxon ID."""
+
+    datatype: Annotated[DataType, EnumField(DataType)]
+    """Data type of the reference."""
+
+    release: str | int = Field(examples=[112])
+    """Release of the reference."""
+
+    build: str | None = Field(None, examples=["GRCh37", "GRCh38"])
+    """Build of the reference."""
+
+    branch: str | None = Field(None, examples=["grch37"])
+    """Branch of the reference."""
+
+    exclude_contigs: str | None = None
+    """Regular expression to exclude contigs with"""
+
+    regions: list[Region] | None = None
+    """Regions of the reference."""
+
+    additional_sequences: list[str] | None = None
+    """List of local fasta files to add to the reference"""
+
+    annotations: dict[str, Annotation] = {}
+
+
+class ReferenceModel(SnappyStepModel):
+    references: dict[str, Reference] = {
+        "GRCh38-foo": Reference(
+            source="Ensembl", species="Homo Sapiens", taxon_id=9606, datatype="dna", release=112
+        )
+    }