Skip to content

Commit

Permalink
add initial layout and config for automatically retrieving (and cachi…
Browse files Browse the repository at this point in the history
…ng) references (and annotations)
  • Loading branch information
tedil committed Jul 5, 2024
1 parent 17c1a87 commit d6b2c4d
Show file tree
Hide file tree
Showing 3 changed files with 220 additions and 0 deletions.
43 changes: 43 additions & 0 deletions snappy_pipeline/workflows/reference/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# -*- coding: utf-8 -*-
"""CUBI Pipeline adapter_trimming step Snakefile"""

import os

from snappy_pipeline import expand_ref
from snappy_pipeline.workflows.reference import ReferenceWorkflow

__author__ = "Till Hartmann <[email protected]>"


# Configuration ===============================================================


configfile: "config.yaml"


# Expand "$ref" JSON pointers in configuration (also works for YAML)
config, lookup_paths, config_paths = expand_ref("config.yaml", config)

# WorkflowImpl Object Setup ===================================================

wf = ReferenceWorkflow(workflow, config, lookup_paths, config_paths, os.getcwd())

# Rules =======================================================================


rule reference_all:
input:
wf.get_result_files(),
default_target: True


rule reference_retrieve_fasta_run:
output:
fasta=protected("work/reference/{reference}/reference.fasta"),
params:
reference=lambda wildcards: wildcards.reference,
cache: "omit-software"
shell:
"""
touch {output.fasta}
"""
97 changes: 97 additions & 0 deletions snappy_pipeline/workflows/reference/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# -*- coding: utf-8 -*-
"""Implementation of the ``reference`` step
=====================
Default Configuration
=====================
The default configuration is as follows.
.. include:: DEFAULT_CONFIG_adapter_trimming.rst
"""

from snappy_pipeline.workflows.abstract import BaseStepPart

from .model import ReferenceModel as ReferenceConfigModel

#: Default configuration for the reference
DEFAULT_CONFIG = ReferenceConfigModel.default_config_yaml_string()


class ReferenceStepPart(BaseStepPart):
"""Reference retrieval common features"""

#: Step name
name = ""

#: Class available actions
actions = ("run",)

def __init__(self, parent):
super().__init__(parent)
self.base_path_out = "work/{source}.{{library_name}}"

@dictify
def get_output_files(self, action):
"""Return output files"""
# Validate action
self._validate_action(action)
return (("out_done", self.base_path_out.format(source=self.name) + "/out/.done"),)

@dictify
def _get_log_file(self, action):
"""Return dict of log files."""
# Validate action
self._validate_action(action)
_ = action
prefix = "work/{source}/log/{source}.{{reference_name}}".format(source=self.name)
key_ext = (
("log", ".log"),
("conda_info", ".conda_info.txt"),
("conda_list", ".conda_list.txt"),
)
yield (
"done",
"work/{source}.{{reference_name}}/log/.done".format(source=self.name),
)
for key, ext in key_ext:
yield key, prefix + ext
yield key + "_md5", prefix + ext + ".md5"

def get_args(self, action):
"""Return function that maps wildcards to dict for input files"""

def args_function(wildcards):
return {}

# Validate action
self._validate_action(action)
return args_function


class ReferenceWorkflow(BaseStep):
"""Automatically retrieve reference data"""

#: Step name
name = "reference"

#: Default biomed sheet class
sheet_shortcut_class = GenericSampleSheet

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs, config_model_class=ReferenceConfigModel)
self.register_sub_step_classes((LinkInStep,))

@classmethod
def default_config_yaml(cls):
"""Return default config YAML, to be overwritten by project-specific one"""
return DEFAULT_CONFIG

@listify
def get_result_files(self):
"""Return list of result files for the reference workflow"""
tpls = ("output/{source}/{reference_name}/out/.done",)
for name, reference in self.config["references"]:
for tpl in tpls:
yield tpl.format(source=reference.source, reference_name=name)
80 changes: 80 additions & 0 deletions snappy_pipeline/workflows/reference/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
from enum import StrEnum
from typing import Annotated

from pydantic import Field

from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel


class Source(StrEnum):
Ensembl = "Ensembl"
NCBI = "NCBI"
Custom = "Custom"


class DataType(StrEnum):
dna = "dna"
cds = "cds"
cdna = "cdna"
ncrna = "ncrna"
pep = "pep"


class Region(SnappyModel):
name: str
start: int | None
end: int | None


class Annotation(SnappyModel):
reference: list[str] | None = None


class Reference(SnappyModel):
description: str
"""Description of the reference."""

source: Annotated[Source, EnumField(Source)]
"""Source of the reference."""

custom_url: str | None = Field(
None, examples=["file:///path/to/reference.fa", "http://example.com/reference.fa"]
)
"""URL to custom reference. Only used when source is 'Custom'."""

species: str = Field(examples=["Homo Sapiens"])
"""Species name."""

taxon_id: str | int = Field(examples=[9606])
"""Taxon ID."""

datatype: Annotated[DataType, EnumField(DataType)]
"""Data type of the reference."""

release: str | int = Field(examples=[112])
"""Release of the reference."""

build: str | None = Field(None, examples=["GRCh37", "GRCh38"])
"""Build of the reference."""

branch: str | None = Field(None, examples=["grch37"])
"""Branch of the reference."""

exclude_contigs: str | None = None
"""Regular expression to exclude contigs with"""

regions: list[Region] | None = None
"""Regions of the reference."""

additional_sequences: list[str] | None = None
"""List of local fasta files to add to the reference"""

annotations: dict[str, Annotation] = {}


class ReferenceModel(SnappyStepModel):
references: dict[str, Reference] = {
"GRCh38-foo": Reference(
source="Ensembl", species="Homo Sapiens", taxon_id=9606, datatype="dna", release=112
)
}

0 comments on commit d6b2c4d

Please sign in to comment.