From 72c17330fd02c0073a3d8102e0c3d9eb82d4bacd Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 6 Feb 2024 13:54:21 +0000 Subject: [PATCH] Add tx2gene module (formerly local to rnaseq) (#4854) * Add test data link * Add tx2gene * Fix linting, update snapshots * path -> directory * Correct conda setup * Correct conda setup * Update test data reference * Apply minor suggestions from code review Co-authored-by: Adam Talbot <12817534+adamrtalbot@users.noreply.github.com> * Error out at point of failure in determining transcript attribute --------- Co-authored-by: Adam Talbot <12817534+adamrtalbot@users.noreply.github.com> --- .../nf-core/custom/tx2gene/environment.yml | 9 + modules/nf-core/custom/tx2gene/main.nf | 36 ++++ modules/nf-core/custom/tx2gene/meta.yml | 65 +++++++ .../custom/tx2gene/templates/tx2gene.py | 184 ++++++++++++++++++ .../nf-core/custom/tx2gene/tests/main.nf.test | 99 ++++++++++ .../custom/tx2gene/tests/main.nf.test.snap | 70 +++++++ modules/nf-core/custom/tx2gene/tests/tags.yml | 2 + tests/config/test_data.config | 6 + 8 files changed, 471 insertions(+) create mode 100644 modules/nf-core/custom/tx2gene/environment.yml create mode 100644 modules/nf-core/custom/tx2gene/main.nf create mode 100644 modules/nf-core/custom/tx2gene/meta.yml create mode 100755 modules/nf-core/custom/tx2gene/templates/tx2gene.py create mode 100644 modules/nf-core/custom/tx2gene/tests/main.nf.test create mode 100644 modules/nf-core/custom/tx2gene/tests/main.nf.test.snap create mode 100644 modules/nf-core/custom/tx2gene/tests/tags.yml diff --git a/modules/nf-core/custom/tx2gene/environment.yml b/modules/nf-core/custom/tx2gene/environment.yml new file mode 100644 index 000000000000..a859dc881fef --- /dev/null +++ b/modules/nf-core/custom/tx2gene/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "custom_tx2gene" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - python=3.9.5 diff --git a/modules/nf-core/custom/tx2gene/main.nf b/modules/nf-core/custom/tx2gene/main.nf new file mode 100644 index 000000000000..dff8f20c97fc --- /dev/null +++ b/modules/nf-core/custom/tx2gene/main.nf @@ -0,0 +1,36 @@ +process CUSTOM_TX2GENE { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9--1' : + 'biocontainers/python:3.9--1' }" + + input: + tuple val(meta), path(gtf) + tuple val(meta2), path ("quants") + val quant_type + val id + val extra + + output: + tuple val(meta), path("*.tx2gene.tsv"), emit: tx2gene + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + template 'tx2gene.py' + + stub: + """ + touch ${meta.id}.tx2gene.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/modules/nf-core/custom/tx2gene/meta.yml b/modules/nf-core/custom/tx2gene/meta.yml new file mode 100644 index 000000000000..bea5f571d54e --- /dev/null +++ b/modules/nf-core/custom/tx2gene/meta.yml @@ -0,0 +1,65 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "custom_tx2gene" +description: Make a transcript/gene mapping from a GTF and cross-reference with transcript quantifications. +keywords: + - gene + - gtf + - pseudoalignment + - transcript +tools: + - "custom": + description: | + "Custom module to create a transcript to gene mapping from a GTF and + check it against transcript quantifications" + tool_dev_url: "https://github.com/nf-core/modules/blob/master/modules/nf-core/custom/tx2gene/main.nf" + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing reference information related to the GTF file + e.g. `[ id:'yeast' ]` + - gtf: + type: file + description: An annotation file of the reference genome in GTF format + pattern: "*.gtf" + - meta2: + type: map + description: | + Groovy Map containing information related to the experiment as a whole + e.g. `[ id:'SRP123456' ]` + - quants: + type: directory + description: Path to a directory with subdirectories corresponding to + sample-wise runs of Salmon or Kallisto + - quant_type: + type: string + description: Quantification type, 'kallisto' or 'salmon' + - id: + type: string + description: Gene ID attribute in the GTF file (default= gene_id) + - extra: + type: string + description: Extra gene attribute in the GTF file (default= gene_name) + +output: + - meta: + type: map + description: | + Groovy Map containing reference information related to the GTF file + e.g. `[ id:'yeast' ]` + - tx2gene: + type: file + description: A transcript/ gene mapping table in TSV format + pattern: "*.tx2gene.tsv" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@pinin4fjords" +maintainers: + - "@pinin4fjords" diff --git a/modules/nf-core/custom/tx2gene/templates/tx2gene.py b/modules/nf-core/custom/tx2gene/templates/tx2gene.py new file mode 100755 index 000000000000..9b1e1ec69cf1 --- /dev/null +++ b/modules/nf-core/custom/tx2gene/templates/tx2gene.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python3 + +# Written by Lorena Pantano with subsequent reworking by Jonathan Manning. Released under the MIT license. + +import logging +import argparse +import glob +import os +import platform +import re +from collections import Counter, defaultdict, OrderedDict +from collections.abc import Set +from typing import Dict + +# Configure logging +logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s") +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +def format_yaml_like(data: dict, indent: int = 0) -> str: + """Formats a dictionary to a YAML-like string. + + Args: + data (dict): The dictionary to format. + indent (int): The current indentation level. + + Returns: + str: A string formatted as YAML. + """ + yaml_str = "" + for key, value in data.items(): + spaces = " " * indent + if isinstance(value, dict): + yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}" + else: + yaml_str += f"{spaces}{key}: {value}\\n" + return yaml_str + +def read_top_transcripts(quant_dir: str, file_pattern: str) -> Set[str]: + """ + Read the top 100 transcripts from the quantification file. + + Parameters: + quant_dir (str): Directory where quantification files are located. + file_pattern (str): Pattern to match quantification files. + + Returns: + set: A set containing the top 100 transcripts. + """ + try: + # Find the quantification file within the directory + quant_file_path = glob.glob(os.path.join(quant_dir, "*", file_pattern))[0] + with open(quant_file_path, "r") as file_handle: + # Read the file and extract the top 100 transcripts + return {line.split()[0] for i, line in enumerate(file_handle) if i > 0 and i <= 100} + except IndexError: + # Log an error and raise a FileNotFoundError if the quant file does not exist + logger.error("No quantification files found.") + raise FileNotFoundError("Quantification file not found.") + + +def discover_transcript_attribute(gtf_file: str, transcripts: Set[str]) -> str: + """ + Discover the attribute in the GTF that corresponds to transcripts, prioritizing 'transcript_id'. + + Parameters: + gtf_file (str): Path to the GTF file. + transcripts (Set[str]): A set of transcripts to match in the GTF file. + + Returns: + str: The attribute name that corresponds to transcripts in the GTF file. + """ + + votes = Counter() + with open(gtf_file) as inh: + # Read GTF file, skipping header lines + for line in filter(lambda x: not x.startswith("#"), inh): + cols = line.split("\\t") + + # Use regular expression to correctly split the attributes string + attributes_str = cols[8] + attributes = dict(re.findall(r'(\\S+) "(.*?)(? Dict[str, str]: + """ + Parse the attributes column of a GTF file. + + :param attributes_text: The attributes column as a string. + :return: A dictionary of the attributes. + """ + # Split the attributes string by semicolon and strip whitespace + attributes = attributes_text.strip().split(";") + attr_dict = OrderedDict() + + # Iterate over each attribute pair + for attribute in attributes: + # Split the attribute into key and value, ensuring there are two parts + parts = attribute.strip().split(" ", 1) + if len(parts) == 2: + key, value = parts + # Remove any double quotes from the value + value = value.replace('"', "") + attr_dict[key] = value + + return attr_dict + + +def map_transcripts_to_gene( + quant_type: str, gtf_file: str, quant_dir: str, gene_id: str, extra_id_field: str, output_file: str +) -> bool: + """ + Map transcripts to gene names and write the output to a file. + + Parameters: + quant_type (str): The quantification method used (e.g., 'salmon'). + gtf_file (str): Path to the GTF file. + quant_dir (str): Directory where quantification files are located. + gene_id (str): The gene ID attribute in the GTF file. + extra_id_field (str): Additional ID field in the GTF file. + output_file (str): The output file path. + + Returns: + bool: True if the operation was successful, False otherwise. + """ + # Read the top transcripts based on quantification type + transcripts = read_top_transcripts(quant_dir, "quant.sf" if quant_type == "salmon" else "abundance.tsv") + # Discover the attribute that corresponds to transcripts in the GTF + transcript_attribute = discover_transcript_attribute(gtf_file, transcripts) + + # Open GTF and output file to write the mappings + # Initialize the set to track seen combinations + seen = set() + + with open(gtf_file) as inh, open(output_file, "w") as output_handle: + # Parse each line of the GTF, mapping transcripts to genes + for line in filter(lambda x: not x.startswith("#"), inh): + cols = line.split("\\t") + attr_dict = parse_attributes(cols[8]) + if gene_id in attr_dict and transcript_attribute in attr_dict: + # Create a unique identifier for the transcript-gene combination + transcript_gene_pair = (attr_dict[transcript_attribute], attr_dict[gene_id]) + + # Check if the combination has already been seen + if transcript_gene_pair not in seen: + # If it's a new combination, write it to the output and add to the seen set + extra_id = attr_dict.get(extra_id_field, attr_dict[gene_id]) + output_handle.write(f"{attr_dict[transcript_attribute]}\\t{attr_dict[gene_id]}\\t{extra_id}\\n") + seen.add(transcript_gene_pair) + + return True + + +# Main function to parse arguments and call the mapping function +if __name__ == "__main__": + prefix = ( + "$task.ext.prefix" + if "$task.ext.prefix" != "null" + else f"$meta.id" + ) + if not map_transcripts_to_gene('$quant_type', '$gtf', 'quants', '$id', '$extra', f"{prefix}.tx2gene.tsv"): + logger.error("Failed to map transcripts to genes.") + + # Write the versions + versions_this_module = {} + versions_this_module["${task.process}"] = {"python": platform.python_version()} + with open("versions.yml", "w") as f: + f.write(format_yaml_like(versions_this_module)) diff --git a/modules/nf-core/custom/tx2gene/tests/main.nf.test b/modules/nf-core/custom/tx2gene/tests/main.nf.test new file mode 100644 index 000000000000..75ad0917d5c2 --- /dev/null +++ b/modules/nf-core/custom/tx2gene/tests/main.nf.test @@ -0,0 +1,99 @@ +nextflow_process { + + name "Test Process CUSTOM_TX2GENE" + script "../main.nf" + process "CUSTOM_TX2GENE" + + tag "modules" + tag "modules_nfcore" + tag "custom" + tag "custom/tx2gene" + tag "untar" + + test("saccharomyces_cerevisiae - gtf") { + + setup { + + run("UNTAR") { + script "../../../untar/main.nf" + process { + """ + input[0] = [ + [ id:'test'], // meta map + file(params.test_data['saccharomyces_cerevisiae']['genome']['kallisto_results'], checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'test'], // meta map + file(params.test_data['saccharomyces_cerevisiae']['genome']['genome_gfp_gtf'], checkIfExists: true) + ] + input[1] = UNTAR.out.untar + input[2] = 'kallisto' + input[3] = 'gene_id' + input[4] = 'gene_name' + """ + } + } + + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.tx2gene).match('tx2gene') }, + { assert snapshot(process.out.tx2gene).match('versions') } + ) + } + + } + + test("saccharomyces_cerevisiae - gtf - stub") { + + options "-stub" + + setup { + + run("UNTAR") { + script "../../../untar/main.nf" + process { + """ + input[0] = [ + [ id:'test'], // meta map + file(params.test_data['saccharomyces_cerevisiae']['genome']['kallisto_results'], checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'test'], // meta map + file(params.test_data['saccharomyces_cerevisiae']['genome']['genome_gfp_gtf'], checkIfExists: true) + ] + input[1] = UNTAR.out.untar + input[2] = 'kallisto' + input[3] = 'gene_id' + input[4] = 'gene_name' + """ + } + } + + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.tx2gene).match('tx2gene - stub') }, + { assert snapshot(process.out.tx2gene).match('versions - stub') } + ) + } + } +} diff --git a/modules/nf-core/custom/tx2gene/tests/main.nf.test.snap b/modules/nf-core/custom/tx2gene/tests/main.nf.test.snap new file mode 100644 index 000000000000..b01abf4c1221 --- /dev/null +++ b/modules/nf-core/custom/tx2gene/tests/main.nf.test.snap @@ -0,0 +1,70 @@ +{ + "versions": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.tx2gene.tsv:md5,b367acfc4b7df7fb68fb1b1d2956e27e" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-05T22:09:56.931541991" + }, + "tx2gene": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.tx2gene.tsv:md5,b367acfc4b7df7fb68fb1b1d2956e27e" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-05T22:09:56.920656242" + }, + "tx2gene - stub": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.tx2gene.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-05T22:10:14.817718782" + }, + "versions - stub": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.tx2gene.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-05T22:10:14.823019212" + } +} \ No newline at end of file diff --git a/modules/nf-core/custom/tx2gene/tests/tags.yml b/modules/nf-core/custom/tx2gene/tests/tags.yml new file mode 100644 index 000000000000..493fbc3b1c2b --- /dev/null +++ b/modules/nf-core/custom/tx2gene/tests/tags.yml @@ -0,0 +1,2 @@ +custom/tx2gene: + - "modules/nf-core/custom/tx2gene/**" diff --git a/tests/config/test_data.config b/tests/config/test_data.config index e2755f3db0b4..cfcdd51f009a 100644 --- a/tests/config/test_data.config +++ b/tests/config/test_data.config @@ -624,6 +624,12 @@ params { genome_aln_nwk = "${params.test_data_base}/data/genomics/prokaryotes/haemophilus_influenzae/genome/genome.aln.nwk" } } + 'saccharomyces_cerevisiae' { + 'genome' { + genome_gfp_gtf = "${params.test_data_base}/data/genomics/eukaryotes/saccharomyces_cerevisiae/genome_gfp.gtf" + kallisto_results = "${params.test_data_base}/data/genomics/eukaryotes/saccharomyces_cerevisiae/kallisto_results.tar.gz" + } + } 'generic' { 'csv' { test_csv = "${params.test_data_base}/data/generic/csv/test.csv"