From 72c17330fd02c0073a3d8102e0c3d9eb82d4bacd Mon Sep 17 00:00:00 2001
From: Jonathan Manning <jonathan.manning@seqera.io>
Date: Tue, 6 Feb 2024 13:54:21 +0000
Subject: [PATCH] Add tx2gene module (formerly local to rnaseq) (#4854)

* Add test data link

* Add tx2gene

* Fix linting, update snapshots

* path -> directory

* Correct conda setup

* Correct conda setup

* Update test data reference

* Apply minor suggestions from code review

Co-authored-by: Adam Talbot <12817534+adamrtalbot@users.noreply.github.com>

* Error out at point of failure in determining transcript attribute

---------

Co-authored-by: Adam Talbot <12817534+adamrtalbot@users.noreply.github.com>
---
 .../nf-core/custom/tx2gene/environment.yml    |   9 +
 modules/nf-core/custom/tx2gene/main.nf        |  36 ++++
 modules/nf-core/custom/tx2gene/meta.yml       |  65 +++++++
 .../custom/tx2gene/templates/tx2gene.py       | 184 ++++++++++++++++++
 .../nf-core/custom/tx2gene/tests/main.nf.test |  99 ++++++++++
 .../custom/tx2gene/tests/main.nf.test.snap    |  70 +++++++
 modules/nf-core/custom/tx2gene/tests/tags.yml |   2 +
 tests/config/test_data.config                 |   6 +
 8 files changed, 471 insertions(+)
 create mode 100644 modules/nf-core/custom/tx2gene/environment.yml
 create mode 100644 modules/nf-core/custom/tx2gene/main.nf
 create mode 100644 modules/nf-core/custom/tx2gene/meta.yml
 create mode 100755 modules/nf-core/custom/tx2gene/templates/tx2gene.py
 create mode 100644 modules/nf-core/custom/tx2gene/tests/main.nf.test
 create mode 100644 modules/nf-core/custom/tx2gene/tests/main.nf.test.snap
 create mode 100644 modules/nf-core/custom/tx2gene/tests/tags.yml

diff --git a/modules/nf-core/custom/tx2gene/environment.yml b/modules/nf-core/custom/tx2gene/environment.yml
new file mode 100644
index 000000000000..a859dc881fef
--- /dev/null
+++ b/modules/nf-core/custom/tx2gene/environment.yml
@@ -0,0 +1,9 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
+name: "custom_tx2gene"
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - python=3.9.5
diff --git a/modules/nf-core/custom/tx2gene/main.nf b/modules/nf-core/custom/tx2gene/main.nf
new file mode 100644
index 000000000000..dff8f20c97fc
--- /dev/null
+++ b/modules/nf-core/custom/tx2gene/main.nf
@@ -0,0 +1,36 @@
+process CUSTOM_TX2GENE {
+    tag "$meta.id"
+    label 'process_single'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/python:3.9--1' :
+        'biocontainers/python:3.9--1' }"
+
+    input:
+    tuple val(meta), path(gtf)
+    tuple val(meta2), path ("quants")
+    val quant_type
+    val id
+    val extra
+
+    output:
+    tuple val(meta), path("*.tx2gene.tsv"), emit: tx2gene
+    path "versions.yml"                   , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    template 'tx2gene.py'
+
+    stub:
+    """
+    touch ${meta.id}.tx2gene.tsv
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        python: \$(python --version | sed 's/Python //g')
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/custom/tx2gene/meta.yml b/modules/nf-core/custom/tx2gene/meta.yml
new file mode 100644
index 000000000000..bea5f571d54e
--- /dev/null
+++ b/modules/nf-core/custom/tx2gene/meta.yml
@@ -0,0 +1,65 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
+name: "custom_tx2gene"
+description: Make a transcript/gene mapping from a GTF and cross-reference with transcript quantifications.
+keywords:
+  - gene
+  - gtf
+  - pseudoalignment
+  - transcript
+tools:
+  - "custom":
+      description: |
+        "Custom module to create a transcript to gene mapping from a GTF and
+        check it against transcript quantifications"
+      tool_dev_url: "https://github.com/nf-core/modules/blob/master/modules/nf-core/custom/tx2gene/main.nf"
+      licence: ["MIT"]
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing reference information related to the GTF file
+        e.g. `[ id:'yeast' ]`
+  - gtf:
+      type: file
+      description: An annotation file of the reference genome in GTF format
+      pattern: "*.gtf"
+  - meta2:
+      type: map
+      description: |
+        Groovy Map containing information related to the experiment as a whole
+        e.g. `[ id:'SRP123456' ]`
+  - quants:
+      type: directory
+      description: Path to a directory with subdirectories corresponding to
+        sample-wise runs of Salmon or Kallisto
+  - quant_type:
+      type: string
+      description: Quantification type, 'kallisto' or 'salmon'
+  - id:
+      type: string
+      description: Gene ID attribute in the GTF file (default= gene_id)
+  - extra:
+      type: string
+      description: Extra gene attribute in the GTF file (default= gene_name)
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing reference information related to the GTF file
+        e.g. `[ id:'yeast' ]`
+  - tx2gene:
+      type: file
+      description: A transcript/ gene mapping table in TSV format
+      pattern: "*.tx2gene.tsv"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+
+authors:
+  - "@pinin4fjords"
+maintainers:
+  - "@pinin4fjords"
diff --git a/modules/nf-core/custom/tx2gene/templates/tx2gene.py b/modules/nf-core/custom/tx2gene/templates/tx2gene.py
new file mode 100755
index 000000000000..9b1e1ec69cf1
--- /dev/null
+++ b/modules/nf-core/custom/tx2gene/templates/tx2gene.py
@@ -0,0 +1,184 @@
+#!/usr/bin/env python3
+
+# Written by Lorena Pantano with subsequent reworking by Jonathan Manning. Released under the MIT license.
+
+import logging
+import argparse
+import glob
+import os
+import platform
+import re
+from collections import Counter, defaultdict, OrderedDict
+from collections.abc import Set
+from typing import Dict
+
+# Configure logging
+logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s")
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+def format_yaml_like(data: dict, indent: int = 0) -> str:
+    """Formats a dictionary to a YAML-like string.
+
+    Args:
+        data (dict): The dictionary to format.
+        indent (int): The current indentation level.
+
+    Returns:
+        str: A string formatted as YAML.
+    """
+    yaml_str = ""
+    for key, value in data.items():
+        spaces = "  " * indent
+        if isinstance(value, dict):
+            yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}"
+        else:
+            yaml_str += f"{spaces}{key}: {value}\\n"
+    return yaml_str
+
+def read_top_transcripts(quant_dir: str, file_pattern: str) -> Set[str]:
+    """
+    Read the top 100 transcripts from the quantification file.
+
+    Parameters:
+    quant_dir (str): Directory where quantification files are located.
+    file_pattern (str): Pattern to match quantification files.
+
+    Returns:
+    set: A set containing the top 100 transcripts.
+    """
+    try:
+        # Find the quantification file within the directory
+        quant_file_path = glob.glob(os.path.join(quant_dir, "*", file_pattern))[0]
+        with open(quant_file_path, "r") as file_handle:
+            # Read the file and extract the top 100 transcripts
+            return {line.split()[0] for i, line in enumerate(file_handle) if i > 0 and i <= 100}
+    except IndexError:
+        # Log an error and raise a FileNotFoundError if the quant file does not exist
+        logger.error("No quantification files found.")
+        raise FileNotFoundError("Quantification file not found.")
+
+
+def discover_transcript_attribute(gtf_file: str, transcripts: Set[str]) -> str:
+    """
+    Discover the attribute in the GTF that corresponds to transcripts, prioritizing 'transcript_id'.
+
+    Parameters:
+    gtf_file (str): Path to the GTF file.
+    transcripts (Set[str]): A set of transcripts to match in the GTF file.
+
+    Returns:
+    str: The attribute name that corresponds to transcripts in the GTF file.
+    """
+
+    votes = Counter()
+    with open(gtf_file) as inh:
+        # Read GTF file, skipping header lines
+        for line in filter(lambda x: not x.startswith("#"), inh):
+            cols = line.split("\\t")
+
+            # Use regular expression to correctly split the attributes string
+            attributes_str = cols[8]
+            attributes = dict(re.findall(r'(\\S+) "(.*?)(?<!\\\\)";', attributes_str))
+
+            votes.update(key for key, value in attributes.items() if value in transcripts)
+
+    if not votes:
+        # Error out if no matching attribute is found
+        logger.error("No attribute in GTF matching transcripts")
+
+    # Check if 'transcript_id' is among the attributes with the highest votes
+    if "transcript_id" in votes and votes["transcript_id"] == max(votes.values()):
+        logger.info("Attribute 'transcript_id' corresponds to transcripts.")
+        return "transcript_id"
+
+    # If 'transcript_id' isn't the highest, determine the most common attribute that matches the transcripts
+    attribute, _ = votes.most_common(1)[0]
+    logger.info(f"Attribute '{attribute}' corresponds to transcripts.")
+    return attribute
+
+
+def parse_attributes(attributes_text: str) -> Dict[str, str]:
+    """
+    Parse the attributes column of a GTF file.
+
+    :param attributes_text: The attributes column as a string.
+    :return: A dictionary of the attributes.
+    """
+    # Split the attributes string by semicolon and strip whitespace
+    attributes = attributes_text.strip().split(";")
+    attr_dict = OrderedDict()
+
+    # Iterate over each attribute pair
+    for attribute in attributes:
+        # Split the attribute into key and value, ensuring there are two parts
+        parts = attribute.strip().split(" ", 1)
+        if len(parts) == 2:
+            key, value = parts
+            # Remove any double quotes from the value
+            value = value.replace('"', "")
+            attr_dict[key] = value
+
+    return attr_dict
+
+
+def map_transcripts_to_gene(
+    quant_type: str, gtf_file: str, quant_dir: str, gene_id: str, extra_id_field: str, output_file: str
+) -> bool:
+    """
+    Map transcripts to gene names and write the output to a file.
+
+    Parameters:
+    quant_type (str): The quantification method used (e.g., 'salmon').
+    gtf_file (str): Path to the GTF file.
+    quant_dir (str): Directory where quantification files are located.
+    gene_id (str): The gene ID attribute in the GTF file.
+    extra_id_field (str): Additional ID field in the GTF file.
+    output_file (str): The output file path.
+
+    Returns:
+    bool: True if the operation was successful, False otherwise.
+    """
+    # Read the top transcripts based on quantification type
+    transcripts = read_top_transcripts(quant_dir, "quant.sf" if quant_type == "salmon" else "abundance.tsv")
+    # Discover the attribute that corresponds to transcripts in the GTF
+    transcript_attribute = discover_transcript_attribute(gtf_file, transcripts)
+
+    # Open GTF and output file to write the mappings
+    # Initialize the set to track seen combinations
+    seen = set()
+
+    with open(gtf_file) as inh, open(output_file, "w") as output_handle:
+        # Parse each line of the GTF, mapping transcripts to genes
+        for line in filter(lambda x: not x.startswith("#"), inh):
+            cols = line.split("\\t")
+            attr_dict = parse_attributes(cols[8])
+            if gene_id in attr_dict and transcript_attribute in attr_dict:
+                # Create a unique identifier for the transcript-gene combination
+                transcript_gene_pair = (attr_dict[transcript_attribute], attr_dict[gene_id])
+
+                # Check if the combination has already been seen
+                if transcript_gene_pair not in seen:
+                    # If it's a new combination, write it to the output and add to the seen set
+                    extra_id = attr_dict.get(extra_id_field, attr_dict[gene_id])
+                    output_handle.write(f"{attr_dict[transcript_attribute]}\\t{attr_dict[gene_id]}\\t{extra_id}\\n")
+                    seen.add(transcript_gene_pair)
+
+    return True
+
+
+# Main function to parse arguments and call the mapping function
+if __name__ == "__main__":
+    prefix = (
+        "$task.ext.prefix"
+        if "$task.ext.prefix" != "null"
+        else f"$meta.id"
+    )
+    if not map_transcripts_to_gene('$quant_type', '$gtf', 'quants', '$id', '$extra', f"{prefix}.tx2gene.tsv"):
+        logger.error("Failed to map transcripts to genes.")
+
+    # Write the versions
+    versions_this_module = {}
+    versions_this_module["${task.process}"] = {"python": platform.python_version()}
+    with open("versions.yml", "w") as f:
+        f.write(format_yaml_like(versions_this_module))
diff --git a/modules/nf-core/custom/tx2gene/tests/main.nf.test b/modules/nf-core/custom/tx2gene/tests/main.nf.test
new file mode 100644
index 000000000000..75ad0917d5c2
--- /dev/null
+++ b/modules/nf-core/custom/tx2gene/tests/main.nf.test
@@ -0,0 +1,99 @@
+nextflow_process {
+
+    name "Test Process CUSTOM_TX2GENE"
+    script "../main.nf"
+    process "CUSTOM_TX2GENE"
+
+    tag "modules"
+    tag "modules_nfcore"
+    tag "custom"
+    tag "custom/tx2gene"
+    tag "untar"
+
+    test("saccharomyces_cerevisiae - gtf") {
+        
+       setup {
+
+            run("UNTAR") {
+                script "../../../untar/main.nf"
+                process {
+                    """
+                    input[0] = [
+                    [ id:'test'], // meta map
+                    file(params.test_data['saccharomyces_cerevisiae']['genome']['kallisto_results'], checkIfExists: true)
+                    ]
+                    """
+                }
+            }
+        }
+
+        when {
+            process {
+                """
+                input[0] = [
+                [ id:'test'], // meta map
+                file(params.test_data['saccharomyces_cerevisiae']['genome']['genome_gfp_gtf'], checkIfExists: true)
+                ]
+                input[1] = UNTAR.out.untar
+                input[2] = 'kallisto'
+                input[3] = 'gene_id' 
+                input[4] = 'gene_name'
+                """
+            }
+        }
+
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(process.out.tx2gene).match('tx2gene') },
+                { assert snapshot(process.out.tx2gene).match('versions') }
+            )
+        }
+
+    }
+
+    test("saccharomyces_cerevisiae - gtf - stub") {
+
+       options "-stub"
+       
+       setup {
+
+            run("UNTAR") {
+                script "../../../untar/main.nf"
+                process {
+                    """
+                    input[0] = [
+                        [ id:'test'], // meta map
+                        file(params.test_data['saccharomyces_cerevisiae']['genome']['kallisto_results'], checkIfExists: true)
+                    ]
+                    """
+                }
+            }
+        }
+        
+        when {
+            process {
+                """
+               input[0] = [
+                    [ id:'test'], // meta map
+                    file(params.test_data['saccharomyces_cerevisiae']['genome']['genome_gfp_gtf'], checkIfExists: true)
+                ]
+                input[1] = UNTAR.out.untar
+                input[2] = 'kallisto'
+                input[3] = 'gene_id' 
+                input[4] = 'gene_name'
+                """
+            }
+        }
+
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(process.out.tx2gene).match('tx2gene - stub') },
+                { assert snapshot(process.out.tx2gene).match('versions - stub') }
+            )
+        }
+    }
+}
diff --git a/modules/nf-core/custom/tx2gene/tests/main.nf.test.snap b/modules/nf-core/custom/tx2gene/tests/main.nf.test.snap
new file mode 100644
index 000000000000..b01abf4c1221
--- /dev/null
+++ b/modules/nf-core/custom/tx2gene/tests/main.nf.test.snap
@@ -0,0 +1,70 @@
+{
+    "versions": {
+        "content": [
+            [
+                [
+                    {
+                        "id": "test"
+                    },
+                    "test.tx2gene.tsv:md5,b367acfc4b7df7fb68fb1b1d2956e27e"
+                ]
+            ]
+        ],
+        "meta": {
+            "nf-test": "0.8.4",
+            "nextflow": "23.10.1"
+        },
+        "timestamp": "2024-02-05T22:09:56.931541991"
+    },
+    "tx2gene": {
+        "content": [
+            [
+                [
+                    {
+                        "id": "test"
+                    },
+                    "test.tx2gene.tsv:md5,b367acfc4b7df7fb68fb1b1d2956e27e"
+                ]
+            ]
+        ],
+        "meta": {
+            "nf-test": "0.8.4",
+            "nextflow": "23.10.1"
+        },
+        "timestamp": "2024-02-05T22:09:56.920656242"
+    },
+    "tx2gene - stub": {
+        "content": [
+            [
+                [
+                    {
+                        "id": "test"
+                    },
+                    "test.tx2gene.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
+                ]
+            ]
+        ],
+        "meta": {
+            "nf-test": "0.8.4",
+            "nextflow": "23.10.1"
+        },
+        "timestamp": "2024-02-05T22:10:14.817718782"
+    },
+    "versions - stub": {
+        "content": [
+            [
+                [
+                    {
+                        "id": "test"
+                    },
+                    "test.tx2gene.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
+                ]
+            ]
+        ],
+        "meta": {
+            "nf-test": "0.8.4",
+            "nextflow": "23.10.1"
+        },
+        "timestamp": "2024-02-05T22:10:14.823019212"
+    }
+}
\ No newline at end of file
diff --git a/modules/nf-core/custom/tx2gene/tests/tags.yml b/modules/nf-core/custom/tx2gene/tests/tags.yml
new file mode 100644
index 000000000000..493fbc3b1c2b
--- /dev/null
+++ b/modules/nf-core/custom/tx2gene/tests/tags.yml
@@ -0,0 +1,2 @@
+custom/tx2gene:
+  - "modules/nf-core/custom/tx2gene/**"
diff --git a/tests/config/test_data.config b/tests/config/test_data.config
index e2755f3db0b4..cfcdd51f009a 100644
--- a/tests/config/test_data.config
+++ b/tests/config/test_data.config
@@ -624,6 +624,12 @@ params {
                 genome_aln_nwk                  = "${params.test_data_base}/data/genomics/prokaryotes/haemophilus_influenzae/genome/genome.aln.nwk"
             }
         }
+        'saccharomyces_cerevisiae' {
+            'genome' {
+                genome_gfp_gtf                                 = "${params.test_data_base}/data/genomics/eukaryotes/saccharomyces_cerevisiae/genome_gfp.gtf"
+                kallisto_results                               = "${params.test_data_base}/data/genomics/eukaryotes/saccharomyces_cerevisiae/kallisto_results.tar.gz"
+            }
+        }
         'generic' {
             'csv' {
                 test_csv                        = "${params.test_data_base}/data/generic/csv/test.csv"