forked from nf-core/modules
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add tx2gene module (formerly local to rnaseq) (nf-core#4854)
* Add test data link * Add tx2gene * Fix linting, update snapshots * path -> directory * Correct conda setup * Correct conda setup * Update test data reference * Apply minor suggestions from code review Co-authored-by: Adam Talbot <[email protected]> * Error out at point of failure in determining transcript attribute --------- Co-authored-by: Adam Talbot <[email protected]>
- Loading branch information
1 parent
c46a626
commit 72c1733
Showing
8 changed files
with
471 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
--- | ||
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json | ||
name: "custom_tx2gene" | ||
channels: | ||
- conda-forge | ||
- bioconda | ||
- defaults | ||
dependencies: | ||
- python=3.9.5 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
process CUSTOM_TX2GENE { | ||
tag "$meta.id" | ||
label 'process_single' | ||
|
||
conda "${moduleDir}/environment.yml" | ||
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? | ||
'https://depot.galaxyproject.org/singularity/python:3.9--1' : | ||
'biocontainers/python:3.9--1' }" | ||
|
||
input: | ||
tuple val(meta), path(gtf) | ||
tuple val(meta2), path ("quants") | ||
val quant_type | ||
val id | ||
val extra | ||
|
||
output: | ||
tuple val(meta), path("*.tx2gene.tsv"), emit: tx2gene | ||
path "versions.yml" , emit: versions | ||
|
||
when: | ||
task.ext.when == null || task.ext.when | ||
|
||
script: | ||
template 'tx2gene.py' | ||
|
||
stub: | ||
""" | ||
touch ${meta.id}.tx2gene.tsv | ||
cat <<-END_VERSIONS > versions.yml | ||
"${task.process}": | ||
python: \$(python --version | sed 's/Python //g') | ||
END_VERSIONS | ||
""" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
--- | ||
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json | ||
name: "custom_tx2gene" | ||
description: Make a transcript/gene mapping from a GTF and cross-reference with transcript quantifications. | ||
keywords: | ||
- gene | ||
- gtf | ||
- pseudoalignment | ||
- transcript | ||
tools: | ||
- "custom": | ||
description: | | ||
"Custom module to create a transcript to gene mapping from a GTF and | ||
check it against transcript quantifications" | ||
tool_dev_url: "https://github.com/nf-core/modules/blob/master/modules/nf-core/custom/tx2gene/main.nf" | ||
licence: ["MIT"] | ||
|
||
input: | ||
- meta: | ||
type: map | ||
description: | | ||
Groovy Map containing reference information related to the GTF file | ||
e.g. `[ id:'yeast' ]` | ||
- gtf: | ||
type: file | ||
description: An annotation file of the reference genome in GTF format | ||
pattern: "*.gtf" | ||
- meta2: | ||
type: map | ||
description: | | ||
Groovy Map containing information related to the experiment as a whole | ||
e.g. `[ id:'SRP123456' ]` | ||
- quants: | ||
type: directory | ||
description: Path to a directory with subdirectories corresponding to | ||
sample-wise runs of Salmon or Kallisto | ||
- quant_type: | ||
type: string | ||
description: Quantification type, 'kallisto' or 'salmon' | ||
- id: | ||
type: string | ||
description: Gene ID attribute in the GTF file (default= gene_id) | ||
- extra: | ||
type: string | ||
description: Extra gene attribute in the GTF file (default= gene_name) | ||
|
||
output: | ||
- meta: | ||
type: map | ||
description: | | ||
Groovy Map containing reference information related to the GTF file | ||
e.g. `[ id:'yeast' ]` | ||
- tx2gene: | ||
type: file | ||
description: A transcript/ gene mapping table in TSV format | ||
pattern: "*.tx2gene.tsv" | ||
- versions: | ||
type: file | ||
description: File containing software versions | ||
pattern: "versions.yml" | ||
|
||
authors: | ||
- "@pinin4fjords" | ||
maintainers: | ||
- "@pinin4fjords" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,184 @@ | ||
#!/usr/bin/env python3 | ||
|
||
# Written by Lorena Pantano with subsequent reworking by Jonathan Manning. Released under the MIT license. | ||
|
||
import logging | ||
import argparse | ||
import glob | ||
import os | ||
import platform | ||
import re | ||
from collections import Counter, defaultdict, OrderedDict | ||
from collections.abc import Set | ||
from typing import Dict | ||
|
||
# Configure logging | ||
logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s") | ||
logger = logging.getLogger(__name__) | ||
logger.setLevel(logging.INFO) | ||
|
||
def format_yaml_like(data: dict, indent: int = 0) -> str: | ||
"""Formats a dictionary to a YAML-like string. | ||
Args: | ||
data (dict): The dictionary to format. | ||
indent (int): The current indentation level. | ||
Returns: | ||
str: A string formatted as YAML. | ||
""" | ||
yaml_str = "" | ||
for key, value in data.items(): | ||
spaces = " " * indent | ||
if isinstance(value, dict): | ||
yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}" | ||
else: | ||
yaml_str += f"{spaces}{key}: {value}\\n" | ||
return yaml_str | ||
|
||
def read_top_transcripts(quant_dir: str, file_pattern: str) -> Set[str]: | ||
""" | ||
Read the top 100 transcripts from the quantification file. | ||
Parameters: | ||
quant_dir (str): Directory where quantification files are located. | ||
file_pattern (str): Pattern to match quantification files. | ||
Returns: | ||
set: A set containing the top 100 transcripts. | ||
""" | ||
try: | ||
# Find the quantification file within the directory | ||
quant_file_path = glob.glob(os.path.join(quant_dir, "*", file_pattern))[0] | ||
with open(quant_file_path, "r") as file_handle: | ||
# Read the file and extract the top 100 transcripts | ||
return {line.split()[0] for i, line in enumerate(file_handle) if i > 0 and i <= 100} | ||
except IndexError: | ||
# Log an error and raise a FileNotFoundError if the quant file does not exist | ||
logger.error("No quantification files found.") | ||
raise FileNotFoundError("Quantification file not found.") | ||
|
||
|
||
def discover_transcript_attribute(gtf_file: str, transcripts: Set[str]) -> str: | ||
""" | ||
Discover the attribute in the GTF that corresponds to transcripts, prioritizing 'transcript_id'. | ||
Parameters: | ||
gtf_file (str): Path to the GTF file. | ||
transcripts (Set[str]): A set of transcripts to match in the GTF file. | ||
Returns: | ||
str: The attribute name that corresponds to transcripts in the GTF file. | ||
""" | ||
|
||
votes = Counter() | ||
with open(gtf_file) as inh: | ||
# Read GTF file, skipping header lines | ||
for line in filter(lambda x: not x.startswith("#"), inh): | ||
cols = line.split("\\t") | ||
|
||
# Use regular expression to correctly split the attributes string | ||
attributes_str = cols[8] | ||
attributes = dict(re.findall(r'(\\S+) "(.*?)(?<!\\\\)";', attributes_str)) | ||
|
||
votes.update(key for key, value in attributes.items() if value in transcripts) | ||
|
||
if not votes: | ||
# Error out if no matching attribute is found | ||
logger.error("No attribute in GTF matching transcripts") | ||
|
||
# Check if 'transcript_id' is among the attributes with the highest votes | ||
if "transcript_id" in votes and votes["transcript_id"] == max(votes.values()): | ||
logger.info("Attribute 'transcript_id' corresponds to transcripts.") | ||
return "transcript_id" | ||
|
||
# If 'transcript_id' isn't the highest, determine the most common attribute that matches the transcripts | ||
attribute, _ = votes.most_common(1)[0] | ||
logger.info(f"Attribute '{attribute}' corresponds to transcripts.") | ||
return attribute | ||
|
||
|
||
def parse_attributes(attributes_text: str) -> Dict[str, str]: | ||
""" | ||
Parse the attributes column of a GTF file. | ||
:param attributes_text: The attributes column as a string. | ||
:return: A dictionary of the attributes. | ||
""" | ||
# Split the attributes string by semicolon and strip whitespace | ||
attributes = attributes_text.strip().split(";") | ||
attr_dict = OrderedDict() | ||
|
||
# Iterate over each attribute pair | ||
for attribute in attributes: | ||
# Split the attribute into key and value, ensuring there are two parts | ||
parts = attribute.strip().split(" ", 1) | ||
if len(parts) == 2: | ||
key, value = parts | ||
# Remove any double quotes from the value | ||
value = value.replace('"', "") | ||
attr_dict[key] = value | ||
|
||
return attr_dict | ||
|
||
|
||
def map_transcripts_to_gene( | ||
quant_type: str, gtf_file: str, quant_dir: str, gene_id: str, extra_id_field: str, output_file: str | ||
) -> bool: | ||
""" | ||
Map transcripts to gene names and write the output to a file. | ||
Parameters: | ||
quant_type (str): The quantification method used (e.g., 'salmon'). | ||
gtf_file (str): Path to the GTF file. | ||
quant_dir (str): Directory where quantification files are located. | ||
gene_id (str): The gene ID attribute in the GTF file. | ||
extra_id_field (str): Additional ID field in the GTF file. | ||
output_file (str): The output file path. | ||
Returns: | ||
bool: True if the operation was successful, False otherwise. | ||
""" | ||
# Read the top transcripts based on quantification type | ||
transcripts = read_top_transcripts(quant_dir, "quant.sf" if quant_type == "salmon" else "abundance.tsv") | ||
# Discover the attribute that corresponds to transcripts in the GTF | ||
transcript_attribute = discover_transcript_attribute(gtf_file, transcripts) | ||
|
||
# Open GTF and output file to write the mappings | ||
# Initialize the set to track seen combinations | ||
seen = set() | ||
|
||
with open(gtf_file) as inh, open(output_file, "w") as output_handle: | ||
# Parse each line of the GTF, mapping transcripts to genes | ||
for line in filter(lambda x: not x.startswith("#"), inh): | ||
cols = line.split("\\t") | ||
attr_dict = parse_attributes(cols[8]) | ||
if gene_id in attr_dict and transcript_attribute in attr_dict: | ||
# Create a unique identifier for the transcript-gene combination | ||
transcript_gene_pair = (attr_dict[transcript_attribute], attr_dict[gene_id]) | ||
|
||
# Check if the combination has already been seen | ||
if transcript_gene_pair not in seen: | ||
# If it's a new combination, write it to the output and add to the seen set | ||
extra_id = attr_dict.get(extra_id_field, attr_dict[gene_id]) | ||
output_handle.write(f"{attr_dict[transcript_attribute]}\\t{attr_dict[gene_id]}\\t{extra_id}\\n") | ||
seen.add(transcript_gene_pair) | ||
|
||
return True | ||
|
||
|
||
# Main function to parse arguments and call the mapping function | ||
if __name__ == "__main__": | ||
prefix = ( | ||
"$task.ext.prefix" | ||
if "$task.ext.prefix" != "null" | ||
else f"$meta.id" | ||
) | ||
if not map_transcripts_to_gene('$quant_type', '$gtf', 'quants', '$id', '$extra', f"{prefix}.tx2gene.tsv"): | ||
logger.error("Failed to map transcripts to genes.") | ||
|
||
# Write the versions | ||
versions_this_module = {} | ||
versions_this_module["${task.process}"] = {"python": platform.python_version()} | ||
with open("versions.yml", "w") as f: | ||
f.write(format_yaml_like(versions_this_module)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
nextflow_process { | ||
|
||
name "Test Process CUSTOM_TX2GENE" | ||
script "../main.nf" | ||
process "CUSTOM_TX2GENE" | ||
|
||
tag "modules" | ||
tag "modules_nfcore" | ||
tag "custom" | ||
tag "custom/tx2gene" | ||
tag "untar" | ||
|
||
test("saccharomyces_cerevisiae - gtf") { | ||
|
||
setup { | ||
|
||
run("UNTAR") { | ||
script "../../../untar/main.nf" | ||
process { | ||
""" | ||
input[0] = [ | ||
[ id:'test'], // meta map | ||
file(params.test_data['saccharomyces_cerevisiae']['genome']['kallisto_results'], checkIfExists: true) | ||
] | ||
""" | ||
} | ||
} | ||
} | ||
|
||
when { | ||
process { | ||
""" | ||
input[0] = [ | ||
[ id:'test'], // meta map | ||
file(params.test_data['saccharomyces_cerevisiae']['genome']['genome_gfp_gtf'], checkIfExists: true) | ||
] | ||
input[1] = UNTAR.out.untar | ||
input[2] = 'kallisto' | ||
input[3] = 'gene_id' | ||
input[4] = 'gene_name' | ||
""" | ||
} | ||
} | ||
|
||
|
||
then { | ||
assertAll( | ||
{ assert process.success }, | ||
{ assert snapshot(process.out.tx2gene).match('tx2gene') }, | ||
{ assert snapshot(process.out.tx2gene).match('versions') } | ||
) | ||
} | ||
|
||
} | ||
|
||
test("saccharomyces_cerevisiae - gtf - stub") { | ||
|
||
options "-stub" | ||
|
||
setup { | ||
|
||
run("UNTAR") { | ||
script "../../../untar/main.nf" | ||
process { | ||
""" | ||
input[0] = [ | ||
[ id:'test'], // meta map | ||
file(params.test_data['saccharomyces_cerevisiae']['genome']['kallisto_results'], checkIfExists: true) | ||
] | ||
""" | ||
} | ||
} | ||
} | ||
|
||
when { | ||
process { | ||
""" | ||
input[0] = [ | ||
[ id:'test'], // meta map | ||
file(params.test_data['saccharomyces_cerevisiae']['genome']['genome_gfp_gtf'], checkIfExists: true) | ||
] | ||
input[1] = UNTAR.out.untar | ||
input[2] = 'kallisto' | ||
input[3] = 'gene_id' | ||
input[4] = 'gene_name' | ||
""" | ||
} | ||
} | ||
|
||
|
||
then { | ||
assertAll( | ||
{ assert process.success }, | ||
{ assert snapshot(process.out.tx2gene).match('tx2gene - stub') }, | ||
{ assert snapshot(process.out.tx2gene).match('versions - stub') } | ||
) | ||
} | ||
} | ||
} |
Oops, something went wrong.