diff --git a/bin/append_GCF.sh b/bin/append_GCF.sh new file mode 100755 index 0000000..3c559ff --- /dev/null +++ b/bin/append_GCF.sh @@ -0,0 +1,92 @@ + +#!/usr/bin/env bash +############################################################################################## +# Copyright 2022 The Johns Hopkins University Applied Physics Laboratory LLC +# All rights reserved. +# Permission is hereby granted, free of charge, to any person obtaining a copy of this +# software and associated documentation files (the "Software"), to deal in the Software +# without restriction, including without limitation the rights to use, copy, modify, +# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE +# OR OTHER DEALINGS IN THE SOFTWARE. +# +# FUNCTIONS +usage() +{ +cat << EOF + +Help message for make_gcf_mapping.sh: + +DESCRIPTION: + Optionally merge assembly refseq on a gcf assembly pull FASTA file. FASTA file must be in format >chr GCF description (3 columns) where first 2 are a space and the third is anything after + +NOTES: + + +OPTIONS: + -h help show this message + -i FILE a fasta file/dir of gcf references pulled. Appends the filename's GCF to the middle of the accession (between chr accession and description) + -d FILE optional. If called, it is a directory of GCF files (*fna, *fasta, *faa, *fa) + -o FILE full path to output file + +USAGE: + +bash append_GCF.sh -i GCF_020735385.1.fasta -o test.fasta -t (looks at all fasta files in dir) +bash append_GCF.sh -i GCF_020735385.1.fasta -o test.fasta (appends base assembly accession to center of contig/chromosome header) + +EOF +} + +DIR="FALSE" + +# ARGUMENTS +# parse args +while getopts "hi:do:" OPTION +do + case $OPTION in + h) usage; exit 1 ;; + i) fasta=$OPTARG ;; + d) DIR="TRUE" ;; + o) output=$OPTARG ;; + ?) usage; exit ;; + esac +done + + +# make a function to append the GCF to the middle of the accession +# if the file is GCF_102310239.1_aSDGM I want only GCF. Split on the _ and get first 2 elements +# now get all lines that start with ">" in the filename, and append the gcf to the middle of the accession between NC_12123 and "djaskdjasdklj" (random description). It should be $1 then $gcf then everything AFTER $1 +# use sed after first space +append_f(){ + filename=$1 + # echo "Name BEGINS with GCF: ${basen}" + # get the basename of the file removing extension + basen=$(basename $filename | sed 's/\.[^.]*$//') + # if the file is GCF_102310239.1_aSDGM I want only GCF. Split on the _ and get first 2 elements + gcf=$(echo $basen | awk -F '_' '{print $1"_"$2}') + # now get all lines that start with ">" in the filename, and append the gcf to the middle of the accession between NC_12123 and "djaskdjasdklj" (random description). It should be $1 then $gcf then everything AFTER $1 + # use sed after first space + sed -e "s/ / $gcf /" $filename +} + + +if [[ $DIR == "TRUE" ]]; then + echo "Directory detected" + for filename in $(find $fasta \( -name "*.fasta" -o -name "*fna" -o -name "*faa" -o -name "*fa" \) -type f ); do + basen=$(basename $filename) + if [[ $basen =~ "GCF" ]]; then + append_f $filename + fi + + done + +else + append_f $fasta +fi + diff --git a/bin/determine_priority_assembly.py b/bin/determine_priority_assembly.py new file mode 100644 index 0000000..6184f32 --- /dev/null +++ b/bin/determine_priority_assembly.py @@ -0,0 +1,41 @@ +import re +def determine_priority_assembly(line): + cols = line.strip().split("\t") + if len(cols) > 7: + name = cols[7] + level = cols[4] + if level == "representative genome" : + return 0 + elif cols[4] == "reference genome" : + return 1 + elif cols[11] == "Complete Genome" : + return 2 + else: + return 3 +def format_description(id, description): + delimiters = "[ ]" # Split on underscore or comma + description = description.split(',', 1)[0] + linesplit = re.split(delimiters, description) + id = id.replace(">", "") + # remove all after first comma + + if len(linesplit) > 1: + return id, " ".join(linesplit[1:]) + else: + return id, description +def extract_strain_from_description(description): + delimiters = "[ ]" # Split on underscore or comma + # regex match for strain, get value after space until next space +def parse_fasta_header(header): + parts = header.split() + organism_parts = [] + strain = None + for part in parts: + if part.lower() == "strain": + strain_index = parts.index(part) + 1 + if strain_index < len(parts): + strain = parts[strain_index] + break + organism_parts.append(part) + organism_name = " ".join(organism_parts) + return organism_name, strain diff --git a/bin/download_fastas.py b/bin/download_fastas.py index 0738231..fc8d80f 100755 --- a/bin/download_fastas.py +++ b/bin/download_fastas.py @@ -40,6 +40,8 @@ import shutil import urllib.request as request import ssl +from determine_priority_assembly import determine_priority_assembly, format_description + ctx = ssl.create_default_context() ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE @@ -173,8 +175,7 @@ def import_genome_file(filename, kraken2output): # def import_assembly_file(input, filename, matchcol, idx, nameidx, index_ftp): - refs = dict() - seen = dict() + assemblies = dict() first = dict() # matchcol is the column is number of column where you match the top hits to @@ -200,6 +201,7 @@ def get_url(utl, id): formatted_header = namecol.replace(" ", "_") formatted_header = str(formatted_header) if len(linesplit) >= 12 and (matchidx in input): + if namecol not in priorities: priorities[namecol] = dict() seencols[matchidx] = True @@ -251,16 +253,16 @@ def get_url(utl, id): for key, value in priorities.items(): if value.get('0'): - refs[key] = value['0'] + assemblies[key] = value['0'] elif value.get('1'): - refs[key] = value['1'] + assemblies[key] = value['1'] elif value.get('2'): - refs[key] = value['2'] + assemblies[key] = value['2'] elif value.get('3'): - refs[key] = value['3'] + assemblies[key] = value['3'] else: print("No reference genome found for", key) - return refs + return assemblies def get_assembly_summary(id): @@ -270,7 +272,22 @@ def get_assembly_summary(id): return esummary_record -def get_assemblies(refs, outfile, seen, index_ftp, refresh): +def download_fasta(ftp_site): + encoding = guess_type(ftp_site)[1] # uses file extension + _open = partial( + gzip.open, mode='rt') if encoding == 'gzip' else open + try: + with closing(request.urlopen(ftp_site, context=ctx)) as r: + with open('file.gz', 'wb') as f: + shutil.copyfileobj(r, f) + f.close() + r.close() + return _open + except Exception as ex: + print("Could not download", ftp_site, ex) + raise ex + +def get_assemblies(assemblies, outfile, GCFs_to_skip, refresh): """Download genbank assemblies for a given search term. Args: term: search term, usually organism name @@ -278,49 +295,60 @@ def get_assemblies(refs, outfile, seen, index_ftp, refresh): path: folder to save to """ # get the value.accession from the refs dict asa list - ids = list(refs.keys()) - if index_ftp and len(ids) > 0: - typee = "a" - if refresh: - typee = "w" - with open(outfile, typee) as w: - for id in ids: - try: - accession = refs[id]['accession'] - if seen and accession in seen: - refs[id]['chrs'] = seen[accession] - print("key already seen:", id, "; skipping") - else: - ftp_site = refs[id]['reference'] - obj = refs[id]['id'] - name = refs[id]['name'] - - encoding = guess_type(ftp_site)[1] # uses file extension - _open = partial( - gzip.open, mode='rt') if encoding == 'gzip' else open - with closing(request.urlopen(ftp_site, context=ctx)) as r: - with open('file.gz', 'wb') as f: - shutil.copyfileobj(r, f) - f.close() - r.close() - with _open('file.gz') as uncompressed: - for record in SeqIO.parse(uncompressed, "fasta"): - # if (len(record.seq) > 1000): - pattern = f"{record.id}\s*" - record.description = re.sub(pattern, "", record.description) - # record.description = record.description.replace(" ", "_") - newobj = f"{record.id} {accession} {record.description}" - refs[id]['chrs'].append(str(record.id)) - record.id = newobj - record.description = "" - SeqIO.write(record, w, "fasta") - uncompressed.close() - except Exception as ex: - print(ex) - pass - w.close() - return + ids = list(assemblies.keys()) + accessions = [assemblies[id]['accession'] for id in ids] + # filter accessions on those present in GCFS_to_skip + accessions = [x for x in accessions if x not in GCFs_to_skip] + caught_ncs = [ ] + new_mapping = dict() + seen = dict() + typee = "a" + if refresh: + typee = "w" + + with open(outfile, typee) as w: + for id in ids: + try: + accession = assemblies[id]['accession'] + if accession in GCFs_to_skip: + assemblies[id]['chrs'] = GCFs_to_skip[accession] + print("key already seen:", id, "; skipping") + else: + ftp_site = assemblies[id]['reference'] + obj = assemblies[id]['id'] + name = assemblies[id]['name'] + _open = download_fasta(ftp_site) + print(f"Downloading {ftp_site} to for {accession}: {id}") + with _open('file.gz') as uncompressed: + for record in SeqIO.parse(uncompressed, "fasta"): + # pattern = f"{record.id}\s*" + # record.description = re.sub(pattern, "", record.description) + # record.description = record.description.replace(" ", "_") + # newobj = f"{record.id} {accession} {record.description}" + if record.id not in caught_ncs: + caught_ncs.append(record.id) + if (not refresh and record.id not in new_mapping) or refresh: + print("writing", record.id, "to file") + + SeqIO.write(record, w, "fasta") + elif not refresh and record.id in new_mapping: + print("already seen", record.id, "skipping") + new_mapping[record.id] = dict(accession=accession, name=obj) + if record.id not in assemblies[id]['chrs']: + fid, description = format_description(record.id, record.description) + assemblies[id]['chrs'].append(dict(acc=record.id, name=description) ) + uncompressed.close() + try: + os.remove('file.gz') + except Exception as ex: + print(f"Could not remove file.gz {ex}") + pass + except Exception as ex: + print(ex) + pass + w.close() + return new_mapping def download(refs, db, outfile, seen): # if refs is not empty @@ -378,24 +406,38 @@ def get_hits_from_file(filename, colnumber): def main(argv=None): """Coordinate argument parsing and program execution.""" args = parse_args(argv) + + + gcf_mapping = dict() + + if args.gcf_map: + if os.path.exists(args.gcf_map): + #import the file, and save them to a dict of column 2 as key, column 1 as value + with open(args.gcf_map, "r") as f: + lines = f.readlines() + for line in lines: + line = line.strip() + linesplit = line.split("\t") + if len(linesplit) > 1: + key = linesplit[0] + value = linesplit[1] + gcf_mapping[key] = value + f.close() # logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s") if args.type == 'file': #colnumber file hits is the column from the input top hits file you want to match to the args.assembly_names - taxids = get_hits_from_file(args.input, args.colnumber_file_hits) - + seen_in_tops = get_hits_from_file(args.input, args.colnumber_file_hits) else: - taxids = args.input + seen_in_tops = args.input.split(" ") + + assemblies = import_assembly_file( + seen_in_tops, args.assembly_refseq_file, args.assembly_names, args.assembly_map_idx, args.name_col_assembly, args.ftp_path + ) - if (args.assembly_refseq_file): - refs = import_assembly_file( - # what column to match input to , col match accession (GCF accession), col match refrence name - taxids, args.assembly_refseq_file, args.assembly_names, args.assembly_map_idx, args.name_col_assembly, args.ftp_path) - else: - refs = import_genome_file(taxids, args.kraken2output) - seen = dict() i = 0 + seen_in_fasta = dict() if os.path.exists(args.file_out): for seq_record in SeqIO.parse(args.file_out, "fasta"): line = str(seq_record.id) @@ -408,45 +450,46 @@ def main(argv=None): # Splitting on space or pipe delimiters = "[ ]" # Split on underscore or comma linesplit = re.split(delimiters, desc) - if (args.kraken2output): - # use regex to split on " | " and get the second element - acc = linesplit[0] - desc = linesplit[1:] - else: - acc = linesplit[0] - desc = linesplit[1:] - header = desc[0] - desc = " ".join(desc) + acc, desc = format_description(line, desc) if not args.refresh: - if header not in seen: - seen[header] = [[acc, desc]] - else: - seen[header].append([acc, desc]) - # Find the key where header == value.accession if it exists - for key, value in refs.items(): - if value['accession'] == header: - refs[key]['chrs'].append(acc) - + seen_in_fasta[acc] = desc except Exception as ex: print(ex) pass + GCFs_to_skip = dict() + if not args.refresh: + for key, value in seen_in_fasta.items(): + if key in gcf_mapping: + if gcf_mapping[key] not in GCFs_to_skip: + GCFs_to_skip[gcf_mapping[key]] = [dict(acc=key, name=value)] + print("GCF found in mapping file, skipping", key, "from", gcf_mapping[key]) + else: + GCFs_to_skip[gcf_mapping[key]].append(dict(acc=key, name=value)) + + else: + print("No mapping for chr/contig to GCF. Will need to redownload to get gcf mapping for: ", key) + ## Now, check what contigs/chrs in seen_in_fasta are present in the gcf_mapping if args.email: Entrez.email = args.email if not (args.refresh): - print(len(seen.keys()), "already seen reference ids") - if (not args.assembly_refseq_file): - print("downloading refseq file") - download(refs, args.db, args.file_out, seen) - else: - print("get assemblies") - get_assemblies(refs, args.file_out, seen, args.ftp_path, args.refresh) + print(len(seen_in_fasta.keys()), "already seen reference ids") + # Now use the assembly refseq file to get the ftp path and download the fasta files + print("Get assemblies now") + + + get_assemblies( + assemblies, #this is the top hits from the input file you want to retrieve + args.file_out, # this is the file you want to write to + GCFs_to_skip, # this is the list of GCFs you want to skip + args.refresh # this is the boolean to check if you want to refresh the file + ) if args.gcf_map: with open(args.gcf_map, "w") as w: - for key, value in refs.items(): + for key, value in assemblies.items(): for chr in value['chrs']: - outstring = f"{chr}\t{key}\t{value['accession']}" + outstring = f"{chr['acc']}\t{value['accession']}\t{key}\t{chr['name']}" w.write(f"{outstring}\n") - w.close() + w.close() if __name__ == "__main__": sys.exit(main()) diff --git a/bin/download_features.py b/bin/download_features.py new file mode 100755 index 0000000..a0fbd2f --- /dev/null +++ b/bin/download_features.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python3 + +############################################################################################## +# Copyright 2022 The Johns Hopkins University Applied Physics Laboratory LLC +# All rights reserved. +# Permission is hereby granted, free of charge, to any person obtaining a copy of this +# software and associated documentation files (the "Software"), to deal in the Software +# without restriction, including without limitation the rights to use, copy, modify, +# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE +# OR OTHER DEALINGS IN THE SOFTWARE. +# + +"""Provide a command line tool to fetch a list of refseq genome ids to a single file, useful for kraken2 database building or alignment purposes""" +from xmlrpc.client import Boolean +import sys +import os +import gzip +import argparse +import logging +from pathlib import Path +import urllib.request as request +import ssl + +ctx = ssl.create_default_context() +ctx.check_hostname = False +ctx.verify_mode = ssl.CERT_NONE + + +# import requests +logger = logging.getLogger() + + +def parse_args(argv=None): + """Define and immediately parse command line arguments.""" + parser = argparse.ArgumentParser( + description="Validate and transform a tabular samplesheet.", + epilog="Example: python check_samplesheet.py samplesheet.csv samplesheet.valid.csv", + ) + parser.add_argument( + "-i", + "--input", + metavar="INPUT", + help="List of taxIDs or names", + ) + parser.add_argument( + "-a", + "--assembly_file", + type=Path, + help="Assembly refseq file to download from, with taxid and accession in the header", + ) + parser.add_argument( + "-d", + "--decompress", + action='store_true', + default=False, + help="Assembly refseq file to download from, with taxid and accession in the header", + ) + parser.add_argument( + "-o", + "--dir_out", + metavar="DIR_OUT", + type=Path, + help="Name of output directory for feature table files", + ) + + return parser.parse_args(argv) + +import shutil +from contextlib import closing +from mimetypes import guess_type +from functools import partial + +def download_feature_file(dir_out: str, url: str, outfile: str): + """Download the assembly file from NCBI""" + try: + response = request.urlopen(url, context=ctx) + encoding = guess_type(url)[1] # uses file extension + _open = partial( + gzip.open, mode='rt') if encoding == 'gzip' else open + outpath = os.path.join(dir_out, outfile) + with closing(request.urlopen(url, context=ctx)) as r: + with open(outpath, 'wb') as f: + shutil.copyfileobj(r, f) + f.close() + r.close() + if encoding == 'gzip': + # decompress the outpath to dir_out, write to file + outpathdecompress = outpath.replace(".gz", "") + with gzip.open(outpath, 'rt') as f: + #write contents to outpath + contents = f.readlines() + # remove the gz extension + with open(outpathdecompress, 'w') as r: + # filter lines with CDS in beginning of line only + for line in contents: + if line.startswith("CDS"): + r.write(line) + r.close() + # remove the gz file + os.remove(outpath) + f.close() + except Exception as e: + logger.error(f"Failed to download assembly file from {url} with error {e}") + return None +def main(): + args = parse_args() + logger.setLevel(logging.INFO) + inputfile = args.input + assembly_file = args.assembly_file + dir_out = args.dir_out + if not dir_out.exists(): + dir_out.mkdir() + with open(inputfile, 'r') as f: + # read in lines, remove newline and get rid of empty lines + gcfs = [line.strip() for line in f if line.strip()] + f.close() + assembly_urls = dict() + with open(assembly_file, 'r') as f: + for line in f: + if line.startswith("#"): + continue + line = line.strip().split("\t") + gcf = line[0] + url = line[19] + assembly_urls[gcf] = url + f.close() + for gcf in gcfs: + if gcf in assembly_urls: + try: + url = assembly_urls[gcf] + baseurl = os.path.basename(url) + url = f"{url}/{baseurl}_feature_table.txt.gz" + download_feature_file(dir_out, url, f"{gcf}_feature_table.txt.gz") + print(f"Downloaded feature file for {gcf} at {url} to {dir_out}/{gcf}_feature_table.txt") + except Exception as e: + print(f"Failed to write feature file for {gcf} at {url} with error {e}") + +if __name__ == "__main__": + sys.exit(main()) + + diff --git a/bin/fuzzy_match_assembly.py b/bin/fuzzy_match_assembly.py new file mode 100755 index 0000000..3594f3f --- /dev/null +++ b/bin/fuzzy_match_assembly.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +############################################################################################## +# Copyright 2022 The Johns Hopkins University Applied Physics Laboratory LLC +# All rights reserved. +# Permission is hereby granted, free of charge, to any person obtaining a copy of this +# software and associated documentation files (the "Software"), to deal in the Software +# without restriction, including without limitation the rights to use, copy, modify, +# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE +# OR OTHER DEALINGS IN THE SOFTWARE. +############################################################################################## + +import argparse +import time +import re +from Bio import SeqIO +import os +# import function determine_priority_assembly from fil in same dir +from determine_priority_assembly import determine_priority_assembly, format_description, parse_fasta_header + +def parse_args(): + parser = argparse.ArgumentParser(description="Fetch assembly accessions for a list of nuccore accessions.") + parser.add_argument("-i", "--input", required=True, help="Path to a file containing nuccore accessions, one per line.") + parser.add_argument("-a", "--assembly", required=True, help="Path to a file of assembly refseq information") + parser.add_argument("-o", "--output", required=True, help="Output File for GCFs") + return parser.parse_args() + +def search_in_dicts(organism_name, strain, assemblies, gcfs): + if organism_name in assemblies: + if strain and strain in assemblies[organism_name]: + # print(f"Match found for {organism_name} and strain {strain}") + # You can return or process the matched data here + return organism_name, gcfs[organism_name][strain] + else: + # print(f"Organism {organism_name} found, but strain {strain} is not present.") + return None, None + else: + # print(f"Organism {organism_name} not found.") + return None, None + +def long_match_names(organism_name, names): + split_name = organism_name.split(" ") + match = False + # iterate through split_name combining index each time until full combine, for each combined iteration check if it is names. If so , print and return + # move backwards from the full length to only first word until first match + for i in range(len(split_name), 0, -1): + name = " ".join(split_name[:i]) + if name in names: + match = name + break + return match + +def extract_after_colon(text): + # Regular expression to match everything after ': ' + match = re.search(r':\s*(.*)', text) + if match: + return match.group(1) # Return the matched group, which is everything after ': ' + else: + return text # Return the original text if no ': ' is found + +def main(): + args = parse_args() + assemblies = dict() + gcfs = dict() + with open(args.assembly, 'r') as assembly_file: + lines = assembly_file.readlines() + for line in lines: + if line.startswith("#"): + pass + cols = line.split("\t") + if len(cols) > 7: + name = cols[7] + strain = cols[8] + strain = strain.split("=")[1] if "=" in strain else strain + if strain == "na": + strain = "None" + ## set 2d list for [name] in the assemblies dict + if name not in gcfs: + gcfs[name] = dict() + if name in assemblies and assemblies[name] == 0: + continue + priority = determine_priority_assembly(line) + if name not in assemblies or (name in assemblies and priority < assemblies[name]): + assemblies[name] = priority + # extract value of strain=value. If strain=value is not present, set to just the same value for strain + gcfs[name] = cols[0] + assembly_file.close() + accessions = dict() + total = 0 + count = 0 + final_list = [] + missing_elements = [] + if os.path.exists(args.input): + for seq_record in SeqIO.parse(args.input, "fasta"): + id, desc = format_description(seq_record.id, seq_record.description) + desc = desc.replace("TPA_inf: ", "") + desc = desc.replace("MAG: ", "") + desc = desc.replace("MAGL: ", "") + # if there is something like MAG: or TPA_inf: in the description, remove it but use fuzzy match so that if it is up : and a space then remove it, dont hardcode the values. + # desc = extract_after_colon(desc) + total +=1 + name = long_match_names(desc, assemblies.keys()) + # organism, gcf = search_in_dicts(organism_name, explicit_strain, assemblies, gcfs) + if name: + count+=1 + # print(f"{seq_record.id}\t{gcfs[name]}\t{name}\t{desc}") + final_list.append(f"{seq_record.id}\t{gcfs[name]}\t{name}\t{desc}") + else: + print(f"No match found for {seq_record.id}\t{desc}") + missing_elements.append(f"{seq_record.id}\t{desc}") + if total > 0: + print(f"Found {count} out of {total} FASTA accessions ({count/total*100:.2f}%) in the assembly file.") + if len(final_list) == 0: + print("No assemblies found") + else: + with open(args.output, 'w') as infile: + for line in final_list: + infile.write(line + "\n") + infile.close() + if len(missing_elements) > 0: + dirname = os.path.dirname(args.output) + with open(os.path.join(dirname, "missing_assemblies.txt"), 'w') as missing_file: + for line in missing_elements: + missing_file.write(line + "\n") + missing_file.close() + + + + +if __name__ == "__main__": + main() diff --git a/bin/make_gcf_mapping.awk b/bin/make_gcf_mapping.awk new file mode 100755 index 0000000..35a8d17 --- /dev/null +++ b/bin/make_gcf_mapping.awk @@ -0,0 +1,17 @@ +#!/usr/bin/awk -f + +# Check if the line starts with ">" +/^>/ { + line = substr($0, 2); + + # Find the first two spaces to separate chromosome and assembly from description + firstSpace = index(line, " "); + secondSpace = index(substr(line, firstSpace + 1), " ") + firstSpace; + + # Extract chromosome, assembly, and description + chromosome = substr(line, 1, firstSpace-1); + assembly = substr(line, firstSpace+1, secondSpace-firstSpace-1); + description = substr(line, secondSpace+1); + + print chromosome "\t" assembly "\t" description; +} diff --git a/bin/make_gcf_mapping.sh b/bin/make_gcf_mapping.sh new file mode 100755 index 0000000..b966931 --- /dev/null +++ b/bin/make_gcf_mapping.sh @@ -0,0 +1,116 @@ +#!/usr/bin/env bash +############################################################################################## +# Copyright 2022 The Johns Hopkins University Applied Physics Laboratory LLC +# All rights reserved. +# Permission is hereby granted, free of charge, to any person obtaining a copy of this +# software and associated documentation files (the "Software"), to deal in the Software +# without restriction, including without limitation the rights to use, copy, modify, +# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE +# OR OTHER DEALINGS IN THE SOFTWARE. +# +# FUNCTIONS +usage() +{ +cat << EOF + +Help message for make_gcf_mapping.sh: + +DESCRIPTION: + Optionally merge assembly refseq on a gcf assembly pull FASTA file. FASTA file must be in format >chr GCF description (3 columns) where first 2 are a space and the third is anything after + +NOTES: + + +OPTIONS: + -h help show this message + -i FILE a fasta file of gcf references pulled. should be in chr \s gcf \s description format + -a FILE a file of the ncbi assembly refseq. Optional. If optional, the 3rd column of the mapping file will be pasted to col 4 + -o DIR full path to output file + +USAGE: + +bash make_gcf_mapping.sh -i ~/Downloads/combined_genomes.fasta -o test_output/test.tsv -a test_output/get/assembly_summary_refseq.txt + +EOF +} + + + + +# ARGUMENTS +# parse args +while getopts "hi:a:o:" OPTION +do + case $OPTION in + h) usage; exit 1 ;; + i) fasta=$OPTARG ;; + a) assembly=$OPTARG ;; + o) output=$OPTARG ;; + ?) usage; exit ;; + esac +done + + +# check args +if [[ -z $fasta ]] || [[ -z $output ]] +then + usage + exit 1 +fi +# print something if assembly is empty +if [[ -z $assembly ]] +then + echo "No assembly file provided. Will only output gcf \t chr \t description \t description (repeat)" +fi + + +grep '^>' $fasta | awk -F ' ' '{ + # Initialize the description variable + description = ""; + + # Concatenate fields to form the description, ignoring fields after the first comma + for (i = 2; i <= NF; i++) { + if (index($i, ",") != 0) { + # If theres a comma in the current field, split it and take the part before the comma + split($i, parts, ","); + description = description (description == "" ? "" : " ") parts[1]; + break; # Stop processing further fields after encountering the first comma + } else { + description = description (description == "" ? "" : " ") $i; + } + } + # Remove the ">" from the first column + gsub("^>", "", $1); + print $1 "\t" description; + +}' > $output.tmp + +echo "Assembly references of names completed from FASTA file. Moving on to adding the organism name" + +if [[ -z $assembly ]] +then + paste $output.tmp <(awk -F '\t' '{print $3}' $output.tmp) > $output +else + # use awk, import both output.tmp and assembly, and print the 1st, 2nd, 3rd, and map + awk -F '\t' 'BEGIN {FS=OFS="\t"} + NR==FNR {map[$1]=$8; next} + { + if ($2 == "") { + print $0, $3 # If assembly is empty, duplicate the third column + } else { + print $0, (map[$2] != "" ? map[$2] : $3) # Otherwise, use mapped value if present, else default to third column + } + }' $assembly $output.tmp > $output +fi + + + + + diff --git a/bin/map_nuccore_gcf.py b/bin/map_nuccore_gcf.py new file mode 100755 index 0000000..27da037 --- /dev/null +++ b/bin/map_nuccore_gcf.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 + +from Bio import Entrez +import argparse +import time + +def parse_args(): + parser = argparse.ArgumentParser(description="Fetch assembly accessions for a list of nuccore accessions.") + parser.add_argument("-i", "--input", required=True, help="Path to a file containing nuccore accessions, one per line.") + parser.add_argument("-r", "--restart", action='store_true', help="Email for entrez") + parser.add_argument("-o", "--file_out", required=True, help="Output file to append the assembly accessions.") + parser.add_argument("-e", "--email", required=False, help="Email for entrez") + parser.add_argument("-b", "--batch_count", required=False, default=100, help="Batch count for fetching assembly accessions. Default is 100.") + parser.add_argument("-f", "--failed", default="failed_accessions.txt", help="File to save failed accessions.") + return parser.parse_args() + +def read_processed_accessions(output_file): + processed = set() + try: + with open(output_file, 'r') as f: + for line in f: + parts = line.strip().split("\t") + if parts: + processed.add(parts[0]) + except FileNotFoundError: + pass # File doesn't exist, which is fine for the first run + return processed + +def fetch_assembly_accession(nuccore_id, retries=2): + try: + handle = Entrez.elink(dbfrom="nuccore", db="assembly", id=nuccore_id, retmax=1) + link_record = Entrez.read(handle) + handle.close() + if link_record[0]["LinkSetDb"]: + assembly_id = link_record[0]["LinkSetDb"][0]["Link"][0]["Id"] + handle = Entrez.esummary(db="assembly", id=assembly_id) + summary_record = Entrez.read(handle) + handle.close() + return summary_record["DocumentSummarySet"]["DocumentSummary"][0]["AssemblyAccession"] + except Exception as e: + if retries > 0: + print(f"Failed to fetch assembly accession for {nuccore_id}. Retrying {retries} more times.") + time.sleep(1) # Wait for 5 seconds before retrying + return fetch_assembly_accession(nuccore_id, retries-1) + return None + +def main(): + args = parse_args() + if args.email: + Entrez.email = args.email + processed_accessions = read_processed_accessions(args.file_out) + failed_accessions = [] + + with open(args.input, 'r') as infile: + for line in infile: + nuccore_id = line.strip() + print(nuccore_id) + if nuccore_id and nuccore_id not in processed_accessions: + assembly_accession = fetch_assembly_accession(nuccore_id) + print(assembly_accession) + if assembly_accession: + mode="w" if args.restart else "a" + with open(args.file_out, mode) as outfile: + print(f"{nuccore_id}\t{assembly_accession}\n") + outfile.write(f"{nuccore_id}\t{assembly_accession}\n") + else: + failed_accessions.append(nuccore_id) + if failed_accessions: + with open(args.failed, 'w') as failed_file: + for acc in failed_accessions: + failed_file.write(acc + "\n") + +if __name__ == "__main__": + main() diff --git a/bin/merge_assemblies_conf.py b/bin/merge_assemblies_conf.py index 5586475..032a05c 100755 --- a/bin/merge_assemblies_conf.py +++ b/bin/merge_assemblies_conf.py @@ -44,11 +44,12 @@ def calculate_stdev(depths, ref_size, weighted_mean_depth): return math.sqrt(variance) def coverage_thresholds(depths, thresholds, ref_size): - coverage_counts = {threshold: 0 for threshold in thresholds} + coverage_counts = [0] * len(thresholds) + print(depths, thresholds, ref_size) for depth, size in zip(depths, ref_size): - for threshold in thresholds: + for i, threshold in enumerate(thresholds): if depth >= threshold: - coverage_counts[threshold] += size + coverage_counts[i] += size return coverage_counts def main(argv=None): @@ -64,12 +65,15 @@ def main(argv=None): file.seek(0) # Reset file pointer to the beginning reader = csv.DictReader(file, delimiter='\t') # Reinitialize reader for row in reader: - name = row['Name'] + name = row['Organism'] data[name]['Mean Depth'].append(float(row['Mean Depth'])) data[name]['Average Coverage'].append(float(row['Average Coverage'])) data[name]['Ref Size'].append(float(row['Ref Size'])) data[name]['Total Reads Aligned'] += float(row['Total Reads Aligned']) # data[name]['Individual Reads Aligned'].append(float(row['Individual Reads Aligned'])) + # split 1:10:50:100:300X Cov. column into list on colons, setting thresholds and making it as a dict + data[name]['Coverage Thresholds'] = {int(t): float(p) for t, p in zip([1, 10, 50, 100, 300], row['1:10:50:100:300X Cov.'].split(':'))} + file.close() # Process data aggregated = [] @@ -82,18 +86,22 @@ def main(argv=None): abu_total_aligned = 0 else: abu_total_aligned = values['Total Reads Aligned'] / total_reads_aligned - coverage_counts = coverage_thresholds(values['Mean Depth'], thresholds, values['Ref Size']) - coverage_percentages = [coverage_counts[t] / sum(values['Ref Size']) * 100 for t in thresholds] + # use the Coverage Thresholds column to find weighted threhsolds for merged row + cov_thresholds = values['Coverage Thresholds'] + # coverage_counts = coverage_thresholds(cov_thresholds, thresholds, values['Ref Size']) + # print(coverage_counts) + # exit() aggregated.append( { - 'Name': name, + 'Organism': name, 'Weighted Mean Depth': mean_depth, 'Weighted Avg Coverage': avg_coverage, 'Total Ref Size': sum(values['Ref Size']), 'Total Reads Aligned': values['Total Reads Aligned'], '% Reads Aligned': abu_total_aligned * 100, 'Stdev': stdev, - '1X:10x:50x:100x:300x Cov.': f"{coverage_percentages[0]:.2f}:{coverage_percentages[1]:.2f}:{coverage_percentages[2]:.2f}:{coverage_percentages[3]:.2f}:{coverage_percentages[4]:.2f}", + "1X:10x:50x:100x:300x Cov.": "Disabled" + # '1X:10x:50x:100x:300x Cov.': f"{coverage_percentages[0]:.2f}:{coverage_percentages[1]:.2f}:{coverage_percentages[2]:.2f}:{coverage_percentages[3]:.2f}:{coverage_percentages[4]:.2f}", }) # Output the aggregated data diff --git a/bin/merge_assembly.awk b/bin/merge_assembly.awk new file mode 100644 index 0000000..797d70f --- /dev/null +++ b/bin/merge_assembly.awk @@ -0,0 +1,12 @@ +#!/usr/bin/awk -f + +# Check if the line starts with ">" +BEGIN {FS=OFS="\t"} +NR==FNR {map[$1]=$2; next} +{ + if ($2 == "") { + print $0, $3 # If assembly is empty, duplicate the third column + } else { + print $0, (map[$2] != "" ? map[$2] : $3) # Otherwise, use mapped value if present, else default to third column + } +} diff --git a/bin/reheader.py b/bin/reheader.py new file mode 100755 index 0000000..e95e4e6 --- /dev/null +++ b/bin/reheader.py @@ -0,0 +1,75 @@ + + +#!/usr/bin/env python3 + +############################################################################################## +# Copyright 2022 The Johns Hopkins University Applied Physics Laboratory LLC +# All rights reserved. +# Permission is hereby granted, free of charge, to any person obtaining a copy of this +# software and associated documentation files (the "Software"), to deal in the Software +# without restriction, including without limitation the rights to use, copy, modify, +# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE +# OR OTHER DEALINGS IN THE SOFTWARE. +# +import argparse +import sys +import pysam # Ensure pysam is installed + + +def parse_args(argv=None): + """Define and immediately parse command line arguments.""" + parser = argparse.ArgumentParser( + description="Validate and transform a tabular samplesheet.", + epilog="Example: python check_samplesheet.py samplesheet.csv samplesheet.valid.csv", + ) + parser.add_argument( + "-i", + "--input", + metavar="INPUT", + help="INPUT fasta file", + ) + parser.add_argument( + "-m", + "--mapping", + metavar="MAP", + help="MAPPING FILE", + ) + return parser.parse_args(argv) + +def main(): + args = parse_args() + input_bam = args.input + mapping_tsv = args.mapping + + # Load the mapping from the TSV file + mapping = {} + with open(mapping_tsv, 'r') as f: + for line in f: + old, new, chr = line.strip().split('\t') + mapping[old] = chr + + # Read the current header from the BAM file + bam = pysam.AlignmentFile(input_bam, "rb") + header_dict = bam.header.to_dict() + + # Update the reference names in the header + for ref in header_dict['SQ']: + old_name = ref['SN'] + if old_name in mapping: + ref['SN'] = mapping[old_name] + bam.close() + header = pysam.AlignmentHeader.from_dict(header_dict) + print(header) + # # Write the new header to a SAM file + # with open(new_header_sam, 'w') as f: + # f.write(pysam.AlignmentHeader.from_dict(header_dict).to_string()) + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/reheader_fasta.sh b/bin/reheader_fasta.sh new file mode 100755 index 0000000..f2ff8a2 --- /dev/null +++ b/bin/reheader_fasta.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +############################################################################################## +# Copyright 2022 The Johns Hopkins University Applied Physics Laboratory LLC +# All rights reserved. +# Permission is hereby granted, free of charge, to any person obtaining a copy of this +# software and associated documentation files (the "Software"), to deal in the Software +# without restriction, including without limitation the rights to use, copy, modify, +# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE +# OR OTHER DEALINGS IN THE SOFTWARE. +# + +usage() { + cat << EOF + +Help message for fasta_reheader.sh: + +DESCRIPTION: + Processes a FASTA file to modify its headers. In each header: + - Replaces the first space with a "|" + - Replaces all subsequent spaces with "_" + - Ignores everything after the first comma + +NOTES: + - This script is designed to work with FASTA files that have descriptive headers. + +OPTIONS: + -h Show this help message + -i FILE Input FASTA file + -o FILE Output FASTA file where the processed headers will be written + +USAGE: + bash fasta_reheader.sh -i -o + Example: bash fasta_reheader.sh -i input.fasta -o output.fasta + +EOF +} + +while getopts "hi:o:" OPTION +do + case $OPTION in + h) usage; exit 0 ;; + i) INPUT_FASTA="$OPTARG" ;; + o) OUTPUT_FASTA="$OPTARG" ;; + ?) usage; exit 1 ;; + esac +done + +if [ -z "$INPUT_FASTA" ] || [ -z "$OUTPUT_FASTA" ]; then + echo "Both input and output files must be specified." + usage + exit 1 +fi + +awk -F, 'BEGIN{OFS=""} /^>/{split($1, a, " "); gsub(" ", "_", $1); print a[1]"|"substr($1, length(a[1])+2)} !/^>/{print}' "$INPUT_FASTA" > "$OUTPUT_FASTA" + +echo "Processed FASTA file saved to $OUTPUT_FASTA" diff --git a/bin/sam_to_confidence.py b/bin/sam_to_confidence.py index 76d1247..e55b7ca 100755 --- a/bin/sam_to_confidence.py +++ b/bin/sam_to_confidence.py @@ -54,6 +54,15 @@ def parse_args(argv=None): required=False, help="Output accession to name tsv. 2 columns minimum. must be in 1 col for accession second is name", ) + parser.add_argument( + "-s", + "--splitidx", + metavar="SPLITIDX", + default=None, + required=False, + type=int, + help="If using -m mapping, split the index of the chr and get element. Split on | by default. 0 based index.", + ) parser.add_argument( "-o", "--file_out", @@ -150,9 +159,10 @@ def fm(x): # Calculate statistics and output -def output_statistics(depth, total_positions, coverage, count, sumsq, ireads, ilen, total_reads_aligned, FILE_OUT, mapping): +def output_statistics(depth, total_positions, coverage, count, sumsq, ireads, ilen, total_reads_aligned, FILE_OUT, mapping, splitidx): headers = ["Accession", - "Name", + "Organism", + "Full Name", "Mean Depth", "Average Coverage", "Ref Size", @@ -181,11 +191,16 @@ def output_statistics(depth, total_positions, coverage, count, sumsq, ireads, il stdev = math.sqrt( abs(sumsq[ref] / count[ref] - (total_depth / count[ref]) ** 2)) abu_total_aligned = ireads[ref] / total_reads_aligned - if ref in mapping: - name = mapping[ref] + check_idx = ref + if splitidx or splitidx == 0: + check_idx = check_idx.split("|")[splitidx] + if check_idx in mapping: + organism = mapping[check_idx]['organism'] + fullname = mapping[check_idx]['fullname'] else: - name = ref - outstring = f"{ref}\t{name}\t{fm(avgDepth)}\t{fm(avg_coverage)}\t{ilen[ref]}\t{ireads[ref]}\t{fm(100*(ireads[ref]/total_reads_aligned))}\t{fm(stdev)}\t{fm(abu_total_aligned)}\t{':'.join([fm(x) for x in xCov])}" + organism = "N/A" + fullname = "N/A" + outstring = f"{check_idx}\t{organism}\t{fullname}\t{fm(avgDepth)}\t{fm(avg_coverage)}\t{ilen[ref]}\t{ireads[ref]}\t{fm(100*(ireads[ref]/total_reads_aligned))}\t{fm(stdev)}\t{fm(abu_total_aligned)}\t{':'.join([fm(x) for x in xCov])}" f.write(outstring+"\n") @@ -202,7 +217,15 @@ def main(argv=None): with open(args.mapping, 'r') as f: for line in f: cols = line.strip().split("\t") - mapping[cols[0]] = cols[1] + idx = cols[0] + organism = "N/A" + fullname = "N/A" + obj = dict(organism=organism, fullname=fullname) + if len(cols) > 2: + obj['organism'] = cols[2] + if len(cols) > 3: + obj['fullname'] = cols[3] + mapping[idx] = obj f.close() depth, total_positions = load_depth_file(DEPTHFILE) @@ -215,7 +238,7 @@ def ensure_bam_index(bam_path): coverage, count, sumsq, ireads, ilen, total_reads_aligned = process_bam_file( BAMFILE, depth, total_positions) output_statistics(depth, total_positions, coverage, count, - sumsq, ireads, ilen, total_reads_aligned, args.file_out, mapping) + sumsq, ireads, ilen, total_reads_aligned, args.file_out, mapping, args.splitidx) if __name__ == "__main__": diff --git a/bin/samtools_coverage.sh b/bin/samtools_coverage.sh new file mode 100755 index 0000000..181effa --- /dev/null +++ b/bin/samtools_coverage.sh @@ -0,0 +1,112 @@ +#!/bin/bash + +############################################################################################## +# Copyright 2022 The Johns Hopkins University Applied Physics Laboratory LLC +# All rights reserved. +# Permission is hereby granted, free of charge, to any person obtaining a copy of this +# software and associated documentation files (the "Software"), to deal in the Software +# without restriction, including without limitation the rights to use, copy, modify, +# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE +# OR OTHER DEALINGS IN THE SOFTWARE. +# + + +usage() { + cat << EOF + +Usage: $0 [options] + +This script runs samtools coverage on a BAM file and replaces reference names +in the output according to a provided mapping file. + +OPTIONS: + -h Show this help message + -b FILE Input BAM file + -m FILE Reference name mapping file (original and new names, tab-separated) + -a ARGS Extra args to attach to samtools coverage + -o FILE Output file for the modified samtools coverage results + +The mapping file should have two columns: +1. Original reference name (as in the BAM file) +2. New reference name + +Each line in the mapping file corresponds to one reference name mapping. + +Example: +bash $0 -b input.bam -m mapping.txt -o coverage_output.txt + +EOF +} + +# Default values for arguments +bam_file="" +mapping_file="" +output_file="" +args="" +# Parse command-line arguments +while getopts "hb:m:o:a:" opt; do + case ${opt} in + h ) + usage + exit 0 + ;; + b ) + bam_file=${OPTARG} + ;; + m ) + mapping_file=${OPTARG} + ;; + o ) + output_file=${OPTARG} + ;; + a ) + args=${OPTARG} + ;; + \? ) + usage + exit 1 + ;; + esac +done + +# Check for mandatory options +if [ -z "${bam_file}" ] || [ -z "${mapping_file}" ] || [ -z "${output_file}" ]; then + echo "Error: Missing required arguments." + usage + exit 1 +fi + +# Check if samtools is installed +if ! command -v samtools &> /dev/null; then + echo "Error: samtools could not be found. Please install samtools and try again." + exit 1 +fi +# Run samtools coverage and process the output +samtools coverage $args "${bam_file}" | awk -v mapFile="${mapping_file}" ' + BEGIN { + FS=OFS="\t" + # Load the mapping from the mapping file + while ((getline < mapFile) > 0) { + map[$1] = $2; + # print $1,$2 + } + } + /^[^>]/ { + # Replace the reference name if it exists in the mapping + # split on space, get first index + split($1, a, " "); + + if (a[1] in map) { + $1 =a[1]" "map[a[1]]; + } + } {print} +' > "${output_file}" + +echo "Modified samtools coverage output saved to ${output_file}" diff --git a/conf/modules.config b/conf/modules.config index 5e1deec..3313bdf 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -72,9 +72,7 @@ process { withName: SAMTOOLS_VIEW{ ext.prefix = { "${meta.id}.remove" } } - withName: SAMTOOLS_SORT{ - ext.prefix = { "${meta.id}.sort" } - } + withName: BCFTOOLS_CONSENSUS{ errorStrategy = 'ignore' ext.prefix = { "${meta.id}.consensus" } diff --git a/nextflow.config b/nextflow.config index 608c0c2..41f0b60 100644 --- a/nextflow.config +++ b/nextflow.config @@ -25,14 +25,19 @@ params { remoteblast = null filter = null remove_taxids = null - skip_features = null + get_features = null fuzzy = null refresh_download = null skip_krona = null k2_minimum_hit_groups = null reference_fasta = null + organisms = null + organisms_file = null skip_kraken2 = null - minmapq = null + get_variants = null + reference_assembly = null + minmapq = 12 + denovo_assembly = null skip_realignment = null skip_stats = null skip_multiqc = null diff --git a/nextflow_schema.json b/nextflow_schema.json index d9fa306..6028121 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -150,17 +150,37 @@ "help_text": "", "fa_icon": "fas fa-file-csv" }, - "refresh_download": { + "organisms": { + "type": "string", + "hint": "Options: Comma-separated taxids or organism names to match to ncbi assembly info. See --assembly for more info. If using organism names, make sure to set it to fuzzy-matching with --fuzzy ", + "default": null, + "description": "Either a string of taxids separated by a space, or organisms names with '' enclosure, separated by space for each enclosed name with ''", + "fa_icon": "fas fa-file-signature" + }, + "organisms_file": { + "type": "string", + "hint": "Options: [string or single column list of taxids or organism names]. If using organism names, make sure to set it to fuzzy-matching with --fuzzy ", + "default": null, + "description": "Either a file or string of taxids separated by a space, or organisms names with '' enclosure", + "fa_icon": "fas fa-file-signature" + }, + "fuzzy": { "type": "boolean", - "description": "Redownload all references from NCBI", + "description": "Fuzzy matching for organism names", "fa_icon": "fas fa-file-signature", "default": false }, - "default_download": { + "minmapq": { + "type": "number", + "default": 5, + "description": "Minimum MapQ Score for alignment", + "fa_icon": "fas fa-file-signature" + }, + "refresh_download": { "type": "boolean", - "description": "Download the default assembly file from NCBI using ncbi-genome-download tool", + "description": "Redownload all references from NCBI", "fa_icon": "fas fa-file-signature", - "default": true + "default": false }, "genome": { "type": "string", @@ -223,21 +243,38 @@ "fa_icon": "fas fa-file-signature", "default": false }, + "skip_kraken2": { + "type": "boolean", + "description": "Skip Kraken2. Will not work with \"unknown taxa\" mode. You must provide a reference FASTA if this is toggled on.", + "fa_icon": "fas fa-file-signature", + "default": false + }, "skip_krona": { "type": "boolean", "description": "Skip Krona Plots", "fa_icon": "fas fa-file-signature", "default": false }, - "skip_assembly": { + "denovo_assembly": { "type": "boolean", - "description": "Skip De Novo Assembly", + "description": "Perform De Novo Assembly", "fa_icon": "fas fa-file-signature", "default": true }, - "skip_features": { + "reference_assembly": { + "type": "boolean", + "description": "Perform Reference-Based Assembly. Performs variant analysis by default (mandatory)", + "fa_icon": "fas fa-file-signature", + "default": true + }, + "get_features": { + "type": "boolean", + "description": "Start generating annotations from alignments that take place. Requires LOTS of RAM for large data files/genomes", + "fa_icon": "fas fa-file-signature" + }, + "get_variants": { "type": "boolean", - "description": "Skip generating annotations from alignments that take place", + "description": "Start generating variant call analysis relative to aligned reads to references", "fa_icon": "fas fa-file-signature" }, "skip_multiqc": { diff --git a/params.json b/params.json index e8587f4..3bbcca7 100644 --- a/params.json +++ b/params.json @@ -6,22 +6,23 @@ "top_hits_count": 3, "demux": true, "assembly": "examples/assembly_summary_refseq.txt", - "skip_assembly": true, + "reference_assembly": false, "max_memory": "10GB", "max_cpus": 3, "low_memory": false, "taxtab": "default", "remove_taxids": "9606", "profile": "docker", - "top_per_taxa": "10239:20:S 2:20:S", "blastdb": null, "remoteblast": false, "skip_fastp": false, - "skip_features": false, + "get_features": false, + "get_variants": false, "fuzzy": false, + "minmapq": 5, "refresh_download": false, "skip_stats": false, - "skip_plots": true, + "skip_plots": false, "skip_consensus": false, "skip_variants": false, "subsample": null diff --git a/subworkflows/local/alignment.nf b/subworkflows/local/alignment.nf index d1283b6..1909743 100644 --- a/subworkflows/local/alignment.nf +++ b/subworkflows/local/alignment.nf @@ -11,7 +11,7 @@ include { SAMTOOLS_DEPTH } from '../../modules/nf-core/samtools/depth/main' include { SAMTOOLS_FAIDX } from '../../modules/nf-core/samtools/faidx/main' include { SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/index/main' include { SAMTOOLS_COVERAGE } from '../../modules/nf-core/samtools/coverage/main' -include { SAMTOOLS_COVERAGE as SAMTOOLS_HIST_COVERAGE } from '../../modules/nf-core/samtools/coverage/main' +include { SAMTOOLS_HIST_COVERAGE } from '../../modules/local/samtools_hist_coverage' include { BCFTOOLS_CONSENSUS } from '../../modules/nf-core/bcftools/consensus/main' include { BCFTOOLS_MPILEUP as BCFTOOLS_MPILEUP_ILLUMINA } from '../../modules/nf-core/bcftools/mpileup/main' include { BCFTOOLS_INDEX } from '../../modules/nf-core/bcftools/index/main' @@ -23,7 +23,6 @@ include { BOWTIE2_ALIGN } from '../../modules/nf-core/bowtie2/align/main' include { BOWTIE2_BUILD } from '../../modules/nf-core/bowtie2/build/main' include { RSEQC_BAMSTAT } from '../../modules/nf-core/rseqc/bamstat/main' include { BEDTOOLS_DEPTHCOVERAGE } from '../../modules/local/bedtools_coverage' -include { SAMTOOLS_SORT } from '../../modules/local/samtools_sort' workflow ALIGNMENT { take: @@ -55,18 +54,18 @@ workflow ALIGNMENT { SAMTOOLS_FAIDX( - fastq_reads.map{ m, report, fastq, fasta, bed -> return [ m, fasta ] }, - fastq_reads.map{ m, report, fastq, fasta, bed -> return [ m, [] ] }, + fastq_reads.map{ m, fastq, fasta, bed, map -> return [ m, fasta ] }, + fastq_reads.map{ m, fastq, fasta, bed, map -> return [ m, [] ] }, ) BOWTIE2_BUILD( - ch_aligners.shortreads.map{ m, report, fastq, fasta, bed -> + ch_aligners.shortreads.map{ m, fastq, fasta, bed, map -> return [ m, fasta ] } ) BOWTIE2_ALIGN( - ch_aligners.shortreads.map{ m, report, fastq, fasta, bed -> + ch_aligners.shortreads.map{ m, fastq, fasta, bed, map -> return [ m, fastq ] }, BOWTIE2_BUILD.out.index, @@ -78,8 +77,8 @@ workflow ALIGNMENT { ) MINIMAP2_ALIGN( - ch_aligners.longreads.map{ m, report, fastq, fasta, bed -> [ m, fastq ] }, - ch_aligners.longreads.map{ meta, report, fastq, fasta, bed -> fasta }, + ch_aligners.longreads.map{ m, fastq, fasta, bed, map -> [ m, fastq ] }, + ch_aligners.longreads.map{ m, fastq, fasta, bed, map -> fasta }, true, true, true @@ -89,53 +88,54 @@ workflow ALIGNMENT { ) nanopore_alignments.mix(illumina_alignments).map{ - m, report, fastq, fasta, bed, bam -> [ m, bam ] + m, fastq, fasta, bed, map, bam -> [ m, bam ] }.set { collected_bams } - SAMTOOLS_SORT( - collected_bams - ) + SAMTOOLS_INDEX( - SAMTOOLS_SORT.out.bam + collected_bams ) - sorted_bams = SAMTOOLS_SORT.out.bam + sorted_bams = collected_bams + SAMTOOLS_DEPTH( sorted_bams ) - SAMTOOLS_SORT.out.bam.join(SAMTOOLS_INDEX.out.bai).set{ sorted_bams_with_index } + collected_bams.join(SAMTOOLS_INDEX.out.bai).set{ sorted_bams_with_index } SAMTOOLS_COVERAGE( sorted_bams_with_index ) + gcf_with_bam = sorted_bams.join(fastq_reads.map{ m, fastq, fasta, bed, map -> return [m, map] }) SAMTOOLS_HIST_COVERAGE ( - sorted_bams_with_index + gcf_with_bam ) - sorted_bams_with_index.join(fastq_reads.map{ m, report, fastq, fasta, bed -> return [m, fasta] }).set{ sorted_bams_with_index_fasta } + sorted_bams_with_index.join(fastq_reads.map{ m, fastq, fasta, bed, map -> return [m, fasta] }).set{ sorted_bams_with_index_fasta } - if (!params.skip_features){ + if (params.get_features){ ch_merged_bed = sorted_bams.join( - fastq_reads.map{ m, report, fastq, fasta, bed -> return [m, bed] } + fastq_reads.map{ m, fastq, fasta, bed, map -> return [m, bed] } ).map{ meta, bam, bed -> [meta, bed, bam] + }.filter{ + it[1] != null } BEDTOOLS_DEPTHCOVERAGE( ch_merged_bed ) - // ch_coverage = BEDTOOLS_DEPTHCOVERAGE.out.coverage } // branch out the samtools_sort output to nanopore and illumina - ch_sorted_bam_split = sorted_bams.join(fastq_reads.map{ m, report, fastq, fasta, bed -> return [m, fasta] }) + ch_sorted_bam_split = sorted_bams.join(fastq_reads.map{ m, fastq, fasta, bed, map -> return [m, fasta] }) ch_sorted_bam_split.branch{ longreads: it[0].platform =~ 'OXFORD' @@ -143,7 +143,7 @@ workflow ALIGNMENT { }.set { ch_sorted_bam_split } - if (!params.skip_variants){ + if (params.get_variants || params.reference_assembly){ BCFTOOLS_MPILEUP_OXFORD( ch_sorted_bam_split.longreads.map{ m, bam, fasta -> [m, bam] }, ch_sorted_bam_split.longreads.map{ m, bam, fasta -> fasta }, @@ -194,14 +194,13 @@ workflow ALIGNMENT { // ) // ch_stats = BCFTOOLS_STATS.out.stats // } - if (!params.skip_consensus){ + if (params.reference_assembly){ BCFTOOLS_CONSENSUS( ch_merged_mpileup ) ch_fasta = BCFTOOLS_CONSENSUS.out.fasta } } - RSEQC_BAMSTAT( sorted_bams ) diff --git a/workflows/taxtriage.nf b/workflows/taxtriage.nf index 55b5416..bec6882 100644 --- a/workflows/taxtriage.nf +++ b/workflows/taxtriage.nf @@ -76,12 +76,24 @@ if (params.assembly && ch_assembly_txt.isEmpty()) { ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +// // // // +// // // // MODULE: MultiQC +// // // // +workflow_summary = WorkflowTaxtriage.paramsSummaryMultiqc(workflow, summary_params) +ch_workflow_summary = Channel.value(workflow_summary) ch_multiqc_config = file("$projectDir/assets/multiqc_config.yml", checkIfExists: true) ch_multiqc_css = file("$projectDir/assets/mqc.css", checkIfExists: true) ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config, checkIfExists: true) : Channel.empty() ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath(params.multiqc_logo, checkIfExists: true) : Channel.empty() + +ch_multiqc_files = Channel.empty() +ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) +ch_multiqc_files = ch_multiqc_files.mix(ch_multiqc_custom_config.collect().ifEmpty([])) +ch_multiqc_files = ch_multiqc_files.mix(Channel.from(ch_multiqc_config)) +ch_multiqc_files = ch_multiqc_files.mix(Channel.from(ch_multiqc_css)) ch_merged_table_config = Channel.fromPath("$projectDir/assets/table_explanation_mqc.yml", checkIfExists: true) -ch_alignment_stats = Channel.empty() +ch_multiqc_files = ch_multiqc_files.mix(ch_merged_table_config.collect().ifEmpty([])) + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT LOCAL MODULES/SUBWORKFLOWS @@ -145,6 +157,10 @@ include { DOWNLOAD_ASSEMBLY } from '../modules/local/download_assembly' include { NCBIGENOMEDOWNLOAD_FEATURES } from '../modules/local/get_feature_tables' include { FEATURES_TO_BED } from '../modules/local/convert_features_to_bed' include { CONFIDENCE_MERGE } from '../modules/local/merge_confidence_contigs' +include { MAP_GCF } from '../modules/local/map_gcfs' +include { FEATURES_DOWNLOAD } from '../modules/local/download_features' +include { REFERENCE_REHEADER } from '../modules/local/reheader' +include { MAP_LOCAL_ASSEMBLY_TO_FASTA } from '../modules/local/map_assembly_to_fasta' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -222,17 +238,9 @@ workflow TAXTRIAGE { ch_accession_mapping = Channel.empty() ch_empty_file = file("$projectDir/assets/NO_FILE") - ch_hit_to_kraken_report = Channel.empty() - // // // // - // // // // MODULE: MultiQC - // // // // - workflow_summary = WorkflowTaxtriage.paramsSummaryMultiqc(workflow, summary_params) - ch_workflow_summary = Channel.value(workflow_summary) - ch_multiqc_files = Channel.empty() - ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(ch_multiqc_custom_config.collect().ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(Channel.from(ch_multiqc_config)) - ch_multiqc_files = ch_multiqc_files.mix(Channel.from(ch_multiqc_css)) + + + // // // // // // // // // // // // @@ -270,21 +278,28 @@ workflow TAXTRIAGE { // // // // MODULE: Run FastQC or Porechop, Trimgalore // // // ch_porechop_out = Channel.empty() + ch_fastp_reads = Channel.empty() + ch_fastp_html = Channel.empty() + if (params.trim) { nontrimmed_reads = ch_reads.filter { !it[0].trim } TRIMGALORE( ch_reads.filter { it[0].platform == 'ILLUMINA' && it[0].trim } ) + + ch_multiqc_files = ch_multiqc_files.mix(TRIMGALORE.out.reads.collect { it[1] }.ifEmpty([])) + PORECHOP( ch_reads.filter { it[0].platform == 'OXFORD' && it[0].trim } ) ch_porechop_out = PORECHOP.out.reads - trimmed_reads = TRIMGALORE.out.reads.mix(PORECHOP.out.reads) ch_reads = nontrimmed_reads.mix(trimmed_reads) + ch_multiqc_files = ch_multiqc_files.mix(ch_porechop_out.collect { it[1] }.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(ch_fastp_html.collect { it[1] }.ifEmpty([])) + } - ch_fastp_reads = Channel.empty() - ch_fastp_html = Channel.empty() + if (!params.skip_fastp) { FASTP( ch_reads, @@ -312,141 +327,194 @@ workflow TAXTRIAGE { NANOPLOT( ch_reads.filter { it[0].platform =~ /(?i)OXFORD/ } ) + ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect { it[1] }.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(NANOPLOT.out.txt.collect { it[1] }.ifEmpty([])) } + ch_filtered_reads = ch_reads - // // // // // // - // // // // // // MODULE: Run Kraken2 - // // // // // // - KRAKEN2_KRAKEN2( - ch_reads, - ch_db, - true, - true - ) + if (!params.skip_kraken2){ + // // // // // // + // // // // // // MODULE: Run Kraken2 + // // // // // // + KRAKEN2_KRAKEN2( + ch_reads, + ch_db, + true, + true + ) - ch_kraken2_report = KRAKEN2_KRAKEN2.out.report + ch_kraken2_report = KRAKEN2_KRAKEN2.out.report - KREPORT_TO_KRONATXT( - ch_kraken2_report - ) - ch_krona_txt = KREPORT_TO_KRONATXT.out.txt - ch_combined = ch_krona_txt - .map{ it[1] } // Get the file path - .collect() // Collect all file parts into a list - .map { files -> - // Join the files with single quotes and space - String joinedFiles = files.collect { "'$it'" }.join(' ') - [[id:'combined_krona_kreports'], joinedFiles] // Combine with new ID - } - - KRONA_KTIMPORTTEXT( - ch_combined - ) + KREPORT_TO_KRONATXT( + ch_kraken2_report + ) + ch_krona_txt = KREPORT_TO_KRONATXT.out.txt + ch_combined = ch_krona_txt + .map{ it[1] } // Get the file path + .collect() // Collect all file parts into a list + .map { files -> + // Join the files with single quotes and space + String joinedFiles = files.collect { "'$it'" }.join(' ') + [[id:'combined_krona_kreports'], joinedFiles] // Combine with new ID + } + + KRONA_KTIMPORTTEXT( + ch_combined + ) - if (params.remove_taxids) { - remove_input = ch_kraken2_report.map { - meta, report -> [ - meta, report, params.remove_taxids - ] + if (params.remove_taxids) { + remove_input = ch_kraken2_report.map { + meta, report -> [ + meta, report, params.remove_taxids + ] + } + REMOVETAXIDSCLASSIFICATION( + remove_input + ) + ch_kraken2_report = REMOVETAXIDSCLASSIFICATION.out.report } - REMOVETAXIDSCLASSIFICATION( - remove_input + + TOP_HITS( + ch_kraken2_report + ) + MERGEDKRAKENREPORT( + TOP_HITS.out.krakenreport.map { meta, file -> file }.collect() + ) + + FILTERKRAKEN( + MERGEDKRAKENREPORT.out.krakenreport ) - ch_kraken2_report = REMOVETAXIDSCLASSIFICATION.out.report + ch_filtered_reads = KRAKEN2_KRAKEN2.out.classified_reads_fastq.map { m, r-> [m, r.findAll { it =~ /.*\.classified.*(fq|fastq)(\.gz)?/ }] } + + if (params.fuzzy){ + ch_organisms = TOP_HITS.out.names + } else { + ch_organisms = TOP_HITS.out.taxids + } + + ch_multiqc_files = ch_multiqc_files.mix(MERGEDKRAKENREPORT.out.krakenreport.collect().ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(ch_kraken2_report.collect { it[1] }.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FILTERKRAKEN.out.reports.collect().ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(TOP_HITS.out.krakenreport.collect { it[1] }.ifEmpty([])) + + } else if (params.organisms_file){ + // check if params.organisms is a file or a string + ch_organisms = Channel.fromPath(params.organisms_file, checkIfExists: true) + } else if (params.organisms) { + ch_organisms = Channel.from(params.organisms) } - TOP_HITS( - ch_kraken2_report - ) - MERGEDKRAKENREPORT( - TOP_HITS.out.krakenreport.map { meta, file -> file }.collect() - ) - FILTERKRAKEN( - MERGEDKRAKENREPORT.out.krakenreport - ) - ch_filtered_reads = KRAKEN2_KRAKEN2.out.classified_reads_fastq.map { m, r-> [m, r.findAll { it =~ /.*\.classified.*(fq|fastq)(\.gz)?/ }] } - ch_hit_to_kraken_report = TOP_HITS.out.tops.join(ch_filtered_reads) ch_accessions = Channel.empty() ch_bedfiles = Channel.empty() - ch_hit_to_kraken_report_merged = Channel.empty() ch_bedfiles_or_default = Channel.empty() + ch_alignment_stats = Channel.empty() + ch_mapped_assemblies = Channel.empty() + + if (params.reference_fasta) { // // format of the FASTA file MUST be "kraken:taxid|" in each reference accession ch_reference_fasta = params.reference_fasta ? Channel.fromPath(params.reference_fasta, checkIfExists: true) : Channel.empty() - ch_hit_to_kraken_report = ch_hit_to_kraken_report.combine( + + // merge ch_reference_fasta on all of the krakenreports. single channel merged to multiple + ch_filtered_reads = ch_filtered_reads.combine( ch_reference_fasta ) + + MAP_LOCAL_ASSEMBLY_TO_FASTA( + ch_filtered_reads.map { + meta, readsclass, fasta -> return [ meta, fasta ] + }, + ch_assembly_txt + ) + + ch_mapped_assemblies = MAP_LOCAL_ASSEMBLY_TO_FASTA.out.map + ch_accessions = MAP_LOCAL_ASSEMBLY_TO_FASTA.out.accessions } else { + + if (params.organisms || params.organisms_file){ + ch_pre_download = ch_filtered_reads.combine(ch_organisms) + } else { + ch_pre_download = ch_filtered_reads.map { + meta, readsclass -> return [ meta, readsclass ] + }.join(ch_organisms) + } + DOWNLOAD_ASSEMBLY( - ch_hit_to_kraken_report.map { - meta, report, readsclass -> [ meta, report ] + ch_pre_download.map { + meta, readsclass, report -> return [ meta, report ] }, ch_assembly_txt ) - ch_hit_to_kraken_report = ch_hit_to_kraken_report.join( - DOWNLOAD_ASSEMBLY.out.fasta - ) + ch_filtered_reads = ch_filtered_reads.join(DOWNLOAD_ASSEMBLY.out.fasta) + + // MAP_GCF( + // DOWNLOAD_ASSEMBLY.out.fasta.map({ meta, fasta -> return [ meta, fasta ] }), + // ch_assembly_txt + // ) ch_accessions = DOWNLOAD_ASSEMBLY.out.accessions - ch_accession_mapping = DOWNLOAD_ASSEMBLY.out.mappings + ch_mapped_assemblies = DOWNLOAD_ASSEMBLY.out.mappings - if (!params.skip_features){ - NCBIGENOMEDOWNLOAD_FEATURES( - ch_accessions - ) + } + if (params.get_features){ - FEATURES_TO_BED( - NCBIGENOMEDOWNLOAD_FEATURES.out.features - ) + FEATURES_DOWNLOAD( + ch_accessions, + ch_assembly_txt + ) - ch_bedfiles = FEATURES_TO_BED.out.bed - } - } + FEATURES_TO_BED( + FEATURES_DOWNLOAD.out.features + ) + ch_bedfiles = FEATURES_TO_BED.out.bed + } if (!params.skip_realignment) { - // Combine ch_hit_to_kraken_report with ch_bedfiles_or_default - if (params.skip_features){ - ch_hit_to_kraken_report = ch_hit_to_kraken_report.map { - meta, report, readsclass, fasta -> [ meta, report, readsclass, fasta, null ] - } + if (params.get_features){ + ch_reads_to_align = ch_filtered_reads.join(ch_bedfiles, remainder: true) } else { - ch_hit_to_kraken_report = ch_hit_to_kraken_report.join(ch_bedfiles) + ch_reads_to_align = ch_reads_to_align.map { + meta, reads, fasta -> [ meta, reads, fasta, null ] + } } + ch_reads_to_align = ch_reads_to_align.join(ch_mapped_assemblies, remainder: true) + ALIGNMENT( - ch_hit_to_kraken_report + ch_reads_to_align ) ch_alignment_stats = ALIGNMENT.out.stats + ch_multiqc_files = ch_multiqc_files.mix(ch_alignment_stats.collect { it[1] }.ifEmpty([])) + ch_bamstats = ALIGNMENT.out.bamstats ch_depth = ALIGNMENT.out.depth ch_alignment_outmerg = ALIGNMENT.out.bams.join(ALIGNMENT.out.depth) - // ch_alignment_outmerg = ch_alignment_outmerg.join(Channel.empty(), remainder: true) + ch_accession_mapping.view() ch_combined = ch_alignment_outmerg - .join(ch_accession_mapping, by: 0, remainder: true) + .join(ch_mapped_assemblies, by: 0, remainder: true) .map { meta, bam, depth, mapping -> // If mapping is not present, replace it with null or an empty placeholder return [meta, bam, depth, mapping ?: ch_empty_file] } - CONFIDENCE_METRIC( - ch_combined - ) + ch_mapped_assemblies.view() - CONFIDENCE_MERGE( - CONFIDENCE_METRIC.out.tsv - ) + if (!params.skip_confidence) { + CONFIDENCE_METRIC( + ch_combined + ) + + CONFIDENCE_MERGE( + CONFIDENCE_METRIC.out.tsv + ) - ch_joined_confidence_report = KRAKEN2_KRAKEN2.out.report.join( - CONFIDENCE_MERGE.out.confidence - ) - if (!params.skip_confidence) { CONVERT_CONFIDENCE( - ch_joined_confidence_report + CONFIDENCE_MERGE.out.confidence ) MERGE_CONFIDENCE( @@ -454,11 +522,13 @@ workflow TAXTRIAGE { ) ch_mergedtsv = MERGE_CONFIDENCE.out.confidence_report + ch_multiqc_files = ch_multiqc_files.mix(ch_mergedtsv.collect().ifEmpty([])) + } } - if (!params.skip_assembly) { - illumina_reads = ch_hit_to_kraken_report.filter { + if (params.denovo_assembly) { + illumina_reads = ch_filtered_reads.filter { it[0].platform == 'ILLUMINA' }.map { meta, reads -> [meta, reads, [], []] @@ -467,9 +537,8 @@ workflow TAXTRIAGE { illumina_reads ) - println('____________________________') - nanopore_reads = ch_hit_to_kraken_report.filter { + nanopore_reads = ch_filtered_reads.filter { it[0].platform == 'OXFORD' }.map { meta, reads -> [meta, [], [], [], reads] @@ -487,24 +556,6 @@ workflow TAXTRIAGE { // // // // // // MODULE: MultiQC Pt 2 // // // - - ch_multiqc_files = ch_multiqc_files.mix(MERGEDKRAKENREPORT.out.krakenreport.collect().ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(FILTERKRAKEN.out.reports.collect().ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(TOP_HITS.out.krakenreport.collect { it[1] }.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_alignment_stats.collect { it[1] }.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_porechop_out.collect { it[1] }.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_merged_table_config.collect().ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_fastp_html.collect { it[1] }.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_kraken2_report.collect { it[1] }.ifEmpty([])) - if (params.trim) { - ch_multiqc_files = ch_multiqc_files.mix(TRIMGALORE.out.reads.collect { it[1] }.ifEmpty([])) - } - if (!params.skip_plots) { - ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect { it[1] }.ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(NANOPLOT.out.txt.collect { it[1] }.ifEmpty([])) - } - ch_multiqc_files = ch_multiqc_files.mix(ch_mergedtsv.collect().ifEmpty([])) - // Unused or Incomplete // if (params.blastdb && !params.remoteblast){ // ch_multiqc_files = ch_multiqc_files.mix(BLAST_BLASTN.out.txt.collect{it[1]}.ifEmpty([]))