diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9e0d2429..932a6d86 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,22 @@
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## 2.5.4 - [2024-02-12]
+
+### `Added`
+
+### `Changed`
+
+- [#581](https://github.com/nf-core/mag/pull/581) - Added explicit licence text to headers of all custom scripts (reported by @FriederikeHanssen and @maxibor, fix by @jfy133)
+
+### `Fixed`
+
+- [#583](https://github.com/nf-core/mag/pull/583) - Fix GTDB database input when directory supplied (fix by @jfy133)
+
+### `Dependencies`
+
+### `Deprecated`
+
## 2.5.3 - [2024-02-05]
### `Added`
diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
index 234d1d8d..ff9fc6fb 100644
--- a/assets/multiqc_config.yml
+++ b/assets/multiqc_config.yml
@@ -1,7 +1,7 @@
report_comment: >
- This report has been generated by the nf-core/mag
+ This report has been generated by the nf-core/mag
analysis pipeline. For information about how to interpret these results, please see the
- documentation.
+ documentation.
report_section_order:
"nf-core-mag-methods-description":
order: -1000
diff --git a/bin/combine_tables.py b/bin/combine_tables.py
index b867ed73..a2dcf986 100755
--- a/bin/combine_tables.py
+++ b/bin/combine_tables.py
@@ -1,5 +1,9 @@
#!/usr/bin/env python
+## Originally written by Daniel Straub and Sabrina Krakau and released under the MIT license.
+## See git repository (https://github.com/nf-core/mag) for full license text.
+
+
import sys
import argparse
import os.path
@@ -8,11 +12,25 @@
def parse_args(args=None):
parser = argparse.ArgumentParser()
- parser.add_argument("-d", "--depths_summary", required=True, metavar="FILE", help="Bin depths summary file.")
- parser.add_argument("-b", "--busco_summary", metavar="FILE", help="BUSCO summary file.")
- parser.add_argument("-c", "--checkm_summary", metavar="FILE", help="CheckM summary file.")
- parser.add_argument("-q", "--quast_summary", metavar="FILE", help="QUAST BINS summary file.")
- parser.add_argument("-g", "--gtdbtk_summary", metavar="FILE", help="GTDB-Tk summary file.")
+ parser.add_argument(
+ "-d",
+ "--depths_summary",
+ required=True,
+ metavar="FILE",
+ help="Bin depths summary file.",
+ )
+ parser.add_argument(
+ "-b", "--busco_summary", metavar="FILE", help="BUSCO summary file."
+ )
+ parser.add_argument(
+ "-c", "--checkm_summary", metavar="FILE", help="CheckM summary file."
+ )
+ parser.add_argument(
+ "-q", "--quast_summary", metavar="FILE", help="QUAST BINS summary file."
+ )
+ parser.add_argument(
+ "-g", "--gtdbtk_summary", metavar="FILE", help="GTDB-Tk summary file."
+ )
parser.add_argument("-a", "--cat_summary", metavar="FILE", help="CAT table file.")
parser.add_argument(
"-o",
@@ -45,7 +63,14 @@ def parse_cat_table(cat_table):
for line in f:
maxcol = max(maxcol, len(line.split("\t")))
- header = ["bin", "classification", "reason", "lineage", "lineage scores", "full lineage names"]
+ header = [
+ "bin",
+ "classification",
+ "reason",
+ "lineage",
+ "lineage scores",
+ "full lineage names",
+ ]
df = pd.read_table(
cat_table,
@@ -55,7 +80,11 @@ def parse_cat_table(cat_table):
skiprows=1,
)
# merge all rank columns into a single column
- df["CAT_rank"] = df.filter(regex="rank_\d+").apply(lambda x: ";".join(x.dropna()), axis=1).str.lstrip()
+ df["CAT_rank"] = (
+ df.filter(regex="rank_\d+")
+ .apply(lambda x: ";".join(x.dropna()), axis=1)
+ .str.lstrip()
+ )
# remove rank_* columns
df.drop(df.filter(regex="rank_\d+").columns, axis=1, inplace=True)
@@ -65,21 +94,34 @@ def parse_cat_table(cat_table):
def main(args=None):
args = parse_args(args)
- if not args.busco_summary and not args.checkm_summary and not args.quast_summary and not args.gtdbtk_summary:
- sys.exit("No summary specified! Please specify at least BUSCO, CheckM or QUAST summary.")
+ if (
+ not args.busco_summary
+ and not args.checkm_summary
+ and not args.quast_summary
+ and not args.gtdbtk_summary
+ ):
+ sys.exit(
+ "No summary specified! Please specify at least BUSCO, CheckM or QUAST summary."
+ )
# GTDB-Tk can only be run in combination with BUSCO or CheckM
if args.gtdbtk_summary and not (args.busco_summary or args.checkm_summary):
- sys.exit("Invalid parameter combination: GTDB-TK summary specified, but no BUSCO or CheckM summary!")
+ sys.exit(
+ "Invalid parameter combination: GTDB-TK summary specified, but no BUSCO or CheckM summary!"
+ )
# handle bin depths
results = pd.read_csv(args.depths_summary, sep="\t")
- results.columns = ["Depth " + str(col) if col != "bin" else col for col in results.columns]
+ results.columns = [
+ "Depth " + str(col) if col != "bin" else col for col in results.columns
+ ]
bins = results["bin"].sort_values().reset_index(drop=True)
if args.busco_summary:
busco_results = pd.read_csv(args.busco_summary, sep="\t")
- if not bins.equals(busco_results["GenomeBin"].sort_values().reset_index(drop=True)):
+ if not bins.equals(
+ busco_results["GenomeBin"].sort_values().reset_index(drop=True)
+ ):
sys.exit("Bins in BUSCO summary do not match bins in bin depths summary!")
results = pd.merge(
results, busco_results, left_on="bin", right_on="GenomeBin", how="outer"
@@ -107,7 +149,9 @@ def main(args=None):
]
checkm_results = pd.read_csv(args.checkm_summary, usecols=use_columns, sep="\t")
checkm_results["Bin Id"] = checkm_results["Bin Id"] + ".fa"
- if not bins.equals(checkm_results["Bin Id"].sort_values().reset_index(drop=True)):
+ if not bins.equals(
+ checkm_results["Bin Id"].sort_values().reset_index(drop=True)
+ ):
sys.exit("Bins in CheckM summary do not match bins in bin depths summary!")
results = pd.merge(
results, checkm_results, left_on="bin", right_on="Bin Id", how="outer"
@@ -116,7 +160,9 @@ def main(args=None):
if args.quast_summary:
quast_results = pd.read_csv(args.quast_summary, sep="\t")
- if not bins.equals(quast_results["Assembly"].sort_values().reset_index(drop=True)):
+ if not bins.equals(
+ quast_results["Assembly"].sort_values().reset_index(drop=True)
+ ):
sys.exit("Bins in QUAST summary do not match bins in bin depths summary!")
results = pd.merge(
results, quast_results, left_on="bin", right_on="Assembly", how="outer"
@@ -134,7 +180,13 @@ def main(args=None):
cat_results = parse_cat_table(args.cat_summary)
if len(set(cat_results["bin"].to_list()).difference(set(bins))) > 0:
sys.exit("Bins in CAT summary do not match bins in bin depths summary!")
- results = pd.merge(results, cat_results[["bin", "CAT_rank"]], left_on="bin", right_on="bin", how="outer")
+ results = pd.merge(
+ results,
+ cat_results[["bin", "CAT_rank"]],
+ left_on="bin",
+ right_on="bin",
+ how="outer",
+ )
results.to_csv(args.out, sep="\t")
diff --git a/bin/domain_classification.R b/bin/domain_classification.R
index eb64b312..33530ca5 100755
--- a/bin/domain_classification.R
+++ b/bin/domain_classification.R
@@ -1,7 +1,7 @@
#!/usr/bin/env Rscript
-# Written by Jim Downie and released under the MIT license.
-# See git repository (https://github.com/nf-core/mag) for full license text.
+## Written by Jim Downie and released under the MIT license.
+## See git repository (https://github.com/nf-core/mag) for full license text.
library(optparse)
library(tidyverse)
diff --git a/bin/filter_ssu.py b/bin/filter_ssu.py
index 7e89989b..5e4675e4 100755
--- a/bin/filter_ssu.py
+++ b/bin/filter_ssu.py
@@ -1,5 +1,8 @@
#!/usr/bin/env python
+## Originally written by Hadrien Gourlé and released under the MIT license.
+## See git repository (https://github.com/nf-core/mag) for full license text.
+
from __future__ import print_function
import os
@@ -28,10 +31,16 @@ def filter(args):
def main():
- parser = argparse.ArgumentParser(prog="filter_ssu.py", usage="filter ssu hits from refinem")
+ parser = argparse.ArgumentParser(
+ prog="filter_ssu.py", usage="filter ssu hits from refinem"
+ )
parser.add_argument("--evalue", help="evalue threshold")
- parser.add_argument("ssu", metavar="ssu.tsv", help="ssu tsv file generated by refinem")
- parser.add_argument("output", metavar="output.tsv", default="output.tsv", help="output file name")
+ parser.add_argument(
+ "ssu", metavar="ssu.tsv", help="ssu tsv file generated by refinem"
+ )
+ parser.add_argument(
+ "output", metavar="output.tsv", default="output.tsv", help="output file name"
+ )
parser.set_defaults(func=filter)
args = parser.parse_args()
diff --git a/bin/get_mag_depths.py b/bin/get_mag_depths.py
index 55d73ac4..43ce3539 100755
--- a/bin/get_mag_depths.py
+++ b/bin/get_mag_depths.py
@@ -1,5 +1,8 @@
#!/usr/bin/env python
+## Originally written by Sabrina Krakau and released under the MIT license.
+## See git repository (https://github.com/nf-core/mag) for full license text.
+
import sys
import argparse
import os.path
@@ -14,7 +17,12 @@
def parse_args(args=None):
parser = argparse.ArgumentParser()
parser.add_argument(
- "-b", "--bins", required=True, nargs="+", metavar="FILE", help="Bins: FASTA containing all contigs."
+ "-b",
+ "--bins",
+ required=True,
+ nargs="+",
+ metavar="FILE",
+ help="Bins: FASTA containing all contigs.",
)
parser.add_argument(
"-d",
@@ -23,9 +31,15 @@ def parse_args(args=None):
metavar="FILE",
help="(Compressed) TSV file containing contig depths for each sample: contigName, contigLen, totalAvgDepth, sample1_avgDepth, sample1_var [, sample2_avgDepth, sample2_var, ...].",
)
- parser.add_argument("-a", "--assembler", required=True, type=str, help="Assembler name.")
- parser.add_argument("-i", "--id", required=True, type=str, help="Sample or group id.")
- parser.add_argument("-m", "--binner", required=True, type=str, help="Binning method.")
+ parser.add_argument(
+ "-a", "--assembler", required=True, type=str, help="Assembler name."
+ )
+ parser.add_argument(
+ "-i", "--id", required=True, type=str, help="Sample or group id."
+ )
+ parser.add_argument(
+ "-m", "--binner", required=True, type=str, help="Binning method."
+ )
return parser.parse_args(args)
@@ -56,7 +70,9 @@ def main(args=None):
# Initialize output files
n_samples = len(sample_names)
- with open(args.assembler + "-" + args.binner + "-" + args.id + "-binDepths.tsv", "w") as outfile:
+ with open(
+ args.assembler + "-" + args.binner + "-" + args.id + "-binDepths.tsv", "w"
+ ) as outfile:
print("bin", "\t".join(sample_names), sep="\t", file=outfile)
# for each bin, access contig depths and compute mean bin depth (for all samples)
@@ -77,10 +93,15 @@ def main(args=None):
all_depths[sample].append(contig_depths[sample])
binname = os.path.basename(file)
- with open(args.assembler + "-" + args.binner + "-" + args.id + "-binDepths.tsv", "a") as outfile:
+ with open(
+ args.assembler + "-" + args.binner + "-" + args.id + "-binDepths.tsv", "a"
+ ) as outfile:
print(
binname,
- "\t".join(str(statistics.median(sample_depths)) for sample_depths in all_depths),
+ "\t".join(
+ str(statistics.median(sample_depths))
+ for sample_depths in all_depths
+ ),
sep="\t",
file=outfile,
)
diff --git a/bin/get_mag_depths_summary.py b/bin/get_mag_depths_summary.py
index 6dbc6f75..69433371 100755
--- a/bin/get_mag_depths_summary.py
+++ b/bin/get_mag_depths_summary.py
@@ -1,5 +1,8 @@
#!/usr/bin/env python
+## Originally written by Sabrina Krakau and released under the MIT license.
+## See git repository (https://github.com/nf-core/mag) for full license text.
+
import sys
import argparse
import pandas as pd
diff --git a/bin/multiqc_to_custom_tsv.py b/bin/multiqc_to_custom_tsv.py
index 6488e31d..4388fb26 100755
--- a/bin/multiqc_to_custom_tsv.py
+++ b/bin/multiqc_to_custom_tsv.py
@@ -1,5 +1,8 @@
#!/usr/bin/env python
-# copied from nf-core/viralrecon and adjusted
+
+## Copied from nf-core/viralrecon and adjusted
+## See git repository (https://github.com/nf-core/viralrecon) for full license text.
+
import os
import sys
@@ -9,9 +12,7 @@
def parse_args(args=None):
- Description = (
- "Create custom spreadsheet for pertinent MultiQC bowtie 2 metrics generated by the nf-core/mag pipeline."
- )
+ Description = "Create custom spreadsheet for pertinent MultiQC bowtie 2 metrics generated by the nf-core/mag pipeline."
Epilog = "Example usage: python multiqc_to_custom_tsv.py"
parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
parser.add_argument(
@@ -86,7 +87,9 @@ def metrics_dict_to_file(FileFieldList, MultiQCDataDir, OutFile, se):
for yamlFile, mappingList in FileFieldList:
yamlFile = os.path.join(MultiQCDataDir, yamlFile)
if os.path.exists(yamlFile):
- MetricsDict = yaml_fields_to_dict(YAMLFile=yamlFile, AppendDict=MetricsDict, FieldMappingList=mappingList)
+ MetricsDict = yaml_fields_to_dict(
+ YAMLFile=yamlFile, AppendDict=MetricsDict, FieldMappingList=mappingList
+ )
FieldList += [x[0] for x in mappingList]
else:
print("WARNING: File does not exist: {}".format(yamlFile))
@@ -96,7 +99,15 @@ def metrics_dict_to_file(FileFieldList, MultiQCDataDir, OutFile, se):
with open(OutFile, "w") as fout:
if se:
fout.write(
- "{}\n".format("\t".join(["Sample", "SE reads not mapped (kept)", "SE reads mapped (discarded)"]))
+ "{}\n".format(
+ "\t".join(
+ [
+ "Sample",
+ "SE reads not mapped (kept)",
+ "SE reads mapped (discarded)",
+ ]
+ )
+ )
)
else:
fout.write(
@@ -118,7 +129,10 @@ def metrics_dict_to_file(FileFieldList, MultiQCDataDir, OutFile, se):
[
k,
str(MetricsDict[k][FieldList[0]]),
- str(MetricsDict[k][FieldList[1]] + MetricsDict[k][FieldList[2]]),
+ str(
+ MetricsDict[k][FieldList[1]]
+ + MetricsDict[k][FieldList[2]]
+ ),
]
)
)
diff --git a/bin/plot_mag_depths.py b/bin/plot_mag_depths.py
index aab38473..d3782845 100755
--- a/bin/plot_mag_depths.py
+++ b/bin/plot_mag_depths.py
@@ -1,5 +1,8 @@
#!/usr/bin/env python
+# Originally written by Sabrina Krakau and released under the MIT license.
+# See git repository (https://github.com/nf-core/mag) for full license text.
+
import sys
import argparse
import os.path
@@ -26,7 +29,9 @@ def parse_args(args=None):
metavar="FILE",
help="File in TSV format containing group information for samples: sample, group",
)
- parser.add_argument("-o", "--out", required=True, metavar="FILE", type=str, help="Output file.")
+ parser.add_argument(
+ "-o", "--out", required=True, metavar="FILE", type=str, help="Output file."
+ )
return parser.parse_args(args)
@@ -43,12 +48,19 @@ def main(args=None):
# compute centered log-ratios
# divide df by sample-wise geometric means
gmeans = stats.gmean(df, axis=0) # apply on axis=0: 'index'
- df = np.log(df.div(gmeans, axis="columns")) # divide column-wise (axis=1|'columns'), take natural logorithm
+ df = np.log(
+ df.div(gmeans, axis="columns")
+ ) # divide column-wise (axis=1|'columns'), take natural logorithm
df.index.name = "MAGs"
df.columns.name = "Samples"
# prepare colors for group information
- color_map = dict(zip(groups["group"].unique(), sns.color_palette(n_colors=len(groups["group"].unique()))))
+ color_map = dict(
+ zip(
+ groups["group"].unique(),
+ sns.color_palette(n_colors=len(groups["group"].unique())),
+ )
+ )
# plot
plt.figure()
diff --git a/bin/run_busco.sh b/bin/run_busco.sh
index 9e022e87..4f8d6c86 100755
--- a/bin/run_busco.sh
+++ b/bin/run_busco.sh
@@ -1,5 +1,9 @@
#! /usr/bin/env bash
+# Originally written by Sabrina Krakau and James Fellows Yates and released
+# under the MIT license.
+# See git repository (https://github.com/nf-core/mag) for full license text.
+
p=$1
cp_augustus_config=$2
db=$3
@@ -148,7 +152,7 @@ if [ -f BUSCO/logs/prodigal_out.log ]; then
fi
# output value of most_spec_db
-echo ${most_spec_db} > info_most_spec_db.txt
+echo ${most_spec_db} >info_most_spec_db.txt
# if needed delete temporary BUSCO files
if [ ${busco_clean} = "Y" ]; then
diff --git a/bin/split_fasta.py b/bin/split_fasta.py
index 87cb9dfa..c5fb6e87 100755
--- a/bin/split_fasta.py
+++ b/bin/split_fasta.py
@@ -1,5 +1,9 @@
#!/usr/bin/env python
+## Originally written by Daniel Straub and Sabrina Krakau and released
+## under the MIT license.
+## See git repository (https://github.com/nf-core/mag) for full license text.
+
# USAGE: ./split_fasta.py <*.unbinned.fa(.gz)>
import pandas as pd
@@ -45,10 +49,14 @@
)
# contigs to retain and pool
elif length >= min_length_to_retain_contig:
- pooled.append(SeqRecord(Seq(sequence, generic_dna), id=name, description=""))
+ pooled.append(
+ SeqRecord(Seq(sequence, generic_dna), id=name, description="")
+ )
# remaining sequences
else:
- remaining.append(SeqRecord(Seq(sequence, generic_dna), id=name, description=""))
+ remaining.append(
+ SeqRecord(Seq(sequence, generic_dna), id=name, description="")
+ )
else:
with open(input_file) as f:
fasta_sequences = SeqIO.parse(f, "fasta")
@@ -64,10 +72,14 @@
)
# contigs to retain and pool
elif length >= min_length_to_retain_contig:
- pooled.append(SeqRecord(Seq(sequence, generic_dna), id=name, description=""))
+ pooled.append(
+ SeqRecord(Seq(sequence, generic_dna), id=name, description="")
+ )
# remaining sequences
else:
- remaining.append(SeqRecord(Seq(sequence, generic_dna), id=name, description=""))
+ remaining.append(
+ SeqRecord(Seq(sequence, generic_dna), id=name, description="")
+ )
# Sort sequences above threshold by length
df_above_threshold.sort_values(by=["length"], ascending=False, inplace=True)
@@ -80,7 +92,9 @@
out = SeqRecord(Seq(row["seq"], generic_dna), id=row["id"], description="")
SeqIO.write(out, out_base + "." + str(index + 1) + ".fa", "fasta")
else:
- pooled.append(SeqRecord(Seq(row["seq"], generic_dna), id=row["id"], description=""))
+ pooled.append(
+ SeqRecord(Seq(row["seq"], generic_dna), id=row["id"], description="")
+ )
print("write " + out_base + ".pooled.fa")
SeqIO.write(pooled, out_base + ".pooled.fa", "fasta")
diff --git a/bin/summary_busco.py b/bin/summary_busco.py
index b4a8c99b..9701783b 100755
--- a/bin/summary_busco.py
+++ b/bin/summary_busco.py
@@ -1,6 +1,10 @@
#!/usr/bin/env python
-# USAGE: ./summary.busco.py -sd -ss -f
+## Originally written by Daniel Straub, Sabrina Krakau, and Hadrien Gourlé
+## and released under the MIT license.
+## See git repository (https://github.com/nf-core/mag) for full license text.
+
+## USAGE: ./summary.busco.py -sd -ss -f
import re
import sys
@@ -12,10 +16,18 @@
def parse_args(args=None):
parser = argparse.ArgumentParser()
parser.add_argument(
- "-a", "--auto", default=False, action="store_true", help="BUSCO run in auto lineage selection mode."
+ "-a",
+ "--auto",
+ default=False,
+ action="store_true",
+ help="BUSCO run in auto lineage selection mode.",
)
parser.add_argument(
- "-sd", "--summaries_domain", nargs="+", metavar="FILE", help="List of BUSCO summary files for domains."
+ "-sd",
+ "--summaries_domain",
+ nargs="+",
+ metavar="FILE",
+ help="List of BUSCO summary files for domains.",
)
parser.add_argument(
"-ss",
@@ -45,8 +57,14 @@ def parse_args(args=None):
def main(args=None):
args = parse_args(args)
- if not args.summaries_domain and not args.summaries_specific and not args.failed_bins:
- sys.exit("Either --summaries_domain, --summaries_specific or --failed_bins must be specified!")
+ if (
+ not args.summaries_domain
+ and not args.summaries_specific
+ and not args.failed_bins
+ ):
+ sys.exit(
+ "Either --summaries_domain, --summaries_specific or --failed_bins must be specified!"
+ )
# "# Summarized benchmarking in BUSCO notation for file /path/to/MEGAHIT-testset1.contigs.fa"
# " C:0.0%[S:0.0%,D:0.0%],F:0.0%,M:100.0%,n:148"
@@ -173,15 +191,30 @@ def main(args=None):
pd.NA,
]
else:
- results = [failed_bin, pd.NA, "0.0", "0.0", "0.0", "0.0", "100.0", pd.NA]
+ results = [
+ failed_bin,
+ pd.NA,
+ "0.0",
+ "0.0",
+ "0.0",
+ "0.0",
+ "100.0",
+ pd.NA,
+ ]
failed.append(results)
df_failed = pd.DataFrame(failed, columns=columns)
# merge results
if args.auto:
- df_final = df_domain.merge(df_specific, on="GenomeBin", how="outer").append(df_failed)
+ df_final = df_domain.merge(df_specific, on="GenomeBin", how="outer").append(
+ df_failed
+ )
# check if 'Domain' is 'NA', but 'Specific lineage dataset' given -> 'Viruses'
- df_final.loc[pd.isna(df_final["Domain"]) & pd.notna(df_final["Specific lineage dataset"]), "Domain"] = "Viruses"
+ df_final.loc[
+ pd.isna(df_final["Domain"])
+ & pd.notna(df_final["Specific lineage dataset"]),
+ "Domain",
+ ] = "Viruses"
else:
df_final = df_specific.append(df_failed)
diff --git a/bin/summary_gtdbtk.py b/bin/summary_gtdbtk.py
index 44bb7d1d..7ae43a09 100755
--- a/bin/summary_gtdbtk.py
+++ b/bin/summary_gtdbtk.py
@@ -1,5 +1,8 @@
#!/usr/bin/env python
+# Originally written by Sabrina Krakau and released under the MIT license.
+# See git repository (https://github.com/nf-core/mag) for full license text.
+
import re
import sys
import argparse
@@ -16,7 +19,13 @@ def parse_args(args=None):
type=str,
help="File extension passed to GTDB-TK and substracted by GTDB-Tk from bin names in results files.",
)
- parser.add_argument("-s", "--summaries", nargs="+", metavar="FILE", help="List of GTDB-tk summary files.")
+ parser.add_argument(
+ "-s",
+ "--summaries",
+ nargs="+",
+ metavar="FILE",
+ help="List of GTDB-tk summary files.",
+ )
parser.add_argument(
"-fi",
"--filtered_bins",
@@ -54,8 +63,15 @@ def parse_args(args=None):
def main(args=None):
args = parse_args(args)
- if not args.summaries and not args.filtered_bins and not args.failed_bins and not args.qc_discarded_bins:
- sys.exit("Either --summaries, --filtered_bins, --failed_bins or --qc_discarded_bins must be specified!")
+ if (
+ not args.summaries
+ and not args.filtered_bins
+ and not args.failed_bins
+ and not args.qc_discarded_bins
+ ):
+ sys.exit(
+ "Either --summaries, --filtered_bins, --failed_bins or --qc_discarded_bins must be specified!"
+ )
columns = [
"user_genome",
@@ -117,7 +133,9 @@ def main(args=None):
for file in args.summaries:
df_summary = pd.read_csv(file, sep="\t")[columns]
# add by GTDB-Tk substracted file extension again to bin names (at least until changed consistently in rest of pipeline)
- df_summary["user_genome"] = df_summary["user_genome"].astype(str) + "." + args.extension
+ df_summary["user_genome"] = (
+ df_summary["user_genome"].astype(str) + "." + args.extension
+ )
df_summary.set_index("user_genome", inplace=True)
df_final = df_final.append(df_summary, verify_integrity=True)
@@ -153,7 +171,9 @@ def main(args=None):
filtered.append(bin_results)
df_filtered = pd.DataFrame(filtered, columns=columns)
- df_filtered["user_genome"] = df_filtered["user_genome"].astype(str) + "." + args.extension
+ df_filtered["user_genome"] = (
+ df_filtered["user_genome"].astype(str) + "." + args.extension
+ )
df_filtered.set_index("user_genome", inplace=True)
df_final = df_final.append(df_filtered, verify_integrity=True)
@@ -189,12 +209,16 @@ def main(args=None):
failed.append(bin_results)
df_failed = pd.DataFrame(failed, columns=columns)
- df_failed["user_genome"] = df_failed["user_genome"].astype(str) + "." + args.extension
+ df_failed["user_genome"] = (
+ df_failed["user_genome"].astype(str) + "." + args.extension
+ )
df_failed.set_index("user_genome", inplace=True)
df_final = df_final.append(df_failed, verify_integrity=True)
# write output
- df_final.reset_index().rename(columns={"index": "user_genome"}).to_csv(args.out, sep="\t", index=False)
+ df_final.reset_index().rename(columns={"index": "user_genome"}).to_csv(
+ args.out, sep="\t", index=False
+ )
if __name__ == "__main__":
diff --git a/conf/test.config b/conf/test.config
index 43a7f18a..28984d13 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -29,5 +29,6 @@ params {
busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2024-01-08.tar.gz"
busco_clean = true
skip_gtdbtk = true
+ gtdbtk_min_completeness = 0
skip_concoct = true
}
diff --git a/conf/test_adapterremoval.config b/conf/test_adapterremoval.config
index dc00d319..ca9fed10 100644
--- a/conf/test_adapterremoval.config
+++ b/conf/test_adapterremoval.config
@@ -29,6 +29,7 @@ params {
max_unbinned_contigs = 2
busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2024-01-08.tar.gz"
skip_gtdbtk = true
+ gtdbtk_min_completeness = 0
clip_tool = 'adapterremoval'
skip_concoct = true
bin_domain_classification = true
diff --git a/conf/test_ancient_dna.config b/conf/test_ancient_dna.config
index bc441be8..5e935321 100644
--- a/conf/test_ancient_dna.config
+++ b/conf/test_ancient_dna.config
@@ -28,6 +28,7 @@ params {
max_unbinned_contigs = 2
busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2024-01-08.tar.gz"
skip_gtdbtk = true
+ gtdbtk_min_completeness = 0
ancient_dna = true
binning_map_mode = 'own'
skip_spades = false
diff --git a/conf/test_bbnorm.config b/conf/test_bbnorm.config
index 02e764c3..a31f6b7b 100644
--- a/conf/test_bbnorm.config
+++ b/conf/test_bbnorm.config
@@ -35,6 +35,7 @@ params {
busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2024-01-08.tar.gz"
busco_clean = true
skip_gtdbtk = true
+ gtdbtk_min_completeness = 0
bbnorm = true
coassemble_group = true
}
diff --git a/conf/test_binrefinement.config b/conf/test_binrefinement.config
index 79105ec4..54144244 100644
--- a/conf/test_binrefinement.config
+++ b/conf/test_binrefinement.config
@@ -29,6 +29,7 @@ params {
max_unbinned_contigs = 2
busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2024-01-08.tar.gz"
skip_gtdbtk = true
+ gtdbtk_min_completeness = 0
refine_bins_dastool = true
refine_bins_dastool_threshold = 0
// TODO not using 'both' until #489 merged
diff --git a/conf/test_busco_auto.config b/conf/test_busco_auto.config
index 6479012f..48f6b7b5 100644
--- a/conf/test_busco_auto.config
+++ b/conf/test_busco_auto.config
@@ -25,6 +25,7 @@ params {
min_length_unbinned_contigs = 1
max_unbinned_contigs = 2
skip_gtdbtk = true
+ gtdbtk_min_completeness = 0
skip_prokka = true
skip_prodigal = true
skip_quast = true
diff --git a/conf/test_host_rm.config b/conf/test_host_rm.config
index 30cae576..afd4e687 100644
--- a/conf/test_host_rm.config
+++ b/conf/test_host_rm.config
@@ -26,5 +26,6 @@ params {
max_unbinned_contigs = 2
busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2024-01-08.tar.gz"
skip_gtdbtk = true
+ gtdbtk_min_completeness = 0
skip_concoct = true
}
diff --git a/conf/test_hybrid.config b/conf/test_hybrid.config
index 3ca0608e..a9f7ee07 100644
--- a/conf/test_hybrid.config
+++ b/conf/test_hybrid.config
@@ -25,5 +25,6 @@ params {
max_unbinned_contigs = 2
busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2024-01-08.tar.gz"
skip_gtdbtk = true
+ gtdbtk_min_completeness = 0
skip_concoct = true
}
diff --git a/conf/test_hybrid_host_rm.config b/conf/test_hybrid_host_rm.config
index 7a0e4a15..531a89d3 100644
--- a/conf/test_hybrid_host_rm.config
+++ b/conf/test_hybrid_host_rm.config
@@ -27,4 +27,5 @@ params {
skip_binqc = true
skip_concoct = true
skip_gtdbtk = true
+ gtdbtk_min_completeness = 0
}
diff --git a/conf/test_nothing.config b/conf/test_nothing.config
index 53df219f..cd3f6311 100644
--- a/conf/test_nothing.config
+++ b/conf/test_nothing.config
@@ -39,5 +39,6 @@ params {
skip_prokka = true
skip_binqc = true
skip_gtdbtk = true
+ gtdbtk_min_completeness = 0
skip_concoct = true
}
diff --git a/conf/test_virus_identification.config b/conf/test_virus_identification.config
index e15fab7d..dba55db9 100644
--- a/conf/test_virus_identification.config
+++ b/conf/test_virus_identification.config
@@ -28,6 +28,7 @@ params {
reads_minlength = 150
coassemble_group = true
skip_gtdbtk = true
+ gtdbtk_min_completeness = 0
skip_binning = true
skip_prokka = true
skip_spades = true
diff --git a/nextflow.config b/nextflow.config
index 6a2da3b5..ed9d4e27 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -379,7 +379,7 @@ manifest {
description = """Assembly, binning and annotation of metagenomes"""
mainScript = 'main.nf'
nextflowVersion = '!>=23.04.0'
- version = '2.5.3'
+ version = '2.5.4'
doi = '10.1093/nargab/lqac007'
}
diff --git a/subworkflows/local/gtdbtk.nf b/subworkflows/local/gtdbtk.nf
index a201e370..95e343c8 100644
--- a/subworkflows/local/gtdbtk.nf
+++ b/subworkflows/local/gtdbtk.nf
@@ -66,13 +66,16 @@ workflow GTDBTK {
// Expects to be tar.gz!
ch_db_for_gtdbtk = GTDBTK_DB_PREPARATION ( gtdb ).db
} else if ( gtdb.isDirectory() ) {
- // Make up meta id to match expected channel cardinality for GTDBTK
+ // The classifywf module expects a list of the _contents_ of the GTDB
+ // database, not just the directory itself (I'm not sure why). But
+ // for now we generate this list before putting into a channel,
+ // then grouping again to pass to the module.
+ // Then make up meta id to match expected channel cardinality for GTDBTK
+ gtdb_dir = gtdb.listFiles()
ch_db_for_gtdbtk = Channel
- .of(gtdb)
- .map{
- [ it.toString().split('/').last(), it ]
- }
- .collect()
+ .of(gtdb_dir)
+ .map{['gtdb', it]}
+ .groupTuple()
} else {
error("Unsupported object given to --gtdb, database must be supplied as either a directory or a .tar.gz file!")
}