From be4730981743653c6c725a678fec87b6266c0ccf Mon Sep 17 00:00:00 2001
From: jasmezz <jasmin.frangenberg@hki-jena.de>
Date: Wed, 24 Jul 2024 17:49:43 +0200
Subject: [PATCH 1/2] Fix comBGC: parsing multiple antismash results

---
 bin/comBGC.py | 300 +++++++++++++++++++++++++-------------------------
 1 file changed, 150 insertions(+), 150 deletions(-)

diff --git a/bin/comBGC.py b/bin/comBGC.py
index 94fc674b..dccece69 100755
--- a/bin/comBGC.py
+++ b/bin/comBGC.py
@@ -35,7 +35,7 @@
 SOFTWARE.
 """
 
-tool_version = "0.6.2"
+tool_version = "0.6.3"
 welcome = """\
                 ........................
                     * comBGC v.{version} *
@@ -118,7 +118,7 @@
 # Assign input files to respective tools
 if input:
     for path in input:
-        if path.endswith(".gbk"):
+        if path.endswith(".gbk") and not re.search("region\d\d\d\.gbk$", path): # Make sure to only fetch relevant GBK files, i.e. those containing all collective antiSMASH BGCs
             with open(path) as infile:
                 for line in infile:
                     if re.search("##GECCO-Data-START##", line):
@@ -225,155 +225,155 @@ def antismash_workflow(antismash_paths):
         else:
             gbk_path = path
 
-    kcb_files = []
-    if kcb_path:
-        kcb_files = [
-            file
-            for file in os.listdir(kcb_path)
-            if file.startswith("c") and file.endswith(".txt")
-        ]
-
-    # Aggregate information
-    Sample_ID = gbk_path.split("/")[-1].split(".gbk")[
-        -2
-    ]  # Assuming file name equals sample name
-    if verbose:
-        print("\nParsing antiSMASH file(s): " + Sample_ID + "\n... ", end="")
-
-    with open(gbk_path) as gbk:
-        for record in SeqIO.parse(
-            gbk, "genbank"
-        ):  # GBK records are contigs in this case
-            # Initiate variables per contig
-            cluster_num = 1
-            antismash_out_line = {}
-            Contig_ID = record.id
-            Product_class = ""
-            BGC_complete = ""
-            BGC_start = ""
-            BGC_end = ""
-            BGC_length = ""
-            PFAM_domains = []
-            MIBiG_ID = "NA"
-
-            for feature in record.features:
-                # Extract relevant infos from the first protocluster feature from the contig record
-                if feature.type == "protocluster":
-                    if (
-                        antismash_out_line
-                    ):  # If there is more than 1 BGC per contig, reset the output line for new BGC. Assuming that BGCs do not overlap.
-                        if not CDS_ID:
-                            CDS_ID = ["NA"]
-                        antismash_out_line = {  # Create dictionary of BGC info
-                            "Sample_ID": Sample_ID,
-                            "Prediction_tool": "antiSMASH",
-                            "Contig_ID": Contig_ID,
-                            "Product_class": ";".join(Product_class),
-                            "BGC_probability": "NA",
-                            "BGC_complete": BGC_complete,
-                            "BGC_start": BGC_start,
-                            "BGC_end": BGC_end,
-                            "BGC_length": BGC_length,
-                            "CDS_ID": ";".join(CDS_ID),
-                            "CDS_count": CDS_count,
-                            "PFAM_domains": ";".join(PFAM_domains),
-                            "MIBiG_ID": MIBiG_ID,
-                            "InterPro_ID": "NA",
-                        }
-                        antismash_out_line = pd.DataFrame([antismash_out_line])
-                        antismash_out = pd.concat(
-                            [antismash_out, antismash_out_line], ignore_index=True
-                        )
-                        antismash_out_line = {}
-
-                        # Reset variables per BGC
-                        CDS_ID = []
-                        CDS_count = 0
-                        PFAM_domains = []
-
-                    # Extract all the BGC info
-                    Product_class = feature.qualifiers["product"]
-                    for i in range(len(Product_class)):
-                        Product_class[i] = (
-                            Product_class[i][0].upper() + Product_class[i][1:]
-                        )  # Make first letters uppercase, e.g. lassopeptide -> Lassopeptide
-
-                    if feature.qualifiers["contig_edge"] == ["True"]:
-                        BGC_complete = "No"
-                    elif feature.qualifiers["contig_edge"] == ["False"]:
-                        BGC_complete = "Yes"
-
-                    BGC_start = (
-                        feature.location.start + 1
-                    )  # +1 because zero-based start position
-                    BGC_end = feature.location.end
-                    BGC_length = feature.location.end - feature.location.start
-
-                    # If there are knownclusterblast files for the BGC, get MIBiG IDs of their homologs
-                    if kcb_files:
-                        print(kcb_files)
-                        kcb_file = "{}_c{}.txt".format(
-                            record.id, str(cluster_num)
-                        )  # Check if this filename is among the knownclusterblast files
-                        if kcb_file in kcb_files:
-                            MIBiG_IDs = ";".join(
-                                parse_knownclusterblast(
-                                    os.path.join(kcb_path, kcb_file)
-                                )
-                            )
-                            if MIBiG_IDs != "":
-                                MIBiG_ID = MIBiG_IDs
-                            cluster_num += 1
-
-                # Count functional CDSs (no pseudogenes) and get the PFAM annotation
-                elif (
-                    feature.type == "CDS"
-                    and "translation" in feature.qualifiers.keys()
-                    and BGC_start != ""
-                ):  # Make sure not to count pseudogenes (which would have no "translation tag") and count no CDSs before first BGC
-                    if (
-                        feature.location.end <= BGC_end
-                    ):  # Make sure CDS is within the current BGC region
-                        if "locus_tag" in feature.qualifiers:
-                            CDS_ID.append(feature.qualifiers["locus_tag"][0])
-                        CDS_count += 1
-                        if "sec_met_domain" in feature.qualifiers.keys():
-                            for PFAM_domain in feature.qualifiers["sec_met_domain"]:
-                                PFAM_domain_name = re.search(
-                                    "(.+) \(E-value", PFAM_domain
-                                ).group(1)
-                                PFAM_domains.append(PFAM_domain_name)
-
-            # Create dictionary of BGC info
-            if not CDS_ID:
-                CDS_ID = ["NA"]
-            antismash_out_line = {
-                "Sample_ID": Sample_ID,
-                "Prediction_tool": "antiSMASH",
-                "Contig_ID": Contig_ID,
-                "Product_class": ";".join(Product_class),
-                "BGC_probability": "NA",
-                "BGC_complete": BGC_complete,
-                "BGC_start": BGC_start,
-                "BGC_end": BGC_end,
-                "BGC_length": BGC_length,
-                "CDS_ID": ";".join(CDS_ID),
-                "CDS_count": CDS_count,
-                "PFAM_domains": ";".join(PFAM_domains),
-                "MIBiG_ID": MIBiG_ID,
-                "InterPro_ID": "NA",
-            }
-
-            if BGC_start != "":  # Only keep records with BGCs
-                antismash_out_line = pd.DataFrame([antismash_out_line])
-                antismash_out = pd.concat(
-                    [antismash_out, antismash_out_line], ignore_index=True
-                )
-
-                # Reset variables per BGC
-                CDS_ID = []
-                CDS_count = 0
+        kcb_files = []
+        if kcb_path:
+            kcb_files = [
+                file
+                for file in os.listdir(kcb_path)
+                if file.startswith("c") and file.endswith(".txt")
+            ]
+
+        # Aggregate information
+        Sample_ID = gbk_path.split("/")[-1].split(".gbk")[
+            -2
+        ]  # Assuming file name equals sample name
+        if verbose:
+            print("\nParsing antiSMASH file(s): " + Sample_ID + "\n... ", end="")
+
+        with open(gbk_path) as gbk:
+            for record in SeqIO.parse(
+                gbk, "genbank"
+            ):  # GBK records are contigs in this case
+                # Initiate variables per contig
+                cluster_num = 1
+                antismash_out_line = {}
+                Contig_ID = record.id
+                Product_class = ""
+                BGC_complete = ""
+                BGC_start = ""
+                BGC_end = ""
+                BGC_length = ""
                 PFAM_domains = []
+                MIBiG_ID = "NA"
+
+                for feature in record.features:
+                    # Extract relevant infos from the first protocluster feature from the contig record
+                    if feature.type == "protocluster":
+                        if (
+                            antismash_out_line
+                        ):  # If there is more than 1 BGC per contig, reset the output line for new BGC. Assuming that BGCs do not overlap.
+                            if not CDS_ID:
+                                CDS_ID = ["NA"]
+                            antismash_out_line = {  # Create dictionary of BGC info
+                                "Sample_ID": Sample_ID,
+                                "Prediction_tool": "antiSMASH",
+                                "Contig_ID": Contig_ID,
+                                "Product_class": ";".join(Product_class),
+                                "BGC_probability": "NA",
+                                "BGC_complete": BGC_complete,
+                                "BGC_start": BGC_start,
+                                "BGC_end": BGC_end,
+                                "BGC_length": BGC_length,
+                                "CDS_ID": ";".join(CDS_ID),
+                                "CDS_count": CDS_count,
+                                "PFAM_domains": ";".join(PFAM_domains),
+                                "MIBiG_ID": MIBiG_ID,
+                                "InterPro_ID": "NA",
+                            }
+                            antismash_out_line = pd.DataFrame([antismash_out_line])
+                            antismash_out = pd.concat(
+                                [antismash_out, antismash_out_line], ignore_index=True
+                            )
+                            antismash_out_line = {}
+
+                            # Reset variables per BGC
+                            CDS_ID = []
+                            CDS_count = 0
+                            PFAM_domains = []
+
+                        # Extract all the BGC info
+                        Product_class = feature.qualifiers["product"]
+                        for i in range(len(Product_class)):
+                            Product_class[i] = (
+                                Product_class[i][0].upper() + Product_class[i][1:]
+                            )  # Make first letters uppercase, e.g. lassopeptide -> Lassopeptide
+
+                        if feature.qualifiers["contig_edge"] == ["True"]:
+                            BGC_complete = "No"
+                        elif feature.qualifiers["contig_edge"] == ["False"]:
+                            BGC_complete = "Yes"
+
+                        BGC_start = (
+                            feature.location.start + 1
+                        )  # +1 because zero-based start position
+                        BGC_end = feature.location.end
+                        BGC_length = feature.location.end - feature.location.start
+
+                        # If there are knownclusterblast files for the BGC, get MIBiG IDs of their homologs
+                        if kcb_files:
+                            print(kcb_files)
+                            kcb_file = "{}_c{}.txt".format(
+                                record.id, str(cluster_num)
+                            )  # Check if this filename is among the knownclusterblast files
+                            if kcb_file in kcb_files:
+                                MIBiG_IDs = ";".join(
+                                    parse_knownclusterblast(
+                                        os.path.join(kcb_path, kcb_file)
+                                    )
+                                )
+                                if MIBiG_IDs != "":
+                                    MIBiG_ID = MIBiG_IDs
+                                cluster_num += 1
+
+                    # Count functional CDSs (no pseudogenes) and get the PFAM annotation
+                    elif (
+                        feature.type == "CDS"
+                        and "translation" in feature.qualifiers.keys()
+                        and BGC_start != ""
+                    ):  # Make sure not to count pseudogenes (which would have no "translation tag") and count no CDSs before first BGC
+                        if (
+                            feature.location.end <= BGC_end
+                        ):  # Make sure CDS is within the current BGC region
+                            if "locus_tag" in feature.qualifiers:
+                                CDS_ID.append(feature.qualifiers["locus_tag"][0])
+                            CDS_count += 1
+                            if "sec_met_domain" in feature.qualifiers.keys():
+                                for PFAM_domain in feature.qualifiers["sec_met_domain"]:
+                                    PFAM_domain_name = re.search(
+                                        "(.+) \(E-value", PFAM_domain
+                                    ).group(1)
+                                    PFAM_domains.append(PFAM_domain_name)
+
+                # Create dictionary of BGC info
+                if not CDS_ID:
+                    CDS_ID = ["NA"]
+                antismash_out_line = {
+                    "Sample_ID": Sample_ID,
+                    "Prediction_tool": "antiSMASH",
+                    "Contig_ID": Contig_ID,
+                    "Product_class": ";".join(Product_class),
+                    "BGC_probability": "NA",
+                    "BGC_complete": BGC_complete,
+                    "BGC_start": BGC_start,
+                    "BGC_end": BGC_end,
+                    "BGC_length": BGC_length,
+                    "CDS_ID": ";".join(CDS_ID),
+                    "CDS_count": CDS_count,
+                    "PFAM_domains": ";".join(PFAM_domains),
+                    "MIBiG_ID": MIBiG_ID,
+                    "InterPro_ID": "NA",
+                }
+
+                if BGC_start != "":  # Only keep records with BGCs
+                    antismash_out_line = pd.DataFrame([antismash_out_line])
+                    antismash_out = pd.concat(
+                        [antismash_out, antismash_out_line], ignore_index=True
+                    )
+
+                    # Reset variables per BGC
+                    CDS_ID = []
+                    CDS_count = 0
+                    PFAM_domains = []
 
     if verbose:
         print("Done.")

From 4f0ca0afbeefc46e17a2023a19804bfc7eb547b8 Mon Sep 17 00:00:00 2001
From: jasmezz <jasmin.frangenberg@hki-jena.de>
Date: Wed, 24 Jul 2024 18:21:57 +0200
Subject: [PATCH 2/2] Update changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d95d7df5..53b7484d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -36,6 +36,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#397](https://github.com/nf-core/funcscan/pull/397) Removed deprecated AMPcombi module, fixed variable name in BGC workflow, updated minor parts in docs (usage, parameter schema). (by @jasmezz)
 - [#402](https://github.com/nf-core/funcscan/pull/402) Fixed BGC length calculation for antiSMASH hits by comBGC. (by @jasmezz)
 - [#406](https://github.com/nf-core/funcscan/pull/406) Fixed prediction tools not being executed if annotation workflow skipped. (by @jasmezz)
+- [#407](https://github.com/nf-core/funcscan/pull/407) Fixed comBGC bug when parsing multiple antiSMASH files. (by @jasmezz)
 
 ### `Dependencies`