Merge pull request #439 from chuan-wang/master

Fix missing settings for BCL Convert
SciLifeLab · Oct 31, 2024 · 3b312b0 · 3b312b0
2 parents 15ed188 + da60ac2
commit 3b312b0
Show file tree

Hide file tree

Showing 5 changed files with 264 additions and 5 deletions.
diff --git a/VERSIONLOG.md b/VERSIONLOG.md
@@ -1,5 +1,9 @@
 # TACA Version Log
 
+## 20241029.3
+
+Fix missing settings for BCL Convert
+
 ## 20241029.2
 
 Fix issue that index 2 is automatically converted to RC by BCL Convert

diff --git a/taca/illumina/NextSeq_Runs.py b/taca/illumina/NextSeq_Runs.py
@@ -1,5 +1,10 @@
+import os
+import re
+
 from taca.illumina.Standard_Runs import Standard_Run
 
+IDT_UMI_PAT = re.compile("([ATCG]{4,}N+$)")
+
 
 class NextSeq_Run(Standard_Run):
     def __init__(self, run_dir, software, configuration):
@@ -16,3 +21,113 @@ def _set_sequencer_type(self):
 
     def _set_run_type(self):
         self.run_type = "NGI-RUN"
+
+    def _revcomp(self, seq: str) -> str:
+        return seq.translate(str.maketrans("ACGT", "TGCA"))[::-1]
+
+    def _generate_samplesheet_subset(
+        self,
+        ssparser,
+        samples_to_include,
+        runSetup,
+        software,
+        sample_type,
+        index1_size,
+        index2_size,
+        base_mask,
+        CONFIG,
+    ):
+        output = ""
+        # Prepare index cycles
+        index_cycles = [0, 0]
+        for read in runSetup:
+            if read["IsIndexedRead"] == "Y":
+                if int(read["Number"]) == 2:
+                    index_cycles[0] = int(read["NumCycles"])
+                else:
+                    index_cycles[1] = int(read["NumCycles"])
+        # Header
+        output += f"[Header]{os.linesep}"
+        for field in sorted(ssparser.header):
+            output += f"{field.rstrip()},{ssparser.header[field].rstrip()}"
+            output += os.linesep
+        # Settings for BCL Convert
+        if software == "bclconvert":
+            output += f"[Settings]{os.linesep}"
+            # For NextSeq, NovaSeq and NovaSeqXPlus, the orders of index 2 masks also need to be reversed
+            if len(base_mask) == 4 or (len(base_mask) == 3 and "Y" not in base_mask[2]):
+                base_mask[2] = "".join(re.findall(r"[A-Z]\d+", base_mask[2])[::-1])
+            output += "OverrideCycles,{}{}".format(";".join(base_mask), os.linesep)
+            if any("U" in bm for bm in base_mask):
+                output += f"TrimUMI,0{os.linesep}"
+
+            if CONFIG.get("bclconvert"):
+                if CONFIG["bclconvert"].get("settings"):
+                    # Put common settings
+                    if CONFIG["bclconvert"]["settings"].get("common"):
+                        for setting in CONFIG["bclconvert"]["settings"]["common"]:
+                            for k, v in setting.items():
+                                output += f"{k},{v}{os.linesep}"
+                    # Put special settings:
+                    if sample_type in CONFIG["bclconvert"]["settings"].keys():
+                        for setting in CONFIG["bclconvert"]["settings"][sample_type]:
+                            for k, v in setting.items():
+                                if (
+                                    (
+                                        k == "BarcodeMismatchesIndex1"
+                                        and index1_size != 0
+                                    )
+                                    or (
+                                        k == "BarcodeMismatchesIndex2"
+                                        and index2_size != 0
+                                    )
+                                    or "BarcodeMismatchesIndex" not in k
+                                ):
+                                    output += f"{k},{v}{os.linesep}"
+        # Data
+        output += f"[Data]{os.linesep}"
+        datafields = []
+        for field in ssparser.datafields:
+            datafields.append(field)
+        output += ",".join(datafields)
+        output += os.linesep
+        for line in ssparser.data:
+            sample_name = line.get("Sample_Name") or line.get("SampleName")
+            lane = line["Lane"]
+            noindex_flag = False
+            if lane in samples_to_include.keys():
+                if sample_name in samples_to_include.get(lane):
+                    line_ar = []
+                    for field in datafields:
+                        # Case with NoIndex
+                        if field == "index" and "NOINDEX" in line["index"].upper():
+                            line[field] = (
+                                "T" * index_cycles[0] if index_cycles[0] != 0 else ""
+                            )
+                            noindex_flag = True
+                        if field == "index2" and noindex_flag:
+                            if software == "bclconvert":
+                                line[field] = (
+                                    "T" * index_cycles[1]
+                                    if index_cycles[1] != 0
+                                    else ""
+                                )
+                            else:
+                                line[field] = (
+                                    "A" * index_cycles[1]
+                                    if index_cycles[1] != 0
+                                    else ""
+                                )
+                            noindex_flag = False
+                        # Case of IDT UMI
+                        if (
+                            field == "index" or field == "index2"
+                        ) and IDT_UMI_PAT.findall(line[field]):
+                            line[field] = line[field].replace("N", "")
+                        # Convert Index 2 into RC for NextSeq, NovaSeq and NovaSeqXPlus for BCL Convert
+                        if field == "index2" and software == "bclconvert":
+                            line[field] = self._revcomp(line[field])
+                        line_ar.append(line[field])
+                    output += ",".join(line_ar)
+                    output += os.linesep
+        return output
diff --git a/taca/illumina/NovaSeqXPlus_Runs.py b/taca/illumina/NovaSeqXPlus_Runs.py
@@ -56,6 +56,9 @@ def _generate_samplesheet_subset(
         # Settings for BCL Convert
         if software == "bclconvert":
             output += f"[Settings]{os.linesep}"
+            # For NextSeq, NovaSeq and NovaSeqXPlus, the orders of index 2 masks also need to be reversed
+            if len(base_mask) == 4 or (len(base_mask) == 3 and "Y" not in base_mask[2]):
+                base_mask[2] = "".join(re.findall(r"[A-Z]\d+", base_mask[2])[::-1])
             output += "OverrideCycles,{}{}".format(";".join(base_mask), os.linesep)
             if any("U" in bm for bm in base_mask):
                 output += f"TrimUMI,0{os.linesep}"
@@ -105,17 +108,26 @@ def _generate_samplesheet_subset(
                             )
                             noindex_flag = True
                         if field == "index2" and noindex_flag:
-                            line[field] = (
-                                "T" * index_cycles[1] if index_cycles[1] != 0 else ""
-                            )
+                            if software == "bclconvert":
+                                line[field] = (
+                                    "T" * index_cycles[1]
+                                    if index_cycles[1] != 0
+                                    else ""
+                                )
+                            else:
+                                line[field] = (
+                                    "A" * index_cycles[1]
+                                    if index_cycles[1] != 0
+                                    else ""
+                                )
                             noindex_flag = False
                         # Case of IDT UMI
                         if (
                             field == "index" or field == "index2"
                         ) and IDT_UMI_PAT.findall(line[field]):
                             line[field] = line[field].replace("N", "")
-                        # Convert Index 2 into RC for NovaSeqXPlus
-                        if field == "index2":
+                        # Convert Index 2 into RC for NextSeq, NovaSeq and NovaSeqXPlus for BCL Convert
+                        if field == "index2" and software == "bclconvert":
                             line[field] = self._revcomp(line[field])
                         line_ar.append(line[field])
                     output += ",".join(line_ar)

diff --git a/taca/illumina/NovaSeq_Runs.py b/taca/illumina/NovaSeq_Runs.py
@@ -1,5 +1,10 @@
+import os
+import re
+
 from taca.illumina.Standard_Runs import Standard_Run
 
+IDT_UMI_PAT = re.compile("([ATCG]{4,}N+$)")
+
 
 class NovaSeq_Run(Standard_Run):
     def __init__(self, run_dir, software, configuration):
@@ -13,3 +18,113 @@ def _set_sequencer_type(self):
 
     def _set_run_type(self):
         self.run_type = "NGI-RUN"
+
+    def _revcomp(self, seq: str) -> str:
+        return seq.translate(str.maketrans("ACGT", "TGCA"))[::-1]
+
+    def _generate_samplesheet_subset(
+        self,
+        ssparser,
+        samples_to_include,
+        runSetup,
+        software,
+        sample_type,
+        index1_size,
+        index2_size,
+        base_mask,
+        CONFIG,
+    ):
+        output = ""
+        # Prepare index cycles
+        index_cycles = [0, 0]
+        for read in runSetup:
+            if read["IsIndexedRead"] == "Y":
+                if int(read["Number"]) == 2:
+                    index_cycles[0] = int(read["NumCycles"])
+                else:
+                    index_cycles[1] = int(read["NumCycles"])
+        # Header
+        output += f"[Header]{os.linesep}"
+        for field in sorted(ssparser.header):
+            output += f"{field.rstrip()},{ssparser.header[field].rstrip()}"
+            output += os.linesep
+        # Settings for BCL Convert
+        if software == "bclconvert":
+            output += f"[Settings]{os.linesep}"
+            # For NextSeq, NovaSeq and NovaSeqXPlus, the orders of index 2 masks also need to be reversed
+            if len(base_mask) == 4 or (len(base_mask) == 3 and "Y" not in base_mask[2]):
+                base_mask[2] = "".join(re.findall(r"[A-Z]\d+", base_mask[2])[::-1])
+            output += "OverrideCycles,{}{}".format(";".join(base_mask), os.linesep)
+            if any("U" in bm for bm in base_mask):
+                output += f"TrimUMI,0{os.linesep}"
+
+            if CONFIG.get("bclconvert"):
+                if CONFIG["bclconvert"].get("settings"):
+                    # Put common settings
+                    if CONFIG["bclconvert"]["settings"].get("common"):
+                        for setting in CONFIG["bclconvert"]["settings"]["common"]:
+                            for k, v in setting.items():
+                                output += f"{k},{v}{os.linesep}"
+                    # Put special settings:
+                    if sample_type in CONFIG["bclconvert"]["settings"].keys():
+                        for setting in CONFIG["bclconvert"]["settings"][sample_type]:
+                            for k, v in setting.items():
+                                if (
+                                    (
+                                        k == "BarcodeMismatchesIndex1"
+                                        and index1_size != 0
+                                    )
+                                    or (
+                                        k == "BarcodeMismatchesIndex2"
+                                        and index2_size != 0
+                                    )
+                                    or "BarcodeMismatchesIndex" not in k
+                                ):
+                                    output += f"{k},{v}{os.linesep}"
+        # Data
+        output += f"[Data]{os.linesep}"
+        datafields = []
+        for field in ssparser.datafields:
+            datafields.append(field)
+        output += ",".join(datafields)
+        output += os.linesep
+        for line in ssparser.data:
+            sample_name = line.get("Sample_Name") or line.get("SampleName")
+            lane = line["Lane"]
+            noindex_flag = False
+            if lane in samples_to_include.keys():
+                if sample_name in samples_to_include.get(lane):
+                    line_ar = []
+                    for field in datafields:
+                        # Case with NoIndex
+                        if field == "index" and "NOINDEX" in line["index"].upper():
+                            line[field] = (
+                                "T" * index_cycles[0] if index_cycles[0] != 0 else ""
+                            )
+                            noindex_flag = True
+                        if field == "index2" and noindex_flag:
+                            if software == "bclconvert":
+                                line[field] = (
+                                    "T" * index_cycles[1]
+                                    if index_cycles[1] != 0
+                                    else ""
+                                )
+                            else:
+                                line[field] = (
+                                    "A" * index_cycles[1]
+                                    if index_cycles[1] != 0
+                                    else ""
+                                )
+                            noindex_flag = False
+                        # Case of IDT UMI
+                        if (
+                            field == "index" or field == "index2"
+                        ) and IDT_UMI_PAT.findall(line[field]):
+                            line[field] = line[field].replace("N", "")
+                        # Convert Index 2 into RC for NextSeq, NovaSeq and NovaSeqXPlus for BCL Convert
+                        if field == "index2" and software == "bclconvert":
+                            line[field] = self._revcomp(line[field])
+                        line_ar.append(line[field])
+                    output += ",".join(line_ar)
+                    output += os.linesep
+        return output
diff --git a/taca/illumina/Runs.py b/taca/illumina/Runs.py
@@ -853,6 +853,11 @@ def _fix_html_reports_for_complex_lanes(
                 ):
                     html_report_laneBarcode_parser.sample_data.remove(entry)
 
+        # Remove the trailing "_SX" postfix from samples names for BCL Convert when it handles SmartSeq3 libraries
+        for entry in html_report_laneBarcode_parser.sample_data:
+            if "_S" in entry["Sample"]:
+                entry["Sample"] = "_".join(entry["Sample"].split("_")[:2])
+
         # Sort sample_data: first by lane then by sample ID
         html_report_laneBarcode_parser.sample_data = sorted(
             html_report_laneBarcode_parser.sample_data,
@@ -1317,6 +1322,14 @@ def _process_demux_with_complex_lanes(
                     "Recipe",
                     "Operator",
                     "Sample_Project",
+                    "[Settings]",
+                    "OverrideCycles",
+                    "MinimumTrimmedReadLength",
+                    "MaskShortReads",
+                    "CreateFastqForIndexReads",
+                    "BarcodeMismatchesIndex1",
+                    "BarcodeMismatchesIndex2",
+                    "TrimUMI",
                 ]
                 with open(samplesheet) as sub_samplesheet_file:
                     sub_samplesheet_reader = csv.reader(sub_samplesheet_file)