diff --git a/VERSIONLOG.md b/VERSIONLOG.md index 891500d3..0525fd42 100644 --- a/VERSIONLOG.md +++ b/VERSIONLOG.md @@ -1,5 +1,9 @@ # TACA Version Log +## 20241029.3 + +Fix missing settings for BCL Convert + ## 20241029.2 Fix issue that index 2 is automatically converted to RC by BCL Convert diff --git a/taca/illumina/NextSeq_Runs.py b/taca/illumina/NextSeq_Runs.py index 5785542c..48bf3c28 100755 --- a/taca/illumina/NextSeq_Runs.py +++ b/taca/illumina/NextSeq_Runs.py @@ -1,5 +1,10 @@ +import os +import re + from taca.illumina.Standard_Runs import Standard_Run +IDT_UMI_PAT = re.compile("([ATCG]{4,}N+$)") + class NextSeq_Run(Standard_Run): def __init__(self, run_dir, software, configuration): @@ -16,3 +21,113 @@ def _set_sequencer_type(self): def _set_run_type(self): self.run_type = "NGI-RUN" + + def _revcomp(self, seq: str) -> str: + return seq.translate(str.maketrans("ACGT", "TGCA"))[::-1] + + def _generate_samplesheet_subset( + self, + ssparser, + samples_to_include, + runSetup, + software, + sample_type, + index1_size, + index2_size, + base_mask, + CONFIG, + ): + output = "" + # Prepare index cycles + index_cycles = [0, 0] + for read in runSetup: + if read["IsIndexedRead"] == "Y": + if int(read["Number"]) == 2: + index_cycles[0] = int(read["NumCycles"]) + else: + index_cycles[1] = int(read["NumCycles"]) + # Header + output += f"[Header]{os.linesep}" + for field in sorted(ssparser.header): + output += f"{field.rstrip()},{ssparser.header[field].rstrip()}" + output += os.linesep + # Settings for BCL Convert + if software == "bclconvert": + output += f"[Settings]{os.linesep}" + # For NextSeq, NovaSeq and NovaSeqXPlus, the orders of index 2 masks also need to be reversed + if len(base_mask) == 4 or (len(base_mask) == 3 and "Y" not in base_mask[2]): + base_mask[2] = "".join(re.findall(r"[A-Z]\d+", base_mask[2])[::-1]) + output += "OverrideCycles,{}{}".format(";".join(base_mask), os.linesep) + if any("U" in bm for bm in base_mask): + output += f"TrimUMI,0{os.linesep}" + + if CONFIG.get("bclconvert"): + if CONFIG["bclconvert"].get("settings"): + # Put common settings + if CONFIG["bclconvert"]["settings"].get("common"): + for setting in CONFIG["bclconvert"]["settings"]["common"]: + for k, v in setting.items(): + output += f"{k},{v}{os.linesep}" + # Put special settings: + if sample_type in CONFIG["bclconvert"]["settings"].keys(): + for setting in CONFIG["bclconvert"]["settings"][sample_type]: + for k, v in setting.items(): + if ( + ( + k == "BarcodeMismatchesIndex1" + and index1_size != 0 + ) + or ( + k == "BarcodeMismatchesIndex2" + and index2_size != 0 + ) + or "BarcodeMismatchesIndex" not in k + ): + output += f"{k},{v}{os.linesep}" + # Data + output += f"[Data]{os.linesep}" + datafields = [] + for field in ssparser.datafields: + datafields.append(field) + output += ",".join(datafields) + output += os.linesep + for line in ssparser.data: + sample_name = line.get("Sample_Name") or line.get("SampleName") + lane = line["Lane"] + noindex_flag = False + if lane in samples_to_include.keys(): + if sample_name in samples_to_include.get(lane): + line_ar = [] + for field in datafields: + # Case with NoIndex + if field == "index" and "NOINDEX" in line["index"].upper(): + line[field] = ( + "T" * index_cycles[0] if index_cycles[0] != 0 else "" + ) + noindex_flag = True + if field == "index2" and noindex_flag: + if software == "bclconvert": + line[field] = ( + "T" * index_cycles[1] + if index_cycles[1] != 0 + else "" + ) + else: + line[field] = ( + "A" * index_cycles[1] + if index_cycles[1] != 0 + else "" + ) + noindex_flag = False + # Case of IDT UMI + if ( + field == "index" or field == "index2" + ) and IDT_UMI_PAT.findall(line[field]): + line[field] = line[field].replace("N", "") + # Convert Index 2 into RC for NextSeq, NovaSeq and NovaSeqXPlus for BCL Convert + if field == "index2" and software == "bclconvert": + line[field] = self._revcomp(line[field]) + line_ar.append(line[field]) + output += ",".join(line_ar) + output += os.linesep + return output diff --git a/taca/illumina/NovaSeqXPlus_Runs.py b/taca/illumina/NovaSeqXPlus_Runs.py index 41cb4094..00e85465 100644 --- a/taca/illumina/NovaSeqXPlus_Runs.py +++ b/taca/illumina/NovaSeqXPlus_Runs.py @@ -56,6 +56,9 @@ def _generate_samplesheet_subset( # Settings for BCL Convert if software == "bclconvert": output += f"[Settings]{os.linesep}" + # For NextSeq, NovaSeq and NovaSeqXPlus, the orders of index 2 masks also need to be reversed + if len(base_mask) == 4 or (len(base_mask) == 3 and "Y" not in base_mask[2]): + base_mask[2] = "".join(re.findall(r"[A-Z]\d+", base_mask[2])[::-1]) output += "OverrideCycles,{}{}".format(";".join(base_mask), os.linesep) if any("U" in bm for bm in base_mask): output += f"TrimUMI,0{os.linesep}" @@ -105,17 +108,26 @@ def _generate_samplesheet_subset( ) noindex_flag = True if field == "index2" and noindex_flag: - line[field] = ( - "T" * index_cycles[1] if index_cycles[1] != 0 else "" - ) + if software == "bclconvert": + line[field] = ( + "T" * index_cycles[1] + if index_cycles[1] != 0 + else "" + ) + else: + line[field] = ( + "A" * index_cycles[1] + if index_cycles[1] != 0 + else "" + ) noindex_flag = False # Case of IDT UMI if ( field == "index" or field == "index2" ) and IDT_UMI_PAT.findall(line[field]): line[field] = line[field].replace("N", "") - # Convert Index 2 into RC for NovaSeqXPlus - if field == "index2": + # Convert Index 2 into RC for NextSeq, NovaSeq and NovaSeqXPlus for BCL Convert + if field == "index2" and software == "bclconvert": line[field] = self._revcomp(line[field]) line_ar.append(line[field]) output += ",".join(line_ar) diff --git a/taca/illumina/NovaSeq_Runs.py b/taca/illumina/NovaSeq_Runs.py index 52a7e162..0d31db1e 100644 --- a/taca/illumina/NovaSeq_Runs.py +++ b/taca/illumina/NovaSeq_Runs.py @@ -1,5 +1,10 @@ +import os +import re + from taca.illumina.Standard_Runs import Standard_Run +IDT_UMI_PAT = re.compile("([ATCG]{4,}N+$)") + class NovaSeq_Run(Standard_Run): def __init__(self, run_dir, software, configuration): @@ -13,3 +18,113 @@ def _set_sequencer_type(self): def _set_run_type(self): self.run_type = "NGI-RUN" + + def _revcomp(self, seq: str) -> str: + return seq.translate(str.maketrans("ACGT", "TGCA"))[::-1] + + def _generate_samplesheet_subset( + self, + ssparser, + samples_to_include, + runSetup, + software, + sample_type, + index1_size, + index2_size, + base_mask, + CONFIG, + ): + output = "" + # Prepare index cycles + index_cycles = [0, 0] + for read in runSetup: + if read["IsIndexedRead"] == "Y": + if int(read["Number"]) == 2: + index_cycles[0] = int(read["NumCycles"]) + else: + index_cycles[1] = int(read["NumCycles"]) + # Header + output += f"[Header]{os.linesep}" + for field in sorted(ssparser.header): + output += f"{field.rstrip()},{ssparser.header[field].rstrip()}" + output += os.linesep + # Settings for BCL Convert + if software == "bclconvert": + output += f"[Settings]{os.linesep}" + # For NextSeq, NovaSeq and NovaSeqXPlus, the orders of index 2 masks also need to be reversed + if len(base_mask) == 4 or (len(base_mask) == 3 and "Y" not in base_mask[2]): + base_mask[2] = "".join(re.findall(r"[A-Z]\d+", base_mask[2])[::-1]) + output += "OverrideCycles,{}{}".format(";".join(base_mask), os.linesep) + if any("U" in bm for bm in base_mask): + output += f"TrimUMI,0{os.linesep}" + + if CONFIG.get("bclconvert"): + if CONFIG["bclconvert"].get("settings"): + # Put common settings + if CONFIG["bclconvert"]["settings"].get("common"): + for setting in CONFIG["bclconvert"]["settings"]["common"]: + for k, v in setting.items(): + output += f"{k},{v}{os.linesep}" + # Put special settings: + if sample_type in CONFIG["bclconvert"]["settings"].keys(): + for setting in CONFIG["bclconvert"]["settings"][sample_type]: + for k, v in setting.items(): + if ( + ( + k == "BarcodeMismatchesIndex1" + and index1_size != 0 + ) + or ( + k == "BarcodeMismatchesIndex2" + and index2_size != 0 + ) + or "BarcodeMismatchesIndex" not in k + ): + output += f"{k},{v}{os.linesep}" + # Data + output += f"[Data]{os.linesep}" + datafields = [] + for field in ssparser.datafields: + datafields.append(field) + output += ",".join(datafields) + output += os.linesep + for line in ssparser.data: + sample_name = line.get("Sample_Name") or line.get("SampleName") + lane = line["Lane"] + noindex_flag = False + if lane in samples_to_include.keys(): + if sample_name in samples_to_include.get(lane): + line_ar = [] + for field in datafields: + # Case with NoIndex + if field == "index" and "NOINDEX" in line["index"].upper(): + line[field] = ( + "T" * index_cycles[0] if index_cycles[0] != 0 else "" + ) + noindex_flag = True + if field == "index2" and noindex_flag: + if software == "bclconvert": + line[field] = ( + "T" * index_cycles[1] + if index_cycles[1] != 0 + else "" + ) + else: + line[field] = ( + "A" * index_cycles[1] + if index_cycles[1] != 0 + else "" + ) + noindex_flag = False + # Case of IDT UMI + if ( + field == "index" or field == "index2" + ) and IDT_UMI_PAT.findall(line[field]): + line[field] = line[field].replace("N", "") + # Convert Index 2 into RC for NextSeq, NovaSeq and NovaSeqXPlus for BCL Convert + if field == "index2" and software == "bclconvert": + line[field] = self._revcomp(line[field]) + line_ar.append(line[field]) + output += ",".join(line_ar) + output += os.linesep + return output diff --git a/taca/illumina/Runs.py b/taca/illumina/Runs.py index 9b34f594..07d46098 100644 --- a/taca/illumina/Runs.py +++ b/taca/illumina/Runs.py @@ -853,6 +853,11 @@ def _fix_html_reports_for_complex_lanes( ): html_report_laneBarcode_parser.sample_data.remove(entry) + # Remove the trailing "_SX" postfix from samples names for BCL Convert when it handles SmartSeq3 libraries + for entry in html_report_laneBarcode_parser.sample_data: + if "_S" in entry["Sample"]: + entry["Sample"] = "_".join(entry["Sample"].split("_")[:2]) + # Sort sample_data: first by lane then by sample ID html_report_laneBarcode_parser.sample_data = sorted( html_report_laneBarcode_parser.sample_data, @@ -1317,6 +1322,14 @@ def _process_demux_with_complex_lanes( "Recipe", "Operator", "Sample_Project", + "[Settings]", + "OverrideCycles", + "MinimumTrimmedReadLength", + "MaskShortReads", + "CreateFastqForIndexReads", + "BarcodeMismatchesIndex1", + "BarcodeMismatchesIndex2", + "TrimUMI", ] with open(samplesheet) as sub_samplesheet_file: sub_samplesheet_reader = csv.reader(sub_samplesheet_file)