Skip to content

Commit

Permalink
Merge pull request #439 from chuan-wang/master
Browse files Browse the repository at this point in the history
Fix missing settings for BCL Convert
  • Loading branch information
chuan-wang authored Oct 31, 2024
2 parents 15ed188 + da60ac2 commit 3b312b0
Show file tree
Hide file tree
Showing 5 changed files with 264 additions and 5 deletions.
4 changes: 4 additions & 0 deletions VERSIONLOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# TACA Version Log

## 20241029.3

Fix missing settings for BCL Convert

## 20241029.2

Fix issue that index 2 is automatically converted to RC by BCL Convert
Expand Down
115 changes: 115 additions & 0 deletions taca/illumina/NextSeq_Runs.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
import os
import re

from taca.illumina.Standard_Runs import Standard_Run

IDT_UMI_PAT = re.compile("([ATCG]{4,}N+$)")


class NextSeq_Run(Standard_Run):
def __init__(self, run_dir, software, configuration):
Expand All @@ -16,3 +21,113 @@ def _set_sequencer_type(self):

def _set_run_type(self):
self.run_type = "NGI-RUN"

def _revcomp(self, seq: str) -> str:
return seq.translate(str.maketrans("ACGT", "TGCA"))[::-1]

def _generate_samplesheet_subset(
self,
ssparser,
samples_to_include,
runSetup,
software,
sample_type,
index1_size,
index2_size,
base_mask,
CONFIG,
):
output = ""
# Prepare index cycles
index_cycles = [0, 0]
for read in runSetup:
if read["IsIndexedRead"] == "Y":
if int(read["Number"]) == 2:
index_cycles[0] = int(read["NumCycles"])
else:
index_cycles[1] = int(read["NumCycles"])
# Header
output += f"[Header]{os.linesep}"
for field in sorted(ssparser.header):
output += f"{field.rstrip()},{ssparser.header[field].rstrip()}"
output += os.linesep
# Settings for BCL Convert
if software == "bclconvert":
output += f"[Settings]{os.linesep}"
# For NextSeq, NovaSeq and NovaSeqXPlus, the orders of index 2 masks also need to be reversed
if len(base_mask) == 4 or (len(base_mask) == 3 and "Y" not in base_mask[2]):
base_mask[2] = "".join(re.findall(r"[A-Z]\d+", base_mask[2])[::-1])
output += "OverrideCycles,{}{}".format(";".join(base_mask), os.linesep)
if any("U" in bm for bm in base_mask):
output += f"TrimUMI,0{os.linesep}"

if CONFIG.get("bclconvert"):
if CONFIG["bclconvert"].get("settings"):
# Put common settings
if CONFIG["bclconvert"]["settings"].get("common"):
for setting in CONFIG["bclconvert"]["settings"]["common"]:
for k, v in setting.items():
output += f"{k},{v}{os.linesep}"
# Put special settings:
if sample_type in CONFIG["bclconvert"]["settings"].keys():
for setting in CONFIG["bclconvert"]["settings"][sample_type]:
for k, v in setting.items():
if (
(
k == "BarcodeMismatchesIndex1"
and index1_size != 0
)
or (
k == "BarcodeMismatchesIndex2"
and index2_size != 0
)
or "BarcodeMismatchesIndex" not in k
):
output += f"{k},{v}{os.linesep}"
# Data
output += f"[Data]{os.linesep}"
datafields = []
for field in ssparser.datafields:
datafields.append(field)
output += ",".join(datafields)
output += os.linesep
for line in ssparser.data:
sample_name = line.get("Sample_Name") or line.get("SampleName")
lane = line["Lane"]
noindex_flag = False
if lane in samples_to_include.keys():
if sample_name in samples_to_include.get(lane):
line_ar = []
for field in datafields:
# Case with NoIndex
if field == "index" and "NOINDEX" in line["index"].upper():
line[field] = (
"T" * index_cycles[0] if index_cycles[0] != 0 else ""
)
noindex_flag = True
if field == "index2" and noindex_flag:
if software == "bclconvert":
line[field] = (
"T" * index_cycles[1]
if index_cycles[1] != 0
else ""
)
else:
line[field] = (
"A" * index_cycles[1]
if index_cycles[1] != 0
else ""
)
noindex_flag = False
# Case of IDT UMI
if (
field == "index" or field == "index2"
) and IDT_UMI_PAT.findall(line[field]):
line[field] = line[field].replace("N", "")
# Convert Index 2 into RC for NextSeq, NovaSeq and NovaSeqXPlus for BCL Convert
if field == "index2" and software == "bclconvert":
line[field] = self._revcomp(line[field])
line_ar.append(line[field])
output += ",".join(line_ar)
output += os.linesep
return output
22 changes: 17 additions & 5 deletions taca/illumina/NovaSeqXPlus_Runs.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ def _generate_samplesheet_subset(
# Settings for BCL Convert
if software == "bclconvert":
output += f"[Settings]{os.linesep}"
# For NextSeq, NovaSeq and NovaSeqXPlus, the orders of index 2 masks also need to be reversed
if len(base_mask) == 4 or (len(base_mask) == 3 and "Y" not in base_mask[2]):
base_mask[2] = "".join(re.findall(r"[A-Z]\d+", base_mask[2])[::-1])
output += "OverrideCycles,{}{}".format(";".join(base_mask), os.linesep)
if any("U" in bm for bm in base_mask):
output += f"TrimUMI,0{os.linesep}"
Expand Down Expand Up @@ -105,17 +108,26 @@ def _generate_samplesheet_subset(
)
noindex_flag = True
if field == "index2" and noindex_flag:
line[field] = (
"T" * index_cycles[1] if index_cycles[1] != 0 else ""
)
if software == "bclconvert":
line[field] = (
"T" * index_cycles[1]
if index_cycles[1] != 0
else ""
)
else:
line[field] = (
"A" * index_cycles[1]
if index_cycles[1] != 0
else ""
)
noindex_flag = False
# Case of IDT UMI
if (
field == "index" or field == "index2"
) and IDT_UMI_PAT.findall(line[field]):
line[field] = line[field].replace("N", "")
# Convert Index 2 into RC for NovaSeqXPlus
if field == "index2":
# Convert Index 2 into RC for NextSeq, NovaSeq and NovaSeqXPlus for BCL Convert
if field == "index2" and software == "bclconvert":
line[field] = self._revcomp(line[field])
line_ar.append(line[field])
output += ",".join(line_ar)
Expand Down
115 changes: 115 additions & 0 deletions taca/illumina/NovaSeq_Runs.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
import os
import re

from taca.illumina.Standard_Runs import Standard_Run

IDT_UMI_PAT = re.compile("([ATCG]{4,}N+$)")


class NovaSeq_Run(Standard_Run):
def __init__(self, run_dir, software, configuration):
Expand All @@ -13,3 +18,113 @@ def _set_sequencer_type(self):

def _set_run_type(self):
self.run_type = "NGI-RUN"

def _revcomp(self, seq: str) -> str:
return seq.translate(str.maketrans("ACGT", "TGCA"))[::-1]

def _generate_samplesheet_subset(
self,
ssparser,
samples_to_include,
runSetup,
software,
sample_type,
index1_size,
index2_size,
base_mask,
CONFIG,
):
output = ""
# Prepare index cycles
index_cycles = [0, 0]
for read in runSetup:
if read["IsIndexedRead"] == "Y":
if int(read["Number"]) == 2:
index_cycles[0] = int(read["NumCycles"])
else:
index_cycles[1] = int(read["NumCycles"])
# Header
output += f"[Header]{os.linesep}"
for field in sorted(ssparser.header):
output += f"{field.rstrip()},{ssparser.header[field].rstrip()}"
output += os.linesep
# Settings for BCL Convert
if software == "bclconvert":
output += f"[Settings]{os.linesep}"
# For NextSeq, NovaSeq and NovaSeqXPlus, the orders of index 2 masks also need to be reversed
if len(base_mask) == 4 or (len(base_mask) == 3 and "Y" not in base_mask[2]):
base_mask[2] = "".join(re.findall(r"[A-Z]\d+", base_mask[2])[::-1])
output += "OverrideCycles,{}{}".format(";".join(base_mask), os.linesep)
if any("U" in bm for bm in base_mask):
output += f"TrimUMI,0{os.linesep}"

if CONFIG.get("bclconvert"):
if CONFIG["bclconvert"].get("settings"):
# Put common settings
if CONFIG["bclconvert"]["settings"].get("common"):
for setting in CONFIG["bclconvert"]["settings"]["common"]:
for k, v in setting.items():
output += f"{k},{v}{os.linesep}"
# Put special settings:
if sample_type in CONFIG["bclconvert"]["settings"].keys():
for setting in CONFIG["bclconvert"]["settings"][sample_type]:
for k, v in setting.items():
if (
(
k == "BarcodeMismatchesIndex1"
and index1_size != 0
)
or (
k == "BarcodeMismatchesIndex2"
and index2_size != 0
)
or "BarcodeMismatchesIndex" not in k
):
output += f"{k},{v}{os.linesep}"
# Data
output += f"[Data]{os.linesep}"
datafields = []
for field in ssparser.datafields:
datafields.append(field)
output += ",".join(datafields)
output += os.linesep
for line in ssparser.data:
sample_name = line.get("Sample_Name") or line.get("SampleName")
lane = line["Lane"]
noindex_flag = False
if lane in samples_to_include.keys():
if sample_name in samples_to_include.get(lane):
line_ar = []
for field in datafields:
# Case with NoIndex
if field == "index" and "NOINDEX" in line["index"].upper():
line[field] = (
"T" * index_cycles[0] if index_cycles[0] != 0 else ""
)
noindex_flag = True
if field == "index2" and noindex_flag:
if software == "bclconvert":
line[field] = (
"T" * index_cycles[1]
if index_cycles[1] != 0
else ""
)
else:
line[field] = (
"A" * index_cycles[1]
if index_cycles[1] != 0
else ""
)
noindex_flag = False
# Case of IDT UMI
if (
field == "index" or field == "index2"
) and IDT_UMI_PAT.findall(line[field]):
line[field] = line[field].replace("N", "")
# Convert Index 2 into RC for NextSeq, NovaSeq and NovaSeqXPlus for BCL Convert
if field == "index2" and software == "bclconvert":
line[field] = self._revcomp(line[field])
line_ar.append(line[field])
output += ",".join(line_ar)
output += os.linesep
return output
13 changes: 13 additions & 0 deletions taca/illumina/Runs.py
Original file line number Diff line number Diff line change
Expand Up @@ -853,6 +853,11 @@ def _fix_html_reports_for_complex_lanes(
):
html_report_laneBarcode_parser.sample_data.remove(entry)

# Remove the trailing "_SX" postfix from samples names for BCL Convert when it handles SmartSeq3 libraries
for entry in html_report_laneBarcode_parser.sample_data:
if "_S" in entry["Sample"]:
entry["Sample"] = "_".join(entry["Sample"].split("_")[:2])

# Sort sample_data: first by lane then by sample ID
html_report_laneBarcode_parser.sample_data = sorted(
html_report_laneBarcode_parser.sample_data,
Expand Down Expand Up @@ -1317,6 +1322,14 @@ def _process_demux_with_complex_lanes(
"Recipe",
"Operator",
"Sample_Project",
"[Settings]",
"OverrideCycles",
"MinimumTrimmedReadLength",
"MaskShortReads",
"CreateFastqForIndexReads",
"BarcodeMismatchesIndex1",
"BarcodeMismatchesIndex2",
"TrimUMI",
]
with open(samplesheet) as sub_samplesheet_file:
sub_samplesheet_reader = csv.reader(sub_samplesheet_file)
Expand Down

0 comments on commit 3b312b0

Please sign in to comment.