Skip to content

Commit

Permalink
Merge pull request #448 from kedhammar/toulligqc
Browse files Browse the repository at this point in the history
ONT automated QC reports with ToulligQC
  • Loading branch information
kedhammar authored Dec 9, 2024
2 parents 661726f + 9eaeeae commit 7b3cb97
Show file tree
Hide file tree
Showing 6 changed files with 192 additions and 33 deletions.
4 changes: 4 additions & 0 deletions VERSIONLOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
# TACA Version Log

## 20241204.2

Add automated QC reports with ToulligQC for ONT.

## 20241204.1

Add support for staging ONT data on Miarka

## 20241128.1
Expand Down
10 changes: 10 additions & 0 deletions taca/analysis/analysis_nanopore.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,12 @@ def process_user_run(ont_user_run: ONT_user_run):
logger.info(f"{ont_user_run.run_name}: Putting HTML report on GenStat...")
ont_user_run.copy_html_report()

# Generate and publish TouliggQC report
logger.info(
f"{ont_user_run.run_name}: Generating and publishing ToulligQC report..."
)
ont_user_run.toulligqc_report()

# Copy metadata
logger.info(f"{ont_user_run.run_name}: Copying metadata...")
ont_user_run.copy_metadata()
Expand Down Expand Up @@ -166,6 +172,10 @@ def process_qc_run(ont_qc_run: ONT_qc_run):
logger.info(f"{ont_qc_run.run_name}: Putting HTML report on GenStat...")
ont_qc_run.copy_html_report()

# Generate and publish TouliggQC report
logger.info(f"{ont_qc_run.run_name}: Generating and publishing ToulligQC report...")
ont_qc_run.toulligqc_report()

# Look at seq data
if not ont_qc_run.has_raw_seq_output():
logger.info(f"{ont_qc_run.run_name}: Run has no sequencing output, continuing")
Expand Down
121 changes: 120 additions & 1 deletion taca/nanopore/ONT_run_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ def __init__(self, run_abspath: str):

# Get attributes from config
self.minknow_reports_dir = CONFIG["nanopore_analysis"]["minknow_reports_dir"]
self.toulligqc_reports_dir = CONFIG["nanopore_analysis"][
"toulligqc_reports_dir"
]
self.toulligqc_executable = CONFIG["nanopore_analysis"]["toulligqc_executable"]
self.analysis_server = CONFIG["nanopore_analysis"].get("analysis_server", None)
self.rsync_options = CONFIG["nanopore_analysis"]["rsync_options"]
for k, v in self.rsync_options.items():
Expand Down Expand Up @@ -106,6 +110,14 @@ def assert_contents(self):
assert self.has_file("/.sync_finished")
assert self.has_file("/final_summary*.txt")

# Raw seq files
assert any(
[
dir in os.listdir(self.run_abspath)
for dir in ["pod5", "pod5_pass", "fast5", "fast5_pass"]
]
)

# NGI files from instrument
assert self.has_file("/pore_count_history.csv")
assert self.has_file("/run_path.txt")
Expand Down Expand Up @@ -338,6 +350,113 @@ def copy_html_report(self):
logger.error(msg)
raise RsyncError(msg)

def toulligqc_report(self):
"""Generate a QC report for the run using ToulligQC and publish it to GenStat."""

# Get sequencing summary file
glob_summary = glob.glob(f"{self.run_abspath}/sequencing_summary*.txt")
assert len(glob_summary) == 1, f"Found {len(glob_summary)} summary files"
summary = glob_summary[0]

# Determine the format of the raw sequencing data, sorted by preference
raw_data_dir_options = [
"pod5_pass",
"pod5",
"fast5_pass",
"fast5",
]
raw_data_dir = None
for raw_data_dir_option in raw_data_dir_options:
if os.path.exists(f"{self.run_abspath}/{raw_data_dir_option}"):
raw_data_dir = raw_data_dir_option
break
if raw_data_dir is None:
raise AssertionError(f"No seq data found in {self.run_abspath}")
raw_data_format = "pod5" if "pod5" in raw_data_dir else "fast5"

# Load samplesheet, if any
ss_glob = glob.glob(f"{self.run_abspath}/sample_sheet*.csv")
if len(ss_glob) == 0:
samplesheet = None
elif len(ss_glob) > 1:
# If multiple samplesheet, use latest one
samplesheet = ss_glob.sort()[-1]
else:
samplesheet = ss_glob[0]

# Run has barcode subdirs
if len(glob.glob(f"{self.run_abspath}/fastq_pass/barcode*")) > 0:
barcode_dirs = True
else:
barcode_dirs = False

# Determine barcodes
if samplesheet:
ss_df = pd.read_csv(samplesheet)
if "barcode" in ss_df.columns:
ss_barcodes = list(ss_df["barcode"].unique())
ss_barcodes.sort()
barcode_nums = [int(bc[-2:]) for bc in ss_barcodes]
# If barcodes are numbered sequentially, write arg as range
if barcode_nums == list(range(1, len(barcode_nums) + 1)):
barcodes_arg = f"{ss_barcodes[0]}:{ss_barcodes[-1]}"
else:
barcodes_arg = ":".join(ss_barcodes)
else:
ss_barcodes = None

command_args = {
"--sequencing-summary-source": summary,
f"--{raw_data_format}-source": f"{self.run_abspath}/{raw_data_dir}",
"--output-directory": self.run_abspath,
"--report-name": "toulligqc_report",
}
if barcode_dirs:
command_args["--barcoding"] = ""
if samplesheet and ss_barcodes:
command_args["--samplesheet"] = samplesheet
command_args["--barcodes"] = barcodes_arg

# Build command list
command_list = [self.toulligqc_executable]
for k, v in command_args.items():
command_list.append(k)
if v:
command_list.append(v)

# Run the command
# Small enough to wait for, should be done in 1-5 minutes
process = subprocess.run(command_list)

# Check if the command was successful
if process.returncode == 0:
logging.info(f"{self.run_name}: ToulligQC report generated successfully.")
else:
raise subprocess.CalledProcessError(process.returncode, command_list)

# Copy the report to GenStat

logger.info(
f"{self.run_name}: Transferring ToulligQC report to ngi-internal..."
)
# Transfer the ToulligQC .html report file to ngi-internal, renaming it to the full run ID. Requires password-free SSH access.
report_src_path = self.get_file("/toulligqc_report/report.html")
report_dest_path = os.path.join(
self.toulligqc_reports_dir,
f"ToulligQC_{self.run_name}.html",
)
transfer_object = RsyncAgent(
src_path=report_src_path,
dest_path=report_dest_path,
validate=False,
)
try:
transfer_object.transfer()
except RsyncError:
msg = f"{self.run_name}: An error occurred while attempting to transfer the report {report_src_path} to {report_dest_path}."
logger.error(msg)
raise RsyncError(msg)

# Transfer run

def transfer_run(self):
Expand Down Expand Up @@ -499,7 +618,7 @@ def has_fastq_output(self) -> bool:
def has_raw_seq_output(self) -> bool:
"""Check whether run has sequencing data output."""

raw_seq_dirs = ["pod5", "fast5_pass"]
raw_seq_dirs = ["pod5", "pod5_pass", "fast5", "fast5_pass"]

for dir in raw_seq_dirs:
if os.path.exists(os.path.join(self.run_abspath, dir)):
Expand Down
67 changes: 42 additions & 25 deletions tests/analysis/test_analysis_nanopore.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import importlib
import logging
import os
import subprocess
from io import StringIO
from unittest.mock import patch
Expand All @@ -21,28 +22,25 @@ def parametrize_testruns():
"""

parameter_string_table = """
desc instrument qc run_finished sync_finished raw_dirs fastq_dirs barcode_dirs anglerfish_samplesheets anglerfish_ongoing anglerfish_exit
prom_ongoing promethion False False False False False False False False NA
prom_done promethion False True False False False False False False NA
prom_synced promethion False True True False False False False False NA
prom_reads promethion False True True True False False False False NA
prom_fastq promethion False True True True True False False False NA
prom_bcs promethion False True True True True True False False NA
min_ongoing minion False False False False False False False False NA
min_done minion False True False False False False False False NA
min_synced minion False True True False False False False False NA
min_reads minion False True True True False False False False NA
min_fastq minion False True True True True False False False NA
min_bcs minion False True True True True True False False NA
min_qc_ongoing minion True False False False False False False False NA
min_qc_done minion True True False False False False False False NA
min_qc_synced minion True True True False False False False False NA
min_qc_reads minion True True True True False False False False NA
min_qc_fastq minion True True True True True False False False NA
min_qc_bcs minion True True True True True True False False NA
min_qc_ang_ss minion True True True True True True True False NA
min_qc_ang_run minion True True True True True True True True NA
min_qc_ang_done minion True True True True True True True False 0
desc instrument qc run_finished sync_finished fastq_dirs barcode_dirs anglerfish_samplesheets anglerfish_ongoing anglerfish_exit
prom_ongoing promethion False False False False False False False NA
prom_done promethion False True False False False False False NA
prom_synced promethion False True True False False False False NA
prom_fastq promethion False True True True False False False NA
prom_bcs promethion False True True True True False False NA
min_ongoing minion False False False False False False False NA
min_done minion False True False False False False False NA
min_synced minion False True True False False False False NA
min_fastq minion False True True True False False False NA
min_bcs minion False True True True True False False NA
min_qc_ongoing minion True False False False False False False NA
min_qc_done minion True True False False False False False NA
min_qc_synced minion True True True False False False False NA
min_qc_fastq minion True True True True False False False NA
min_qc_bcs minion True True True True True False False NA
min_qc_ang_ss minion True True True True True True False NA
min_qc_ang_run minion True True True True True True True NA
min_qc_ang_done minion True True True True True True False 0
"""

# Turn string table to datastream
Expand All @@ -51,6 +49,9 @@ def parametrize_testruns():
# Read data, trimming whitespace
df = pd.read_csv(data, sep=r"\s+")

# Fix data types
df.anglerfish_exit = df.anglerfish_exit[df.anglerfish_exit.notna()].astype("Int64")

# Replace nan(s) with None(s)
df = df.replace(np.nan, None)

Expand Down Expand Up @@ -100,17 +101,34 @@ def test_ont_transfer(create_dirs, run_properties, caplog):
# Mock subprocess.Popen ONLY for Anglerfish
original_popen = subprocess.Popen

def side_effect(*args, **kwargs):
def mock_Popen_side_effect(*args, **kwargs):
if "anglerfish" in args[0]:
return mock_Popen
else:
return original_popen(*args, **kwargs)

mock_Popen = patch(
"taca.nanopore.ONT_run_classes.subprocess.Popen", side_effect=side_effect
"taca.nanopore.ONT_run_classes.subprocess.Popen",
side_effect=mock_Popen_side_effect,
).start()
mock_Popen.pid = 1337 # Nice

# Mock subprocess.run ONLY for ToulligQC
original_run = subprocess.run

def mock_run_side_effect(*args, **kwargs):
if "toulligqc" in args[0]:
os.mkdir(f"{args[0][6]}/toulligqc_report")
open(f"{args[0][6]}/toulligqc_report/report.html", "w").close()
return mock_run
else:
return original_run(*args, **kwargs)

mock_run = patch(
"taca.nanopore.ONT_run_classes.subprocess.run", side_effect=mock_run_side_effect
).start()
mock_run.returncode = 0

# Reload module to implement mocks
importlib.reload(analysis_nanopore)

Expand All @@ -122,7 +140,6 @@ def side_effect(*args, **kwargs):
script_files=True,
run_finished=run_properties.pop("run_finished"),
sync_finished=run_properties.pop("sync_finished"),
raw_dirs=run_properties.pop("raw_dirs"),
fastq_dirs=run_properties.pop("fastq_dirs"),
barcode_dirs=run_properties.pop("barcode_dirs"),
anglerfish_samplesheets=run_properties.pop("anglerfish_samplesheets"),
Expand Down
9 changes: 7 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ def create_dirs():
│ ├── minion
│ │ └── qc
│ └── promethion
├── minknow_reports
├── ngi-internal
│ ├── minknow_reports
│ └── toulligqc_reports
├── ngi-nas-ns
│ ├── Aviti_data
│ ├── NextSeq_data
Expand Down Expand Up @@ -91,8 +93,11 @@ def create_dirs():
os.makedirs(f"{tmp.name}/ngi-nas-ns/samplesheets/NovaSeqXPlus")
os.makedirs(f"{tmp.name}/ngi-nas-ns/samplesheets/Aviti")

# GenStat
os.makedirs(f"{tmp.name}/ngi-internal/minknow_reports")
os.makedirs(f"{tmp.name}/ngi-internal/toulligqc_reports")

# Misc. ONT dirs/files
os.makedirs(f"{tmp.name}/minknow_reports")
os.makedirs(f"{tmp.name}/log")
open(f"{tmp.name}/log/transfer_promethion.tsv", "w").close()
open(f"{tmp.name}/log/transfer_minion.tsv", "w").close()
Expand Down
14 changes: 9 additions & 5 deletions tests/nanopore/test_ONT_run_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,9 @@ def make_ONT_test_config(tmp: tempfile.TemporaryDirectory) -> dict:
anglerfish:
anglerfish_samplesheets_dir: {tmp.name}/ngi-nas-ns/samplesheets/anglerfish
anglerfish_path: mock
minknow_reports_dir: {tmp.name}/minknow_reports/
minknow_reports_dir: {tmp.name}/ngi-internal/minknow_reports/
toulligqc_reports_dir: {tmp.name}/ngi-internal/toulligqc_reports/
toulligqc_executable: toulligqc
rsync_options:
'-Lav': None
'-r': None
Expand Down Expand Up @@ -90,7 +92,6 @@ def create_ONT_run_dir(
script_files: bool = False,
run_finished: bool = False,
sync_finished: bool = False,
raw_dirs: bool = False,
fastq_dirs: bool = False,
barcode_dirs: bool = False,
anglerfish_samplesheets: bool = False,
Expand Down Expand Up @@ -132,9 +133,15 @@ def create_ONT_run_dir(
write_pore_count_history(run_path, flowcell_id, instrument_position)

if run_finished:
# Raw seq data
os.mkdir(f"{run_path}/pod5_pass")

# Run summary .txt
open(f"{run_path}/final_summary_{run_name}.txt", "w").close()

# Sequencing summary .txt
open(f"{run_path}/sequencing_summary_{run_name}.txt", "w").close()

# Run report .html
open(f"{run_path}/report_{run_name}.html", "w").close()

Expand Down Expand Up @@ -188,9 +195,6 @@ def create_ONT_run_dir(
with open(f"{run_path}/.anglerfish_done", "w") as f:
f.write(str(anglerfish_exit))

if raw_dirs:
os.mkdir(f"{run_path}/pod5_pass")

if fastq_dirs:
os.mkdir(f"{run_path}/fastq_pass")

Expand Down

0 comments on commit 7b3cb97

Please sign in to comment.