From 22bbbcf3dd244b6de0e5f2648cdf946f6ec0a03d Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Thu, 29 Aug 2024 10:19:04 +0200 Subject: [PATCH 001/187] Initial setup for element aviti --- taca/analysis/analysis_element.py | 65 +++++++++++++++++++++++++++++++ taca/analysis/cli.py | 23 +++++++++++ taca/element/Aviti_Runs.py | 7 ++++ taca/element/Element_Runs.py | 18 +++++++++ taca/element/__init__.py | 3 ++ 5 files changed, 116 insertions(+) create mode 100755 taca/analysis/analysis_element.py create mode 100644 taca/element/Aviti_Runs.py create mode 100644 taca/element/Element_Runs.py create mode 100644 taca/element/__init__.py diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py new file mode 100755 index 00000000..1099ffc9 --- /dev/null +++ b/taca/analysis/analysis_element.py @@ -0,0 +1,65 @@ +"""Analysis methods for sequencing runs produced by Element instruments.""" + +import glob +import logging +import os + +from taca.element.Element_Runs import Aviti_Run +from taca.utils.config import CONFIG + +logger = logging.getLogger(__name__) + + +def run_preprocessing(given_run): + """Run demultiplexing in all data directories. + + :param str given_run: Process a particular run instead of looking for runs + """ + + def _process(run): + """Process a run/flowcell and transfer to analysis server. + + :param taca.element.Run run: Run to be processed and transferred + """ + # Check if sequencing is finished. (is the final file there and was it completed OK) + # if sequencing is not done + # Update statusdb? + # return + # else If sequencing finished and demux not started + # Update statusdb + # Get/generate sample sheet + # Start demux + # else if sequencing finished and demux ongoing + # do nothing + # Else if sequencing started and demux finished + # check if run is transferred or transfer is ongoing + # if run has not been transferred and transfer is not ongoing + # make a hidden file to indicate that transfer has started + # transfer run to miarka + # remove hidden file if transfer was successful + # Update transfer log + # archive run to nosync + # elif run is being transferred (hidden file exists) + # return + # elif run is already transferred (in transfer log) + # warn that transferred run has not been archived + + + + if given_run: + run = Aviti_Run(run) #TODO: Needs to change if more Element machines are aquired in the future + _process(runObj) + else: + data_dirs = CONFIG.get("element_analysis").get("data_dirs") #TODO: add to config + for data_dir in data_dirs: + # Run folder looks like DATE_*_*_*, the last section is the FC name. + runs = glob.glob(os.path.join(data_dir, "[1-9]*_*_*_*")) #TODO: adapt to aviti format + for run in runs: + runObj = Aviti_Run(run) + try: + _process(runObj) + except: #TODO: chatch error message and print it + # This function might throw and exception, + # it is better to continue processing other runs + logger.warning(f"There was an error processing the run {run}") + pass diff --git a/taca/analysis/cli.py b/taca/analysis/cli.py index 342a8b1c..13250f61 100644 --- a/taca/analysis/cli.py +++ b/taca/analysis/cli.py @@ -4,6 +4,7 @@ from taca.analysis import analysis as an from taca.analysis import analysis_nanopore +from taca.analysis import analysis_element @click.group() @@ -71,6 +72,28 @@ def updatedb(rundir, software): """Save the run to statusdb.""" an.upload_to_statusdb(rundir, software) +# Element analysis subcommands + + +@analysis.command() +@click.option( + "-r", + "--run", + type=click.Path(exists=True), + default=None, + help="Demultiplex only a particular run", +) +def demultiplex_element(run): + """Demultiplex and transfer all runs present in the data directories.""" + analysis_element.run_preprocessing(run) + + +@analysis.command() +@click.argument("run") +def element_updatedb(run): + """Save the run to statusdb.""" + analysis_element.upload_to_statusdb(run) + # Nanopore analysis subcommands diff --git a/taca/element/Aviti_Runs.py b/taca/element/Aviti_Runs.py new file mode 100644 index 00000000..ad162ac4 --- /dev/null +++ b/taca/element/Aviti_Runs.py @@ -0,0 +1,7 @@ +from taca.element.Element_Runs import Run + + +class Aviti_Run(Run): + def __init__(self, run_dir, configuration): + super().__init__(run_dir, configuration) + self.sequencer_type = "Aviti" diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py new file mode 100644 index 00000000..8bd18273 --- /dev/null +++ b/taca/element/Element_Runs.py @@ -0,0 +1,18 @@ +import logging +import os + +logger = logging.getLogger(__name__) + + +class Run: + """Defines an Element run""" + + def __init__(self, run_dir, configuration): + if not os.path.exists(run_dir): + raise RuntimeError(f"Could not locate run directory {run_dir}") + self.run_dir = os.path.abspath(run_dir) + self.CONFIG = configuration + self.demux_dir = "Demultiplexing" + + def is_transferred(self, transfer_file): + pass \ No newline at end of file diff --git a/taca/element/__init__.py b/taca/element/__init__.py new file mode 100644 index 00000000..75a569ff --- /dev/null +++ b/taca/element/__init__.py @@ -0,0 +1,3 @@ +""" +Classes to parse and work with Element data +""" From 36dbafd21734ad0b0b7aeea9c470390f475476c8 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Thu, 29 Aug 2024 12:50:07 +0200 Subject: [PATCH 002/187] Updated outline for Aviti --- taca/analysis/analysis_element.py | 18 ++++++++++++++---- taca/element/Element_Runs.py | 3 +++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index 1099ffc9..7d9648b7 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -21,27 +21,37 @@ def _process(run): :param taca.element.Run run: Run to be processed and transferred """ + # Fetch statusdb document for run + + # Get previous status of run from statusdb document # Check if sequencing is finished. (is the final file there and was it completed OK) # if sequencing is not done - # Update statusdb? + # compare previous status with current status and update statusdb document if different # return - # else If sequencing finished and demux not started - # Update statusdb + # else if sequencing finished and demux not started # Get/generate sample sheet # Start demux + # compare previous status with current status and update statusdb document if different # else if sequencing finished and demux ongoing - # do nothing + # compare previous status with current status and update statusdb document if different + # return # Else if sequencing started and demux finished # check if run is transferred or transfer is ongoing # if run has not been transferred and transfer is not ongoing # make a hidden file to indicate that transfer has started + # compare previous status with current status and update statusdb document if different + # Also update statusdb with a timestamp of when the transfer started # transfer run to miarka # remove hidden file if transfer was successful # Update transfer log + # update statusdb document # archive run to nosync + # update statusdb document # elif run is being transferred (hidden file exists) + # compare previous status with current status and update statusdb document if different # return # elif run is already transferred (in transfer log) + # compare previous status with current status and update statusdb document if different # warn that transferred run has not been archived diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 8bd18273..e718cc85 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -15,4 +15,7 @@ def __init__(self, run_dir, configuration): self.demux_dir = "Demultiplexing" def is_transferred(self, transfer_file): + pass + + def parse_rundir(self): pass \ No newline at end of file From d0054466ec098701d551f6203d8883409c182e30 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Thu, 29 Aug 2024 16:22:20 +0200 Subject: [PATCH 003/187] Process aviti data from sequencing to demux --- taca/analysis/analysis_element.py | 43 +++++++++++++++++--------- taca/element/Element_Runs.py | 51 ++++++++++++++++++++++++++++++- 2 files changed, 78 insertions(+), 16 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index 7d9648b7..6e6a395f 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -6,9 +6,17 @@ from taca.element.Element_Runs import Aviti_Run from taca.utils.config import CONFIG +from taca.utils import statusdb + logger = logging.getLogger(__name__) +def _upload_to_statusdb(run): + """Triggers the upload to statusdb. + + :param Run run: the object run + """ + pass def run_preprocessing(given_run): """Run demultiplexing in all data directories. @@ -21,21 +29,26 @@ def _process(run): :param taca.element.Run run: Run to be processed and transferred """ - # Fetch statusdb document for run - - # Get previous status of run from statusdb document - # Check if sequencing is finished. (is the final file there and was it completed OK) - # if sequencing is not done - # compare previous status with current status and update statusdb document if different - # return - # else if sequencing finished and demux not started - # Get/generate sample sheet + #TODO: Fetch statusdb document for run + #TODO: Get previous status of run from statusdb document + sequencing_done = run.check_sequencing_status() + demultiplexing_status = run.get_demultiplexing_status() + if not sequencing_done: + #TODO: compare previous status with current status and update statusdb document if different + return + elif sequencing_done and demultiplexing_status == "not started": + if not run.manifest_exists(): # Assumes that we use the same manifest as for sequencing. TODO: demux settings need to be added to the original manifest by lims + #TODO: email operator that manifest is missing + return # Start demux - # compare previous status with current status and update statusdb document if different - # else if sequencing finished and demux ongoing - # compare previous status with current status and update statusdb document if different - # return - # Else if sequencing started and demux finished + run.start_demux() + #TODO: compare previous status with current status and update statusdb document if different + return + elif sequencing_done and demultiplexing_status == "ongoing": + #TODO: compare previous status with current status and update statusdb document if different + return + elif sequencing_done and demultiplexing_status == "finished": + # Sync metadata to ngi-data-ns # check if run is transferred or transfer is ongoing # if run has not been transferred and transfer is not ongoing # make a hidden file to indicate that transfer has started @@ -53,7 +66,7 @@ def _process(run): # elif run is already transferred (in transfer log) # compare previous status with current status and update statusdb document if different # warn that transferred run has not been archived - + pass if given_run: diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index e718cc85..f3007dff 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -1,5 +1,10 @@ import logging import os +import json +from datetime import datetime + +from taca.utils import misc +from taca.utils.filesystem import chdir logger = logging.getLogger(__name__) @@ -12,7 +17,51 @@ def __init__(self, run_dir, configuration): raise RuntimeError(f"Could not locate run directory {run_dir}") self.run_dir = os.path.abspath(run_dir) self.CONFIG = configuration - self.demux_dir = "Demultiplexing" + self.demux_dir = os.path.join(self.run_dir, "Demultiplexing") + self.final_sequencing_file = os.path.join(self.run_dir, "RunUploaded.json") + self.demux_stats_file = os.path.join(self.demux_dir, "RunStats.json") #TODO: How to handle SideA/SideB? + self.run_manifest_file = os.path.join(self.run_dir, "RunManifest.csv") + + def check_sequencing_status(self): + if os.path.exists(self.final_sequencing_file): + with open(self.final_sequencing_file) as json_file: + sequencing_outcome = json.load(json_file).get("outcome") + if sequencing_outcome != "OutcomeCompleted": + return False + else: + return True + else: + return False + + def get_demultiplexing_status(self): + if not os.path.exists(self.demux_dir): + return "not started" + elif os.path.exists(self.demux_dir) and not os.path.isfile(self.demux_stats_file): + return "ongoing" + elif os.path.exists(self.demux_dir) and os.path.isfile(self.demux_stats_file): + return "finished" + + def manifest_exists(self): + return os.path.isfile(self.run_manifest_file) + + def generate_demux_command(self): + command = [self.CONFIG.get(self.software)["bin"], #TODO add path to bases2fastq executable to config + self.run_dir, + self.demux_dir, #TODO: how to handle SideA/SideB? + "-p 12" + ] + return command + + def start_demux(self): + with chdir(self.run_dir): + cmd = self.generate_demux_command() + misc.call_external_command_detached( + cmd, with_log_files=True, prefix=f"demux_" + ) + logger.info( + "Bases2Fastq conversion and demultiplexing " + f"started for run {os.path.basename(self.id)} on {datetime.now()}" + ) def is_transferred(self, transfer_file): pass From 6b9e885c728daeab5516986099223b80e0901953 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Mon, 2 Sep 2024 11:59:33 +0200 Subject: [PATCH 004/187] add aviti stuff to test conf tempdir --- tests/conftest.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index 171a5667..d699945a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -31,9 +31,11 @@ def create_dirs(): │ ├── minion_data │ ├── miseq_data │ ├── promethion_data + │ ├── Aviti_data │ └── samplesheets │ ├── NovaSeqXPlus │ └── anglerfish + │ └── Aviti └── ngi_data └── sequencing ├── MiSeq @@ -49,6 +51,8 @@ def create_dirs(): │ └── qc │ └── nosync └── promethion + │ └── nosync + └── AV242106 └── nosync --> Return the the temporary directory object @@ -65,6 +69,8 @@ def create_dirs(): os.makedirs(f"{tmp.name}/ngi_data/sequencing/promethion/nosync") os.makedirs(f"{tmp.name}/ngi_data/sequencing/minion/nosync") os.makedirs(f"{tmp.name}/ngi_data/sequencing/minion/qc/nosync") + ## AVITI + os.makedirs(f"{tmp.name}/ngi_data/sequencing/AV242106/nosync") # Sequencing metadata ## Illumina @@ -75,10 +81,13 @@ def create_dirs(): ## ONT os.makedirs(f"{tmp.name}/ngi-nas-ns/promethion_data") os.makedirs(f"{tmp.name}/ngi-nas-ns/minion_data") + ## AVITI + os.makedirs(f"{tmp.name}/ngi-nas-ns/Aviti_data") # Samplesheets os.makedirs(f"{tmp.name}/ngi-nas-ns/samplesheets/anglerfish") os.makedirs(f"{tmp.name}/ngi-nas-ns/samplesheets/NovaSeqXPlus") + os.makedirs(f"{tmp.name}/ngi-nas-ns/samplesheets/Aviti") # Misc. ONT dirs/files os.makedirs(f"{tmp.name}/minknow_reports") From 36fc1c64faa6945b891c94a28270e57993165839 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Mon, 2 Sep 2024 12:36:53 +0200 Subject: [PATCH 005/187] modularize --- tests/element/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/element/__init__.py diff --git a/tests/element/__init__.py b/tests/element/__init__.py new file mode 100644 index 00000000..e69de29b From bafbf6a7ef978b44761fba9559cd77491db03598 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Mon, 2 Sep 2024 12:37:03 +0200 Subject: [PATCH 006/187] add test class --- tests/element/test_Element_Runs.py | 66 ++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 tests/element/test_Element_Runs.py diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py new file mode 100644 index 00000000..9f421400 --- /dev/null +++ b/tests/element/test_Element_Runs.py @@ -0,0 +1,66 @@ +import os +import tempfile + +import pytest + +from taca.element import Element_Runs as to_test + + +def create_aviti_run_dir( + tmp: tempfile.TemporaryDirectory, + run_name: str = "20240716_AV242106_testrun", + nosync: bool = False, + run_finished: bool = True, + sync_finished: bool = True, +) -> str: + # Create run dir + if nosync: + run_path = f"{tmp.name}/ngi_data/sequencing/AV242106/nosync/{run_name}" + else: + run_path = f"{tmp.name}/ngi_data/sequencing/AV242106/{run_name}" + os.mkdir(run_path) + + # Create files + if run_finished: + open(f"{run_path}/AvitiRunStats.json", "w").close() + open(f"{run_path}/RunManifest.csv", "w").close() + open(f"{run_path}/RunManifest.json", "w").close() + open(f"{run_path}/RunParameters.json", "w").close() + open(f"{run_path}/RunUploaded.json", "w").close() + + if sync_finished: + open(f"{run_path}/.sync_finished", "w").close() + + return run_path + + +class TestRun: + @pytest.fixture(autouse=True) + def setup(self, create_dirs: pytest.fixture): + self.tmp: tempfile.TemporaryDirectory = create_dirs + self.run_path = create_aviti_run_dir(self.tmp) + self.run = to_test.Run(self.run_path, {}) + + def test_init(self): + assert self.run.run_dir == self.run_path + + def test_check_sequencing_status(self): + assert False + + def test_get_demultiplexing_status(self): + assert False + + def test_manifest_exists(self): + assert False + + def test_generate_demux_command(self): + assert False + + def test_start_demux(self): + assert False + + def test_is_transferred(self): + assert False + + def test_parse_rundir(self): + assert False From 7ff7a26f5a234094bda3028e596ec89f79fb06cb Mon Sep 17 00:00:00 2001 From: kedhammar Date: Mon, 2 Sep 2024 14:05:02 +0200 Subject: [PATCH 007/187] add tests --- tests/element/test_Element_Runs.py | 116 +++++++++++++++++++++++------ 1 file changed, 94 insertions(+), 22 deletions(-) diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py index 9f421400..33050200 100644 --- a/tests/element/test_Element_Runs.py +++ b/tests/element/test_Element_Runs.py @@ -1,3 +1,4 @@ +import json import os import tempfile @@ -12,6 +13,9 @@ def create_aviti_run_dir( nosync: bool = False, run_finished: bool = True, sync_finished: bool = True, + demux_dir: bool = True, + demux_done: bool = True, + outcome_completed: bool = True, ) -> str: # Create run dir if nosync: @@ -20,47 +24,115 @@ def create_aviti_run_dir( run_path = f"{tmp.name}/ngi_data/sequencing/AV242106/{run_name}" os.mkdir(run_path) - # Create files + # Populate run dir with files and folders if run_finished: open(f"{run_path}/AvitiRunStats.json", "w").close() open(f"{run_path}/RunManifest.csv", "w").close() open(f"{run_path}/RunManifest.json", "w").close() open(f"{run_path}/RunParameters.json", "w").close() - open(f"{run_path}/RunUploaded.json", "w").close() + with open(f"{run_path}/RunUploaded.json", "w") as f: + outcome = "OutcomeCompleted" if outcome_completed else "OutcomeFailed" + f.write(json.dumps({"outcome": outcome})) if sync_finished: open(f"{run_path}/.sync_finished", "w").close() - return run_path - - -class TestRun: - @pytest.fixture(autouse=True) - def setup(self, create_dirs: pytest.fixture): - self.tmp: tempfile.TemporaryDirectory = create_dirs - self.run_path = create_aviti_run_dir(self.tmp) - self.run = to_test.Run(self.run_path, {}) - - def test_init(self): - assert self.run.run_dir == self.run_path + if demux_dir: + os.mkdir(os.path.join(run_path, "Demultiplexing")) - def test_check_sequencing_status(self): - assert False + if demux_done: + open(os.path.join(run_path, "Demultiplexing", "RunStats.json"), "w").close() - def test_get_demultiplexing_status(self): - assert False + return run_path - def test_manifest_exists(self): - assert False +class TestRun: + def test_init(self, create_dirs: pytest.fixture): + tmp: tempfile.TemporaryDirectory = create_dirs + run_dir = create_aviti_run_dir(tmp) + run = to_test.Run(run_dir, {}) + assert run.run_dir == run_dir + + @pytest.mark.parametrize( + "p", + [ + {"run_finished": True, "outcome_completed": True, "expected": True}, + {"run_finished": True, "outcome_completed": False, "expected": False}, + {"run_finished": False, "outcome_completed": False, "expected": False}, + ], + ids=["success", "failure", "ongoing"], + ) + def test_check_sequencing_status( + self, p: pytest.fixture, create_dirs: pytest.fixture + ): + tmp: tempfile.TemporaryDirectory = create_dirs + + run = to_test.Run( + create_aviti_run_dir( + tmp, + run_finished=p["run_finished"], + outcome_completed=p["outcome_completed"], + ), + {}, + ) + assert run.check_sequencing_status() is p["expected"] + + @pytest.mark.parametrize( + "p", + [ + {"demux_dir": False, "demux_done": False, "expected": "not started"}, + {"demux_dir": True, "demux_done": False, "expected": "ongoing"}, + {"demux_dir": True, "demux_done": True, "expected": "finished"}, + ], + ids=["not started", "ongoing", "finished"], + ) + def test_get_demultiplexing_status( + self, p: pytest.fixture, create_dirs: pytest.fixture + ): + tmp: tempfile.TemporaryDirectory = create_dirs + + run = to_test.Run( + create_aviti_run_dir( + tmp, + demux_dir=p["demux_dir"], + demux_done=p["demux_done"], + ), + {}, + ) + assert run.get_demultiplexing_status() == p["expected"] + + @pytest.mark.parametrize( + "p", + [ + {"run_finished": True, "expected": True}, + {"run_finished": False, "expected": False}, + ], + ids=["exists", "does not exist"], + ) + def test_manifest_exists(self, create_dirs: pytest.fixture, p: pytest.fixture): + tmp: tempfile.TemporaryDirectory = create_dirs + + run = to_test.Run( + create_aviti_run_dir( + tmp, + run_finished=p["run_finished"], + ), + {}, + ) + assert run.manifest_exists() == p["expected"] + + @pytest.mark.skip def test_generate_demux_command(self): assert False + @pytest.mark.skip def test_start_demux(self): assert False + @pytest.mark.skip def test_is_transferred(self): - assert False + pass + @pytest.mark.skip def test_parse_rundir(self): - assert False + pass From 781ef95e9d45cd84194dcc382bc79fc0dfcc3f91 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Mon, 2 Sep 2024 14:09:22 +0200 Subject: [PATCH 008/187] docs --- tests/element/test_Element_Runs.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py index 33050200..2f491441 100644 --- a/tests/element/test_Element_Runs.py +++ b/tests/element/test_Element_Runs.py @@ -17,6 +17,21 @@ def create_aviti_run_dir( demux_done: bool = True, outcome_completed: bool = True, ) -> str: + """ + Conditionally build a file tree for an Aviti run. + + . + ├── AvitiRunStats.json + ├── RunManifest.csv + ├── RunManifest.json + ├── RunParameters.json + ├── RunUploaded.json + ├── .sync_finished + └── Demultiplexing + └── RunStats.json + + """ + # Create run dir if nosync: run_path = f"{tmp.name}/ngi_data/sequencing/AV242106/nosync/{run_name}" From d32b7de8b776a132eacdf7de4c10b4c1572afd71 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Mon, 2 Sep 2024 14:54:37 +0200 Subject: [PATCH 009/187] bugfix --- taca/element/Element_Runs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index f3007dff..c850446c 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -60,7 +60,7 @@ def start_demux(self): ) logger.info( "Bases2Fastq conversion and demultiplexing " - f"started for run {os.path.basename(self.id)} on {datetime.now()}" + f"started for run {os.path.basename(self.run_dir)} on {datetime.now()}" ) def is_transferred(self, transfer_file): From 1d3408b8beb4ca1ad79122131ce776b0bc172abc Mon Sep 17 00:00:00 2001 From: kedhammar Date: Mon, 2 Sep 2024 14:55:13 +0200 Subject: [PATCH 010/187] add tests --- tests/element/test_Element_Runs.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py index 2f491441..cfeb013f 100644 --- a/tests/element/test_Element_Runs.py +++ b/tests/element/test_Element_Runs.py @@ -1,6 +1,7 @@ import json import os import tempfile +from unittest.mock import patch import pytest @@ -137,17 +138,27 @@ def test_manifest_exists(self, create_dirs: pytest.fixture, p: pytest.fixture): assert run.manifest_exists() == p["expected"] @pytest.mark.skip - def test_generate_demux_command(self): - assert False + def test_generate_demux_command(self, create_dirs): + pass - @pytest.mark.skip - def test_start_demux(self): - assert False + def test_start_demux(self, create_dirs): + with patch( + "taca.utils.misc.call_external_command_detached" + ) as mock_call, patch( + "taca.element.Element_Runs.Run.generate_demux_command" + ) as mock_command: + mock_command.return_value = "test command" + run = to_test.Run(create_aviti_run_dir(create_dirs), {}) + run.start_demux() + mock_command.assert_called_once() + mock_call.assert_called_once_with( + "test command", with_log_files=True, prefix="demux_" + ) @pytest.mark.skip - def test_is_transferred(self): + def test_is_transferred(self, create_dirs): pass @pytest.mark.skip - def test_parse_rundir(self): + def test_parse_rundir(self, create_dirs): pass From b5d307c1c1b5ce20a76327fb44af5a3da4d58d76 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Mon, 2 Sep 2024 14:59:36 +0200 Subject: [PATCH 011/187] differentiate element/aviti --- tests/conftest.py | 4 ++-- tests/element/test_Element_Runs.py | 16 +++++++--------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index d699945a..c26d4c03 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -69,7 +69,7 @@ def create_dirs(): os.makedirs(f"{tmp.name}/ngi_data/sequencing/promethion/nosync") os.makedirs(f"{tmp.name}/ngi_data/sequencing/minion/nosync") os.makedirs(f"{tmp.name}/ngi_data/sequencing/minion/qc/nosync") - ## AVITI + ## Element os.makedirs(f"{tmp.name}/ngi_data/sequencing/AV242106/nosync") # Sequencing metadata @@ -81,7 +81,7 @@ def create_dirs(): ## ONT os.makedirs(f"{tmp.name}/ngi-nas-ns/promethion_data") os.makedirs(f"{tmp.name}/ngi-nas-ns/minion_data") - ## AVITI + ## Element os.makedirs(f"{tmp.name}/ngi-nas-ns/Aviti_data") # Samplesheets diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py index cfeb013f..7377742b 100644 --- a/tests/element/test_Element_Runs.py +++ b/tests/element/test_Element_Runs.py @@ -8,7 +8,7 @@ from taca.element import Element_Runs as to_test -def create_aviti_run_dir( +def create_element_run_dir( tmp: tempfile.TemporaryDirectory, run_name: str = "20240716_AV242106_testrun", nosync: bool = False, @@ -19,10 +19,9 @@ def create_aviti_run_dir( outcome_completed: bool = True, ) -> str: """ - Conditionally build a file tree for an Aviti run. + Conditionally build a file tree for an Element run. . - ├── AvitiRunStats.json ├── RunManifest.csv ├── RunManifest.json ├── RunParameters.json @@ -42,7 +41,6 @@ def create_aviti_run_dir( # Populate run dir with files and folders if run_finished: - open(f"{run_path}/AvitiRunStats.json", "w").close() open(f"{run_path}/RunManifest.csv", "w").close() open(f"{run_path}/RunManifest.json", "w").close() open(f"{run_path}/RunParameters.json", "w").close() @@ -65,7 +63,7 @@ def create_aviti_run_dir( class TestRun: def test_init(self, create_dirs: pytest.fixture): tmp: tempfile.TemporaryDirectory = create_dirs - run_dir = create_aviti_run_dir(tmp) + run_dir = create_element_run_dir(tmp) run = to_test.Run(run_dir, {}) assert run.run_dir == run_dir @@ -84,7 +82,7 @@ def test_check_sequencing_status( tmp: tempfile.TemporaryDirectory = create_dirs run = to_test.Run( - create_aviti_run_dir( + create_element_run_dir( tmp, run_finished=p["run_finished"], outcome_completed=p["outcome_completed"], @@ -108,7 +106,7 @@ def test_get_demultiplexing_status( tmp: tempfile.TemporaryDirectory = create_dirs run = to_test.Run( - create_aviti_run_dir( + create_element_run_dir( tmp, demux_dir=p["demux_dir"], demux_done=p["demux_done"], @@ -129,7 +127,7 @@ def test_manifest_exists(self, create_dirs: pytest.fixture, p: pytest.fixture): tmp: tempfile.TemporaryDirectory = create_dirs run = to_test.Run( - create_aviti_run_dir( + create_element_run_dir( tmp, run_finished=p["run_finished"], ), @@ -148,7 +146,7 @@ def test_start_demux(self, create_dirs): "taca.element.Element_Runs.Run.generate_demux_command" ) as mock_command: mock_command.return_value = "test command" - run = to_test.Run(create_aviti_run_dir(create_dirs), {}) + run = to_test.Run(create_element_run_dir(create_dirs), {}) run.start_demux() mock_command.assert_called_once() mock_call.assert_called_once_with( From 047ac2222907d14cc26c963ac5288ea9a7eed8aa Mon Sep 17 00:00:00 2001 From: kedhammar Date: Mon, 2 Sep 2024 15:09:34 +0200 Subject: [PATCH 012/187] add skip reasons --- tests/element/test_Element_Runs.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py index 7377742b..a6eca20a 100644 --- a/tests/element/test_Element_Runs.py +++ b/tests/element/test_Element_Runs.py @@ -135,8 +135,8 @@ def test_manifest_exists(self, create_dirs: pytest.fixture, p: pytest.fixture): ) assert run.manifest_exists() == p["expected"] - @pytest.mark.skip - def test_generate_demux_command(self, create_dirs): + @pytest.mark.skip(reason="Not implemented yet") + def test_generate_demux_command(self): pass def test_start_demux(self, create_dirs): @@ -153,10 +153,10 @@ def test_start_demux(self, create_dirs): "test command", with_log_files=True, prefix="demux_" ) - @pytest.mark.skip + @pytest.mark.skip(reason="Not implemented yet") def test_is_transferred(self, create_dirs): pass - @pytest.mark.skip + @pytest.mark.skip(reason="Not implemented yet") def test_parse_rundir(self, create_dirs): pass From fa473cd1e3c867f88430679a54c219b29eb44de8 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Mon, 2 Sep 2024 15:09:38 +0200 Subject: [PATCH 013/187] add test --- tests/element/test_Aviti_Runs.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 tests/element/test_Aviti_Runs.py diff --git a/tests/element/test_Aviti_Runs.py b/tests/element/test_Aviti_Runs.py new file mode 100644 index 00000000..0ff3e7db --- /dev/null +++ b/tests/element/test_Aviti_Runs.py @@ -0,0 +1,15 @@ +import tempfile + +import pytest + +from taca.element import Aviti_Runs as to_test +from tests.element.test_Element_Runs import create_element_run_dir + + +class TestAviti_Run: + def test_init(self, create_dirs: pytest.fixture): + tmp: tempfile.TemporaryDirectory = create_dirs + run_dir = create_element_run_dir(tmp) + run = to_test.Aviti_Run(run_dir, {}) + assert run.run_dir == run_dir + assert run.sequencer_type == "Aviti" From 7e84ac16b0677adb54b9f96e83bff1f419d606e2 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Mon, 2 Sep 2024 15:13:35 +0200 Subject: [PATCH 014/187] ruff format --- taca/analysis/analysis_element.py | 57 +++++++++++++++++-------------- taca/analysis/cli.py | 1 + taca/element/Element_Runs.py | 37 ++++++++++++-------- 3 files changed, 55 insertions(+), 40 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index 6e6a395f..20bc0c80 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -11,6 +11,7 @@ logger = logging.getLogger(__name__) + def _upload_to_statusdb(run): """Triggers the upload to statusdb. @@ -18,6 +19,7 @@ def _upload_to_statusdb(run): """ pass + def run_preprocessing(given_run): """Run demultiplexing in all data directories. @@ -29,59 +31,64 @@ def _process(run): :param taca.element.Run run: Run to be processed and transferred """ - #TODO: Fetch statusdb document for run - #TODO: Get previous status of run from statusdb document + # TODO: Fetch statusdb document for run + # TODO: Get previous status of run from statusdb document sequencing_done = run.check_sequencing_status() demultiplexing_status = run.get_demultiplexing_status() if not sequencing_done: - #TODO: compare previous status with current status and update statusdb document if different + # TODO: compare previous status with current status and update statusdb document if different return elif sequencing_done and demultiplexing_status == "not started": - if not run.manifest_exists(): # Assumes that we use the same manifest as for sequencing. TODO: demux settings need to be added to the original manifest by lims - #TODO: email operator that manifest is missing + if not run.manifest_exists(): # Assumes that we use the same manifest as for sequencing. TODO: demux settings need to be added to the original manifest by lims + # TODO: email operator that manifest is missing return # Start demux run.start_demux() - #TODO: compare previous status with current status and update statusdb document if different + # TODO: compare previous status with current status and update statusdb document if different return elif sequencing_done and demultiplexing_status == "ongoing": - #TODO: compare previous status with current status and update statusdb document if different + # TODO: compare previous status with current status and update statusdb document if different return elif sequencing_done and demultiplexing_status == "finished": # Sync metadata to ngi-data-ns # check if run is transferred or transfer is ongoing # if run has not been transferred and transfer is not ongoing - # make a hidden file to indicate that transfer has started - # compare previous status with current status and update statusdb document if different - # Also update statusdb with a timestamp of when the transfer started - # transfer run to miarka - # remove hidden file if transfer was successful - # Update transfer log - # update statusdb document - # archive run to nosync - # update statusdb document + # make a hidden file to indicate that transfer has started + # compare previous status with current status and update statusdb document if different + # Also update statusdb with a timestamp of when the transfer started + # transfer run to miarka + # remove hidden file if transfer was successful + # Update transfer log + # update statusdb document + # archive run to nosync + # update statusdb document # elif run is being transferred (hidden file exists) - # compare previous status with current status and update statusdb document if different - # return + # compare previous status with current status and update statusdb document if different + # return # elif run is already transferred (in transfer log) - # compare previous status with current status and update statusdb document if different - # warn that transferred run has not been archived + # compare previous status with current status and update statusdb document if different + # warn that transferred run has not been archived pass - if given_run: - run = Aviti_Run(run) #TODO: Needs to change if more Element machines are aquired in the future + run = Aviti_Run( + run + ) # TODO: Needs to change if more Element machines are aquired in the future _process(runObj) else: - data_dirs = CONFIG.get("element_analysis").get("data_dirs") #TODO: add to config + data_dirs = CONFIG.get("element_analysis").get( + "data_dirs" + ) # TODO: add to config for data_dir in data_dirs: # Run folder looks like DATE_*_*_*, the last section is the FC name. - runs = glob.glob(os.path.join(data_dir, "[1-9]*_*_*_*")) #TODO: adapt to aviti format + runs = glob.glob( + os.path.join(data_dir, "[1-9]*_*_*_*") + ) # TODO: adapt to aviti format for run in runs: runObj = Aviti_Run(run) try: _process(runObj) - except: #TODO: chatch error message and print it + except: # TODO: chatch error message and print it # This function might throw and exception, # it is better to continue processing other runs logger.warning(f"There was an error processing the run {run}") diff --git a/taca/analysis/cli.py b/taca/analysis/cli.py index 13250f61..2e433a4c 100644 --- a/taca/analysis/cli.py +++ b/taca/analysis/cli.py @@ -72,6 +72,7 @@ def updatedb(rundir, software): """Save the run to statusdb.""" an.upload_to_statusdb(rundir, software) + # Element analysis subcommands diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index c850446c..a86e7d8e 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -19,9 +19,11 @@ def __init__(self, run_dir, configuration): self.CONFIG = configuration self.demux_dir = os.path.join(self.run_dir, "Demultiplexing") self.final_sequencing_file = os.path.join(self.run_dir, "RunUploaded.json") - self.demux_stats_file = os.path.join(self.demux_dir, "RunStats.json") #TODO: How to handle SideA/SideB? + self.demux_stats_file = os.path.join( + self.demux_dir, "RunStats.json" + ) # TODO: How to handle SideA/SideB? self.run_manifest_file = os.path.join(self.run_dir, "RunManifest.csv") - + def check_sequencing_status(self): if os.path.exists(self.final_sequencing_file): with open(self.final_sequencing_file) as json_file: @@ -32,26 +34,31 @@ def check_sequencing_status(self): return True else: return False - + def get_demultiplexing_status(self): if not os.path.exists(self.demux_dir): return "not started" - elif os.path.exists(self.demux_dir) and not os.path.isfile(self.demux_stats_file): + elif os.path.exists(self.demux_dir) and not os.path.isfile( + self.demux_stats_file + ): return "ongoing" elif os.path.exists(self.demux_dir) and os.path.isfile(self.demux_stats_file): return "finished" - + def manifest_exists(self): return os.path.isfile(self.run_manifest_file) - + def generate_demux_command(self): - command = [self.CONFIG.get(self.software)["bin"], #TODO add path to bases2fastq executable to config - self.run_dir, - self.demux_dir, #TODO: how to handle SideA/SideB? - "-p 12" - ] + command = [ + self.CONFIG.get(self.software)[ + "bin" + ], # TODO add path to bases2fastq executable to config + self.run_dir, + self.demux_dir, # TODO: how to handle SideA/SideB? + "-p 12", + ] return command - + def start_demux(self): with chdir(self.run_dir): cmd = self.generate_demux_command() @@ -62,9 +69,9 @@ def start_demux(self): "Bases2Fastq conversion and demultiplexing " f"started for run {os.path.basename(self.run_dir)} on {datetime.now()}" ) - + def is_transferred(self, transfer_file): pass - + def parse_rundir(self): - pass \ No newline at end of file + pass From 9052caaf41a545ef866a22623cf2f90ebd52e28b Mon Sep 17 00:00:00 2001 From: kedhammar Date: Mon, 2 Sep 2024 15:20:02 +0200 Subject: [PATCH 015/187] ruff check --- taca/analysis/analysis_element.py | 2 -- taca/analysis/cli.py | 3 +-- taca/element/Element_Runs.py | 4 ++-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index 20bc0c80..189cf01d 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -6,8 +6,6 @@ from taca.element.Element_Runs import Aviti_Run from taca.utils.config import CONFIG -from taca.utils import statusdb - logger = logging.getLogger(__name__) diff --git a/taca/analysis/cli.py b/taca/analysis/cli.py index 2e433a4c..131e78e0 100644 --- a/taca/analysis/cli.py +++ b/taca/analysis/cli.py @@ -3,8 +3,7 @@ import click from taca.analysis import analysis as an -from taca.analysis import analysis_nanopore -from taca.analysis import analysis_element +from taca.analysis import analysis_element, analysis_nanopore @click.group() diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index a86e7d8e..b3888c55 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -1,6 +1,6 @@ +import json import logging import os -import json from datetime import datetime from taca.utils import misc @@ -63,7 +63,7 @@ def start_demux(self): with chdir(self.run_dir): cmd = self.generate_demux_command() misc.call_external_command_detached( - cmd, with_log_files=True, prefix=f"demux_" + cmd, with_log_files=True, prefix="demux_" ) logger.info( "Bases2Fastq conversion and demultiplexing " From b63cd4d6544c5f1c2f55d88a1b99121c0146c5d7 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Mon, 2 Sep 2024 15:20:19 +0200 Subject: [PATCH 016/187] bugfix --- taca/analysis/analysis_element.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index 189cf01d..8f2b29d9 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -4,7 +4,7 @@ import logging import os -from taca.element.Element_Runs import Aviti_Run +from taca.element.Aviti_Runs import Aviti_Run from taca.utils.config import CONFIG logger = logging.getLogger(__name__) From 5ddd2f082520d0ee0d2b12bb688bbe557b89f62d Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Wed, 4 Sep 2024 13:00:48 +0200 Subject: [PATCH 017/187] Add funktions for aviti processing --- taca/analysis/analysis_element.py | 62 +++++++++++++++++++------------ taca/element/Element_Runs.py | 41 ++++++++++++++++++++ 2 files changed, 79 insertions(+), 24 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index 6e6a395f..55fc6b42 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -37,37 +37,51 @@ def _process(run): #TODO: compare previous status with current status and update statusdb document if different return elif sequencing_done and demultiplexing_status == "not started": - if not run.manifest_exists(): # Assumes that we use the same manifest as for sequencing. TODO: demux settings need to be added to the original manifest by lims - #TODO: email operator that manifest is missing + if not run.manifest_exists(): + #TODO: email operator warning return - # Start demux - run.start_demux() - #TODO: compare previous status with current status and update statusdb document if different - return + elif run.manifest_exists(): + # Get sample info from manifest + sample_info = run.get_sample_info() + sample_types = run.get_sample_types(sample_info) + if len(sample_types) == 1: + run.start_demux() + elif len(sample_types) > 1: + for sample_type in sample_types: + run.make_manifest(sample_info, sample_type) + run.start_demux() + else: + #TODO: warn that no samples were found in the run manifest + return + #TODO: compare previous status with current status and update statusdb document if different elif sequencing_done and demultiplexing_status == "ongoing": #TODO: compare previous status with current status and update statusdb document if different return elif sequencing_done and demultiplexing_status == "finished": - # Sync metadata to ngi-data-ns - # check if run is transferred or transfer is ongoing - # if run has not been transferred and transfer is not ongoing - # make a hidden file to indicate that transfer has started - # compare previous status with current status and update statusdb document if different + transfer_file = CONFIG.get('Element').get('Aviti').get('transfer_log') + if not run.is_transferred(transfer_file) and not run.transfer_ongoing(): + run.sync_metadata() + run.make_transfer_indicator() + #TODO: compare previous status with current status and update statusdb document if different # Also update statusdb with a timestamp of when the transfer started - # transfer run to miarka - # remove hidden file if transfer was successful - # Update transfer log - # update statusdb document - # archive run to nosync - # update statusdb document - # elif run is being transferred (hidden file exists) - # compare previous status with current status and update statusdb document if different - # return - # elif run is already transferred (in transfer log) - # compare previous status with current status and update statusdb document if different + run.transfer() + run.remove_transfer_indicator() + run.update_transfer_log(transfer_file) + #TODO: update statusdb document + run.archive() + elif not run.is_transferred(transfer_file) and run.transfer_ongoing(): + #TODO: compare previous status with current status and update statusdb document if different + logger.info("Run is being transferred. Skipping.") + return + elif run.is_transferred(transfer_file): + #TODO: compare previous status with current status and update statusdb document if different # warn that transferred run has not been archived - pass - + logger.warn("The run has already been transferred but has not been archived. Please investigate") + return + else: + logger.warn("Unknown transfer status. Please investigate") + + if given_run: run = Aviti_Run(run) #TODO: Needs to change if more Element machines are aquired in the future diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index f3007dff..2cb113e3 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -44,6 +44,18 @@ def get_demultiplexing_status(self): def manifest_exists(self): return os.path.isfile(self.run_manifest_file) + def get_sample_info(self): + sample_info = {} #TODO: populate + return sample_info + + def get_sample_types(self, sample_info): + sample_types = () #TODO: populate + return sample_types + + def make_manifest(self, sample_info, sample_type): + #TODO: make a manifest for a sample_type based on sample_info + return + def generate_demux_command(self): command = [self.CONFIG.get(self.software)["bin"], #TODO add path to bases2fastq executable to config self.run_dir, @@ -64,6 +76,35 @@ def start_demux(self): ) def is_transferred(self, transfer_file): + #TODO: return true if run in transfer log, else false + pass + + def transfer_ongoing(self): + #TODO: return true if hidden transfer file marker exists, else false + pass + + def sync_metadata(self): + #TODO: copy metadata from demuxed run to ngi-nas-ns + pass + + def make_transfer_indicator(self): + #TODO: touch a hidden file in the run directory + pass + + def transfer(self): + #TODO: rsync run to analysis cluster + pass + + def remove_transfer_indicator(self): + #TODO: remove hidden file in run directory + pass + + def update_transfer_log(self, transfer_file): + #TODO: update the transfer log + pass + + def archive(self): + #TODO: move run dir to nosync pass def parse_rundir(self): From 1a4320750e8b13294cdc996658f3f36d3a649db1 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Tue, 10 Sep 2024 15:20:00 +0200 Subject: [PATCH 018/187] Add status of a run --- taca/analysis/analysis_element.py | 66 ++++++++++++++++--------------- taca/element/Element_Runs.py | 25 ++++++++---- 2 files changed, 52 insertions(+), 39 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index 50f95cef..433b2483 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -10,14 +10,6 @@ logger = logging.getLogger(__name__) -def _upload_to_statusdb(run): - """Triggers the upload to statusdb. - - :param Run run: the object run - """ - pass - - def run_preprocessing(given_run): """Run demultiplexing in all data directories. @@ -29,20 +21,19 @@ def _process(run): :param taca.element.Run run: Run to be processed and transferred """ - # TODO: Fetch statusdb document for run - # TODO: Get previous status of run from statusdb document sequencing_done = run.check_sequencing_status() demultiplexing_status = run.get_demultiplexing_status() - if not sequencing_done: - # TODO: compare previous status with current status and update statusdb document if different - return - elif sequencing_done and demultiplexing_status == "not started": + if not sequencing_done: # Sequencing ongoing + current_run_status = 'sequencing' + if run.status_changed(current_run_status): + run.update_statusdb(current_run_status) #TODO: what info needs to be gathered and uploaded? + elif sequencing_done and demultiplexing_status == "not started": # Sequencing done. Start demux if not run.manifest_exists(): + logger.warn(f"Run manifest is missing for {run.flowcell_id}") #TODO: email operator warning return elif run.manifest_exists(): - # Get sample info from manifest - sample_info = run.get_sample_info() + sample_info = run.get_sample_info_from_manifest() sample_types = run.get_sample_types(sample_info) if len(sample_types) == 1: run.start_demux() @@ -51,48 +42,59 @@ def _process(run): run.make_manifest(sample_info, sample_type) run.start_demux() else: - #TODO: warn that no samples were found in the run manifest + logger.warn(f"No samples were found in the sample manifest for run {run.flowcell_id}.") + #TODO: email operator warning return - #TODO: compare previous status with current status and update statusdb document if different + current_run_status = "demultiplexing" + if run.status_changed(current_run_status): + run.update_statusdb(current_run_status) elif sequencing_done and demultiplexing_status == "ongoing": - # TODO: compare previous status with current status and update statusdb document if different + current_run_status = "demultiplexing" + if run.status_changed(current_run_status): + run.update_statusdb(current_run_status) return elif sequencing_done and demultiplexing_status == "finished": transfer_file = CONFIG.get('Element').get('Aviti').get('transfer_log') if not run.is_transferred(transfer_file) and not run.transfer_ongoing(): run.sync_metadata() run.make_transfer_indicator() - #TODO: compare previous status with current status and update statusdb document if different - # Also update statusdb with a timestamp of when the transfer started + current_run_status = "transferring" + if run.status_changed(current_run_status): + run.update_statusdb(current_run_status) + #TODO: Also update statusdb with a timestamp of when the transfer started run.transfer() run.remove_transfer_indicator() run.update_transfer_log(transfer_file) - #TODO: update statusdb document + current_run_status = "transferred" + if run.status_changed(current_run_status): + run.update_statusdb(current_run_status) run.archive() + current_run_status = "archived" + if run.status_changed(current_run_status): + run.update_statusdb(current_run_status) elif not run.is_transferred(transfer_file) and run.transfer_ongoing(): - #TODO: compare previous status with current status and update statusdb document if different - logger.info("Run is being transferred. Skipping.") + current_run_status = "transferring" + if run.status_changed(current_run_status): + run.update_statusdb(current_run_status) + logger.info(f"Run {run.flowcell_id} is being transferred. Skipping.") return elif run.is_transferred(transfer_file): - #TODO: compare previous status with current status and update statusdb document if different - # warn that transferred run has not been archived - logger.warn("The run has already been transferred but has not been archived. Please investigate") + logger.warn(f"The run {run.flowcell_id} has already been transferred but has not been archived. Please investigate") + #TODO: email operator warning return else: - logger.warn("Unknown transfer status. Please investigate") + logger.warn(f"Unknown transfer status of run {run.flowcell_id}. Please investigate") if given_run: - run = Aviti_Run( - run - ) # TODO: Needs to change if more Element machines are aquired in the future + run = Aviti_Run(given_run) # TODO: Needs to change if more Element machines are aquired in the future _process(run) else: data_dirs = CONFIG.get("element_analysis").get( "data_dirs" ) # TODO: add to config - for data_dir in data_dirs: + for data_dir in data_dirs: #TODO: make sure to look in both side A and B # Run folder looks like DATE_*_*_*, the last section is the FC name. runs = glob.glob( os.path.join(data_dir, "[1-9]*_*_*_*") diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index b8c273b8..813e2ea4 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -15,15 +15,16 @@ class Run: def __init__(self, run_dir, configuration): if not os.path.exists(run_dir): raise RuntimeError(f"Could not locate run directory {run_dir}") - self.run_dir = os.path.abspath(run_dir) + self.flowcell_id = run_dir #TODO: get flowcell id from json instead + self.run_dir = os.path.abspath(run_dir) # TODO: How to handle SideA/SideB? self.CONFIG = configuration self.demux_dir = os.path.join(self.run_dir, "Demultiplexing") self.final_sequencing_file = os.path.join(self.run_dir, "RunUploaded.json") self.demux_stats_file = os.path.join( - self.demux_dir, "RunStats.json" - ) # TODO: How to handle SideA/SideB? + self.demux_dir, "RunStats.json" # Assumes demux is finished when this file is created + ) self.run_manifest_file = os.path.join(self.run_dir, "RunManifest.csv") - + def check_sequencing_status(self): if os.path.exists(self.final_sequencing_file): with open(self.final_sequencing_file) as json_file: @@ -34,7 +35,7 @@ def check_sequencing_status(self): return True else: return False - + def get_demultiplexing_status(self): if not os.path.exists(self.demux_dir): return "not started" @@ -44,12 +45,22 @@ def get_demultiplexing_status(self): return "ongoing" elif os.path.exists(self.demux_dir) and os.path.isfile(self.demux_stats_file): return "finished" + else: + return "unknown" + + def status_changed(self, current_run_status): + #TODO: get document from statusdb, check status field, return true if status of run changed + pass + + def update_statusdb(self, current_run_status): + #TODO: Get document from statusdb. Gather data about run and update the statusdb document, then upload to statusdb + pass def manifest_exists(self): return os.path.isfile(self.run_manifest_file) - def get_sample_info(self): - sample_info = {} #TODO: populate + def get_sample_info_from_manifest(self): + sample_info = {} #TODO: populate with sample info from manifest return sample_info def get_sample_types(self, sample_info): From 38b3276001e388303f241555472df03f042c2b9b Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Wed, 11 Sep 2024 14:17:23 +0200 Subject: [PATCH 019/187] Get run manifests from lims instead --- taca/analysis/analysis_element.py | 30 +++++++++++++++++------------- taca/element/Element_Runs.py | 18 +++++------------- 2 files changed, 22 insertions(+), 26 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index 433b2483..91902d05 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -28,23 +28,25 @@ def _process(run): if run.status_changed(current_run_status): run.update_statusdb(current_run_status) #TODO: what info needs to be gathered and uploaded? elif sequencing_done and demultiplexing_status == "not started": # Sequencing done. Start demux - if not run.manifest_exists(): + if not run.manifest_exists(): #TODO: this should check for the zip file in lims output location logger.warn(f"Run manifest is missing for {run.flowcell_id}") #TODO: email operator warning return elif run.manifest_exists(): - sample_info = run.get_sample_info_from_manifest() - sample_types = run.get_sample_types(sample_info) - if len(sample_types) == 1: - run.start_demux() - elif len(sample_types) > 1: - for sample_type in sample_types: - run.make_manifest(sample_info, sample_type) - run.start_demux() - else: - logger.warn(f"No samples were found in the sample manifest for run {run.flowcell_id}.") - #TODO: email operator warning - return + os.mkdir(run.demux_dir) + run.copy_manifests() + run_manifests = glob.glob( + os.path.join(run.run_dir, "RunManifest_*.csv") + ) # TODO: is this filename right? + sub_demux_count = 0 + for run_manifest in run_manifests.sort(): + if len(run_manifests) == 1: + demux_dir = run.demux_dir + elif len(run_manifests) > 1: + demux_dir = f"Demultiplexing_{sub_demux_count}" + os.mkdir(demux_dir) + run.start_demux(run_manifest, demux_dir) + sub_demux_count += 1 current_run_status = "demultiplexing" if run.status_changed(current_run_status): run.update_statusdb(current_run_status) @@ -56,6 +58,8 @@ def _process(run): elif sequencing_done and demultiplexing_status == "finished": transfer_file = CONFIG.get('Element').get('Aviti').get('transfer_log') if not run.is_transferred(transfer_file) and not run.transfer_ongoing(): + #TODO: if multiple demux dirs, aggregate the results into Demultiplexing? + run.aggregate_demux_results run.sync_metadata() run.make_transfer_indicator() current_run_status = "transferring" diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 813e2ea4..560eb4da 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -57,19 +57,11 @@ def update_statusdb(self, current_run_status): pass def manifest_exists(self): - return os.path.isfile(self.run_manifest_file) + return os.path.isfile(self.run_manifest_file) #TODO: still true? - def get_sample_info_from_manifest(self): - sample_info = {} #TODO: populate with sample info from manifest - return sample_info - - def get_sample_types(self, sample_info): - sample_types = () #TODO: populate - return sample_types - - def make_manifest(self, sample_info, sample_type): - #TODO: make a manifest for a sample_type based on sample_info - return + def copy_manifests(): + #TODO: copy manifest zip file from lims location and unzip + pass def generate_demux_command(self): command = [ @@ -82,7 +74,7 @@ def generate_demux_command(self): ] return command - def start_demux(self): + def start_demux(self, run_manifest, demux_dir): with chdir(self.run_dir): cmd = self.generate_demux_command() misc.call_external_command_detached( From ad657179a501acdf316bbdbcd7f847fc80a04e73 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Thu, 12 Sep 2024 12:03:19 +0200 Subject: [PATCH 020/187] Draft for Element upload to statusdb --- taca/element/Aviti_Runs.py | 25 +++++++++++++++++++++++++ taca/utils/statusdb.py | 27 +++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/taca/element/Aviti_Runs.py b/taca/element/Aviti_Runs.py index ad162ac4..3f672d5a 100644 --- a/taca/element/Aviti_Runs.py +++ b/taca/element/Aviti_Runs.py @@ -1,7 +1,32 @@ from taca.element.Element_Runs import Run +from taca.utils.config import CONFIG +from taca.utils.statusdb import ElementRunsConnection class Aviti_Run(Run): def __init__(self, run_dir, configuration): super().__init__(run_dir, configuration) self.sequencer_type = "Aviti" + self.db = ElementRunsConnection(CONFIG["statusdb"], dbname="element_runs") + + def update_statusdb(self): + doc_obj = self.to_doc_obj() + self.db.upload_to_statusdb(doc_obj) + + def construct_NGI_run_id(self): + pass + + def to_doc_obj(self): + doc_obj = { + "name": self.run_name, + "run_status": self.run_status, + "run_id": self.run_id, + "run_dir": self.run_dir, + "run_type": self.run_type, + "sequencer_type": self.sequencer_type, + "samples": self.samples, + "demux": self.demux, + "date": self.date, + "flowcell": self.flowcell, + } + return doc_obj diff --git a/taca/utils/statusdb.py b/taca/utils/statusdb.py index 939e0606..47620ea2 100644 --- a/taca/utils/statusdb.py +++ b/taca/utils/statusdb.py @@ -166,6 +166,33 @@ def finish_ongoing_run(self, ont_run, dict_json: dict): self.db[doc.id] = doc +class ElementRunsConnection(StatusdbSession): + def __init__(self, config, dbname="element_runs"): + super().__init__(config) + self.db = self.connection[dbname] + + def get_db_entry(self, run_id): + view_run_id = self.db.view("info/id") + try: + return view_run_id[run_id].rows[0] + except IndexError: + return None + + def check_if_run_exists(self, run_id) -> bool: + return self.get_db_entry(run_id) is not None + + def check_db_run_status(self, run_name) -> str: + view_status = self.db.view("info/status") + try: + status = view_status[run_name].rows[0].value + except IndexError: # No rows found + return "Unknown" + return status + + def upload_to_statusdb(self, run_obj: dict): + update_doc(self.db, run_obj) + + def update_doc(db, obj, over_write_db_entry=False): view = db.view("info/name") if len(view[obj["name"]].rows) == 1: From dcb450448208042c84bb0e07066d39f1e2250357 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Thu, 12 Sep 2024 14:16:54 +0200 Subject: [PATCH 021/187] more functions --- taca/analysis/analysis_element.py | 4 ++-- taca/element/Element_Runs.py | 22 +++++++++++----------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index 91902d05..e33b655b 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -36,8 +36,8 @@ def _process(run): os.mkdir(run.demux_dir) run.copy_manifests() run_manifests = glob.glob( - os.path.join(run.run_dir, "RunManifest_*.csv") - ) # TODO: is this filename right? + os.path.join(run.run_dir, "RunManifest_*.csv") # TODO: is this filename right? + ) sub_demux_count = 0 for run_manifest in run_manifests.sort(): if len(run_manifests) == 1: diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 560eb4da..d2287659 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -1,6 +1,7 @@ import json import logging import os +import shutil from datetime import datetime from taca.utils import misc @@ -23,7 +24,8 @@ def __init__(self, run_dir, configuration): self.demux_stats_file = os.path.join( self.demux_dir, "RunStats.json" # Assumes demux is finished when this file is created ) - self.run_manifest_file = os.path.join(self.run_dir, "RunManifest.csv") + self.run_manifest_zip_file = os.path.join(self.CONFIG.get('Aviti').get('manifest_zip_location'), + self.flowcell_id + '.tar.gz') #TODO: change and add to taca.yaml def check_sequencing_status(self): if os.path.exists(self.final_sequencing_file): @@ -57,26 +59,24 @@ def update_statusdb(self, current_run_status): pass def manifest_exists(self): - return os.path.isfile(self.run_manifest_file) #TODO: still true? + return os.path.isfile(self.run_manifest_zip_file) - def copy_manifests(): - #TODO: copy manifest zip file from lims location and unzip - pass + def copy_manifests(self): + shutil.copy(self.run_manifest_zip_file, self.run_dir) + #TODO: unzip - def generate_demux_command(self): + def generate_demux_command(self, run_manifest, demux_dir): command = [ - self.CONFIG.get(self.software)[ - "bin" - ], # TODO add path to bases2fastq executable to config + self.CONFIG.get(self.software)["bin"], # TODO add path to bases2fastq executable to config self.run_dir, - self.demux_dir, # TODO: how to handle SideA/SideB? + demux_dir, "-p 12", ] return command def start_demux(self, run_manifest, demux_dir): with chdir(self.run_dir): - cmd = self.generate_demux_command() + cmd = self.generate_demux_command(run_manifest, demux_dir) misc.call_external_command_detached( cmd, with_log_files=True, prefix="demux_" ) From d23be69672c260546df0c187f0b00b0a3dbb0323 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Thu, 12 Sep 2024 14:44:20 +0200 Subject: [PATCH 022/187] RunParameters parsing --- taca/analysis/analysis_element.py | 48 ++++++++----- taca/element/Aviti_Runs.py | 25 ------- taca/element/Element_Runs.py | 112 ++++++++++++++++++++++-------- 3 files changed, 111 insertions(+), 74 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index 91902d05..d632c2f0 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -21,23 +21,28 @@ def _process(run): :param taca.element.Run run: Run to be processed and transferred """ + run.parse_run_parameters() sequencing_done = run.check_sequencing_status() demultiplexing_status = run.get_demultiplexing_status() - if not sequencing_done: # Sequencing ongoing - current_run_status = 'sequencing' + if not sequencing_done: # Sequencing ongoing + current_run_status = "sequencing" if run.status_changed(current_run_status): - run.update_statusdb(current_run_status) #TODO: what info needs to be gathered and uploaded? - elif sequencing_done and demultiplexing_status == "not started": # Sequencing done. Start demux - if not run.manifest_exists(): #TODO: this should check for the zip file in lims output location - logger.warn(f"Run manifest is missing for {run.flowcell_id}") - #TODO: email operator warning + run.update_statusdb() + elif ( + sequencing_done and demultiplexing_status == "not started" + ): # Sequencing done. Start demux + if ( + not run.manifest_exists() + ): # TODO: this should check for the zip file in lims output location + logger.warn(f"Run manifest is missing for {run}") + # TODO: email operator warning return elif run.manifest_exists(): os.mkdir(run.demux_dir) run.copy_manifests() run_manifests = glob.glob( os.path.join(run.run_dir, "RunManifest_*.csv") - ) # TODO: is this filename right? + ) # TODO: is this filename right? sub_demux_count = 0 for run_manifest in run_manifests.sort(): if len(run_manifests) == 1: @@ -56,16 +61,16 @@ def _process(run): run.update_statusdb(current_run_status) return elif sequencing_done and demultiplexing_status == "finished": - transfer_file = CONFIG.get('Element').get('Aviti').get('transfer_log') + transfer_file = CONFIG.get("Element").get("Aviti").get("transfer_log") if not run.is_transferred(transfer_file) and not run.transfer_ongoing(): - #TODO: if multiple demux dirs, aggregate the results into Demultiplexing? + # TODO: if multiple demux dirs, aggregate the results into Demultiplexing? run.aggregate_demux_results run.sync_metadata() run.make_transfer_indicator() current_run_status = "transferring" if run.status_changed(current_run_status): run.update_statusdb(current_run_status) - #TODO: Also update statusdb with a timestamp of when the transfer started + # TODO: Also update statusdb with a timestamp of when the transfer started run.transfer() run.remove_transfer_indicator() run.update_transfer_log(transfer_file) @@ -80,25 +85,29 @@ def _process(run): current_run_status = "transferring" if run.status_changed(current_run_status): run.update_statusdb(current_run_status) - logger.info(f"Run {run.flowcell_id} is being transferred. Skipping.") + logger.info(f"{run} is being transferred. Skipping.") return elif run.is_transferred(transfer_file): - logger.warn(f"The run {run.flowcell_id} has already been transferred but has not been archived. Please investigate") - #TODO: email operator warning + logger.warn( + f"The run {run.flowcell_id} has already been transferred but has not been archived. Please investigate" + ) + # TODO: email operator warning return else: - logger.warn(f"Unknown transfer status of run {run.flowcell_id}. Please investigate") - - + logger.warn( + f"Unknown transfer status of run {run.flowcell_id}. Please investigate" + ) if given_run: - run = Aviti_Run(given_run) # TODO: Needs to change if more Element machines are aquired in the future + run = Aviti_Run(given_run) + # TODO: Needs to change if more types of Element machines are aquired in the future + _process(run) else: data_dirs = CONFIG.get("element_analysis").get( "data_dirs" ) # TODO: add to config - for data_dir in data_dirs: #TODO: make sure to look in both side A and B + for data_dir in data_dirs: # TODO: make sure to look in both side A and B # Run folder looks like DATE_*_*_*, the last section is the FC name. runs = glob.glob( os.path.join(data_dir, "[1-9]*_*_*_*") @@ -111,4 +120,5 @@ def _process(run): # This function might throw and exception, # it is better to continue processing other runs logger.warning(f"There was an error processing the run {run}") + # TODO: Think about how to avoid silent errors (email?) pass diff --git a/taca/element/Aviti_Runs.py b/taca/element/Aviti_Runs.py index 3f672d5a..ad162ac4 100644 --- a/taca/element/Aviti_Runs.py +++ b/taca/element/Aviti_Runs.py @@ -1,32 +1,7 @@ from taca.element.Element_Runs import Run -from taca.utils.config import CONFIG -from taca.utils.statusdb import ElementRunsConnection class Aviti_Run(Run): def __init__(self, run_dir, configuration): super().__init__(run_dir, configuration) self.sequencer_type = "Aviti" - self.db = ElementRunsConnection(CONFIG["statusdb"], dbname="element_runs") - - def update_statusdb(self): - doc_obj = self.to_doc_obj() - self.db.upload_to_statusdb(doc_obj) - - def construct_NGI_run_id(self): - pass - - def to_doc_obj(self): - doc_obj = { - "name": self.run_name, - "run_status": self.run_status, - "run_id": self.run_id, - "run_dir": self.run_dir, - "run_type": self.run_type, - "sequencer_type": self.sequencer_type, - "samples": self.samples, - "demux": self.demux, - "date": self.date, - "flowcell": self.flowcell, - } - return doc_obj diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 560eb4da..2b745f8b 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -5,6 +5,7 @@ from taca.utils import misc from taca.utils.filesystem import chdir +from taca.utils.statusdb import ElementRunsConnection logger = logging.getLogger(__name__) @@ -15,16 +16,67 @@ class Run: def __init__(self, run_dir, configuration): if not os.path.exists(run_dir): raise RuntimeError(f"Could not locate run directory {run_dir}") - self.flowcell_id = run_dir #TODO: get flowcell id from json instead - self.run_dir = os.path.abspath(run_dir) # TODO: How to handle SideA/SideB? + self.run_parameters_parsed = False + + self.run_dir = os.path.abspath(run_dir) # TODO: How to handle SideA/SideB? self.CONFIG = configuration + self.demux_dir = os.path.join(self.run_dir, "Demultiplexing") self.final_sequencing_file = os.path.join(self.run_dir, "RunUploaded.json") + self.demux_stats_file = os.path.join( - self.demux_dir, "RunStats.json" # Assumes demux is finished when this file is created + self.demux_dir, + "RunStats.json", # Assumes demux is finished when this file is created ) self.run_manifest_file = os.path.join(self.run_dir, "RunManifest.csv") - + + # Instrument generated files + self.run_parameters_file = os.path.join(self.run_dir, "RunParameters.json") + self.run_stats_file = os.path.join(self.run_dir, "RunStats.json") + self.run_manifest_file_from_instrument = os.path.join( + self.run_dir, "RunManifest.json" + ) + self.run_uploaded_file = os.path.join(self.run_dir, "RunUploaded.json") + + self.db = ElementRunsConnection(self.CONFIG["statusdb"], dbname="element_runs") + + def __str__(self) -> str: + if self.run_parameters_parsed: + return f"ElementRun({self.NGI_run_id})" + else: + return f"ElementRun({self.run_dir})" + + @property + def NGI_run_id(self): + if self.run_parameters_parsed: + return f"{self.date}_{self.instrument_name}_{self.side_letter}{self.flowcell_id}" + else: + raise RuntimeError(f"Run parameters not parsed for run {self.run_dir}") + + def parse_run_parameters(self) -> None: + with open(self.run_parameters_file) as json_file: + run_parameters = json.load(json_file) + + # Manually entered, but should be side and flowcell id + self.run_name = run_parameters.get("RunName") + + self.run_id = run_parameters.get( + "runID" + ) # Unique hash that we don't really use + self.side = run_parameters.get("Side") # SideA or SideB + self.side_letter = self.side[-1] # A or B + self.run_type = run_parameters.get( + "RunType" + ) # Sequencing, wash or prime I believe? + self.flowcell_id = run_parameters.get("FlowcellID") + self.instrument_name = run_parameters.get("InstrumentName") + self.date = run_parameters.get("Date") + self.operator_name = run_parameters.get("OperatorName") + + def to_doc_obj(self): + # TODO + pass + def check_sequencing_status(self): if os.path.exists(self.final_sequencing_file): with open(self.final_sequencing_file) as json_file: @@ -35,7 +87,7 @@ def check_sequencing_status(self): return True else: return False - + def get_demultiplexing_status(self): if not os.path.exists(self.demux_dir): return "not started" @@ -47,22 +99,22 @@ def get_demultiplexing_status(self): return "finished" else: return "unknown" - + def status_changed(self, current_run_status): - #TODO: get document from statusdb, check status field, return true if status of run changed + # TODO: get document from statusdb, check status field, return true if status of run changed pass - def update_statusdb(self, current_run_status): - #TODO: Get document from statusdb. Gather data about run and update the statusdb document, then upload to statusdb - pass + def update_statusdb(self): + doc_obj = self.to_doc_obj() + self.db.upload_to_statusdb(doc_obj) def manifest_exists(self): - return os.path.isfile(self.run_manifest_file) #TODO: still true? - - def copy_manifests(): - #TODO: copy manifest zip file from lims location and unzip + return os.path.isfile(self.run_manifest_file) # TODO: still true? + + def copy_manifests(self): + # TODO: copy manifest zip file from lims location and unzip pass - + def generate_demux_command(self): command = [ self.CONFIG.get(self.software)[ @@ -86,35 +138,35 @@ def start_demux(self, run_manifest, demux_dir): ) def is_transferred(self, transfer_file): - #TODO: return true if run in transfer log, else false + # TODO: return true if run in transfer log, else false pass - + def transfer_ongoing(self): - #TODO: return true if hidden transfer file marker exists, else false + # TODO: return true if hidden transfer file marker exists, else false pass - + def sync_metadata(self): - #TODO: copy metadata from demuxed run to ngi-nas-ns + # TODO: copy metadata from demuxed run to ngi-nas-ns pass - + def make_transfer_indicator(self): - #TODO: touch a hidden file in the run directory + # TODO: touch a hidden file in the run directory pass - + def transfer(self): - #TODO: rsync run to analysis cluster + # TODO: rsync run to analysis cluster pass - + def remove_transfer_indicator(self): - #TODO: remove hidden file in run directory + # TODO: remove hidden file in run directory pass - + def update_transfer_log(self, transfer_file): - #TODO: update the transfer log + # TODO: update the transfer log pass - + def archive(self): - #TODO: move run dir to nosync + # TODO: move run dir to nosync pass def parse_rundir(self): From 6ac2bee3d7a8c0a5ec850da6e157186c276128a3 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Fri, 13 Sep 2024 08:33:13 +0200 Subject: [PATCH 023/187] No need to give the status again --- taca/analysis/analysis_element.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index e10230b3..1987260f 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -22,6 +22,8 @@ def _process(run): :param taca.element.Run run: Run to be processed and transferred """ run.parse_run_parameters() + # TODO Should we just abort if the run parameters is not found? We cannot assign the run id without it. + sequencing_done = run.check_sequencing_status() demultiplexing_status = run.get_demultiplexing_status() if not sequencing_done: # Sequencing ongoing @@ -34,7 +36,9 @@ def _process(run): if ( not run.manifest_exists() ): # TODO: this should check for the zip file in lims output location - logger.warn(f"Run manifest is missing for {run}") + logger.warn( + f"Run manifest is missing for {run}, demultiplexing aborted" + ) # TODO: email operator warning return elif run.manifest_exists(): @@ -56,11 +60,11 @@ def _process(run): sub_demux_count += 1 run.status = "demultiplexing" if run.status_changed: - run.update_statusdb(run.status) + run.update_statusdb() elif sequencing_done and demultiplexing_status == "ongoing": run.status = "demultiplexing" if run.status_changed: - run.update_statusdb(run.status) + run.update_statusdb() return elif sequencing_done and demultiplexing_status == "finished": transfer_file = CONFIG.get("Element").get("Aviti").get("transfer_log") @@ -71,22 +75,22 @@ def _process(run): run.make_transfer_indicator() run.status = "transferring" if run.status_changed: - run.update_statusdb(run.status) + run.update_statusdb() # TODO: Also update statusdb with a timestamp of when the transfer started run.transfer() run.remove_transfer_indicator() run.update_transfer_log(transfer_file) run.status = "transferred" if run.status_changed: - run.update_statusdb(run.status) + run.update_statusdb() run.archive() run.status = "archived" if run.status_changed: - run.update_statusdb(run.status) + run.update_statusdb() elif not run.is_transferred(transfer_file) and run.transfer_ongoing(): run.status = "transferring" if run.status_changed: - run.update_statusdb(run.status) + run.update_statusdb() logger.info(f"{run} is being transferred. Skipping.") return elif run.is_transferred(transfer_file): From 539bbee825f25780db05120c0e7e511499e95ef1 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Fri, 13 Sep 2024 11:22:09 +0200 Subject: [PATCH 024/187] Use run for printing --- taca/analysis/analysis_element.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index 1987260f..c7230a90 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -67,7 +67,9 @@ def _process(run): run.update_statusdb() return elif sequencing_done and demultiplexing_status == "finished": - transfer_file = CONFIG.get("Element").get("Aviti").get("transfer_log") + transfer_file = ( + CONFIG.get("Element").get(run.sequencer_type).get("transfer_log") + ) if not run.is_transferred(transfer_file) and not run.transfer_ongoing(): # TODO: if multiple demux dirs, aggregate the results into Demultiplexing? run.aggregate_demux_results @@ -77,9 +79,10 @@ def _process(run): if run.status_changed: run.update_statusdb() # TODO: Also update statusdb with a timestamp of when the transfer started - run.transfer() + run.transfer() # I think this should be a detached command as well run.remove_transfer_indicator() run.update_transfer_log(transfer_file) + run.status = "transferred" if run.status_changed: run.update_statusdb() @@ -95,14 +98,12 @@ def _process(run): return elif run.is_transferred(transfer_file): logger.warn( - f"The run {run.flowcell_id} has already been transferred but has not been archived. Please investigate" + f"The run {run} has already been transferred but has not been archived. Please investigate" ) # TODO: email operator warning return else: - logger.warn( - f"Unknown transfer status of run {run.flowcell_id}. Please investigate" - ) + logger.warn(f"Unknown transfer status of run {run}. Please investigate") if given_run: run = Aviti_Run(given_run) From 4c15b67ea99307a5d936c45a46a5d5c7f1d732e9 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Fri, 13 Sep 2024 15:47:35 +0200 Subject: [PATCH 025/187] Read instrument generated files to doc obj and implemented the status_changed method --- taca/element/Element_Runs.py | 38 +++++++++++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 36a7ef9b..78f6763e 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -73,6 +73,7 @@ def NGI_run_id(self): raise RuntimeError(f"Run parameters not parsed for run {self.run_dir}") def parse_run_parameters(self) -> None: + """Parse run-information from the RunParameters.json file""" try: with open(self.run_parameters_file) as json_file: run_parameters = json.load(json_file) @@ -100,12 +101,31 @@ def parse_run_parameters(self) -> None: self.run_parameters_parsed = True def to_doc_obj(self): - # TODO + # TODO, are we sure what we should do when the RunParameters.json file is missing? + + # Read in all instrument generated files + instrument_generated_files = {} + for file in [ + self.run_parameters_file, + self.run_stats_file, + self.run_manifest_file_from_instrument, + self.run_uploaded_file, + ]: + if os.path.exists(file): + with open(file) as json_file: + instrument_generated_files[os.path.basename(file)] = json.load( + json_file + ) + else: + instrument_generated_files[os.path.basename(file)] = None + doc_obj = { "run_path": self.run_dir, "run_status": self.status, - "pore_count_history": [], + "NGI_run_id": self.NGI_run_id, + "instrument_generated_files": instrument_generated_files, } + return doc_obj def check_sequencing_status(self): @@ -131,9 +151,13 @@ def get_demultiplexing_status(self): else: return "unknown" - def status_changed(self, current_run_status): - # TODO: get document from statusdb, check status field, return true if status of run changed - pass + def status_changed(self): + if not self.run_parameters_parsed: + raise RuntimeError( + f"Run parameters not parsed for run {self.run_dir}, cannot check status" + ) + db_run_status = self.db.check_db_run_status(self.NGI_run_id) + return db_run_status != self.status def update_statusdb(self): doc_obj = self.to_doc_obj() @@ -199,3 +223,7 @@ def update_transfer_log(self, transfer_file): def archive(self): # TODO: move run dir to nosync pass + + def aggregate_demux_results(self): + # TODO: aggregate demux results + pass From bf101872345eb8f46400265e4a455f85f455a1bc Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Mon, 16 Sep 2024 15:20:06 +0200 Subject: [PATCH 026/187] Abort processing if RunParameters.json is not found --- taca/analysis/analysis_element.py | 9 +++++++-- taca/element/Element_Runs.py | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index c7230a90..4877218e 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -21,8 +21,13 @@ def _process(run): :param taca.element.Run run: Run to be processed and transferred """ - run.parse_run_parameters() - # TODO Should we just abort if the run parameters is not found? We cannot assign the run id without it. + try: + run.parse_run_parameters() + except FileNotFoundError: + logger.warn( + f"Cannot reliably set NGI_run_id for {run} due to missing RunParameters.json. Aborting run processing" + ) + raise sequencing_done = run.check_sequencing_status() demultiplexing_status = run.get_demultiplexing_status() diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 78f6763e..83eb6334 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -81,7 +81,7 @@ def parse_run_parameters(self) -> None: logger.warning( f"Run parameters file not found for {self}, might not be ready yet" ) - return + raise # Manually entered, but should be side and flowcell id self.run_name = run_parameters.get("RunName") From cd857d7161bb7e03224ab2a5e6a6b579c1be25b1 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Mon, 16 Sep 2024 15:29:25 +0200 Subject: [PATCH 027/187] Restructure transfer logic --- taca/analysis/analysis_element.py | 41 +++++++++++++------------- taca/element/Element_Runs.py | 48 ++++++++++++++++++++++--------- 2 files changed, 56 insertions(+), 33 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index c7230a90..20ec17d2 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -67,12 +67,8 @@ def _process(run): run.update_statusdb() return elif sequencing_done and demultiplexing_status == "finished": - transfer_file = ( - CONFIG.get("Element").get(run.sequencer_type).get("transfer_log") - ) - if not run.is_transferred(transfer_file) and not run.transfer_ongoing(): - # TODO: if multiple demux dirs, aggregate the results into Demultiplexing? - run.aggregate_demux_results + if not run.is_transferred() and not run.transfer_ongoing() and not run.rsync_complete(): + run.aggregate_demux_results() # TODO: if multiple demux dirs, aggregate the results into Demultiplexing? run.sync_metadata() run.make_transfer_indicator() run.status = "transferring" @@ -80,30 +76,35 @@ def _process(run): run.update_statusdb() # TODO: Also update statusdb with a timestamp of when the transfer started run.transfer() # I think this should be a detached command as well - run.remove_transfer_indicator() - run.update_transfer_log(transfer_file) - - run.status = "transferred" - if run.status_changed: - run.update_statusdb() - run.archive() - run.status = "archived" - if run.status_changed: - run.update_statusdb() - elif not run.is_transferred(transfer_file) and run.transfer_ongoing(): + elif run.transfer_ongoing() and not run.rsync_complete(): run.status = "transferring" if run.status_changed: run.update_statusdb() logger.info(f"{run} is being transferred. Skipping.") return - elif run.is_transferred(transfer_file): - logger.warn( + elif run.rsync_complete() and not run.is_transferred(): + if run.rsync_success(): + run.remove_transfer_indicator() + run.update_transfer_log() + run.status = "transferred" + if run.status_changed: + run.update_statusdb() + run.archive() + run.status = "archived" + if run.status_changed: + run.update_statusdb() + else: + run.status = "transfer failed" + logger.warning(f"An issue occurred while transfering {run} to the analysis cluster." ) + # TODO: email warning to operator + elif run.is_transferred(): + logger.warning( f"The run {run} has already been transferred but has not been archived. Please investigate" ) # TODO: email operator warning return else: - logger.warn(f"Unknown transfer status of run {run}. Please investigate") + logger.warning(f"Unknown transfer status of run {run}. Please investigate") if given_run: run = Aviti_Run(given_run) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 78f6763e..4bb65ffe 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -34,6 +34,8 @@ def __init__(self, run_dir, configuration): self.flowcell_id + ".tar.gz", ) # TODO: change and add to taca.yaml # TODO, need to be real careful when using the flowcell_id as it is manually entered and can mean three different things + self.transfer_file = ( + self.CONFIG.get("Element").get(self.sequencer_type).get("transfer_log")) # TODO: change and add to taca.yaml # Instrument generated files self.run_parameters_file = os.path.join(self.run_dir, "RunParameters.json") @@ -145,9 +147,9 @@ def get_demultiplexing_status(self): elif os.path.exists(self.demux_dir) and not os.path.isfile( self.demux_stats_file ): - return "ongoing" + return "ongoing" # TODO: check for exit status file instead elif os.path.exists(self.demux_dir) and os.path.isfile(self.demux_stats_file): - return "finished" + return "finished" # TODO: check exit status of demux in exit status file else: return "unknown" @@ -177,8 +179,12 @@ def generate_demux_command(self, run_manifest, demux_dir): ], # TODO add path to bases2fastq executable to config self.run_dir, demux_dir, - "-p 12", - ] + "-p 12", # TODO: how many? Considering that we may start several demux runs at once + f"-r {run_manifest}", + "--legacy-fastq", # TODO: except if Smart-seq3 + "--force-index-orientation", + ] # TODO: any other options? + # TODO: write exit status of command to file return command def start_demux(self, run_manifest, demux_dir): @@ -189,17 +195,35 @@ def start_demux(self, run_manifest, demux_dir): ) logger.info( "Bases2Fastq conversion and demultiplexing " - f"started for run {os.path.basename(self.run_dir)} on {datetime.now()}" + f"started for run {self} on {datetime.now()}" ) - def is_transferred(self, transfer_file): - # TODO: return true if run in transfer log, else false - pass - + def is_transferred(self): + with open(self.transfer_file, 'r') as transfer_file: + for row in transfer_file.read(): + if self.NGI_run_id in row: + return True + return False + def transfer_ongoing(self): # TODO: return true if hidden transfer file marker exists, else false + + pass + + def rsync_complete(self): + # TODO: return true if .rsync_exit_status exists + pass + + def get_rsync_exit_status(): + # TODO: return status of rsync from .rsync_exit_status pass + def aggregate_demux_results(self): + # TODO: aggregate demux results + pass + + + def sync_metadata(self): # TODO: copy metadata from demuxed run to ngi-nas-ns pass @@ -216,7 +240,7 @@ def remove_transfer_indicator(self): # TODO: remove hidden file in run directory pass - def update_transfer_log(self, transfer_file): + def update_transfer_log(self): # TODO: update the transfer log pass @@ -224,6 +248,4 @@ def archive(self): # TODO: move run dir to nosync pass - def aggregate_demux_results(self): - # TODO: aggregate demux results - pass + From 3be54840d57a19ee0df91e405840d99a9778d5ce Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Mon, 16 Sep 2024 15:35:59 +0200 Subject: [PATCH 028/187] rename is_transferred function --- taca/analysis/analysis_element.py | 6 +++--- taca/element/Element_Runs.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index 0c0bf977..42fd3e00 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -72,7 +72,7 @@ def _process(run): run.update_statusdb() return elif sequencing_done and demultiplexing_status == "finished": - if not run.is_transferred() and not run.transfer_ongoing() and not run.rsync_complete(): + if not run.in_transfer_log() and not run.transfer_ongoing() and not run.rsync_complete(): run.aggregate_demux_results() # TODO: if multiple demux dirs, aggregate the results into Demultiplexing? run.sync_metadata() run.make_transfer_indicator() @@ -87,7 +87,7 @@ def _process(run): run.update_statusdb() logger.info(f"{run} is being transferred. Skipping.") return - elif run.rsync_complete() and not run.is_transferred(): + elif run.rsync_complete() and not run.in_transfer_log(): if run.rsync_success(): run.remove_transfer_indicator() run.update_transfer_log() @@ -102,7 +102,7 @@ def _process(run): run.status = "transfer failed" logger.warning(f"An issue occurred while transfering {run} to the analysis cluster." ) # TODO: email warning to operator - elif run.is_transferred(): + elif run.in_transfer_log(): logger.warning( f"The run {run} has already been transferred but has not been archived. Please investigate" ) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 541bb319..6b8f03e5 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -198,7 +198,7 @@ def start_demux(self, run_manifest, demux_dir): f"started for run {self} on {datetime.now()}" ) - def is_transferred(self): + def in_transfer_log(self): with open(self.transfer_file, 'r') as transfer_file: for row in transfer_file.read(): if self.NGI_run_id in row: From 4ca441d33aaa87d5404075412985952b8195a084 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Mon, 16 Sep 2024 15:43:36 +0200 Subject: [PATCH 029/187] add methods for finding and copying LIMS-generated manifests --- taca/element/Element_Runs.py | 90 ++++++++++++++++++++++++++++++++---- 1 file changed, 81 insertions(+), 9 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 78f6763e..0a7cab61 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -1,8 +1,11 @@ import json import logging import os +import re import shutil +import zipfile from datetime import datetime +from glob import glob from taca.utils import misc from taca.utils.filesystem import chdir @@ -28,12 +31,6 @@ def __init__(self, run_dir, configuration): self.demux_dir, "RunStats.json", # Assumes demux is finished when this file is created ) - self.run_manifest_file = os.path.join(self.run_dir, "RunManifest.csv") - self.run_manifest_zip_file = os.path.join( - self.CONFIG.get("Aviti").get("manifest_zip_location"), - self.flowcell_id + ".tar.gz", - ) # TODO: change and add to taca.yaml - # TODO, need to be real careful when using the flowcell_id as it is manually entered and can mean three different things # Instrument generated files self.run_parameters_file = os.path.join(self.run_dir, "RunParameters.json") @@ -47,6 +44,10 @@ def __init__(self, run_dir, configuration): # Fields to be set by TACA self.status = None + self.lims_step_id = None + self.lims_full_manifest = None + self.lims_start_manifest = None + self.lims_demux_manifests = None # Fields that will be set when parsing run parameters self.run_name = None @@ -166,9 +167,79 @@ def update_statusdb(self): def manifest_exists(self): return os.path.isfile(self.run_manifest_zip_file) - def copy_manifests(self): - shutil.copy(self.run_manifest_zip_file, self.run_dir) - # TODO: unzip + def get_lims_step_id(self) -> str | None: + """If the run was started using a LIMS-generated manifest, + the ID of the LIMS step can be extracted from it. + """ + assert self.manifest_exists(), "Run manifest not found" + with open(self.run_manifest_file_from_instrument) as csv_file: + manifest_lines = csv_file.readlines() + for line in manifest_lines: + if "lims_step_id" in line: + lims_step_id = line.split(",")[1] + return lims_step_id + return None + + def copy_manifests(self) -> bool: + """Fetch the LIMS-generated run manifests from ngi-nas-ns and unzip them into a run subdir.""" + + # Specify dir in which LIMS drop the manifest zip files + dir_to_search = os.path.join( + self.CONFIG.get("Aviti").get( + "manifest_zip_location" + ), # TODO: change and add to taca.yaml + datetime.now().year, + ) + + # Use LIMS step ID if available, else flowcell ID, to make a query pattern + if self.lims_step_id: + logging.info( + f"Using LIMS step ID '{self.lims_step_id}' to find LIMS run manifests." + ) + glob_pattern = f"{dir_to_search}/*{self.lims_step_id}*.zip" + else: + logging.warning( + "LIMS step ID not available, using flowcell ID to find LIMS run manifests." + ) + glob_pattern = f"{dir_to_search}/*{self.flowcell_id}*.zip" + + # Find paths matching the pattern + glob_results = glob(glob_pattern) + if len(glob_results) == 0: + logger.warning( + f"No manifest found for run '{self.run_dir}' with pattern '{glob_pattern}'." + ) + return False # TODO determine whether to raise an error here instead + elif len(glob_results) > 1: + logger.warning( + f"Multiple manifests found for run '{self.run_dir}' with pattern '{glob_pattern}', using latest one." + ) + glob_results.sort() + zip_src_path = glob_results[-1] + else: + zip_src_path = glob_results[0] + + # Make a run subdir named after the zip file and extract manifests there + zip_name = os.path.basename(zip_src_path) + zip_dst_path = os.path.join(self.run_dir, zip_name) + os.mkdir(zip_dst_path) + + with zipfile.ZipFile(zip_src_path, "r") as zip_ref: + zip_ref.extractall(zip_dst_path) + + # Set the paths of the different manifests as attributes + manifests = os.listdir(zip_dst_path) + self.lims_full_manifest = [ + m for m in manifests if re.match(r".*_untrimmed\.csv$", m) + ][0] + self.lims_start_manifest = [ + m for m in manifests if re.match(r".*_trimmed\.csv$", m) + ][0] + self.lims_demux_manifests = [ + m for m in manifests if re.match(r".*_\d+\.csv$", m) + ] + + return True def generate_demux_command(self, run_manifest, demux_dir): command = [ @@ -184,6 +255,7 @@ def generate_demux_command(self, run_manifest, demux_dir): def start_demux(self, run_manifest, demux_dir): with chdir(self.run_dir): cmd = self.generate_demux_command(run_manifest, demux_dir) + # TODO handle multiple composite manifests for demux misc.call_external_command_detached( cmd, with_log_files=True, prefix="demux_" ) From c840ea8f749513b4d0fc452ecbff4ebc548cf52c Mon Sep 17 00:00:00 2001 From: kedhammar Date: Mon, 16 Sep 2024 16:48:43 +0200 Subject: [PATCH 030/187] add very rushed function for deriving new composite demux manifests --- taca/element/Element_Runs.py | 115 +++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index c0c2c0c2..a338d691 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -7,6 +7,8 @@ from datetime import datetime from glob import glob +import pandas as pd + from taca.utils import misc from taca.utils.filesystem import chdir from taca.utils.statusdb import ElementRunsConnection @@ -174,6 +176,9 @@ def get_lims_step_id(self) -> str | None: """If the run was started using a LIMS-generated manifest, the ID of the LIMS step can be extracted from it. """ + + # TODO test me + assert self.manifest_exists(), "Run manifest not found" with open(self.run_manifest_file_from_instrument) as csv_file: manifest_lines = csv_file.readlines() @@ -186,6 +191,8 @@ def get_lims_step_id(self) -> str | None: def copy_manifests(self) -> bool: """Fetch the LIMS-generated run manifests from ngi-nas-ns and unzip them into a run subdir.""" + # TODO test me + # Specify dir in which LIMS drop the manifest zip files dir_to_search = os.path.join( self.CONFIG.get("Aviti").get( @@ -244,6 +251,114 @@ def copy_manifests(self) -> bool: return True + def make_demux_manifests( + self, manifest_to_split: os.PathLike, outdir: os.PathLike | None = None + ) -> list[os.PathLike]: + """Derive composite demultiplexing manifests (grouped by index duplicity and lengths) + from a single information-rich manifest. + """ + + # TODO test me + + # Read specified manifest + with open(manifest_to_split) as f: + manifest_contents = f.read() + + # Get '[SAMPLES]' section + split_contents = "[SAMPLES]".split(manifest_contents) + assert ( + len(split_contents) == 2 + ), f"Could not split sample rows out of manifest {manifest_contents}" + sample_section = split_contents[1].split("\n") + + # Split into header and rows + header = sample_section[0] + sample_rows = sample_section[1:] + + # Convert to list of dicts + sample_dicts = [] + for row in sample_rows: + row_dict = dict(zip(header.split(","), row.split(","))) + sample_dicts.append(row_dict) + + # Convert to dataframe + df = pd.DataFrame.from_dict(sample_dicts) + + # Separate samples from controls + df_samples = df[df["Project"] != "Control"].copy() + df_controls = df[df["Project"] == "Control"].copy() + + # Apply default dir path for output + if outdir is None: + outdir = self.run_dir + + ## Build composite manifests + + manifest_root_name = f"{self.NGI_run_id}_demux" + + # Get idx lengths for calculations + df_samples.loc[:, "len_idx1"] = df["Index1"].apply(len) + df_samples.loc[:, "len_idx2"] = df["Index2"].apply(len) + + # Break down by index lengths and lane, creating composite manifests + manifests = [] + n = 0 + for (len_idx1, len_idx2, lane), group in df_samples.groupby( + ["len_idx1", "len_idx2", "Lane"] + ): + file_name = f"{manifest_root_name}_{n}.csv" + runValues_section = "\n".join( + [ + "[RUNVALUES]", + "KeyName, Value", + f'manifest_file, "{file_name}"', + f"manifest_group, {n+1}/{len(df.groupby(['len_idx1', 'len_idx2', 'Lane']))}", + f"grouped_by, len_idx1:{len_idx1} len_idx2:{len_idx2} lane:{lane}", + ] + ) + + settings_section = "\n".join( + [ + "[SETTINGS]", + "SettingName, Value", + ] + ) + + # Add PhiX stratified by index length + if group["phix_loaded"].any(): + # Subset controls by lane + group_controls = df_controls[df_controls["Lane"] == lane].copy() + + # Trim PhiX indexes to match group + group_controls.loc[:, "Index1"] = group_controls.loc[:, "Index1"].apply( + lambda x: x[:len_idx1] + ) + group_controls.loc[:, "Index2"] = group_controls.loc[:, "Index2"].apply( + lambda x: x[:len_idx2] + ) + + # Add PhiX to group + group = pd.concat([group, group_controls], axis=0, ignore_index=True) + + samples_section = ( + f"[SAMPLES]\n{group.iloc[:, 0:6].to_csv(index=None, header=True)}" + ) + + manifest_contents = "\n\n".join( + [runValues_section, settings_section, samples_section] + ) + + file_path = os.path.join(outdir, file_name) + manifests.append((file_path, manifest_contents)) + n += 1 + + for manifest_path, manifest_contents in manifests: + with open(os.path.join(outdir, manifest_path), "w") as f: + f.write(manifest_contents) + + manifest_paths = [t[0] for t in manifests] + return manifest_paths + def generate_demux_command(self, run_manifest, demux_dir): command = [ self.CONFIG.get(self.software)[ From eda9f3ff7c5e28d6af1f1fb0d479785ce74bf581 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Tue, 17 Sep 2024 08:45:53 +0200 Subject: [PATCH 031/187] Restructure transfer status --- taca/analysis/analysis_element.py | 9 +++++---- taca/element/Element_Runs.py | 12 +++++++++++- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index 42fd3e00..0f20ba8a 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -72,7 +72,8 @@ def _process(run): run.update_statusdb() return elif sequencing_done and demultiplexing_status == "finished": - if not run.in_transfer_log() and not run.transfer_ongoing() and not run.rsync_complete(): + transfer_status = run.get_transfer_status() + if transfer_status == "not started": run.aggregate_demux_results() # TODO: if multiple demux dirs, aggregate the results into Demultiplexing? run.sync_metadata() run.make_transfer_indicator() @@ -81,13 +82,13 @@ def _process(run): run.update_statusdb() # TODO: Also update statusdb with a timestamp of when the transfer started run.transfer() # I think this should be a detached command as well - elif run.transfer_ongoing() and not run.rsync_complete(): + elif transfer_status == "ongoing": run.status = "transferring" if run.status_changed: run.update_statusdb() logger.info(f"{run} is being transferred. Skipping.") return - elif run.rsync_complete() and not run.in_transfer_log(): + elif transfer_status == "finished": if run.rsync_success(): run.remove_transfer_indicator() run.update_transfer_log() @@ -102,7 +103,7 @@ def _process(run): run.status = "transfer failed" logger.warning(f"An issue occurred while transfering {run} to the analysis cluster." ) # TODO: email warning to operator - elif run.in_transfer_log(): + elif transfer_status == "unknown": logger.warning( f"The run {run} has already been transferred but has not been archived. Please investigate" ) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 6b8f03e5..2010a933 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -179,7 +179,7 @@ def generate_demux_command(self, run_manifest, demux_dir): ], # TODO add path to bases2fastq executable to config self.run_dir, demux_dir, - "-p 12", # TODO: how many? Considering that we may start several demux runs at once + "-p 8", f"-r {run_manifest}", "--legacy-fastq", # TODO: except if Smart-seq3 "--force-index-orientation", @@ -198,6 +198,16 @@ def start_demux(self, run_manifest, demux_dir): f"started for run {self} on {datetime.now()}" ) + def get_transfer_status(self): + if not self.in_transfer_log() and not self.transfer_ongoing() and not self.rsync_complete(): + return "not started" + elif self.transfer_ongoing() and not self.rsync_complete(): + return "ongoing" + elif self.rsync_complete() and not self.in_transfer_log(): + return "finished" + elif self.in_transfer_log(): + return "unknown" + def in_transfer_log(self): with open(self.transfer_file, 'r') as transfer_file: for row in transfer_file.read(): From 0baa93c933df4a082b359a600cf64c5041493e21 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Tue, 17 Sep 2024 09:24:06 +0200 Subject: [PATCH 032/187] refrase --- taca/analysis/analysis_element.py | 2 +- taca/element/Element_Runs.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index 0f20ba8a..e57052a6 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -88,7 +88,7 @@ def _process(run): run.update_statusdb() logger.info(f"{run} is being transferred. Skipping.") return - elif transfer_status == "finished": + elif transfer_status == "rsync done": if run.rsync_success(): run.remove_transfer_indicator() run.update_transfer_log() diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index bd807b52..5267d6ba 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -392,7 +392,7 @@ def get_transfer_status(self): elif self.transfer_ongoing() and not self.rsync_complete(): return "ongoing" elif self.rsync_complete() and not self.in_transfer_log(): - return "finished" + return "rsync done" elif self.in_transfer_log(): return "unknown" From 1fe015daf1be02f4c4a868dbb96d4a99ea720f85 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Tue, 17 Sep 2024 13:28:52 +0200 Subject: [PATCH 033/187] Add command for starting demux --- taca/analysis/analysis_element.py | 2 +- taca/element/Element_Runs.py | 64 ++++++++++++++++--------------- 2 files changed, 35 insertions(+), 31 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index e57052a6..250fc013 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -89,7 +89,7 @@ def _process(run): logger.info(f"{run} is being transferred. Skipping.") return elif transfer_status == "rsync done": - if run.rsync_success(): + if run.rsync_successful(): run.remove_transfer_indicator() run.update_transfer_log() run.status = "transferred" diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 5267d6ba..7f3e8eae 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -2,9 +2,10 @@ import logging import os import re -import shutil import zipfile +import subprocess from datetime import datetime +from pathlib import Path from glob import glob import pandas as pd @@ -36,6 +37,7 @@ def __init__(self, run_dir, configuration): self.transfer_file = ( self.CONFIG.get("Element").get(self.sequencer_type).get("transfer_log") ) # TODO: change and add to taca.yaml + self.rsync_exit_file = os.path.join(self.run_dir, '.rsync_exit_status') # Instrument generated files self.run_parameters_file = os.path.join(self.run_dir, "RunParameters.json") @@ -360,31 +362,32 @@ def make_demux_manifests( return manifest_paths def generate_demux_command(self, run_manifest, demux_dir): - command = [ - self.CONFIG.get(self.software)[ - "bin" - ], # TODO add path to bases2fastq executable to config - self.run_dir, - demux_dir, - "-p 8", - f"-r {run_manifest}", - "--legacy-fastq", # TODO: except if Smart-seq3 - "--force-index-orientation", - ] # TODO: any other options? - # TODO: write exit status of command to file + command = (f"{self.CONFIG.get(self.software)["bin"]}" # TODO: add path to bases2fastq executable to config + + f" {self.run_dir}" + + f" {demux_dir}" + + " -p 8" + + f" -r {run_manifest}" + + " --legacy-fastq" # TODO: except if Smart-seq3 + + f" --force-index-orientation; echo $? > {self.rsync_exit_file}" + ) # TODO: any other options? return command def start_demux(self, run_manifest, demux_dir): with chdir(self.run_dir): cmd = self.generate_demux_command(run_manifest, demux_dir) # TODO handle multiple composite manifests for demux - misc.call_external_command_detached( - cmd, with_log_files=True, prefix="demux_" - ) - logger.info( - "Bases2Fastq conversion and demultiplexing " - f"started for run {self} on {datetime.now()}" - ) + try: + p_handle = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True, cwd=self.run_dir) + logger.info( + "Bases2Fastq conversion and demultiplexing " + f"started for run {self} on {datetime.now()}" + ) + except subprocess.CalledProcessError: + logger.warning("An error occurred while starting demultiplexing for " + f"{self} on {datetime.now()}." + ) + return + def get_transfer_status(self): if not self.in_transfer_log() and not self.transfer_ongoing() and not self.rsync_complete(): @@ -404,17 +407,18 @@ def in_transfer_log(self): return False def transfer_ongoing(self): - # TODO: return true if hidden transfer file marker exists, else false - - pass + return os.path.isfile(os.path.join(self.run_dir, '.rsync_ongoing')) def rsync_complete(self): - # TODO: return true if .rsync_exit_status exists - pass + return os.path.isfile(self.rsync_exit_file) - def get_rsync_exit_status(): - # TODO: return status of rsync from .rsync_exit_status - pass + def rsync_successful(self): + with open(os.path.join(self.run_dir, '.rsync_exit_status')) as rsync_exit_file: + rsync_exit_status = rsync_exit_file.readlines() + if rsync_exit_status[0].strip() == 0: + return True + else: + return False def aggregate_demux_results(self): # TODO: aggregate demux results @@ -425,8 +429,8 @@ def sync_metadata(self): pass def make_transfer_indicator(self): - # TODO: touch a hidden file in the run directory - pass + transfer_indicator = os.path.join(self.run_dir, '.rsync_ongoing') + Path(transfer_indicator).touch() def transfer(self): # TODO: rsync run to analysis cluster From 5790c1aacd05e8608c98607a121b69abd12e553c Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Tue, 17 Sep 2024 14:24:49 +0200 Subject: [PATCH 034/187] Check all demux dirs if demux is done --- taca/analysis/analysis_element.py | 7 ++++--- taca/element/Element_Runs.py | 28 +++++++++++++++++----------- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index 250fc013..32439610 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -24,7 +24,7 @@ def _process(run): try: run.parse_run_parameters() except FileNotFoundError: - logger.warn( + logger.warning( f"Cannot reliably set NGI_run_id for {run} due to missing RunParameters.json. Aborting run processing" ) raise @@ -41,7 +41,7 @@ def _process(run): if ( not run.manifest_exists() ): # TODO: this should check for the zip file in lims output location - logger.warn( + logger.warning( f"Run manifest is missing for {run}, demultiplexing aborted" ) # TODO: email operator warning @@ -74,7 +74,8 @@ def _process(run): elif sequencing_done and demultiplexing_status == "finished": transfer_status = run.get_transfer_status() if transfer_status == "not started": - run.aggregate_demux_results() # TODO: if multiple demux dirs, aggregate the results into Demultiplexing? + #TODO: if multiple demux dir exist, move the data dirs into Demultiplexing + run.aggregate_demux_results() run.sync_metadata() run.make_transfer_indicator() run.status = "transferring" diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 7f3e8eae..fed22e28 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -30,10 +30,7 @@ def __init__(self, run_dir, configuration): self.demux_dir = os.path.join(self.run_dir, "Demultiplexing") self.final_sequencing_file = os.path.join(self.run_dir, "RunUploaded.json") - self.demux_stats_file = os.path.join( - self.demux_dir, - "RunStats.json", # Assumes demux is finished when this file is created - ) + self.demux_stats_file = "RunStats.json" # Assumes demux is finished when this file is created self.transfer_file = ( self.CONFIG.get("Element").get(self.sequencer_type).get("transfer_log") ) # TODO: change and add to taca.yaml @@ -150,12 +147,21 @@ def check_sequencing_status(self): def get_demultiplexing_status(self): if not os.path.exists(self.demux_dir): return "not started" - elif os.path.exists(self.demux_dir) and not os.path.isfile( - self.demux_stats_file - ): - return "ongoing" # TODO: check for exit status file instead - elif os.path.exists(self.demux_dir) and os.path.isfile(self.demux_stats_file): - return "finished" # TODO: check exit status of demux in exit status file + demux_dirs = glob.glob( + os.path.join(self.run_dir, "Delmultiplexing*") + ) + finished_count = 0 + for demux_dir in demux_dirs: + if os.path.exists(self.demux_dir) and not os.path.isfile( + os.path.join(demux_dir, self.demux_stats_file) + ): + return "ongoing" + elif os.path.exists(self.demux_dir) and os.path.isfile( + os.path.join(demux_dir, self.demux_stats_file) + ): + finished_count += 1 # TODO: check exit status of demux in exit status file + if finished_count == len(demux_dirs): + return "finished" else: return "unknown" @@ -421,7 +427,7 @@ def rsync_successful(self): return False def aggregate_demux_results(self): - # TODO: aggregate demux results + # TODO: aggregate demux results. Move project data dir from each sub demux dir to Demultiplexing pass def sync_metadata(self): From 32f812cb72913e6a401c362da1a839e6f4377712 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Tue, 17 Sep 2024 14:53:37 +0200 Subject: [PATCH 035/187] Aggregate demux results if more than one --- taca/analysis/analysis_element.py | 7 +++++-- taca/element/Element_Runs.py | 14 +++++++++----- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index 32439610..06d4e252 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -74,8 +74,11 @@ def _process(run): elif sequencing_done and demultiplexing_status == "finished": transfer_status = run.get_transfer_status() if transfer_status == "not started": - #TODO: if multiple demux dir exist, move the data dirs into Demultiplexing - run.aggregate_demux_results() + demux_results_dirs = glob.glob( + os.path.join(run.run_dir, "Delmultiplexing*") + ) + if len(demux_results_dirs > 1): + run.aggregate_demux_results(demux_results_dirs) run.sync_metadata() run.make_transfer_indicator() run.status = "transferring" diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index fed22e28..67222304 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -4,6 +4,7 @@ import re import zipfile import subprocess +import shutil from datetime import datetime from pathlib import Path from glob import glob @@ -374,14 +375,14 @@ def generate_demux_command(self, run_manifest, demux_dir): + " -p 8" + f" -r {run_manifest}" + " --legacy-fastq" # TODO: except if Smart-seq3 - + f" --force-index-orientation; echo $? > {self.rsync_exit_file}" + + f" --force-index-orientation" ) # TODO: any other options? return command def start_demux(self, run_manifest, demux_dir): with chdir(self.run_dir): cmd = self.generate_demux_command(run_manifest, demux_dir) - # TODO handle multiple composite manifests for demux + # TODO: handle multiple composite manifests for demux try: p_handle = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True, cwd=self.run_dir) logger.info( @@ -426,9 +427,12 @@ def rsync_successful(self): else: return False - def aggregate_demux_results(self): - # TODO: aggregate demux results. Move project data dir from each sub demux dir to Demultiplexing - pass + def aggregate_demux_results(self, demux_results_dirs): + for demux_dir in demux_results_dirs: + data_dirs = [f.path for f in os.scandir(os.path.join(demux_dir, 'Samples')) if f.is_dir()] + for data_dir in data_dirs: + if not "PhiX" in data_dir and not "Unassigned" in data_dir: + shutil.move(data_dir, self.demux_dir) def sync_metadata(self): # TODO: copy metadata from demuxed run to ngi-nas-ns From 253b9d1ea65a356c7b050e3768b1a29469b202cf Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Wed, 18 Sep 2024 08:37:50 +0200 Subject: [PATCH 036/187] rsync function --- taca/element/Element_Runs.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 67222304..1cfdf953 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -428,6 +428,7 @@ def rsync_successful(self): return False def aggregate_demux_results(self, demux_results_dirs): + # TODO: Correct this based on comments from Chuan for demux_dir in demux_results_dirs: data_dirs = [f.path for f in os.scandir(os.path.join(demux_dir, 'Samples')) if f.is_dir()] for data_dir in data_dirs: @@ -443,8 +444,28 @@ def make_transfer_indicator(self): Path(transfer_indicator).touch() def transfer(self): - # TODO: rsync run to analysis cluster - pass + transfer_details = self.CONFIG.get("Element").get(self.sequencer_type).get("transfer_details") #TODO: Add section to taca.yaml + command = ("rsync" + + " -rLav" + + f" --chown={transfer_details.get("owner")}" + + f" --chmod={transfer_details.get("permissions")}" + + " --exclude BaseCalls" # TODO: check that we actually want to exclude these + + " --exclude Alignment" + + f" {self.run_dir}" + + f" {transfer_details.get("user")@transfer_details.get("host")}:/" + + "; echo $? > .rsync_exit_status" + ) # TODO: any other options? + try: + p_handle = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True) + logger.info( + "Transfer to analysis cluster " + f"started for run {self} on {datetime.now()}" + ) + except subprocess.CalledProcessError: + logger.warning("An error occurred while starting transfer to analysis cluster " + f"for {self} on {datetime.now()}." + ) + return def remove_transfer_indicator(self): # TODO: remove hidden file in run directory From 90bc0ed60b5dc9635be420ad4dfd2b7dc53252ae Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Wed, 18 Sep 2024 10:40:43 +0200 Subject: [PATCH 037/187] A suggestion for structure for the _process function --- taca/analysis/analysis_element.py | 121 +++++++++++++++++------------- 1 file changed, 69 insertions(+), 52 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index 06d4e252..e69e0902 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -29,24 +29,19 @@ def _process(run): ) raise + #### Sequencing status #### sequencing_done = run.check_sequencing_status() - demultiplexing_status = run.get_demultiplexing_status() if not sequencing_done: # Sequencing ongoing run.status = "sequencing" if run.status_changed: run.update_statusdb() - elif ( - sequencing_done and demultiplexing_status == "not started" - ): # Sequencing done. Start demux - if ( - not run.manifest_exists() - ): # TODO: this should check for the zip file in lims output location - logger.warning( - f"Run manifest is missing for {run}, demultiplexing aborted" - ) - # TODO: email operator warning - return - elif run.manifest_exists(): + return + + #### Demultiplexing status #### + demultiplexing_status = run.get_demultiplexing_status() + if demultiplexing_status == "not started": + # Sequencing done. Start demux + if run.manifest_exists(): os.mkdir(run.demux_dir) run.copy_manifests() run_manifests = glob.glob( @@ -66,55 +61,77 @@ def _process(run): run.status = "demultiplexing" if run.status_changed: run.update_statusdb() - elif sequencing_done and demultiplexing_status == "ongoing": + return + else: + # TODO: this should check for the zip file in lims output location + logger.warning( + f"Run manifest is missing for {run}, demultiplexing aborted" + ) + # TODO: email operator warning + return + elif demultiplexing_status == "ongoing": run.status = "demultiplexing" if run.status_changed: run.update_statusdb() return - elif sequencing_done and demultiplexing_status == "finished": - transfer_status = run.get_transfer_status() - if transfer_status == "not started": - demux_results_dirs = glob.glob( - os.path.join(run.run_dir, "Delmultiplexing*") - ) - if len(demux_results_dirs > 1): - run.aggregate_demux_results(demux_results_dirs) - run.sync_metadata() - run.make_transfer_indicator() - run.status = "transferring" + elif demultiplexing_status != "finished": + logger.warning( + f"Unknown demultiplexing status {demultiplexing_status} of run {run}. Please investigate" + ) + return + + #### Transfer status #### + transfer_status = run.get_transfer_status() + if transfer_status == "not started": + demux_results_dirs = glob.glob( + os.path.join(run.run_dir, "Delmultiplexing*") + ) + if len(demux_results_dirs > 1): + run.aggregate_demux_results(demux_results_dirs) + run.sync_metadata() + run.make_transfer_indicator() + run.status = "transferring" + if run.status_changed: + run.update_statusdb() + # TODO: Also update statusdb with a timestamp of when the transfer started + run.transfer() # I think this should be a detached command as well + return + elif transfer_status == "ongoing": + run.status = "transferring" + if run.status_changed: + run.update_statusdb() + logger.info(f"{run} is being transferred. Skipping.") + return + elif transfer_status == "rsync done": + if run.rsync_successful(): + run.remove_transfer_indicator() + run.update_transfer_log() + run.status = "transferred" if run.status_changed: run.update_statusdb() - # TODO: Also update statusdb with a timestamp of when the transfer started - run.transfer() # I think this should be a detached command as well - elif transfer_status == "ongoing": - run.status = "transferring" + run.archive() + run.status = "archived" if run.status_changed: run.update_statusdb() - logger.info(f"{run} is being transferred. Skipping.") - return - elif transfer_status == "rsync done": - if run.rsync_successful(): - run.remove_transfer_indicator() - run.update_transfer_log() - run.status = "transferred" - if run.status_changed: - run.update_statusdb() - run.archive() - run.status = "archived" - if run.status_changed: - run.update_statusdb() - else: - run.status = "transfer failed" - logger.warning(f"An issue occurred while transfering {run} to the analysis cluster." ) - # TODO: email warning to operator - elif transfer_status == "unknown": + else: + run.status = "transfer failed" logger.warning( - f"The run {run} has already been transferred but has not been archived. Please investigate" + f"An issue occurred while transfering {run} to the analysis cluster." ) - # TODO: email operator warning - return - else: - logger.warning(f"Unknown transfer status of run {run}. Please investigate") + # TODO: email warning to operator + return + elif transfer_status == "unknown": + logger.warning( + f"The run {run} has already been transferred but has not been archived. Please investigate" + ) + # TODO: email operator warning + return + else: + # TODO Merge with the one above? + logger.warning( + f"Unknown transfer status {transfer_status} of run {run}. Please investigate" + ) + return if given_run: run = Aviti_Run(given_run) From 08995cb5a881a5548eccc40765fb259cb3d86757 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Wed, 18 Sep 2024 12:54:24 +0200 Subject: [PATCH 038/187] gather demux results and upload to statusdb --- taca/analysis/analysis_element.py | 1 + taca/element/Element_Runs.py | 49 +++++++++++++++++++++++++++++-- 2 files changed, 47 insertions(+), 3 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index 06d4e252..5a21d718 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -79,6 +79,7 @@ def _process(run): ) if len(demux_results_dirs > 1): run.aggregate_demux_results(demux_results_dirs) + run.upload_demux_results_to_statusdb() run.sync_metadata() run.make_transfer_indicator() run.status = "transferring" diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 1cfdf953..af54aa03 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -2,6 +2,7 @@ import logging import os import re +import csv import zipfile import subprocess import shutil @@ -107,7 +108,7 @@ def parse_run_parameters(self) -> None: self.run_parameters_parsed = True def to_doc_obj(self): - # TODO, are we sure what we should do when the RunParameters.json file is missing? + # TODO: are we sure what we should do when the RunParameters.json file is missing? # Read in all instrument generated files instrument_generated_files = {} @@ -432,8 +433,50 @@ def aggregate_demux_results(self, demux_results_dirs): for demux_dir in demux_results_dirs: data_dirs = [f.path for f in os.scandir(os.path.join(demux_dir, 'Samples')) if f.is_dir()] for data_dir in data_dirs: - if not "PhiX" in data_dir and not "Unassigned" in data_dir: - shutil.move(data_dir, self.demux_dir) + if not "PhiX" in data_dir in data_dir: + shutil.move(data_dir, self.demux_dir) + + def upload_demux_results_to_statusdb(self): + # TODO: dump contents of IndexAssignment.csv and UnassignedSequences.csv into statusdb document + doc_obj = self.db.get_db_entry(self.NGI_run_id) + index_assignement_file = os.path.join(self.run_dir, "Demultiplexing", "IndexAssignment.csv") + with open(index_assignement_file, 'r') as index_file: + reader = csv.DictReader(index_file) + index_assignments = [row for row in reader] + unassigned_sequences_file = os.path.join(self.run_dir, "Demultiplexing", "UnassignedSequences.csv") + with open(unassigned_sequences_file, 'r') as unassigned_file: + reader = csv.DictReader(unassigned_file) + unassigned_sequences = [row for row in reader] + project_dirs = [f.path for f in os.scandir(os.path.join(self.run_dir, "Demultiplexing")) if f.is_dir() and not "PhiX" in f] + for project_dir in project_dirs: + run_stats_file = glob.glob(os.path.join(project_dir, "*_RunStats.json")) + with open(run_stats_file) as stats_json: + project_sample_stats_raw = json.load(stats_json) + collected_sample_stats = {} + for sample_stats in project_sample_stats_raw["SampleStats"]: + sample_name = sample_stats["SampleName"] + percent_q30 = sample_stats["PercentQ30"] + quality_score_mean = sample_stats["QualityScoreMean"] + percent_mismatch = sample_stats["PercentMismatch"] + collected_sample_stats[sample_name] = { + "PercentQ30": percent_q30, + "QualityScoreMean": quality_score_mean, + "PercentMismatch": percent_mismatch + } + for assignment in index_assignments: + sample = assignment.get("SampleName") + sample_stats_to_add = collected_sample_stats.get(sample) + assignment["PercentQ30"] = sample_stats_to_add.get("PercentQ30") + assignment["QualityScoreMean"] = sample_stats_to_add.get("QualityScoreMean") + assignment["PercentMismatch"] = sample_stats_to_add.get("PercentMismatch") + demultiplex_stats = { + "Demultiplex_Stats": { + "Index_Assignment": index_assignments, + "Unassigned_Sequences": unassigned_sequences + } + } + doc_obj["Aviti": demultiplex_stats] + self.db.upload_to_statusdb(doc_obj) def sync_metadata(self): # TODO: copy metadata from demuxed run to ngi-nas-ns From ee38b71f71d7665e5d145e0b90dfdb8342fcbf4b Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Wed, 18 Sep 2024 15:39:39 +0200 Subject: [PATCH 039/187] Correct path finding --- taca/element/Element_Runs.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index af54aa03..875e2cdd 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -447,8 +447,12 @@ def upload_demux_results_to_statusdb(self): with open(unassigned_sequences_file, 'r') as unassigned_file: reader = csv.DictReader(unassigned_file) unassigned_sequences = [row for row in reader] - project_dirs = [f.path for f in os.scandir(os.path.join(self.run_dir, "Demultiplexing")) if f.is_dir() and not "PhiX" in f] - for project_dir in project_dirs: + dirs = os.scandir("Demultiplexing") + project_dirs = [] + for directory in dirs: + if os.path.isdir(directory.path) and not "Unassigned" in directory.path: + project_dirs.append(directory.path) + for project_dir in project_dirs: # TODO: remove this block when q30 is added to IndexAssignment.csv by Element run_stats_file = glob.glob(os.path.join(project_dir, "*_RunStats.json")) with open(run_stats_file) as stats_json: project_sample_stats_raw = json.load(stats_json) From c9acfb94c19b4204dde99f21e1c6d029621c7497 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Wed, 18 Sep 2024 15:43:48 +0200 Subject: [PATCH 040/187] filtter out phix --- taca/element/Element_Runs.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 875e2cdd..dcfb0633 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -469,10 +469,11 @@ def upload_demux_results_to_statusdb(self): } for assignment in index_assignments: sample = assignment.get("SampleName") - sample_stats_to_add = collected_sample_stats.get(sample) - assignment["PercentQ30"] = sample_stats_to_add.get("PercentQ30") - assignment["QualityScoreMean"] = sample_stats_to_add.get("QualityScoreMean") - assignment["PercentMismatch"] = sample_stats_to_add.get("PercentMismatch") + if sample != "PhiX": + sample_stats_to_add = collected_sample_stats.get(sample) + assignment["PercentQ30"] = sample_stats_to_add.get("PercentQ30") + assignment["QualityScoreMean"] = sample_stats_to_add.get("QualityScoreMean") + assignment["PercentMismatch"] = sample_stats_to_add.get("PercentMismatch") demultiplex_stats = { "Demultiplex_Stats": { "Index_Assignment": index_assignments, From faa7f68e73e3be5723f94dd2ca8b790724757cec Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Wed, 18 Sep 2024 15:45:11 +0200 Subject: [PATCH 041/187] Fix dictionary --- taca/element/Element_Runs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index dcfb0633..8fd53316 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -480,7 +480,7 @@ def upload_demux_results_to_statusdb(self): "Unassigned_Sequences": unassigned_sequences } } - doc_obj["Aviti": demultiplex_stats] + doc_obj["Aviti"] = demultiplex_stats self.db.upload_to_statusdb(doc_obj) def sync_metadata(self): From 70803cd71a12758d90a64923f057f1e80d0d6c96 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Thu, 19 Sep 2024 09:42:25 +0200 Subject: [PATCH 042/187] Remove config file for tests from Docker --- Dockerfile | 1 - 1 file changed, 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index db69561c..93fd631b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -23,7 +23,6 @@ COPY requirements-dev.txt requirements-dev.txt RUN python -m pip install -r requirements-dev.txt RUN mkdir /root/.taca/ -COPY tests/data/taca_test_cfg.yaml /root/.taca/taca.yaml FROM base AS testing COPY . /taca From c6667b24130b3487298baad897c6c9d8fe39fea3 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Thu, 19 Sep 2024 09:43:18 +0200 Subject: [PATCH 043/187] Change order of init in aviti --- taca/element/Aviti_Runs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/taca/element/Aviti_Runs.py b/taca/element/Aviti_Runs.py index ad162ac4..18c81eda 100644 --- a/taca/element/Aviti_Runs.py +++ b/taca/element/Aviti_Runs.py @@ -3,5 +3,5 @@ class Aviti_Run(Run): def __init__(self, run_dir, configuration): - super().__init__(run_dir, configuration) self.sequencer_type = "Aviti" + super().__init__(run_dir, configuration) From 0ec6924215aec36f214803628d47317f44ef9fa2 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Thu, 19 Sep 2024 10:08:57 +0200 Subject: [PATCH 044/187] Handle run post transfer --- taca/element/Element_Runs.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 8fd53316..d22fced6 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -516,13 +516,22 @@ def transfer(self): return def remove_transfer_indicator(self): - # TODO: remove hidden file in run directory - pass + transfer_indicator = os.path.join(self.run_dir, '.rsync_ongoing') + Path(transfer_indicator).unlink() def update_transfer_log(self): - # TODO: update the transfer log - pass + """Update transfer log with run id and date.""" + try: + with open(self.transfer_file, "a") as f: + tsv_writer = csv.writer(f, delimiter="\t") + tsv_writer.writerow([self.NGI_run_id, str(datetime.now())]) + except OSError: + msg = f"{self}: Could not update the transfer logfile {self.transfer_file}" + logger.error(msg) + raise OSError(msg) def archive(self): - # TODO: move run dir to nosync - pass + """Move directory to nosync.""" + src = self.run_dir + dst = os.path.join(self.run_dir, os.pardir, "nosync") + shutil.move(src, dst) From 0a3de0b5a1e49eff7df6389fe7f14b4ff958ce70 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Thu, 19 Sep 2024 10:11:26 +0200 Subject: [PATCH 045/187] Some small bugfixes for Element Runs --- taca/element/Element_Runs.py | 138 +++++++++++++++++++++-------------- 1 file changed, 83 insertions(+), 55 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 8fd53316..1a660d82 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -1,18 +1,17 @@ +import csv import json import logging import os import re -import csv -import zipfile -import subprocess import shutil +import subprocess +import zipfile from datetime import datetime -from pathlib import Path from glob import glob +from pathlib import Path import pandas as pd -from taca.utils import misc from taca.utils.filesystem import chdir from taca.utils.statusdb import ElementRunsConnection @@ -32,11 +31,15 @@ def __init__(self, run_dir, configuration): self.demux_dir = os.path.join(self.run_dir, "Demultiplexing") self.final_sequencing_file = os.path.join(self.run_dir, "RunUploaded.json") - self.demux_stats_file = "RunStats.json" # Assumes demux is finished when this file is created + self.demux_stats_file = ( + "RunStats.json" # Assumes demux is finished when this file is created + ) self.transfer_file = ( - self.CONFIG.get("Element").get(self.sequencer_type).get("transfer_log") + self.CONFIG.get("Element", {}) + .get(self.sequencer_type, {}) + .get("transfer_log") ) # TODO: change and add to taca.yaml - self.rsync_exit_file = os.path.join(self.run_dir, '.rsync_exit_status') + self.rsync_exit_file = os.path.join(self.run_dir, ".rsync_exit_status") # Instrument generated files self.run_parameters_file = os.path.join(self.run_dir, "RunParameters.json") @@ -46,7 +49,9 @@ def __init__(self, run_dir, configuration): ) self.run_uploaded_file = os.path.join(self.run_dir, "RunUploaded.json") - self.db = ElementRunsConnection(self.CONFIG["statusdb"], dbname="element_runs") + self.db = ElementRunsConnection( + self.CONFIG.get("statusdb", {}), dbname="element_runs" + ) # Fields to be set by TACA self.status = None @@ -149,19 +154,19 @@ def check_sequencing_status(self): def get_demultiplexing_status(self): if not os.path.exists(self.demux_dir): return "not started" - demux_dirs = glob.glob( - os.path.join(self.run_dir, "Delmultiplexing*") - ) + demux_dirs = glob.glob(os.path.join(self.run_dir, "Delmultiplexing*")) finished_count = 0 for demux_dir in demux_dirs: if os.path.exists(self.demux_dir) and not os.path.isfile( os.path.join(demux_dir, self.demux_stats_file) - ): + ): return "ongoing" elif os.path.exists(self.demux_dir) and os.path.isfile( os.path.join(demux_dir, self.demux_stats_file) - ): - finished_count += 1 # TODO: check exit status of demux in exit status file + ): + finished_count += ( + 1 # TODO: check exit status of demux in exit status file + ) if finished_count == len(demux_dirs): return "finished" else: @@ -370,14 +375,15 @@ def make_demux_manifests( return manifest_paths def generate_demux_command(self, run_manifest, demux_dir): - command = (f"{self.CONFIG.get(self.software)["bin"]}" # TODO: add path to bases2fastq executable to config + command = ( + f"{self.CONFIG.get(self.software)['bin']}" # TODO: add path to bases2fastq executable to config + f" {self.run_dir}" + f" {demux_dir}" + " -p 8" + f" -r {run_manifest}" + " --legacy-fastq" # TODO: except if Smart-seq3 - + f" --force-index-orientation" - ) # TODO: any other options? + + " --force-index-orientation" + ) # TODO: any other options? return command def start_demux(self, run_manifest, demux_dir): @@ -385,20 +391,26 @@ def start_demux(self, run_manifest, demux_dir): cmd = self.generate_demux_command(run_manifest, demux_dir) # TODO: handle multiple composite manifests for demux try: - p_handle = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True, cwd=self.run_dir) + p_handle = subprocess.Popen( + cmd, stdout=subprocess.PIPE, shell=True, cwd=self.run_dir + ) logger.info( "Bases2Fastq conversion and demultiplexing " f"started for run {self} on {datetime.now()}" ) except subprocess.CalledProcessError: - logger.warning("An error occurred while starting demultiplexing for " - f"{self} on {datetime.now()}." + logger.warning( + "An error occurred while starting demultiplexing for " + f"{self} on {datetime.now()}." ) return - def get_transfer_status(self): - if not self.in_transfer_log() and not self.transfer_ongoing() and not self.rsync_complete(): + if ( + not self.in_transfer_log() + and not self.transfer_ongoing() + and not self.rsync_complete() + ): return "not started" elif self.transfer_ongoing() and not self.rsync_complete(): return "ongoing" @@ -406,22 +418,22 @@ def get_transfer_status(self): return "rsync done" elif self.in_transfer_log(): return "unknown" - + def in_transfer_log(self): - with open(self.transfer_file, "r") as transfer_file: + with open(self.transfer_file) as transfer_file: for row in transfer_file.read(): if self.NGI_run_id in row: return True return False def transfer_ongoing(self): - return os.path.isfile(os.path.join(self.run_dir, '.rsync_ongoing')) + return os.path.isfile(os.path.join(self.run_dir, ".rsync_ongoing")) def rsync_complete(self): return os.path.isfile(self.rsync_exit_file) def rsync_successful(self): - with open(os.path.join(self.run_dir, '.rsync_exit_status')) as rsync_exit_file: + with open(os.path.join(self.run_dir, ".rsync_exit_status")) as rsync_exit_file: rsync_exit_status = rsync_exit_file.readlines() if rsync_exit_status[0].strip() == 0: return True @@ -431,28 +443,36 @@ def rsync_successful(self): def aggregate_demux_results(self, demux_results_dirs): # TODO: Correct this based on comments from Chuan for demux_dir in demux_results_dirs: - data_dirs = [f.path for f in os.scandir(os.path.join(demux_dir, 'Samples')) if f.is_dir()] + data_dirs = [ + f.path + for f in os.scandir(os.path.join(demux_dir, "Samples")) + if f.is_dir() + ] for data_dir in data_dirs: if not "PhiX" in data_dir in data_dir: shutil.move(data_dir, self.demux_dir) - + def upload_demux_results_to_statusdb(self): # TODO: dump contents of IndexAssignment.csv and UnassignedSequences.csv into statusdb document doc_obj = self.db.get_db_entry(self.NGI_run_id) - index_assignement_file = os.path.join(self.run_dir, "Demultiplexing", "IndexAssignment.csv") - with open(index_assignement_file, 'r') as index_file: + index_assignement_file = os.path.join( + self.run_dir, "Demultiplexing", "IndexAssignment.csv" + ) + with open(index_assignement_file) as index_file: reader = csv.DictReader(index_file) index_assignments = [row for row in reader] - unassigned_sequences_file = os.path.join(self.run_dir, "Demultiplexing", "UnassignedSequences.csv") - with open(unassigned_sequences_file, 'r') as unassigned_file: + unassigned_sequences_file = os.path.join( + self.run_dir, "Demultiplexing", "UnassignedSequences.csv" + ) + with open(unassigned_sequences_file) as unassigned_file: reader = csv.DictReader(unassigned_file) unassigned_sequences = [row for row in reader] dirs = os.scandir("Demultiplexing") project_dirs = [] for directory in dirs: - if os.path.isdir(directory.path) and not "Unassigned" in directory.path: + if os.path.isdir(directory.path) and "Unassigned" not in directory.path: project_dirs.append(directory.path) - for project_dir in project_dirs: # TODO: remove this block when q30 is added to IndexAssignment.csv by Element + for project_dir in project_dirs: # TODO: remove this block when q30 is added to IndexAssignment.csv by Element run_stats_file = glob.glob(os.path.join(project_dir, "*_RunStats.json")) with open(run_stats_file) as stats_json: project_sample_stats_raw = json.load(stats_json) @@ -465,21 +485,25 @@ def upload_demux_results_to_statusdb(self): collected_sample_stats[sample_name] = { "PercentQ30": percent_q30, "QualityScoreMean": quality_score_mean, - "PercentMismatch": percent_mismatch - } + "PercentMismatch": percent_mismatch, + } for assignment in index_assignments: sample = assignment.get("SampleName") if sample != "PhiX": sample_stats_to_add = collected_sample_stats.get(sample) assignment["PercentQ30"] = sample_stats_to_add.get("PercentQ30") - assignment["QualityScoreMean"] = sample_stats_to_add.get("QualityScoreMean") - assignment["PercentMismatch"] = sample_stats_to_add.get("PercentMismatch") + assignment["QualityScoreMean"] = sample_stats_to_add.get( + "QualityScoreMean" + ) + assignment["PercentMismatch"] = sample_stats_to_add.get( + "PercentMismatch" + ) demultiplex_stats = { "Demultiplex_Stats": { "Index_Assignment": index_assignments, - "Unassigned_Sequences": unassigned_sequences - } + "Unassigned_Sequences": unassigned_sequences, } + } doc_obj["Aviti"] = demultiplex_stats self.db.upload_to_statusdb(doc_obj) @@ -488,21 +512,24 @@ def sync_metadata(self): pass def make_transfer_indicator(self): - transfer_indicator = os.path.join(self.run_dir, '.rsync_ongoing') + transfer_indicator = os.path.join(self.run_dir, ".rsync_ongoing") Path(transfer_indicator).touch() def transfer(self): - transfer_details = self.CONFIG.get("Element").get(self.sequencer_type).get("transfer_details") #TODO: Add section to taca.yaml - command = ("rsync" - + " -rLav" - + f" --chown={transfer_details.get("owner")}" - + f" --chmod={transfer_details.get("permissions")}" - + " --exclude BaseCalls" # TODO: check that we actually want to exclude these - + " --exclude Alignment" - + f" {self.run_dir}" - + f" {transfer_details.get("user")@transfer_details.get("host")}:/" - + "; echo $? > .rsync_exit_status" - ) # TODO: any other options? + transfer_details = ( + self.CONFIG.get("Element").get(self.sequencer_type).get("transfer_details") + ) # TODO: Add section to taca.yaml + command = ( + "rsync" + + " -rLav" + + f" --chown={transfer_details.get('owner')}" + + f" --chmod={transfer_details.get('permissions')}" + + " --exclude BaseCalls" # TODO: check that we actually want to exclude these + + " --exclude Alignment" + + f" {self.run_dir}" + + f" {transfer_details.get('user')@transfer_details.get('host')}:/" + + "; echo $? > .rsync_exit_status" + ) # TODO: any other options? try: p_handle = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True) logger.info( @@ -510,8 +537,9 @@ def transfer(self): f"started for run {self} on {datetime.now()}" ) except subprocess.CalledProcessError: - logger.warning("An error occurred while starting transfer to analysis cluster " - f"for {self} on {datetime.now()}." + logger.warning( + "An error occurred while starting transfer to analysis cluster " + f"for {self} on {datetime.now()}." ) return From d9581d561762d18f9da976f269b192a91bc1fb92 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Thu, 19 Sep 2024 10:21:38 +0200 Subject: [PATCH 046/187] Trying to use moch patch for element tests --- tests/element/test_Aviti_Runs.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/element/test_Aviti_Runs.py b/tests/element/test_Aviti_Runs.py index 0ff3e7db..7d536547 100644 --- a/tests/element/test_Aviti_Runs.py +++ b/tests/element/test_Aviti_Runs.py @@ -1,4 +1,5 @@ import tempfile +from unittest.mock import patch import pytest @@ -10,6 +11,11 @@ class TestAviti_Run: def test_init(self, create_dirs: pytest.fixture): tmp: tempfile.TemporaryDirectory = create_dirs run_dir = create_element_run_dir(tmp) + + # Mock db + mock_db = patch("taca.utils.statusdb.ElementRunsConnection") + mock_db.start() + run = to_test.Aviti_Run(run_dir, {}) assert run.run_dir == run_dir assert run.sequencer_type == "Aviti" From b831d429f9c68b8b5212229c6c8e26b0394373e5 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Thu, 19 Sep 2024 11:53:08 +0200 Subject: [PATCH 047/187] Added extensions to devcontainer --- .devcontainer/devcontainer.json | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 796444ad..0550f01b 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -12,7 +12,12 @@ "features": {}, "customizations": { "vscode": { - "extensions": ["ms-python.python", "eamodio.gitlens"] + "extensions": [ + "ms-python.python", + "eamodio.gitlens", + "charliermarsh.ruff", + "ms-python.mypy-type-checker" + ] } }, // Features to add to the dev container. More info: https://containers.dev/features. From 6844ab7036ac2efcf69dab173a229ab11f44f046 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Thu, 19 Sep 2024 11:54:05 +0200 Subject: [PATCH 048/187] Fix mocking of statusb for element tests --- tests/element/test_Aviti_Runs.py | 2 +- tests/element/test_Element_Runs.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/element/test_Aviti_Runs.py b/tests/element/test_Aviti_Runs.py index 7d536547..bf32089c 100644 --- a/tests/element/test_Aviti_Runs.py +++ b/tests/element/test_Aviti_Runs.py @@ -13,7 +13,7 @@ def test_init(self, create_dirs: pytest.fixture): run_dir = create_element_run_dir(tmp) # Mock db - mock_db = patch("taca.utils.statusdb.ElementRunsConnection") + mock_db = patch("taca.element.Element_Runs.ElementRunsConnection") mock_db.start() run = to_test.Aviti_Run(run_dir, {}) diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py index a6eca20a..3af02a41 100644 --- a/tests/element/test_Element_Runs.py +++ b/tests/element/test_Element_Runs.py @@ -64,6 +64,11 @@ class TestRun: def test_init(self, create_dirs: pytest.fixture): tmp: tempfile.TemporaryDirectory = create_dirs run_dir = create_element_run_dir(tmp) + + # Mock db + mock_db = patch("taca.element.Element_Runs.ElementRunsConnection") + mock_db.start() + run = to_test.Run(run_dir, {}) assert run.run_dir == run_dir From 871f7c45c818f9c2e65f792e5355c362eea86ec0 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Thu, 19 Sep 2024 11:55:07 +0200 Subject: [PATCH 049/187] For testing, sequencer_type was not defined --- taca/element/Element_Runs.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 1a660d82..134788e9 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -22,6 +22,10 @@ class Run: """Defines an Element run""" def __init__(self, run_dir, configuration): + if not hasattr(self, "sequencer_type"): + # Mostly for testing, since this class is not meant to be instantiated + self.sequencer_type = "GenericElement" + if not os.path.exists(run_dir): raise RuntimeError(f"Could not locate run directory {run_dir}") self.run_parameters_parsed = False From 7aea5427bac6bd050e2c138da55867712e76fc31 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Thu, 19 Sep 2024 13:22:18 +0200 Subject: [PATCH 050/187] Importing glob according to code usage --- taca/element/Element_Runs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 134788e9..942b95b0 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -1,4 +1,5 @@ import csv +import glob import json import logging import os @@ -7,7 +8,6 @@ import subprocess import zipfile from datetime import datetime -from glob import glob from pathlib import Path import pandas as pd From 99c20ea143605bbf0f62f2d17a12293586ea3f53 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Thu, 19 Sep 2024 13:23:43 +0200 Subject: [PATCH 051/187] Changed mock behaviour --- tests/element/test_Element_Runs.py | 36 +++++++++++++++++------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py index 3af02a41..498b4b65 100644 --- a/tests/element/test_Element_Runs.py +++ b/tests/element/test_Element_Runs.py @@ -1,7 +1,7 @@ import json import os import tempfile -from unittest.mock import patch +from unittest import mock import pytest @@ -60,15 +60,12 @@ def create_element_run_dir( return run_path +@mock.patch("taca.element.Element_Runs.ElementRunsConnection") class TestRun: - def test_init(self, create_dirs: pytest.fixture): + def test_init(self, mock_db: mock.Mock, create_dirs: pytest.fixture): tmp: tempfile.TemporaryDirectory = create_dirs run_dir = create_element_run_dir(tmp) - # Mock db - mock_db = patch("taca.element.Element_Runs.ElementRunsConnection") - mock_db.start() - run = to_test.Run(run_dir, {}) assert run.run_dir == run_dir @@ -82,7 +79,10 @@ def test_init(self, create_dirs: pytest.fixture): ids=["success", "failure", "ongoing"], ) def test_check_sequencing_status( - self, p: pytest.fixture, create_dirs: pytest.fixture + self, + mock_db: mock.Mock, + p: pytest.fixture, + create_dirs: pytest.fixture, ): tmp: tempfile.TemporaryDirectory = create_dirs @@ -106,10 +106,14 @@ def test_check_sequencing_status( ids=["not started", "ongoing", "finished"], ) def test_get_demultiplexing_status( - self, p: pytest.fixture, create_dirs: pytest.fixture + self, mock_db: mock.Mock, p: pytest.fixture, create_dirs: pytest.fixture ): tmp: tempfile.TemporaryDirectory = create_dirs + if p["demux_dir"] and not p["demux_done"]: + import pdb + + pdb.set_trace() run = to_test.Run( create_element_run_dir( tmp, @@ -128,7 +132,9 @@ def test_get_demultiplexing_status( ], ids=["exists", "does not exist"], ) - def test_manifest_exists(self, create_dirs: pytest.fixture, p: pytest.fixture): + def test_manifest_exists( + self, mock_db: mock.Mock, create_dirs: pytest.fixture, p: pytest.fixture + ): tmp: tempfile.TemporaryDirectory = create_dirs run = to_test.Run( @@ -141,13 +147,13 @@ def test_manifest_exists(self, create_dirs: pytest.fixture, p: pytest.fixture): assert run.manifest_exists() == p["expected"] @pytest.mark.skip(reason="Not implemented yet") - def test_generate_demux_command(self): + def test_generate_demux_command(self, mock_db): pass - def test_start_demux(self, create_dirs): - with patch( + def test_start_demux(self, mock_db, create_dirs): + with mock.patch( "taca.utils.misc.call_external_command_detached" - ) as mock_call, patch( + ) as mock_call, mock.patch( "taca.element.Element_Runs.Run.generate_demux_command" ) as mock_command: mock_command.return_value = "test command" @@ -159,9 +165,9 @@ def test_start_demux(self, create_dirs): ) @pytest.mark.skip(reason="Not implemented yet") - def test_is_transferred(self, create_dirs): + def test_is_transferred(self, mock_db, create_dirs): pass @pytest.mark.skip(reason="Not implemented yet") - def test_parse_rundir(self, create_dirs): + def test_parse_rundir(self, mock_db, create_dirs): pass From a9fc4f7b1f7b465419268a39942228e8ce72b027 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Thu, 19 Sep 2024 14:52:35 +0200 Subject: [PATCH 052/187] Fix check for manifest zip file --- taca/analysis/analysis_element.py | 4 ++-- taca/element/Element_Runs.py | 18 ++++++++++-------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index 5a21d718..ccc95ee5 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -40,7 +40,7 @@ def _process(run): ): # Sequencing done. Start demux if ( not run.manifest_exists() - ): # TODO: this should check for the zip file in lims output location + ): logger.warning( f"Run manifest is missing for {run}, demultiplexing aborted" ) @@ -86,7 +86,7 @@ def _process(run): if run.status_changed: run.update_statusdb() # TODO: Also update statusdb with a timestamp of when the transfer started - run.transfer() # I think this should be a detached command as well + run.transfer() elif transfer_status == "ongoing": run.status = "transferring" if run.status_changed: diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index d22fced6..ed956abd 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -180,7 +180,8 @@ def update_statusdb(self): self.db.upload_to_statusdb(doc_obj) def manifest_exists(self): - return os.path.isfile(self.run_manifest_zip_file) + zip_src_path = self.find_manifest_zip() + return os.path.isfile(zip_src_path) def get_lims_step_id(self) -> str | None: """If the run was started using a LIMS-generated manifest, @@ -197,12 +198,8 @@ def get_lims_step_id(self) -> str | None: lims_step_id = line.split(",")[1] return lims_step_id return None - - def copy_manifests(self) -> bool: - """Fetch the LIMS-generated run manifests from ngi-nas-ns and unzip them into a run subdir.""" - - # TODO test me - + + def find_manifest_zip(self): # Specify dir in which LIMS drop the manifest zip files dir_to_search = os.path.join( self.CONFIG.get("Aviti").get( @@ -238,7 +235,13 @@ def copy_manifests(self) -> bool: zip_src_path = glob_results[-1] else: zip_src_path = glob_results[0] + return zip_src_path + + def copy_manifests(self) -> bool: + """Fetch the LIMS-generated run manifests from ngi-nas-ns and unzip them into a run subdir.""" + # TODO: test me + zip_src_path = self.find_manifest_zip() # Make a run subdir named after the zip file and extract manifests there zip_name = os.path.basename(zip_src_path) zip_dst_path = os.path.join(self.run_dir, zip_name) @@ -437,7 +440,6 @@ def aggregate_demux_results(self, demux_results_dirs): shutil.move(data_dir, self.demux_dir) def upload_demux_results_to_statusdb(self): - # TODO: dump contents of IndexAssignment.csv and UnassignedSequences.csv into statusdb document doc_obj = self.db.get_db_entry(self.NGI_run_id) index_assignement_file = os.path.join(self.run_dir, "Demultiplexing", "IndexAssignment.csv") with open(index_assignement_file, 'r') as index_file: From 591e6339c887e520cad4676a81fa9b1fc0baa8cb Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Thu, 19 Sep 2024 14:53:22 +0200 Subject: [PATCH 053/187] Remove pdb --- tests/element/test_Element_Runs.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py index 498b4b65..1100aff9 100644 --- a/tests/element/test_Element_Runs.py +++ b/tests/element/test_Element_Runs.py @@ -111,9 +111,7 @@ def test_get_demultiplexing_status( tmp: tempfile.TemporaryDirectory = create_dirs if p["demux_dir"] and not p["demux_done"]: - import pdb - pdb.set_trace() run = to_test.Run( create_element_run_dir( tmp, From babcbd0948237ba4c76677285e1632d5556eb047 Mon Sep 17 00:00:00 2001 From: chuan-wang Date: Fri, 20 Sep 2024 11:22:28 +0200 Subject: [PATCH 054/187] TACA Aviti integration WIP --- taca/analysis/analysis_element.py | 10 +- taca/element/Aviti_Runs.py | 1 + taca/element/Element_Runs.py | 150 ++++++++++++++++++++++++++++-- 3 files changed, 145 insertions(+), 16 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index 5a21d718..63cce774 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -56,10 +56,7 @@ def _process(run): ) sub_demux_count = 0 for run_manifest in run_manifests.sort(): - if len(run_manifests) == 1: - demux_dir = run.demux_dir - elif len(run_manifests) > 1: - demux_dir = f"Demultiplexing_{sub_demux_count}" + demux_dir = f"Demultiplexing_{sub_demux_count}" os.mkdir(demux_dir) run.start_demux(run_manifest, demux_dir) sub_demux_count += 1 @@ -75,10 +72,9 @@ def _process(run): transfer_status = run.get_transfer_status() if transfer_status == "not started": demux_results_dirs = glob.glob( - os.path.join(run.run_dir, "Delmultiplexing*") + os.path.join(run.run_dir, "Delmultiplexing_*") ) - if len(demux_results_dirs > 1): - run.aggregate_demux_results(demux_results_dirs) + run.aggregate_demux_results(demux_results_dirs) run.upload_demux_results_to_statusdb() run.sync_metadata() run.make_transfer_indicator() diff --git a/taca/element/Aviti_Runs.py b/taca/element/Aviti_Runs.py index ad162ac4..ccfb5a33 100644 --- a/taca/element/Aviti_Runs.py +++ b/taca/element/Aviti_Runs.py @@ -5,3 +5,4 @@ class Aviti_Run(Run): def __init__(self, run_dir, configuration): super().__init__(run_dir, configuration) self.sequencer_type = "Aviti" + self.demux_dir = "Demultiplexing" diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index af54aa03..e566be6d 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -395,7 +395,7 @@ def start_demux(self, run_manifest, demux_dir): f"{self} on {datetime.now()}." ) return - + def get_transfer_status(self): if not self.in_transfer_log() and not self.transfer_ongoing() and not self.rsync_complete(): @@ -406,7 +406,7 @@ def get_transfer_status(self): return "rsync done" elif self.in_transfer_log(): return "unknown" - + def in_transfer_log(self): with open(self.transfer_file, "r") as transfer_file: for row in transfer_file.read(): @@ -428,14 +428,146 @@ def rsync_successful(self): else: return False - def aggregate_demux_results(self, demux_results_dirs): - # TODO: Correct this based on comments from Chuan + # Clear all content under a dir + def clear_dir(dir): + for filename in os.listdir(dir): + file_path = os.path.join(dir, filename) + try: + if os.path.isfile(file_path) or os.path.islink(file_path): + os.unlink(file_path) + elif os.path.isdir(file_path): + shutil.rmtree(file_path) + except Exception as e: + print(f"Failed to delete {file_path} Reason {e}") + + # Create symlink for a simple demultiplexing dir + def symlink_demux_dir(src_dir, dest_dir): + # Ensure the destination directory exists + if not os.path.exists(dest_dir): + os.makedirs(dest_dir) + # Clear all content under dest_dir + clear_dir(dest_dir) + # Loop through all files and directories in the source directory + for item in os.listdir(src_dir): + src_path = os.path.join(src_dir, item) + # Move content of Samples to the parental dir + if item == "Samples": + dest_path = dest_dir + else: + dest_path = os.path.join(dest_dir, item) + try: + # Create symbolic link only if it doesn't already exist + if not os.path.exists(dest_path): + os.symlink(src_path, dest_path) + print(f"Linked {src_path} to {dest_path}") + else: + print(f"{dest_path} already exists.") + except OSError as e: + print(f"Error linking {src_path} to {dest_path}: {e}") + + + # Collect demux info into a list of dictionaries + # Structure: [{'sub_demux_count':XXX, 'SampleName':XXX, 'Index1':XXX, 'Index2':XXX, 'Lane':XXX, 'Project':XXX, 'Recipe':XXX}] + def collect_demux_runmanifest(self, demux_results_dirs): + demux_runmanifest = [] for demux_dir in demux_results_dirs: - data_dirs = [f.path for f in os.scandir(os.path.join(demux_dir, 'Samples')) if f.is_dir()] - for data_dir in data_dirs: - if not "PhiX" in data_dir in data_dir: - shutil.move(data_dir, self.demux_dir) - + sub_demux_count = demux_dir.split('_')[1] + with open(os.path.join(self.run_dir, demux_dir, 'RunManifest.csv'), 'r') as file: + lines = file.readlines() + sample_section = False + headers = [] + # Loop through each line + for line in lines: + # Check if we reached the "[SAMPLES]" section + if '[SAMPLES]' in line: + sample_section = True + continue + # Exit the sample section if another section is encountered + if sample_section and line.startswith('['): + break + # If in the sample section, process the sample lines + if sample_section: + # Clean up the line + line = line.strip() + # Skip empty lines + if not line: + continue + # Get the headers from the first line + if not headers: + headers = line.split(',') + else: + # Parse sample data + values = line.split(',') + sample_dict = dict(zip(headers, values)) + sample_dict['sub_demux_count'] = sub_demux_count + demux_runmanifest.append(sample_dict) + sorted_demux_runmanifest = sorted(demux_runmanifest, key=lambda x: (x['Lane'], x['SampleName'], x['sub_demux_count'])) + return sorted_demux_runmanifest + + + # Aggregate the output FastQ files of samples from multiple demux + def aggregate_sample_fastq(self, demux_runmanifest): + lanes = sorted(list(set(sample['Lane'] for sample in demux_runmanifest))) + unique_sample_demux = set() + for lane in lanes: + sample_count = 1 + for sample in demux_runmanifest: + lanenr = sample['Lane'] + project = sample['Project'] + sample = sample['SampleName'] + sub_demux_count = sample['sub_demux_count'] + # Skip PhiX + if lanenr == lane and sample != "PhiX": + sample_tuple = (sample, sub_demux_count) + if sample_tuple not in unique_sample_demux: + project_dest = os.path.join(self.run_dir, self.demux_dir, project) + sample_dest = os.path.join(self.run_dir, self.demux_dir, project, sample) + if not os.path.exists(project_dest): + os.makedirs(project_dest) + if not os.path.exists(sample_dest): + os.makedirs(sample_dest) + fastqfiles = glob.glob(os.path.join(self.run_dir, f"Demultiplexing_{sub_demux_count}", "Samples", project, sample, f"*L00{lane}*.fastq.gz")) + for fastqfile in fastqfiles: + old_name = os.path.basename(fastqfile) + read_label = re.search(rf"L00{lane}_(.*?)_001", old_name).group(1) + new_name = "_".join([sample, f"S{sample_count}", f"L00{lane}", read_label, "001.fastq.gz"]) + os.symlink(fastqfile, os.path.join(sample_dest, new_name)) + unique_sample_demux.add(sample_tuple) + sample_count += 1 + + + # Symplink the output FastQ files of undet only if a lane does not have multiple demux + def aggregate_undet_fastq(self, demux_runmanifest): + + + + # Aggregate demux results + def aggregate_demux_results(self, demux_results_dirs): + # In case of single demux + if len(demux_results_dirs) == 1: + # TODO: Check NoIndex case. Can Base2Fastq generate FastQs for both reads and indexes for NoIndex sample? + # Otherwise just softlink contents of Demultplexing_0 into Demultiplexing + symlink_demux_dir(demux_results_dirs[0], os.path.join(self.run_dir, self.demux_dir)) + else: + # Ensure the destination directory exists + if not os.path.exists(os.path.join(self.run_dir, self.demux_dir): + os.makedirs(os.path.join(self.run_dir, self.demux_dir) + # Clear all content under dest_dir + clear_dir(os.path.join(self.run_dir, self.demux_dir) + demux_runmanifest = collect_demux_runmanifest(demux_results_dirs) + # Aggregate the output FastQ files of samples from multiple demux + aggregate_sample_fastq(demux_runmanifest) + # Symplink the output FastQ files of undet only if a lane does not have multiple demux + aggregate_undet_fastq(demux_runmanifest) + # Aggregate stats in IndexAssignment.csv + TBD + # Aggregate stats in UnassignedSequences.csv + TBD + # Aggregate stats in Project_RunStats.json + TBD + + + def upload_demux_results_to_statusdb(self): # TODO: dump contents of IndexAssignment.csv and UnassignedSequences.csv into statusdb document doc_obj = self.db.get_db_entry(self.NGI_run_id) From efe3e96ea634a59b14b962b5c6e4f19a7fa28a3f Mon Sep 17 00:00:00 2001 From: chuan-wang Date: Fri, 20 Sep 2024 11:36:55 +0200 Subject: [PATCH 055/187] Fix undet --- taca/element/Element_Runs.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index e566be6d..cb3b9d5b 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -538,7 +538,17 @@ def aggregate_sample_fastq(self, demux_runmanifest): # Symplink the output FastQ files of undet only if a lane does not have multiple demux def aggregate_undet_fastq(self, demux_runmanifest): - + lanes = sorted(list(set(sample['Lane'] for sample in demux_runmanifest))) + for lane in lanes: + sub_demux = list(set(sample['sub_demux_count'] for sample in demux_runmanifest if sample['Lane']==lane)) + if sub_demux == 1: + project_dest = os.path.join(self.run_dir, self.demux_dir, "Undetermined") + if not os.path.exists(project_dest): + os.makedirs(project_dest) + fastqfiles = glob.glob(os.path.join(self.run_dir, f"Demultiplexing_{sub_demux[0]}", "Samples", "Undetermined", "*.fastq.gz")) + for fastqfile in fastqfiles: + base_name = os.path.basename(fastqfile) + os.symlink(fastqfile, os.path.join(project_dest, base_name)) # Aggregate demux results From 04808462636a8d3d89a8bb7293b5c4564f69c671 Mon Sep 17 00:00:00 2001 From: chuan-wang Date: Fri, 20 Sep 2024 11:38:35 +0200 Subject: [PATCH 056/187] Fix typo --- taca/element/Element_Runs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index cb3b9d5b..2183eafa 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -541,7 +541,7 @@ def aggregate_undet_fastq(self, demux_runmanifest): lanes = sorted(list(set(sample['Lane'] for sample in demux_runmanifest))) for lane in lanes: sub_demux = list(set(sample['sub_demux_count'] for sample in demux_runmanifest if sample['Lane']==lane)) - if sub_demux == 1: + if len(sub_demux) == 1: project_dest = os.path.join(self.run_dir, self.demux_dir, "Undetermined") if not os.path.exists(project_dest): os.makedirs(project_dest) From f01333db08d80b47449f7daa0b1c769c2e819419 Mon Sep 17 00:00:00 2001 From: chuan-wang Date: Fri, 20 Sep 2024 16:16:09 +0200 Subject: [PATCH 057/187] Still WIP --- taca/element/Element_Runs.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 2183eafa..67c89975 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -551,6 +551,15 @@ def aggregate_undet_fastq(self, demux_runmanifest): os.symlink(fastqfile, os.path.join(project_dest, base_name)) + # Aggregate + def aggregate_stats_unassigned(demux_runmanifest): + lanes = sorted(list(set(sample['Lane'] for sample in demux_runmanifest))) + for lane in lanes: + sub_demux = list(set(sample['sub_demux_count'] for sample in demux_runmanifest if sample['Lane']==lane)) + if len(sub_demux) == 1: + unassigned_csv = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux[0]}", "UnassignedSequences.csv") + + # Aggregate demux results def aggregate_demux_results(self, demux_results_dirs): # In case of single demux @@ -572,7 +581,7 @@ def aggregate_demux_results(self, demux_results_dirs): # Aggregate stats in IndexAssignment.csv TBD # Aggregate stats in UnassignedSequences.csv - TBD + aggregate_stats_unassigned(demux_runmanifest) # Aggregate stats in Project_RunStats.json TBD From d118931d47e849f53acbc82f367476f1df56ac0d Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Mon, 23 Sep 2024 11:38:10 +0200 Subject: [PATCH 058/187] Fix config settings --- taca/analysis/analysis_element.py | 2 +- taca/element/Element_Runs.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index 7249964f..209ac163 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -134,7 +134,7 @@ def _process(run): return if given_run: - run = Aviti_Run(given_run) + run = Aviti_Run(given_run, CONFIG.get("element_analysis")) # TODO: Needs to change if more types of Element machines are aquired in the future _process(run) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 21f8989a..3c87ba25 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -211,9 +211,9 @@ def get_lims_step_id(self) -> str | None: def find_manifest_zip(self): # Specify dir in which LIMS drop the manifest zip files dir_to_search = os.path.join( - self.CONFIG.get("Aviti").get( - "manifest_zip_location" - ), # TODO: change and add to taca.yaml + self.CONFIG.get("Element", {}) + .get(self.sequencer_type, {}) + .get("manifest_zip_location"), # TODO: add to taca.yaml datetime.now().year, ) @@ -383,7 +383,7 @@ def make_demux_manifests( def generate_demux_command(self, run_manifest, demux_dir): command = ( - f"{self.CONFIG.get(self.software)['bin']}" # TODO: add path to bases2fastq executable to config + f"{self.CONFIG.get('bases2fastq')}" # TODO: add path to bases2fastq executable to config + f" {self.run_dir}" + f" {demux_dir}" + " -p 8" @@ -523,7 +523,7 @@ def make_transfer_indicator(self): def transfer(self): transfer_details = ( - self.CONFIG.get("Element").get(self.sequencer_type).get("transfer_details") + self.CONFIG.get(self.sequencer_type).get("transfer_details") ) # TODO: Add section to taca.yaml command = ( "rsync" @@ -533,7 +533,7 @@ def transfer(self): + " --exclude BaseCalls" # TODO: check that we actually want to exclude these + " --exclude Alignment" + f" {self.run_dir}" - + f" {transfer_details.get('user')@transfer_details.get('host')}:/" + + f" {transfer_details.get('user')@transfer_details.get('host')}:/aviti" + "; echo $? > .rsync_exit_status" ) # TODO: any other options? try: From 73c54a3822827b1b1281bbd48d6385b99b1ca209 Mon Sep 17 00:00:00 2001 From: chuan-wang Date: Mon, 23 Sep 2024 14:47:46 +0200 Subject: [PATCH 059/187] Fix UnassignedSequences.csv --- taca/element/Element_Runs.py | 64 ++++++++++++++++++++++++++++++++---- 1 file changed, 58 insertions(+), 6 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 67c89975..c757fc94 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -550,14 +550,66 @@ def aggregate_undet_fastq(self, demux_runmanifest): base_name = os.path.basename(fastqfile) os.symlink(fastqfile, os.path.join(project_dest, base_name)) - - # Aggregate - def aggregate_stats_unassigned(demux_runmanifest): + # Write to csv + def write_to_csv(data, filename): + # Get the fieldnames from the keys of the first dictionary + fieldnames = data[0].keys() + # Open the file and write the CSV + with open(filename, mode='w', newline='') as file: + writer = csv.DictWriter(file, fieldnames=fieldnames) + # Write the header (fieldnames) + writer.writeheader() + # Write the data (rows) + writer.writerows(data) + + + # Aggregate stats in UnassignedSequences.csv + def aggregate_stats_unassigned(self, demux_runmanifest): + aggregated_unassigned_indexes = [] lanes = sorted(list(set(sample['Lane'] for sample in demux_runmanifest))) for lane in lanes: - sub_demux = list(set(sample['sub_demux_count'] for sample in demux_runmanifest if sample['Lane']==lane)) - if len(sub_demux) == 1: - unassigned_csv = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux[0]}", "UnassignedSequences.csv") + sub_demux_index_lens = set() + for sample in demux_runmanifest: + if sample['Lane'] == lane: + sub_demux_index_lens.add((sample['sub_demux_count'], (len(sample.get("Index1", "")), len(sample.get("Index2", ""))))) + # List of sub-demux with a decreasing order of index lengths + sub_demux_list = [x[0] for x in sorted(sub_demux_index_lens, key=lambda x: sum(x[1]), reverse=True)] + sub_demux_with_max_index_lens = sub_demux_list[0] + # Start with the unassigned list with the longest index + max_unassigned_csv = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux_with_max_index_lens}", "UnassignedSequences.csv") + with open(max_unassigned_csv, 'r') as max_unassigned_file: + reader = csv.DictReader(max_unassigned_file) + max_unassigned_indexes = [row for row in reader] + # Filter by lane + max_unassigned_indexes = [idx for idx in max_unassigned_indexes if idx["Lane"] == lane] + # Complicated case with multiple demuxes. Take the full list if there is only one sub-demux otherwise + if len(sub_demux_list) > 1: + # Order: from longer to shorter indexes + sub_demux_with_shorter_index_lens = sub_demux_list[1:] + for sub_demux in sub_demux_with_shorter_index_lens: + unassigned_csv = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux}", "UnassignedSequences.csv") + with open(unassigned_csv, 'r') as unassigned_file: + reader = csv.DictReader(unassigned_file) + unassigned_indexes = [row for row in reader] + # Filter by lane + unassigned_indexes = [unassigned_index for unassigned_index in unassigned_indexes if unassigned_index["Lane"] == lane] + # Remove overlapped indexes from the list of max_unassigned_indexes + idx1_overlapped_len = min([demux_lens_pair[1] for demux_lens_pair in sub_demux_index_lens if demux_lens_pair[0] == sub_demux][0][0], + [demux_lens_pair[1] for demux_lens_pair in sub_demux_index_lens if demux_lens_pair[0] == sub_demux_with_max_index_lens][0][0]) + idx2_overlapped_len = min([demux_lens_pair[1] for demux_lens_pair in sub_demux_index_lens if demux_lens_pair[0] == sub_demux][0][1], + [demux_lens_pair[1] for demux_lens_pair in sub_demux_index_lens if demux_lens_pair[0] == sub_demux_with_max_index_lens][0][1]) + for unassigned_index in unassigned_indexes: + idx1_overlapped_seq = unassigned_index['I1'][:idx1_overlapped_len] + idx2_overlapped_seq = unassigned_index['I2'][:idx2_overlapped_len] + # Remove the overlapped record from the max_unassigned_indexes list + max_unassigned_indexes = [max_unassigned_index for max_unassigned_index in max_unassigned_indexes if not (max_unassigned_index['I1'][:idx1_overlapped_len] == idx1_overlapped_seq and max_unassigned_index['I2'][:idx2_overlapped_len] == idx2_overlapped_seq)] + # Append to the aggregated_unassigned_indexes list + aggregated_unassigned_indexes += max_unassigned_indexes + # Sort aggregated_unassigned_indexes list first by lane and then by Count in the decreasing order + aggregated_unassigned_indexes = sorted(aggregated_unassigned_indexes, key=lambda x: (x['Lane'], -int(x['Count']))) + # Write to a new UnassignedSequences.csv file under + aggregated_unassigned_csv = os.path.join(self.run_dir, self.demux_dir, "UnassignedSequences.csv") + write_to_csv(aggregated_unassigned_indexes, aggregated_unassigned_csv) # Aggregate demux results From 5b4f94732cffebf3987952282da17e2a080330dd Mon Sep 17 00:00:00 2001 From: chuan-wang Date: Mon, 23 Sep 2024 16:19:38 +0200 Subject: [PATCH 060/187] Fix IndexAssignment.csv --- taca/element/Element_Runs.py | 68 +++++++++++++++++++++++++++++++----- 1 file changed, 60 insertions(+), 8 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index c757fc94..3391824f 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -563,6 +563,50 @@ def write_to_csv(data, filename): writer.writerows(data) + # Aggregate stats in IndexAssignment.csv + def aggregate_stats_assigned(self, demux_runmanifest): + aggregated_assigned_indexes = [] + sub_demux_list = sorted(list(set(sample['sub_demux_count'] for sample in demux_runmanifest))) + lanes = sorted(list(set(sample['Lane'] for sample in demux_runmanifest))) + for sub_demux in sub_demux_list: + # Read in IndexAssignment.csv + assigned_csv = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux}", "IndexAssignment.csv") + if os.path.exists(assigned_csv): + with open(assigned_csv, 'r') as assigned_file: + reader = csv.DictReader(assigned_file) + index_assignment = [row for row in reader] + for sample in index_assignment: + if sample['Lane'] in lanes: + sample['sub_demux_count'] = sub_demux + aggregated_assigned_indexes.append(sample) + else: + logger.warning(f"No IndexAssignment.csv file found for sub-demultiplexing {sub_demux}.") + # Remove redundant rows for PhiX + aggregated_assigned_indexes_filtered = [] + unique_phiX_combination = set() + for sample in aggregated_assigned_indexes: + if sample['SampleName'] == 'PhiX': + combination = (sample['I1'], sample['I2'], sample['Lane']) + if combination not in unique_phiX_combination: + aggregated_assigned_indexes_filtered.append(sample) + unique_phiX_combination.add(combination) + else: + aggregated_assigned_indexes_filtered.append(sample) + # Sort the list by Lane, SampleName and sub_demux_count + aggregated_assigned_indexes_filtered_sorted = sorted(aggregated_assigned_indexes_filtered, key=lambda x: (x['Lane'], x['SampleName'], x['sub_demux_count'])) + # Fix new sample number based on SampleName and Lane + sample_count = 0 + previous_samplename_lane = ('NA', 'NA') + for sample in aggregated_assigned_indexes_filtered_sorted: + if (sample['SampleName'], sample['Lane']) != previous_samplename_lane: + sample_count += 1 + previous_samplename_lane = (sample['SampleName'], sample['Lane']) + sample['SampleNumber'] = sample_count + # Write to a new UnassignedSequences.csv file under demux_dir + aggregated_assigned_indexes_csv = os.path.join(self.run_dir, self.demux_dir, "IndexAssignment.csv") + write_to_csv(aggregated_assigned_indexes_filtered_sorted, aggregated_assigned_indexes_csv) + + # Aggregate stats in UnassignedSequences.csv def aggregate_stats_unassigned(self, demux_runmanifest): aggregated_unassigned_indexes = [] @@ -577,9 +621,13 @@ def aggregate_stats_unassigned(self, demux_runmanifest): sub_demux_with_max_index_lens = sub_demux_list[0] # Start with the unassigned list with the longest index max_unassigned_csv = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux_with_max_index_lens}", "UnassignedSequences.csv") - with open(max_unassigned_csv, 'r') as max_unassigned_file: - reader = csv.DictReader(max_unassigned_file) - max_unassigned_indexes = [row for row in reader] + if os.path.exists(max_unassigned_csv): + with open(max_unassigned_csv, 'r') as max_unassigned_file: + reader = csv.DictReader(max_unassigned_file) + max_unassigned_indexes = [row for row in reader] + else: + logger.warning(f"No UnassignedSequences.csv file found for sub-demultiplexing {sub_demux_with_max_index_lens}.") + break # Filter by lane max_unassigned_indexes = [idx for idx in max_unassigned_indexes if idx["Lane"] == lane] # Complicated case with multiple demuxes. Take the full list if there is only one sub-demux otherwise @@ -588,9 +636,13 @@ def aggregate_stats_unassigned(self, demux_runmanifest): sub_demux_with_shorter_index_lens = sub_demux_list[1:] for sub_demux in sub_demux_with_shorter_index_lens: unassigned_csv = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux}", "UnassignedSequences.csv") - with open(unassigned_csv, 'r') as unassigned_file: - reader = csv.DictReader(unassigned_file) - unassigned_indexes = [row for row in reader] + if os.path.exists(unassigned_csv): + with open(unassigned_csv, 'r') as unassigned_file: + reader = csv.DictReader(unassigned_file) + unassigned_indexes = [row for row in reader] + else: + logger.warning(f"No UnassignedSequences.csv file found for sub-demultiplexing {sub_demux}.") + continue # Filter by lane unassigned_indexes = [unassigned_index for unassigned_index in unassigned_indexes if unassigned_index["Lane"] == lane] # Remove overlapped indexes from the list of max_unassigned_indexes @@ -607,7 +659,7 @@ def aggregate_stats_unassigned(self, demux_runmanifest): aggregated_unassigned_indexes += max_unassigned_indexes # Sort aggregated_unassigned_indexes list first by lane and then by Count in the decreasing order aggregated_unassigned_indexes = sorted(aggregated_unassigned_indexes, key=lambda x: (x['Lane'], -int(x['Count']))) - # Write to a new UnassignedSequences.csv file under + # Write to a new UnassignedSequences.csv file under demux_dir aggregated_unassigned_csv = os.path.join(self.run_dir, self.demux_dir, "UnassignedSequences.csv") write_to_csv(aggregated_unassigned_indexes, aggregated_unassigned_csv) @@ -631,7 +683,7 @@ def aggregate_demux_results(self, demux_results_dirs): # Symplink the output FastQ files of undet only if a lane does not have multiple demux aggregate_undet_fastq(demux_runmanifest) # Aggregate stats in IndexAssignment.csv - TBD + aggregate_stats_assigned(demux_runmanifest) # Aggregate stats in UnassignedSequences.csv aggregate_stats_unassigned(demux_runmanifest) # Aggregate stats in Project_RunStats.json From fac9912c8af86273d44cb0eba2c964c535a5492e Mon Sep 17 00:00:00 2001 From: chuan-wang Date: Tue, 24 Sep 2024 11:35:42 +0200 Subject: [PATCH 061/187] Fix project run stats WIP --- taca/element/Element_Runs.py | 63 ++++++++++++++++++++++++++++-------- 1 file changed, 49 insertions(+), 14 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 3391824f..b8cf07e3 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -466,6 +466,19 @@ def symlink_demux_dir(src_dir, dest_dir): print(f"Error linking {src_path} to {dest_path}: {e}") + # Write to csv + def write_to_csv(data, filename): + # Get the fieldnames from the keys of the first dictionary + fieldnames = data[0].keys() + # Open the file and write the CSV + with open(filename, mode='w', newline='') as file: + writer = csv.DictWriter(file, fieldnames=fieldnames) + # Write the header (fieldnames) + writer.writeheader() + # Write the data (rows) + writer.writerows(data) + + # Collect demux info into a list of dictionaries # Structure: [{'sub_demux_count':XXX, 'SampleName':XXX, 'Index1':XXX, 'Index2':XXX, 'Lane':XXX, 'Project':XXX, 'Recipe':XXX}] def collect_demux_runmanifest(self, demux_results_dirs): @@ -550,17 +563,38 @@ def aggregate_undet_fastq(self, demux_runmanifest): base_name = os.path.basename(fastqfile) os.symlink(fastqfile, os.path.join(project_dest, base_name)) - # Write to csv - def write_to_csv(data, filename): - # Get the fieldnames from the keys of the first dictionary - fieldnames = data[0].keys() - # Open the file and write the CSV - with open(filename, mode='w', newline='') as file: - writer = csv.DictWriter(file, fieldnames=fieldnames) - # Write the header (fieldnames) - writer.writeheader() - # Write the data (rows) - writer.writerows(data) + + # Read in each Project_RunStats.json to fetch PercentMismatch, PercentQ30, PercentQ40 and QualityScoreMean + # Note that Element promised that they would include these stats into IndexAssignment.csv + # But for now we have to do this by ourselves in this hard way + def get_project_runstats(self, sub_demux, demux_runmanifest): + project_runstats = [] + project_list = sorted(list(set(sample['Project'] for sample in demux_runmanifest if sample['sub_demux_count']==sub_demux))) + for project in project_list: + project_runstats_json_path = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux}", "Samples", project, f"{project}_RunStats.json") + if os.path.exists(project_runstats_json_path): + with open(project_runstats_json_path) as stats_json: + project_runstats_json = json.load(stats_json) + for sample in project_runstats_json["SampleStats"]: + sample_name = sample["SampleName"] + for occurrence in sample["Occurrences"]: + lane = occurrence["Lane"] + expected_sequence = occurrence["ExpectedSequence"] + percentage_mismatch = occurrence["PercentMismatch"] + percentage_q30 = occurrence["PercentQ30"] + percentage_q40 = occurrence["PercentQ40"] + quality_score_mean = occurrence["QualityScoreMean"] + project_runstats.append({ "SampleName" : sample_name, + "Lane" : lane, + "ExpectedSequence" : expected_sequence, + "PercentMismatch" : percentage_mismatch, + "PercentQ30" : percentage_q30, + "PercentQ40" : percentage_q40, + "QualityScoreMean" : quality_score_mean + }) + else: + continue + return project_runstats # Aggregate stats in IndexAssignment.csv @@ -569,6 +603,10 @@ def aggregate_stats_assigned(self, demux_runmanifest): sub_demux_list = sorted(list(set(sample['sub_demux_count'] for sample in demux_runmanifest))) lanes = sorted(list(set(sample['Lane'] for sample in demux_runmanifest))) for sub_demux in sub_demux_list: + # Read in each Project_RunStats.json to fetch PercentMismatch, PercentQ30, PercentQ40 and QualityScoreMean + # Note that Element promised that they would include these stats into IndexAssignment.csv + # But for now we have to do this by ourselves in this hard way + project_runstats = get_project_runstats(sub_demux, demux_runmanifest) # Read in IndexAssignment.csv assigned_csv = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux}", "IndexAssignment.csv") if os.path.exists(assigned_csv): @@ -686,9 +724,6 @@ def aggregate_demux_results(self, demux_results_dirs): aggregate_stats_assigned(demux_runmanifest) # Aggregate stats in UnassignedSequences.csv aggregate_stats_unassigned(demux_runmanifest) - # Aggregate stats in Project_RunStats.json - TBD - def upload_demux_results_to_statusdb(self): From 3a0b10a2c7cad860915fc8640492c30faf106bfc Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Tue, 24 Sep 2024 13:15:40 +0200 Subject: [PATCH 062/187] bug fixes --- taca/analysis/analysis_element.py | 4 ++-- taca/element/Element_Runs.py | 18 ++++++++++-------- taca/utils/statusdb.py | 2 +- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index 209ac163..9b0b62cf 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -134,7 +134,7 @@ def _process(run): return if given_run: - run = Aviti_Run(given_run, CONFIG.get("element_analysis")) + run = Aviti_Run(given_run, CONFIG) # TODO: Needs to change if more types of Element machines are aquired in the future _process(run) @@ -148,7 +148,7 @@ def _process(run): os.path.join(data_dir, "[1-9]*_*_*_*") ) # TODO: adapt to aviti format for run in runs: - runObj = Aviti_Run(run) + runObj = Aviti_Run(run, CONFIG) try: _process(runObj) except: # TODO: chatch error message and print it diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 3c87ba25..88ba3967 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -39,7 +39,7 @@ def __init__(self, run_dir, configuration): "RunStats.json" # Assumes demux is finished when this file is created ) self.transfer_file = ( - self.CONFIG.get("Element", {}) + self.CONFIG.get("element_analysis").get("Element", {}) .get(self.sequencer_type, {}) .get("transfer_log") ) # TODO: change and add to taca.yaml @@ -112,7 +112,8 @@ def parse_run_parameters(self) -> None: ) # Sequencing, wash or prime I believe? self.flowcell_id = run_parameters.get("FlowcellID") self.instrument_name = run_parameters.get("InstrumentName") - self.date = run_parameters.get("Date") + self.date = run_parameters.get("Date")[0:10].replace("-", "") + self.year = self.date[0:4] self.operator_name = run_parameters.get("OperatorName") self.run_parameters_parsed = True @@ -136,6 +137,7 @@ def to_doc_obj(self): instrument_generated_files[os.path.basename(file)] = None doc_obj = { + "name": self.NGI_run_id, "run_path": self.run_dir, "run_status": self.status, "NGI_run_id": self.NGI_run_id, @@ -158,7 +160,7 @@ def check_sequencing_status(self): def get_demultiplexing_status(self): if not os.path.exists(self.demux_dir): return "not started" - demux_dirs = glob.glob(os.path.join(self.run_dir, "Delmultiplexing*")) + demux_dirs = glob.glob(os.path.join(self.run_dir, "Demultiplexing*")) finished_count = 0 for demux_dir in demux_dirs: if os.path.exists(self.demux_dir) and not os.path.isfile( @@ -211,10 +213,10 @@ def get_lims_step_id(self) -> str | None: def find_manifest_zip(self): # Specify dir in which LIMS drop the manifest zip files dir_to_search = os.path.join( - self.CONFIG.get("Element", {}) + self.CONFIG.get("element_analysis").get("Element", {}) .get(self.sequencer_type, {}) .get("manifest_zip_location"), # TODO: add to taca.yaml - datetime.now().year, + str(self.year), ) # Use LIMS step ID if available, else flowcell ID, to make a query pattern @@ -230,7 +232,7 @@ def find_manifest_zip(self): glob_pattern = f"{dir_to_search}/*{self.flowcell_id}*.zip" # Find paths matching the pattern - glob_results = glob(glob_pattern) + glob_results = glob.glob(glob_pattern) if len(glob_results) == 0: logger.warning( f"No manifest found for run '{self.run_dir}' with pattern '{glob_pattern}'." @@ -383,7 +385,7 @@ def make_demux_manifests( def generate_demux_command(self, run_manifest, demux_dir): command = ( - f"{self.CONFIG.get('bases2fastq')}" # TODO: add path to bases2fastq executable to config + f"{self.CONFIG.get("element_analysis").get('bases2fastq')}" # TODO: add path to bases2fastq executable to config + f" {self.run_dir}" + f" {demux_dir}" + " -p 8" @@ -523,7 +525,7 @@ def make_transfer_indicator(self): def transfer(self): transfer_details = ( - self.CONFIG.get(self.sequencer_type).get("transfer_details") + self.CONFIG.get("element_analysis").get(self.sequencer_type).get("transfer_details") ) # TODO: Add section to taca.yaml command = ( "rsync" diff --git a/taca/utils/statusdb.py b/taca/utils/statusdb.py index 47620ea2..a2920550 100644 --- a/taca/utils/statusdb.py +++ b/taca/utils/statusdb.py @@ -210,7 +210,7 @@ def update_doc(db, obj, over_write_db_entry=False): db.save(obj) logger.info("Saving {}".format(obj["name"])) else: - logger.warn("More than one row with name {} found".format(obj["name"])) + logger.warning("More than one row with name {} found".format(obj["name"])) def merge_dicts(d1, d2): From 9e80a4f6be4371d691284222e54045f867699ae3 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Tue, 24 Sep 2024 13:54:12 +0200 Subject: [PATCH 063/187] Fix getting demux status --- taca/element/Element_Runs.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 88ba3967..1606df50 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -36,7 +36,7 @@ def __init__(self, run_dir, configuration): self.demux_dir = os.path.join(self.run_dir, "Demultiplexing") self.final_sequencing_file = os.path.join(self.run_dir, "RunUploaded.json") self.demux_stats_file = ( - "RunStats.json" # Assumes demux is finished when this file is created + "*RunStats.json" # Assumes demux is finished when this file is created ) self.transfer_file = ( self.CONFIG.get("element_analysis").get("Element", {}) @@ -160,20 +160,17 @@ def check_sequencing_status(self): def get_demultiplexing_status(self): if not os.path.exists(self.demux_dir): return "not started" - demux_dirs = glob.glob(os.path.join(self.run_dir, "Demultiplexing*")) + sub_demux_dirs = glob.glob(os.path.join(self.run_dir, "Demultiplexing_*")) finished_count = 0 - for demux_dir in demux_dirs: - if os.path.exists(self.demux_dir) and not os.path.isfile( - os.path.join(demux_dir, self.demux_stats_file) - ): + for demux_dir in sub_demux_dirs: + found_demux_stats_file = glob.glob(os.path.join(demux_dir, self.demux_stats_file)) + if not found_demux_stats_file: return "ongoing" - elif os.path.exists(self.demux_dir) and os.path.isfile( - os.path.join(demux_dir, self.demux_stats_file) - ): + elif found_demux_stats_file: finished_count += ( 1 # TODO: check exit status of demux in exit status file ) - if finished_count == len(demux_dirs): + if finished_count == len(sub_demux_dirs): return "finished" else: return "unknown" From feec3217e2f980c4afad5df9d5c39425fb1a964d Mon Sep 17 00:00:00 2001 From: chuan-wang Date: Tue, 24 Sep 2024 14:18:42 +0200 Subject: [PATCH 064/187] Finalize scripts and fix bugs --- VERSIONLOG.md | 4 +++ taca/analysis/analysis_element.py | 2 +- taca/element/Element_Runs.py | 57 +++++++++++++++---------------- 3 files changed, 33 insertions(+), 30 deletions(-) diff --git a/VERSIONLOG.md b/VERSIONLOG.md index 73323deb..d0563c9a 100644 --- a/VERSIONLOG.md +++ b/VERSIONLOG.md @@ -1,5 +1,9 @@ # TACA Version Log +## 20240924.1 + +Aggregate aviti demultiplexing results + ## 20240705.1 Add section header in samplesheet for run folder transfer diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index 63cce774..867f13c6 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -72,7 +72,7 @@ def _process(run): transfer_status = run.get_transfer_status() if transfer_status == "not started": demux_results_dirs = glob.glob( - os.path.join(run.run_dir, "Delmultiplexing_*") + os.path.join(run.run_dir, "Demultiplexing_*") ) run.aggregate_demux_results(demux_results_dirs) run.upload_demux_results_to_statusdb() diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index b8cf07e3..fb5de39c 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -484,7 +484,7 @@ def write_to_csv(data, filename): def collect_demux_runmanifest(self, demux_results_dirs): demux_runmanifest = [] for demux_dir in demux_results_dirs: - sub_demux_count = demux_dir.split('_')[1] + sub_demux_count = os.path.basename(demux_dir).split('_')[1] with open(os.path.join(self.run_dir, demux_dir, 'RunManifest.csv'), 'r') as file: lines = file.readlines() sample_section = False @@ -527,23 +527,23 @@ def aggregate_sample_fastq(self, demux_runmanifest): for sample in demux_runmanifest: lanenr = sample['Lane'] project = sample['Project'] - sample = sample['SampleName'] + sample_name = sample['SampleName'] sub_demux_count = sample['sub_demux_count'] # Skip PhiX - if lanenr == lane and sample != "PhiX": - sample_tuple = (sample, sub_demux_count) + if lanenr == lane and sample_name != "PhiX": + sample_tuple = (sample_name, sub_demux_count) if sample_tuple not in unique_sample_demux: project_dest = os.path.join(self.run_dir, self.demux_dir, project) - sample_dest = os.path.join(self.run_dir, self.demux_dir, project, sample) + sample_dest = os.path.join(self.run_dir, self.demux_dir, project, sample_name) if not os.path.exists(project_dest): os.makedirs(project_dest) if not os.path.exists(sample_dest): os.makedirs(sample_dest) - fastqfiles = glob.glob(os.path.join(self.run_dir, f"Demultiplexing_{sub_demux_count}", "Samples", project, sample, f"*L00{lane}*.fastq.gz")) + fastqfiles = glob.glob(os.path.join(self.run_dir, f"Demultiplexing_{sub_demux_count}", "Samples", project, sample_name, f"*L00{lane}*.fastq.gz")) for fastqfile in fastqfiles: old_name = os.path.basename(fastqfile) read_label = re.search(rf"L00{lane}_(.*?)_001", old_name).group(1) - new_name = "_".join([sample, f"S{sample_count}", f"L00{lane}", read_label, "001.fastq.gz"]) + new_name = "_".join([sample_name, f"S{sample_count}", f"L00{lane}", read_label, "001.fastq.gz"]) os.symlink(fastqfile, os.path.join(sample_dest, new_name)) unique_sample_demux.add(sample_tuple) sample_count += 1 @@ -558,7 +558,7 @@ def aggregate_undet_fastq(self, demux_runmanifest): project_dest = os.path.join(self.run_dir, self.demux_dir, "Undetermined") if not os.path.exists(project_dest): os.makedirs(project_dest) - fastqfiles = glob.glob(os.path.join(self.run_dir, f"Demultiplexing_{sub_demux[0]}", "Samples", "Undetermined", "*.fastq.gz")) + fastqfiles = glob.glob(os.path.join(self.run_dir, f"Demultiplexing_{sub_demux[0]}", "Samples", "Undetermined", f"*L00{lane}*.fastq.gz")) for fastqfile in fastqfiles: base_name = os.path.basename(fastqfile) os.symlink(fastqfile, os.path.join(project_dest, base_name)) @@ -585,7 +585,7 @@ def get_project_runstats(self, sub_demux, demux_runmanifest): percentage_q40 = occurrence["PercentQ40"] quality_score_mean = occurrence["QualityScoreMean"] project_runstats.append({ "SampleName" : sample_name, - "Lane" : lane, + "Lane" : str(lane), "ExpectedSequence" : expected_sequence, "PercentMismatch" : percentage_mismatch, "PercentQ30" : percentage_q30, @@ -615,7 +615,12 @@ def aggregate_stats_assigned(self, demux_runmanifest): index_assignment = [row for row in reader] for sample in index_assignment: if sample['Lane'] in lanes: + project_runstats_sample = [d for d in project_runstats if d['SampleName'] == sample['SampleName'] and d['Lane'] == sample['Lane'] and d['ExpectedSequence'] == sample['I1']+sample['I2']] sample['sub_demux_count'] = sub_demux + sample['PercentMismatch'] = project_runstats_sample[0]['PercentMismatch'] + sample['PercentQ30'] = project_runstats_sample[0]['PercentQ30'] + sample['PercentQ40'] = project_runstats_sample[0]['PercentQ40'] + sample['QualityScoreMean'] = project_runstats_sample[0]['QualityScoreMean'] aggregated_assigned_indexes.append(sample) else: logger.warning(f"No IndexAssignment.csv file found for sub-demultiplexing {sub_demux}.") @@ -704,26 +709,20 @@ def aggregate_stats_unassigned(self, demux_runmanifest): # Aggregate demux results def aggregate_demux_results(self, demux_results_dirs): - # In case of single demux - if len(demux_results_dirs) == 1: - # TODO: Check NoIndex case. Can Base2Fastq generate FastQs for both reads and indexes for NoIndex sample? - # Otherwise just softlink contents of Demultplexing_0 into Demultiplexing - symlink_demux_dir(demux_results_dirs[0], os.path.join(self.run_dir, self.demux_dir)) - else: - # Ensure the destination directory exists - if not os.path.exists(os.path.join(self.run_dir, self.demux_dir): - os.makedirs(os.path.join(self.run_dir, self.demux_dir) - # Clear all content under dest_dir - clear_dir(os.path.join(self.run_dir, self.demux_dir) - demux_runmanifest = collect_demux_runmanifest(demux_results_dirs) - # Aggregate the output FastQ files of samples from multiple demux - aggregate_sample_fastq(demux_runmanifest) - # Symplink the output FastQ files of undet only if a lane does not have multiple demux - aggregate_undet_fastq(demux_runmanifest) - # Aggregate stats in IndexAssignment.csv - aggregate_stats_assigned(demux_runmanifest) - # Aggregate stats in UnassignedSequences.csv - aggregate_stats_unassigned(demux_runmanifest) + # Ensure the destination directory exists + if not os.path.exists(os.path.join(self.run_dir, self.demux_dir)): + os.makedirs(os.path.join(self.run_dir, self.demux_dir)) + # Clear all content under dest_dir + clear_dir(os.path.join(self.run_dir, self.demux_dir)) + demux_runmanifest = collect_demux_runmanifest(demux_results_dirs) + # Aggregate the output FastQ files of samples from multiple demux + aggregate_sample_fastq(demux_runmanifest) + # Symplink the output FastQ files of undet only if a lane does not have multiple demux + aggregate_undet_fastq(demux_runmanifest) + # Aggregate stats in IndexAssignment.csv + aggregate_stats_assigned(demux_runmanifest) + # Aggregate stats in UnassignedSequences.csv + aggregate_stats_unassigned(demux_runmanifest) def upload_demux_results_to_statusdb(self): From 23e9b6407627f0603d88da9da2002e191dfe0ef7 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Wed, 25 Sep 2024 09:37:21 +0200 Subject: [PATCH 065/187] Bug fixes --- taca/analysis/analysis_element.py | 6 +++--- taca/element/Element_Runs.py | 9 +++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index 9b0b62cf..b469a5af 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -85,7 +85,7 @@ def _process(run): demux_results_dirs = glob.glob( os.path.join(run.run_dir, "Delmultiplexing*") ) - if len(demux_results_dirs > 1): + if len(demux_results_dirs) > 1: run.aggregate_demux_results(demux_results_dirs) run.sync_metadata() run.make_transfer_indicator() @@ -93,13 +93,13 @@ def _process(run): if run.status_changed: run.update_statusdb() # TODO: Also update statusdb with a timestamp of when the transfer started - run.transfer() # I think this should be a detached command as well + run.transfer() return elif transfer_status == "ongoing": run.status = "transferring" if run.status_changed: run.update_statusdb() - logger.info(f"{run} is being transferred. Skipping.") + logger.info(f"{run} is being transferred. Skipping.") # TODO: fix formatting, currently prints "ElementRun(20240910_AV242106_B2403418431) is being transferred" return elif transfer_status == "rsync done": if run.rsync_successful(): diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 1606df50..7efce962 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -441,7 +441,7 @@ def rsync_complete(self): def rsync_successful(self): with open(os.path.join(self.run_dir, ".rsync_exit_status")) as rsync_exit_file: rsync_exit_status = rsync_exit_file.readlines() - if rsync_exit_status[0].strip() == 0: + if rsync_exit_status[0].strip() == '0': return True else: return False @@ -522,7 +522,7 @@ def make_transfer_indicator(self): def transfer(self): transfer_details = ( - self.CONFIG.get("element_analysis").get(self.sequencer_type).get("transfer_details") + self.CONFIG.get("element_analysis").get("transfer_details") ) # TODO: Add section to taca.yaml command = ( "rsync" @@ -532,8 +532,8 @@ def transfer(self): + " --exclude BaseCalls" # TODO: check that we actually want to exclude these + " --exclude Alignment" + f" {self.run_dir}" - + f" {transfer_details.get('user')@transfer_details.get('host')}:/aviti" - + "; echo $? > .rsync_exit_status" + + f" {transfer_details.get('user')}@{transfer_details.get('host')}:/aviti" + + f"; echo $? > {os.path.join(self.run_dir, ".rsync_exit_status")}" ) # TODO: any other options? try: p_handle = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True) @@ -568,3 +568,4 @@ def archive(self): src = self.run_dir dst = os.path.join(self.run_dir, os.pardir, "nosync") shutil.move(src, dst) + self.run_dir = From ab99114bad809b76c287bd09d95695545174c640 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Thu, 26 Sep 2024 08:49:51 +0200 Subject: [PATCH 066/187] Update taca/element/Element_Runs.py Co-authored-by: Johannes Alneberg --- taca/element/Element_Runs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 4dd1684b..58a6b646 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -567,7 +567,7 @@ def aggregate_sample_fastq(self, demux_runmanifest): sample_count += 1 - # Symplink the output FastQ files of undet only if a lane does not have multiple demux + # Symlink the output FastQ files of undet only if a lane does not have multiple demux def aggregate_undet_fastq(self, demux_runmanifest): lanes = sorted(list(set(sample['Lane'] for sample in demux_runmanifest))) for lane in lanes: From cf64192b549440c43f2e6b54d79032855edae296 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Thu, 26 Sep 2024 08:50:17 +0200 Subject: [PATCH 067/187] Update taca/element/Element_Runs.py Co-authored-by: Johannes Alneberg --- taca/element/Element_Runs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 58a6b646..e3d25989 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -735,7 +735,7 @@ def aggregate_demux_results(self, demux_results_dirs): demux_runmanifest = collect_demux_runmanifest(demux_results_dirs) # Aggregate the output FastQ files of samples from multiple demux aggregate_sample_fastq(demux_runmanifest) - # Symplink the output FastQ files of undet only if a lane does not have multiple demux + # Symlink the output FastQ files of undet only if a lane does not have multiple demux aggregate_undet_fastq(demux_runmanifest) # Aggregate stats in IndexAssignment.csv aggregate_stats_assigned(demux_runmanifest) From 2559eccae794230e602743311f0ac0533cbfdc01 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Thu, 26 Sep 2024 08:52:31 +0200 Subject: [PATCH 068/187] Update taca/element/Element_Runs.py --- taca/element/Element_Runs.py | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index e3d25989..ee9a2cc2 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -459,29 +459,6 @@ def clear_dir(dir): print(f"Failed to delete {file_path} Reason {e}") # Create symlink for a simple demultiplexing dir - def symlink_demux_dir(src_dir, dest_dir): - # Ensure the destination directory exists - if not os.path.exists(dest_dir): - os.makedirs(dest_dir) - # Clear all content under dest_dir - clear_dir(dest_dir) - # Loop through all files and directories in the source directory - for item in os.listdir(src_dir): - src_path = os.path.join(src_dir, item) - # Move content of Samples to the parental dir - if item == "Samples": - dest_path = dest_dir - else: - dest_path = os.path.join(dest_dir, item) - try: - # Create symbolic link only if it doesn't already exist - if not os.path.exists(dest_path): - os.symlink(src_path, dest_path) - print(f"Linked {src_path} to {dest_path}") - else: - print(f"{dest_path} already exists.") - except OSError as e: - print(f"Error linking {src_path} to {dest_path}: {e}") # Write to csv From 9627961ceb6ac295ce69fd1ff56744f197605920 Mon Sep 17 00:00:00 2001 From: chuan-wang Date: Thu, 26 Sep 2024 08:53:08 +0200 Subject: [PATCH 069/187] Remove unused function --- taca/element/Element_Runs.py | 29 ++--------------------------- 1 file changed, 2 insertions(+), 27 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index fb5de39c..ad977c9f 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -440,31 +440,6 @@ def clear_dir(dir): except Exception as e: print(f"Failed to delete {file_path} Reason {e}") - # Create symlink for a simple demultiplexing dir - def symlink_demux_dir(src_dir, dest_dir): - # Ensure the destination directory exists - if not os.path.exists(dest_dir): - os.makedirs(dest_dir) - # Clear all content under dest_dir - clear_dir(dest_dir) - # Loop through all files and directories in the source directory - for item in os.listdir(src_dir): - src_path = os.path.join(src_dir, item) - # Move content of Samples to the parental dir - if item == "Samples": - dest_path = dest_dir - else: - dest_path = os.path.join(dest_dir, item) - try: - # Create symbolic link only if it doesn't already exist - if not os.path.exists(dest_path): - os.symlink(src_path, dest_path) - print(f"Linked {src_path} to {dest_path}") - else: - print(f"{dest_path} already exists.") - except OSError as e: - print(f"Error linking {src_path} to {dest_path}: {e}") - # Write to csv def write_to_csv(data, filename): @@ -549,7 +524,7 @@ def aggregate_sample_fastq(self, demux_runmanifest): sample_count += 1 - # Symplink the output FastQ files of undet only if a lane does not have multiple demux + # Symlink the output FastQ files of undet only if a lane does not have multiple demux def aggregate_undet_fastq(self, demux_runmanifest): lanes = sorted(list(set(sample['Lane'] for sample in demux_runmanifest))) for lane in lanes: @@ -717,7 +692,7 @@ def aggregate_demux_results(self, demux_results_dirs): demux_runmanifest = collect_demux_runmanifest(demux_results_dirs) # Aggregate the output FastQ files of samples from multiple demux aggregate_sample_fastq(demux_runmanifest) - # Symplink the output FastQ files of undet only if a lane does not have multiple demux + # Symlink the output FastQ files of undet only if a lane does not have multiple demux aggregate_undet_fastq(demux_runmanifest) # Aggregate stats in IndexAssignment.csv aggregate_stats_assigned(demux_runmanifest) From 8a9ce0f4dce83934511466eec234bc296e120dd6 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Thu, 26 Sep 2024 10:12:01 +0200 Subject: [PATCH 070/187] fix references to functions --- taca/element/Element_Runs.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index ee9a2cc2..8dee9732 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -458,9 +458,6 @@ def clear_dir(dir): except Exception as e: print(f"Failed to delete {file_path} Reason {e}") - # Create symlink for a simple demultiplexing dir - - # Write to csv def write_to_csv(data, filename): # Get the fieldnames from the keys of the first dictionary @@ -601,7 +598,7 @@ def aggregate_stats_assigned(self, demux_runmanifest): # Read in each Project_RunStats.json to fetch PercentMismatch, PercentQ30, PercentQ40 and QualityScoreMean # Note that Element promised that they would include these stats into IndexAssignment.csv # But for now we have to do this by ourselves in this hard way - project_runstats = get_project_runstats(sub_demux, demux_runmanifest) + project_runstats = self.get_project_runstats(sub_demux, demux_runmanifest) # Read in IndexAssignment.csv assigned_csv = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux}", "IndexAssignment.csv") if os.path.exists(assigned_csv): @@ -642,7 +639,7 @@ def aggregate_stats_assigned(self, demux_runmanifest): sample['SampleNumber'] = sample_count # Write to a new UnassignedSequences.csv file under demux_dir aggregated_assigned_indexes_csv = os.path.join(self.run_dir, self.demux_dir, "IndexAssignment.csv") - write_to_csv(aggregated_assigned_indexes_filtered_sorted, aggregated_assigned_indexes_csv) + self.write_to_csv(aggregated_assigned_indexes_filtered_sorted, aggregated_assigned_indexes_csv) # Aggregate stats in UnassignedSequences.csv @@ -699,7 +696,7 @@ def aggregate_stats_unassigned(self, demux_runmanifest): aggregated_unassigned_indexes = sorted(aggregated_unassigned_indexes, key=lambda x: (x['Lane'], -int(x['Count']))) # Write to a new UnassignedSequences.csv file under demux_dir aggregated_unassigned_csv = os.path.join(self.run_dir, self.demux_dir, "UnassignedSequences.csv") - write_to_csv(aggregated_unassigned_indexes, aggregated_unassigned_csv) + self.write_to_csv(aggregated_unassigned_indexes, aggregated_unassigned_csv) # Aggregate demux results @@ -708,16 +705,16 @@ def aggregate_demux_results(self, demux_results_dirs): if not os.path.exists(os.path.join(self.run_dir, self.demux_dir)): os.makedirs(os.path.join(self.run_dir, self.demux_dir)) # Clear all content under dest_dir - clear_dir(os.path.join(self.run_dir, self.demux_dir)) - demux_runmanifest = collect_demux_runmanifest(demux_results_dirs) + self.clear_dir(os.path.join(self.run_dir, self.demux_dir)) + demux_runmanifest = self.collect_demux_runmanifest(demux_results_dirs) # Aggregate the output FastQ files of samples from multiple demux - aggregate_sample_fastq(demux_runmanifest) + self.aggregate_sample_fastq(demux_runmanifest) # Symlink the output FastQ files of undet only if a lane does not have multiple demux - aggregate_undet_fastq(demux_runmanifest) + self.aggregate_undet_fastq(demux_runmanifest) # Aggregate stats in IndexAssignment.csv - aggregate_stats_assigned(demux_runmanifest) + self.aggregate_stats_assigned(demux_runmanifest) # Aggregate stats in UnassignedSequences.csv - aggregate_stats_unassigned(demux_runmanifest) + self.aggregate_stats_unassigned(demux_runmanifest) def upload_demux_results_to_statusdb(self): doc_obj = self.db.get_db_entry(self.NGI_run_id) @@ -829,4 +826,4 @@ def archive(self): src = self.run_dir dst = os.path.join(self.run_dir, os.pardir, "nosync") shutil.move(src, dst) - self.run_dir = + self.run_dir = os.path.join(dst, self.NGI_run_id) # Needs to be redirected to new location so that TACA can find files to upload to statusdb From 80d03c443061e0b07aaaad6072775012e7fb88cf Mon Sep 17 00:00:00 2001 From: kedhammar Date: Thu, 26 Sep 2024 13:58:31 +0200 Subject: [PATCH 071/187] make config to use for tests --- tests/element/test_Element_Runs.py | 47 ++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 12 deletions(-) diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py index 1100aff9..5196e678 100644 --- a/tests/element/test_Element_Runs.py +++ b/tests/element/test_Element_Runs.py @@ -7,6 +7,17 @@ from taca.element import Element_Runs as to_test +CONFIG = { + "element_analysis": { + "Element": { + "GenericElement": { + "demux_dir": "mock_demux_dir_path", + "transfer_log": "mock_transfer_log_file.log", + }, + }, + }, +} + def create_element_run_dir( tmp: tempfile.TemporaryDirectory, @@ -15,6 +26,7 @@ def create_element_run_dir( run_finished: bool = True, sync_finished: bool = True, demux_dir: bool = True, + n_demux_subdirs: int = 1, demux_done: bool = True, outcome_completed: bool = True, ) -> str: @@ -27,8 +39,12 @@ def create_element_run_dir( ├── RunParameters.json ├── RunUploaded.json ├── .sync_finished - └── Demultiplexing - └── RunStats.json + ├── Demultiplexing + ├── Demultiplexing_0 + | └── RunStats.json + ├── Demultiplexing_1 + | └── RunStats.json + └── ... """ @@ -53,9 +69,18 @@ def create_element_run_dir( if demux_dir: os.mkdir(os.path.join(run_path, "Demultiplexing")) - - if demux_done: - open(os.path.join(run_path, "Demultiplexing", "RunStats.json"), "w").close() + for i in range(n_demux_subdirs): + os.mkdir(os.path.join(run_path, "Demultiplexing", f"Demultiplexing_{i}")) + if demux_done: + open( + os.path.join( + run_path, + "Demultiplexing", + f"Demultiplexing_{i}", + "RunStats.json", + ), + "w", + ).close() return run_path @@ -66,7 +91,7 @@ def test_init(self, mock_db: mock.Mock, create_dirs: pytest.fixture): tmp: tempfile.TemporaryDirectory = create_dirs run_dir = create_element_run_dir(tmp) - run = to_test.Run(run_dir, {}) + run = to_test.Run(run_dir, CONFIG) assert run.run_dir == run_dir @pytest.mark.parametrize( @@ -92,7 +117,7 @@ def test_check_sequencing_status( run_finished=p["run_finished"], outcome_completed=p["outcome_completed"], ), - {}, + CONFIG, ) assert run.check_sequencing_status() is p["expected"] @@ -110,15 +135,13 @@ def test_get_demultiplexing_status( ): tmp: tempfile.TemporaryDirectory = create_dirs - if p["demux_dir"] and not p["demux_done"]: - run = to_test.Run( create_element_run_dir( tmp, demux_dir=p["demux_dir"], demux_done=p["demux_done"], ), - {}, + CONFIG, ) assert run.get_demultiplexing_status() == p["expected"] @@ -140,7 +163,7 @@ def test_manifest_exists( tmp, run_finished=p["run_finished"], ), - {}, + CONFIG, ) assert run.manifest_exists() == p["expected"] @@ -155,7 +178,7 @@ def test_start_demux(self, mock_db, create_dirs): "taca.element.Element_Runs.Run.generate_demux_command" ) as mock_command: mock_command.return_value = "test command" - run = to_test.Run(create_element_run_dir(create_dirs), {}) + run = to_test.Run(create_element_run_dir(create_dirs), CONFIG) run.start_demux() mock_command.assert_called_once() mock_call.assert_called_once_with( From 7936e8fb325a45466db030dd7fa1925c7b548744 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Thu, 26 Sep 2024 14:12:18 +0200 Subject: [PATCH 072/187] add config --- tests/element/test_Aviti_Runs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/element/test_Aviti_Runs.py b/tests/element/test_Aviti_Runs.py index bf32089c..3c61276a 100644 --- a/tests/element/test_Aviti_Runs.py +++ b/tests/element/test_Aviti_Runs.py @@ -4,7 +4,7 @@ import pytest from taca.element import Aviti_Runs as to_test -from tests.element.test_Element_Runs import create_element_run_dir +from tests.element.test_Element_Runs import CONFIG, create_element_run_dir class TestAviti_Run: @@ -16,6 +16,6 @@ def test_init(self, create_dirs: pytest.fixture): mock_db = patch("taca.element.Element_Runs.ElementRunsConnection") mock_db.start() - run = to_test.Aviti_Run(run_dir, {}) + run = to_test.Aviti_Run(run_dir, CONFIG) assert run.run_dir == run_dir assert run.sequencer_type == "Aviti" From 41dd4777a56e8d7978c1d98da69923cd85ab49ad Mon Sep 17 00:00:00 2001 From: kedhammar Date: Thu, 26 Sep 2024 14:14:01 +0200 Subject: [PATCH 073/187] bugfix --- taca/element/Element_Runs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 8dee9732..40beb05c 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -382,7 +382,7 @@ def make_demux_manifests( def generate_demux_command(self, run_manifest, demux_dir): command = ( - f"{self.CONFIG.get("element_analysis").get('bases2fastq')}" # TODO: add path to bases2fastq executable to config + f"{self.CONFIG.get('element_analysis').get('bases2fastq')}" # TODO: add path to bases2fastq executable to config + f" {self.run_dir}" + f" {demux_dir}" + " -p 8" @@ -791,7 +791,7 @@ def transfer(self): + " --exclude Alignment" + f" {self.run_dir}" + f" {transfer_details.get('user')}@{transfer_details.get('host')}:/aviti" - + f"; echo $? > {os.path.join(self.run_dir, ".rsync_exit_status")}" + + f"; echo $? > {os.path.join(self.run_dir, '.rsync_exit_status')}" ) # TODO: any other options? try: p_handle = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True) From 48ec3438925e4c4cdfa625ac9c122067a8dd2d51 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Thu, 26 Sep 2024 15:24:42 +0200 Subject: [PATCH 074/187] Upload more stuff to statusdb --- taca/analysis/analysis_element.py | 1 - taca/element/Element_Runs.py | 126 ++++++++++++++++-------------- 2 files changed, 68 insertions(+), 59 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index ab142d69..1d2f4d3f 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -84,7 +84,6 @@ def _process(run): os.path.join(run.run_dir, "Delmultiplexing_*") ) run.aggregate_demux_results(demux_results_dirs) - run.upload_demux_results_to_statusdb() run.sync_metadata() run.make_transfer_indicator() run.status = "transferring" diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 8dee9732..0d48edf5 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -42,12 +42,12 @@ def __init__(self, run_dir, configuration): self.CONFIG.get("element_analysis").get("Element", {}) .get(self.sequencer_type, {}) .get("transfer_log") - ) # TODO: change and add to taca.yaml + ) # TODO: add to taca.yaml self.rsync_exit_file = os.path.join(self.run_dir, ".rsync_exit_status") # Instrument generated files self.run_parameters_file = os.path.join(self.run_dir, "RunParameters.json") - self.run_stats_file = os.path.join(self.run_dir, "RunStats.json") + self.run_stats_file = os.path.join(self.run_dir, "AvitiRunStats.json") self.run_manifest_file_from_instrument = os.path.join( self.run_dir, "RunManifest.json" ) @@ -135,13 +135,64 @@ def to_doc_obj(self): ) else: instrument_generated_files[os.path.basename(file)] = None + # Aggregated demux stats files + index_assignement_file = os.path.join( + self.run_dir, "Demultiplexing", "IndexAssignment.csv" + ) + if os.path.exists(index_assignement_file): + with open(index_assignement_file) as index_file: + reader = csv.DictReader(index_file) + index_assignments = [row for row in reader] + else: + index_assignments = None + + unassigned_sequences_file = os.path.join( + self.run_dir, "Demultiplexing", "UnassignedSequences.csv" + ) + if os.path.exists(unassigned_sequences_file): + with open(unassigned_sequences_file) as unassigned_file: + reader = csv.DictReader(unassigned_file) + unassigned_sequences = [row for row in reader] + else: + unassigned_sequences = None + demultiplex_stats = { + "Demultiplex_Stats": { + "Index_Assignment": index_assignments, + "Unassigned_Sequences": unassigned_sequences, + } + } + + demux_command_file = os.path.join(self.run_dir, ".bases2fastq_command") + if os.path.exists(demux_command_file): + with open(demux_command_file) as command_file: + demux_command = command_file.readlines()[0] + else: + demux_command = None + demux_version_file = os.path.join(self.run_dir,"Demultiplexing_0", "RunStats.json") + if os.path.exists(demux_version_file): + with open(demux_version_file) as json_file: + demux_info = json.load( + json_file + ) + demux_version = demux_info.get("AnalysisVersion") + else: + demux_version = None + + software_info = { + "Version": demux_version, + "bin": self.CONFIG.get("element_analysis").get("bases2fastq"), + "options": demux_command, + } + doc_obj = { "name": self.NGI_run_id, "run_path": self.run_dir, "run_status": self.status, "NGI_run_id": self.NGI_run_id, "instrument_generated_files": instrument_generated_files, + "Element": demultiplex_stats, + "Software": software_info, } return doc_obj @@ -390,6 +441,8 @@ def generate_demux_command(self, run_manifest, demux_dir): + " --legacy-fastq" # TODO: except if Smart-seq3 + " --force-index-orientation" ) # TODO: any other options? + with open(os.path.join(self.run_dir, '.bases2fastq_command')) as command_file: + command_file.write(command) return command def start_demux(self, run_manifest, demux_dir): @@ -716,60 +769,6 @@ def aggregate_demux_results(self, demux_results_dirs): # Aggregate stats in UnassignedSequences.csv self.aggregate_stats_unassigned(demux_runmanifest) - def upload_demux_results_to_statusdb(self): - doc_obj = self.db.get_db_entry(self.NGI_run_id) - index_assignement_file = os.path.join( - self.run_dir, "Demultiplexing", "IndexAssignment.csv" - ) - with open(index_assignement_file) as index_file: - reader = csv.DictReader(index_file) - index_assignments = [row for row in reader] - unassigned_sequences_file = os.path.join( - self.run_dir, "Demultiplexing", "UnassignedSequences.csv" - ) - with open(unassigned_sequences_file) as unassigned_file: - reader = csv.DictReader(unassigned_file) - unassigned_sequences = [row for row in reader] - dirs = os.scandir("Demultiplexing") - project_dirs = [] - for directory in dirs: - if os.path.isdir(directory.path) and "Unassigned" not in directory.path: - project_dirs.append(directory.path) - for project_dir in project_dirs: # TODO: remove this block when q30 is added to IndexAssignment.csv by Element - run_stats_file = glob.glob(os.path.join(project_dir, "*_RunStats.json")) - with open(run_stats_file) as stats_json: - project_sample_stats_raw = json.load(stats_json) - collected_sample_stats = {} - for sample_stats in project_sample_stats_raw["SampleStats"]: - sample_name = sample_stats["SampleName"] - percent_q30 = sample_stats["PercentQ30"] - quality_score_mean = sample_stats["QualityScoreMean"] - percent_mismatch = sample_stats["PercentMismatch"] - collected_sample_stats[sample_name] = { - "PercentQ30": percent_q30, - "QualityScoreMean": quality_score_mean, - "PercentMismatch": percent_mismatch, - } - for assignment in index_assignments: - sample = assignment.get("SampleName") - if sample != "PhiX": - sample_stats_to_add = collected_sample_stats.get(sample) - assignment["PercentQ30"] = sample_stats_to_add.get("PercentQ30") - assignment["QualityScoreMean"] = sample_stats_to_add.get( - "QualityScoreMean" - ) - assignment["PercentMismatch"] = sample_stats_to_add.get( - "PercentMismatch" - ) - demultiplex_stats = { - "Demultiplex_Stats": { - "Index_Assignment": index_assignments, - "Unassigned_Sequences": unassigned_sequences, - } - } - doc_obj["Aviti"] = demultiplex_stats - self.db.upload_to_statusdb(doc_obj) - def sync_metadata(self): # TODO: copy metadata from demuxed run to ngi-nas-ns pass @@ -821,9 +820,20 @@ def update_transfer_log(self): logger.error(msg) raise OSError(msg) + def update_paths_after_archiving(self, new_location): + self.run_dir = os.path.join(new_location, self.NGI_run_id) # Needs to be redirected to new location so that TACA can find files to upload to statusdb + self.run_parameters_file = os.path.join(self.run_dir, "RunParameters.json") + self.run_stats_file = os.path.join(self.run_dir, "RunStats.json") + self.run_manifest_file_from_instrument = os.path.join( + self.run_dir, "RunManifest.json" + ) + self.run_uploaded_file = os.path.join(self.run_dir, "RunUploaded.json") + # TODO: also update location of demux files? + def archive(self): """Move directory to nosync.""" src = self.run_dir - dst = os.path.join(self.run_dir, os.pardir, "nosync") + parent_dir = Path(self.run_dir).parent.absolute() + dst = os.path.join(parent_dir, "nosync") shutil.move(src, dst) - self.run_dir = os.path.join(dst, self.NGI_run_id) # Needs to be redirected to new location so that TACA can find files to upload to statusdb + self.update_paths_after_archiving(dst) From d16ef8f0e94e88ad9ff3ff8dbf5c6d8c97820bf7 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Thu, 26 Sep 2024 15:28:58 +0200 Subject: [PATCH 075/187] fix formatting of .toml and scale down defaults args of pytest to enable IDE debugging --- pyproject.toml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index cf0d04c8..f9ceff6f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,3 @@ -title = "taca" - # === LINTING ================================================================ [tool.ruff] @@ -37,7 +35,12 @@ filterwarnings = [ 'ignore::DeprecationWarning:couchdb.*', 'ignore::DeprecationWarning:pkg_resources.*', ] -addopts = "--cov=./taca --cov-report term-missing -vv --cache-clear tests/" +# Default addopts +addopts = "--ignore tests_old/ " #--cov=./taca --cov-report=xml -vv" + +# CLI coverage reports, messes with IDE debugging +#addopts = "--ignore tests_old/ --cov=./taca --cov-report=xml -vv" + [tool.coverage.run] # The comment "# pragma: no cover" can be used to exclude a line from coverage From 04001fd5f4a82f609d78627d8a3eda2697e6d8ed Mon Sep 17 00:00:00 2001 From: kedhammar Date: Thu, 26 Sep 2024 15:29:13 +0200 Subject: [PATCH 076/187] add aviti transfer log --- tests/conftest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index c26d4c03..e9a3fd89 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -17,6 +17,7 @@ def create_dirs(): │ ├── transfer_minion_qc.tsv │ ├── transfer_minion.tsv │ └── transfer_promethion.tsv + │ └── transfer_aviti.tsv │ └── transfer.tsv │ └── taca.log ├── miarka @@ -95,6 +96,7 @@ def create_dirs(): open(f"{tmp.name}/log/transfer_promethion.tsv", "w").close() open(f"{tmp.name}/log/transfer_minion.tsv", "w").close() open(f"{tmp.name}/log/transfer_minion_qc.tsv", "w").close() + open(f"{tmp.name}/log/transfer_aviti.tsv", "w").close() open(f"{tmp.name}/log/transfer.tsv", "w").close() open(f"{tmp.name}/log/taca.log", "w").close() From bcec98fda552c2a98ebfcecf409d475d66bd3717 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Thu, 26 Sep 2024 15:29:41 +0200 Subject: [PATCH 077/187] build config from tmp dir and update tests to pass --- tests/element/test_Aviti_Runs.py | 4 +- tests/element/test_Element_Runs.py | 67 +++++++++++++++++------------- 2 files changed, 41 insertions(+), 30 deletions(-) diff --git a/tests/element/test_Aviti_Runs.py b/tests/element/test_Aviti_Runs.py index 3c61276a..62d142bc 100644 --- a/tests/element/test_Aviti_Runs.py +++ b/tests/element/test_Aviti_Runs.py @@ -4,7 +4,7 @@ import pytest from taca.element import Aviti_Runs as to_test -from tests.element.test_Element_Runs import CONFIG, create_element_run_dir +from tests.element.test_Element_Runs import create_element_run_dir, get_config class TestAviti_Run: @@ -16,6 +16,6 @@ def test_init(self, create_dirs: pytest.fixture): mock_db = patch("taca.element.Element_Runs.ElementRunsConnection") mock_db.start() - run = to_test.Aviti_Run(run_dir, CONFIG) + run = to_test.Aviti_Run(run_dir, get_config(tmp)) assert run.run_dir == run_dir assert run.sequencer_type == "Aviti" diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py index 5196e678..af373891 100644 --- a/tests/element/test_Element_Runs.py +++ b/tests/element/test_Element_Runs.py @@ -7,16 +7,19 @@ from taca.element import Element_Runs as to_test -CONFIG = { - "element_analysis": { - "Element": { - "GenericElement": { - "demux_dir": "mock_demux_dir_path", - "transfer_log": "mock_transfer_log_file.log", + +def get_config(tmp: tempfile.TemporaryDirectory) -> dict: + config = { + "element_analysis": { + "Element": { + "GenericElement": { + "manifest_zip_location": f"{tmp}/ngi-nas-ns/samplesheets/Aviti", + "transfer_log": f"{tmp}/log/transfer_aviti.tsv", + }, }, }, - }, -} + } + return config def create_element_run_dir( @@ -40,11 +43,11 @@ def create_element_run_dir( ├── RunUploaded.json ├── .sync_finished ├── Demultiplexing - ├── Demultiplexing_0 - | └── RunStats.json - ├── Demultiplexing_1 - | └── RunStats.json - └── ... + ├── Demultiplexing_0 + | └── RunStats.json + ├── Demultiplexing_1 + | └── RunStats.json + └── ... """ @@ -69,13 +72,21 @@ def create_element_run_dir( if demux_dir: os.mkdir(os.path.join(run_path, "Demultiplexing")) + if demux_done: + open( + os.path.join( + run_path, + f"Demultiplexing", + "RunStats.json", + ), + "w", + ).close() for i in range(n_demux_subdirs): - os.mkdir(os.path.join(run_path, "Demultiplexing", f"Demultiplexing_{i}")) + os.mkdir(os.path.join(run_path, f"Demultiplexing_{i}")) if demux_done: open( os.path.join( run_path, - "Demultiplexing", f"Demultiplexing_{i}", "RunStats.json", ), @@ -91,7 +102,7 @@ def test_init(self, mock_db: mock.Mock, create_dirs: pytest.fixture): tmp: tempfile.TemporaryDirectory = create_dirs run_dir = create_element_run_dir(tmp) - run = to_test.Run(run_dir, CONFIG) + run = to_test.Run(run_dir, get_config(tmp)) assert run.run_dir == run_dir @pytest.mark.parametrize( @@ -117,7 +128,7 @@ def test_check_sequencing_status( run_finished=p["run_finished"], outcome_completed=p["outcome_completed"], ), - CONFIG, + get_config(tmp), ) assert run.check_sequencing_status() is p["expected"] @@ -141,10 +152,12 @@ def test_get_demultiplexing_status( demux_dir=p["demux_dir"], demux_done=p["demux_done"], ), - CONFIG, + get_config(tmp), ) + assert run.get_demultiplexing_status() == p["expected"] + @pytest.mark.skip(reason="Not implemented yet") @pytest.mark.parametrize( "p", [ @@ -163,8 +176,9 @@ def test_manifest_exists( tmp, run_finished=p["run_finished"], ), - CONFIG, + get_config(tmp), ) + assert run.manifest_exists() == p["expected"] @pytest.mark.skip(reason="Not implemented yet") @@ -172,18 +186,15 @@ def test_generate_demux_command(self, mock_db): pass def test_start_demux(self, mock_db, create_dirs): - with mock.patch( - "taca.utils.misc.call_external_command_detached" - ) as mock_call, mock.patch( + tmp: tempfile.TemporaryDirectory = create_dirs + with mock.patch("subprocess.Popen") as mock_Popen, mock.patch( "taca.element.Element_Runs.Run.generate_demux_command" ) as mock_command: mock_command.return_value = "test command" - run = to_test.Run(create_element_run_dir(create_dirs), CONFIG) - run.start_demux() - mock_command.assert_called_once() - mock_call.assert_called_once_with( - "test command", with_log_files=True, prefix="demux_" - ) + run = to_test.Run(create_element_run_dir(create_dirs), get_config(tmp)) + run.start_demux("mock_run_manifest", "mock_demux_dir") + mock_command.assert_called_once_with("mock_run_manifest", "mock_demux_dir") + mock_Popen.assert_called_once() @pytest.mark.skip(reason="Not implemented yet") def test_is_transferred(self, mock_db, create_dirs): From eadc072738901898947c62bdfbbacd0934ecae5f Mon Sep 17 00:00:00 2001 From: kedhammar Date: Thu, 26 Sep 2024 15:30:33 +0200 Subject: [PATCH 078/187] remove comment --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f9ceff6f..4c9bfa38 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ filterwarnings = [ 'ignore::DeprecationWarning:pkg_resources.*', ] # Default addopts -addopts = "--ignore tests_old/ " #--cov=./taca --cov-report=xml -vv" +addopts = "--ignore tests_old/" # CLI coverage reports, messes with IDE debugging #addopts = "--ignore tests_old/ --cov=./taca --cov-report=xml -vv" From 4e1c568359c6f5db33e2010159d2f8e6c1a31c71 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Thu, 26 Sep 2024 15:33:04 +0200 Subject: [PATCH 079/187] fix full command with inline results --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4c9bfa38..d5d152b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,7 @@ filterwarnings = [ addopts = "--ignore tests_old/" # CLI coverage reports, messes with IDE debugging -#addopts = "--ignore tests_old/ --cov=./taca --cov-report=xml -vv" +# pytest --ignore tests_old/ --cov=./taca --cov-report term-missing -vv [tool.coverage.run] From a0696b15b9d908ab5efc0c922962eb4151e68ddf Mon Sep 17 00:00:00 2001 From: kedhammar Date: Thu, 26 Sep 2024 17:14:52 +0200 Subject: [PATCH 080/187] bugfix --- tests/element/test_Element_Runs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py index af373891..9eb264b1 100644 --- a/tests/element/test_Element_Runs.py +++ b/tests/element/test_Element_Runs.py @@ -13,8 +13,8 @@ def get_config(tmp: tempfile.TemporaryDirectory) -> dict: "element_analysis": { "Element": { "GenericElement": { - "manifest_zip_location": f"{tmp}/ngi-nas-ns/samplesheets/Aviti", - "transfer_log": f"{tmp}/log/transfer_aviti.tsv", + "manifest_zip_location": f"{tmp.name}/ngi-nas-ns/samplesheets/Aviti", + "transfer_log": f"{tmp.name}/log/transfer_aviti.tsv", }, }, }, From 891595d66cbdc7ef86e69d40c2e87341a7dbb26c Mon Sep 17 00:00:00 2001 From: chuan-wang Date: Fri, 27 Sep 2024 15:56:34 +0200 Subject: [PATCH 081/187] Add project name in IndexAssignment; Correct index percentage in undet --- VERSIONLOG.md | 5 +++++ taca/analysis/analysis_element.py | 6 +++--- taca/element/Element_Runs.py | 21 ++++++++++++++++++--- 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/VERSIONLOG.md b/VERSIONLOG.md index d0563c9a..bc899c3c 100644 --- a/VERSIONLOG.md +++ b/VERSIONLOG.md @@ -1,5 +1,10 @@ # TACA Version Log +## 20240927.1 + +Add project name in IndexAssignment; +Correct index percentage in undet + ## 20240924.1 Aggregate aviti demultiplexing results diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index 1d2f4d3f..7109a027 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -70,7 +70,7 @@ def _process(run): if run.status_changed: run.update_statusdb() return - + elif demultiplexing_status != "finished": logger.warning( f"Unknown demultiplexing status {demultiplexing_status} of run {run}. Please investigate" @@ -81,7 +81,7 @@ def _process(run): transfer_status = run.get_transfer_status() if transfer_status == "not started": demux_results_dirs = glob.glob( - os.path.join(run.run_dir, "Delmultiplexing_*") + os.path.join(run.run_dir, "Demultiplexing_*") ) run.aggregate_demux_results(demux_results_dirs) run.sync_metadata() @@ -107,7 +107,7 @@ def _process(run): run.update_statusdb() run.archive() run.status = "archived" - + if run.status_changed: run.update_statusdb() else: diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 397ab0ee..eaab13f6 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -162,7 +162,7 @@ def to_doc_obj(self): "Unassigned_Sequences": unassigned_sequences, } } - + demux_command_file = os.path.join(self.run_dir, ".bases2fastq_command") if os.path.exists(demux_command_file): with open(demux_command_file) as command_file: @@ -184,7 +184,7 @@ def to_doc_obj(self): "bin": self.CONFIG.get("element_analysis").get("bases2fastq"), "options": demux_command, } - + doc_obj = { "name": self.NGI_run_id, "run_path": self.run_dir, @@ -257,7 +257,7 @@ def get_lims_step_id(self) -> str | None: lims_step_id = line.split(",")[1] return lims_step_id return None - + def find_manifest_zip(self): # Specify dir in which LIMS drop the manifest zip files dir_to_search = os.path.join( @@ -674,6 +674,8 @@ def aggregate_stats_assigned(self, demux_runmanifest): aggregated_assigned_indexes_filtered = [] unique_phiX_combination = set() for sample in aggregated_assigned_indexes: + # Add project name + sample['Project'] = [d for d in demux_runmanifest if d['SampleName'] == sample['SampleName']][0]['Project'] if sample['SampleName'] == 'PhiX': combination = (sample['I1'], sample['I2'], sample['Lane']) if combination not in unique_phiX_combination: @@ -748,6 +750,19 @@ def aggregate_stats_unassigned(self, demux_runmanifest): aggregated_unassigned_indexes += max_unassigned_indexes # Sort aggregated_unassigned_indexes list first by lane and then by Count in the decreasing order aggregated_unassigned_indexes = sorted(aggregated_unassigned_indexes, key=lambda x: (x['Lane'], -int(x['Count']))) + # Fetch PFCount for each lane + pfcount_lane = {} + aviti_runstats_json = os.path.join(self.run_dir, "AvitiRunStats.json") + if os.path.exists(aviti_runstats_json): + with open(aviti_runstats_json) as stats_json: + aviti_runstats_json = json.load(stats_json) + for lane_stats in aviti_runstats_json["LaneStats"]: + pfcount_lane[str(lane_stats["Lane"])] = float(lane_stats["PFCount"]) + else: + logger.warning(f"No AvitiRunStats.json file found for the run.") + # Modify the % Polonies values based on PFCount for each lane + for unassigned_index in aggregated_unassigned_indexes: + unassigned_index["% Polonies"] = float(unassigned_index["Count"])/pfcount_lane[unassigned_index["Lane"]]*100 # Write to a new UnassignedSequences.csv file under demux_dir aggregated_unassigned_csv = os.path.join(self.run_dir, self.demux_dir, "UnassignedSequences.csv") self.write_to_csv(aggregated_unassigned_indexes, aggregated_unassigned_csv) From d6e225196761a5bd11ff68ad21ecf611e57cfe16 Mon Sep 17 00:00:00 2001 From: chuan-wang Date: Sat, 28 Sep 2024 07:28:16 +0200 Subject: [PATCH 082/187] Add Sample_ in sample folder name --- taca/element/Element_Runs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index eaab13f6..794f6892 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -580,7 +580,7 @@ def aggregate_sample_fastq(self, demux_runmanifest): sample_tuple = (sample_name, sub_demux_count) if sample_tuple not in unique_sample_demux: project_dest = os.path.join(self.run_dir, self.demux_dir, project) - sample_dest = os.path.join(self.run_dir, self.demux_dir, project, sample_name) + sample_dest = os.path.join(self.run_dir, self.demux_dir, project, f"Sample_{sample_name}") if not os.path.exists(project_dest): os.makedirs(project_dest) if not os.path.exists(sample_dest): From 8a1a4981469f0ed23d6aa2563e2b13e7604b35f7 Mon Sep 17 00:00:00 2001 From: chuan-wang Date: Mon, 30 Sep 2024 08:51:45 +0200 Subject: [PATCH 083/187] Refactors based on comments from SS --- taca/element/Element_Runs.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 794f6892..833f0460 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -752,17 +752,18 @@ def aggregate_stats_unassigned(self, demux_runmanifest): aggregated_unassigned_indexes = sorted(aggregated_unassigned_indexes, key=lambda x: (x['Lane'], -int(x['Count']))) # Fetch PFCount for each lane pfcount_lane = {} - aviti_runstats_json = os.path.join(self.run_dir, "AvitiRunStats.json") - if os.path.exists(aviti_runstats_json): - with open(aviti_runstats_json) as stats_json: + if os.path.exists(self.run_stats_file): + with open(self.run_stats_file) as stats_json: aviti_runstats_json = json.load(stats_json) for lane_stats in aviti_runstats_json["LaneStats"]: pfcount_lane[str(lane_stats["Lane"])] = float(lane_stats["PFCount"]) + # Modify the % Polonies values based on PFCount for each lane + for unassigned_index in aggregated_unassigned_indexes: + if pfcount_lane.get(unassigned_index["Lane"]): + unassigned_index["% Polonies"] = float(unassigned_index["Count"])/pfcount_lane[unassigned_index["Lane"]]*100 else: logger.warning(f"No AvitiRunStats.json file found for the run.") - # Modify the % Polonies values based on PFCount for each lane - for unassigned_index in aggregated_unassigned_indexes: - unassigned_index["% Polonies"] = float(unassigned_index["Count"])/pfcount_lane[unassigned_index["Lane"]]*100 + # Write to a new UnassignedSequences.csv file under demux_dir aggregated_unassigned_csv = os.path.join(self.run_dir, self.demux_dir, "UnassignedSequences.csv") self.write_to_csv(aggregated_unassigned_indexes, aggregated_unassigned_csv) @@ -839,7 +840,7 @@ def update_transfer_log(self): def update_paths_after_archiving(self, new_location): self.run_dir = os.path.join(new_location, self.NGI_run_id) # Needs to be redirected to new location so that TACA can find files to upload to statusdb self.run_parameters_file = os.path.join(self.run_dir, "RunParameters.json") - self.run_stats_file = os.path.join(self.run_dir, "RunStats.json") + self.run_stats_file = os.path.join(self.run_dir, "AvitiRunStats.json") self.run_manifest_file_from_instrument = os.path.join( self.run_dir, "RunManifest.json" ) From 9a1c25924abae4f6a38e04c83c3b456262dbb05a Mon Sep 17 00:00:00 2001 From: chuan-wang Date: Mon, 30 Sep 2024 09:08:11 +0200 Subject: [PATCH 084/187] Replace hard-coded file names --- taca/element/Element_Runs.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 833f0460..3c4816bc 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -669,7 +669,7 @@ def aggregate_stats_assigned(self, demux_runmanifest): sample['QualityScoreMean'] = project_runstats_sample[0]['QualityScoreMean'] aggregated_assigned_indexes.append(sample) else: - logger.warning(f"No IndexAssignment.csv file found for sub-demultiplexing {sub_demux}.") + logger.warning(f"No {os.path.basename(assigned_csv)} file found for sub-demultiplexing {sub_demux}.") # Remove redundant rows for PhiX aggregated_assigned_indexes_filtered = [] unique_phiX_combination = set() @@ -717,7 +717,7 @@ def aggregate_stats_unassigned(self, demux_runmanifest): reader = csv.DictReader(max_unassigned_file) max_unassigned_indexes = [row for row in reader] else: - logger.warning(f"No UnassignedSequences.csv file found for sub-demultiplexing {sub_demux_with_max_index_lens}.") + logger.warning(f"No {os.path.basename(max_unassigned_csv)} file found for sub-demultiplexing {sub_demux_with_max_index_lens}.") break # Filter by lane max_unassigned_indexes = [idx for idx in max_unassigned_indexes if idx["Lane"] == lane] @@ -732,7 +732,7 @@ def aggregate_stats_unassigned(self, demux_runmanifest): reader = csv.DictReader(unassigned_file) unassigned_indexes = [row for row in reader] else: - logger.warning(f"No UnassignedSequences.csv file found for sub-demultiplexing {sub_demux}.") + logger.warning(f"No {os.path.basename(unassigned_csv)} file found for sub-demultiplexing {sub_demux}.") continue # Filter by lane unassigned_indexes = [unassigned_index for unassigned_index in unassigned_indexes if unassigned_index["Lane"] == lane] @@ -762,7 +762,7 @@ def aggregate_stats_unassigned(self, demux_runmanifest): if pfcount_lane.get(unassigned_index["Lane"]): unassigned_index["% Polonies"] = float(unassigned_index["Count"])/pfcount_lane[unassigned_index["Lane"]]*100 else: - logger.warning(f"No AvitiRunStats.json file found for the run.") + logger.warning(f"No {os.path.basename(self.run_stats_file)} file found for the run.") # Write to a new UnassignedSequences.csv file under demux_dir aggregated_unassigned_csv = os.path.join(self.run_dir, self.demux_dir, "UnassignedSequences.csv") From 8c0263ef88c9594396dd68e2ce245e1cf1c0f814 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Mon, 30 Sep 2024 10:10:22 +0200 Subject: [PATCH 085/187] Small fixes --- taca/analysis/analysis_element.py | 23 +++++++---------------- taca/element/Element_Runs.py | 8 +++----- 2 files changed, 10 insertions(+), 21 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index 7109a027..72c3c74d 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -47,7 +47,7 @@ def _process(run): run_manifests = glob.glob( os.path.join( run.run_dir, "RunManifest_*.csv" - ) # TODO: is this filename right? + ) ) sub_demux_count = 0 for run_manifest in run_manifests.sort(): @@ -117,38 +117,29 @@ def _process(run): ) # TODO: email warning to operator return - elif transfer_status == "unknown": - logger.warning( - f"The run {run} has already been transferred but has not been archived. Please investigate" - ) - # TODO: email operator warning - return else: - # TODO Merge with the one above? logger.warning( f"Unknown transfer status {transfer_status} of run {run}. Please investigate" - ) + ) # TODO: email warning to operator return if given_run: run = Aviti_Run(given_run, CONFIG) - # TODO: Needs to change if more types of Element machines are aquired in the future - _process(run) else: data_dirs = CONFIG.get("element_analysis").get( "data_dirs" ) # TODO: add to config - for data_dir in data_dirs: # TODO: make sure to look in both side A and B - # Run folder looks like DATE_*_*_*, the last section is the FC name. + for data_dir in data_dirs: + # Run folder looks like DATE_*_*, the last section is the FC side (A/B) and name runs = glob.glob( - os.path.join(data_dir, "[1-9]*_*_*_*") - ) # TODO: adapt to aviti format + os.path.join(data_dir, "[1-9]*_*_*") + ) for run in runs: runObj = Aviti_Run(run, CONFIG) try: _process(runObj) - except: # TODO: chatch error message and print it + except: # This function might throw and exception, # it is better to continue processing other runs logger.warning(f"There was an error processing the run {run}") diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 3c4816bc..e1923b27 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -285,7 +285,7 @@ def find_manifest_zip(self): logger.warning( f"No manifest found for run '{self.run_dir}' with pattern '{glob_pattern}'." ) - return False # TODO determine whether to raise an error here instead + return False # TODO: determine whether to raise an error here instead elif len(glob_results) > 1: logger.warning( f"Multiple manifests found for run '{self.run_dir}' with pattern '{glob_pattern}', using latest one." @@ -448,7 +448,6 @@ def generate_demux_command(self, run_manifest, demux_dir): def start_demux(self, run_manifest, demux_dir): with chdir(self.run_dir): cmd = self.generate_demux_command(run_manifest, demux_dir) - # TODO: handle multiple composite manifests for demux try: p_handle = subprocess.Popen( cmd, stdout=subprocess.PIPE, shell=True, cwd=self.run_dir @@ -803,12 +802,12 @@ def transfer(self): + " -rLav" + f" --chown={transfer_details.get('owner')}" + f" --chmod={transfer_details.get('permissions')}" - + " --exclude BaseCalls" # TODO: check that we actually want to exclude these + + " --exclude BaseCalls" + " --exclude Alignment" + f" {self.run_dir}" + f" {transfer_details.get('user')}@{transfer_details.get('host')}:/aviti" + f"; echo $? > {os.path.join(self.run_dir, '.rsync_exit_status')}" - ) # TODO: any other options? + ) try: p_handle = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True) logger.info( @@ -845,7 +844,6 @@ def update_paths_after_archiving(self, new_location): self.run_dir, "RunManifest.json" ) self.run_uploaded_file = os.path.join(self.run_dir, "RunUploaded.json") - # TODO: also update location of demux files? def archive(self): """Move directory to nosync.""" From c86b6bf6d617fab4ab0041a0a30518f8908571e3 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Mon, 30 Sep 2024 10:30:41 +0200 Subject: [PATCH 086/187] Sync metadata --- taca/element/Element_Runs.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index e1923b27..1e67c4ba 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -786,8 +786,15 @@ def aggregate_demux_results(self, demux_results_dirs): self.aggregate_stats_unassigned(demux_runmanifest) def sync_metadata(self): - # TODO: copy metadata from demuxed run to ngi-nas-ns - pass + files_to_copy = [self.run_stats_file, + os.path.join(self.run_dir, "Demultiplexing", "IndexAssignment.csv"), + os.path.join(self.run_dir, "Demultiplexing", "UnassignedSequences.csv") + ] + metadata_archive = self.CONFIG.get("element_analysis").get("metadata_location") # TODO: add to aca.yaml + dest = os.path.join(metadata_archive, self.NGI_run_id) + os.makedirs(dest) + for f in files_to_copy: + shutil.copy(f, dest) def make_transfer_indicator(self): transfer_indicator = os.path.join(self.run_dir, ".rsync_ongoing") From f221c984cb66c9030c3d96ba596cbb985bc16bdb Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Mon, 30 Sep 2024 11:13:50 +0200 Subject: [PATCH 087/187] Also sync RunParameters.json --- taca/element/Element_Runs.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 1e67c4ba..543f9b55 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -437,6 +437,7 @@ def generate_demux_command(self, run_manifest, demux_dir): + f" {self.run_dir}" + f" {demux_dir}" + " -p 8" + + " --num-unassigned 500" + f" -r {run_manifest}" + " --legacy-fastq" # TODO: except if Smart-seq3 + " --force-index-orientation" @@ -788,7 +789,8 @@ def aggregate_demux_results(self, demux_results_dirs): def sync_metadata(self): files_to_copy = [self.run_stats_file, os.path.join(self.run_dir, "Demultiplexing", "IndexAssignment.csv"), - os.path.join(self.run_dir, "Demultiplexing", "UnassignedSequences.csv") + os.path.join(self.run_dir, "Demultiplexing", "UnassignedSequences.csv"), + self.run_parameters_file, ] metadata_archive = self.CONFIG.get("element_analysis").get("metadata_location") # TODO: add to aca.yaml dest = os.path.join(metadata_archive, self.NGI_run_id) From da9222ac36c98f87a0c260479d6f98f03ed104a5 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Mon, 30 Sep 2024 11:36:05 +0200 Subject: [PATCH 088/187] Cleanup --- taca/element/Element_Runs.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 543f9b55..d6dd0b3a 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -106,7 +106,7 @@ def parse_run_parameters(self) -> None: "runID" ) # Unique hash that we don't really use self.side = run_parameters.get("Side") # SideA or SideB - self.side_letter = self.side[-1] # A or B + self.side_letter = self.side[-1] # A or B TODO: compare side letter with manually entered letter in run name self.run_type = run_parameters.get( "RunType" ) # Sequencing, wash or prime I believe? @@ -299,7 +299,6 @@ def find_manifest_zip(self): def copy_manifests(self) -> bool: """Fetch the LIMS-generated run manifests from ngi-nas-ns and unzip them into a run subdir.""" - # TODO: test me zip_src_path = self.find_manifest_zip() # Make a run subdir named after the zip file and extract manifests there zip_name = os.path.basename(zip_src_path) @@ -439,9 +438,9 @@ def generate_demux_command(self, run_manifest, demux_dir): + " -p 8" + " --num-unassigned 500" + f" -r {run_manifest}" - + " --legacy-fastq" # TODO: except if Smart-seq3 + + " --legacy-fastq" + " --force-index-orientation" - ) # TODO: any other options? + ) with open(os.path.join(self.run_dir, '.bases2fastq_command')) as command_file: command_file.write(command) return command @@ -792,7 +791,7 @@ def sync_metadata(self): os.path.join(self.run_dir, "Demultiplexing", "UnassignedSequences.csv"), self.run_parameters_file, ] - metadata_archive = self.CONFIG.get("element_analysis").get("metadata_location") # TODO: add to aca.yaml + metadata_archive = self.CONFIG.get("element_analysis").get("metadata_location") # TODO: add to taca.yaml dest = os.path.join(metadata_archive, self.NGI_run_id) os.makedirs(dest) for f in files_to_copy: From 58b72529ee8f58002c74ed5870d3df0d747fd934 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Tue, 1 Oct 2024 09:26:28 +0200 Subject: [PATCH 089/187] bugfix --- tests/element/test_Element_Runs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py index 9eb264b1..0471b0a4 100644 --- a/tests/element/test_Element_Runs.py +++ b/tests/element/test_Element_Runs.py @@ -12,7 +12,7 @@ def get_config(tmp: tempfile.TemporaryDirectory) -> dict: config = { "element_analysis": { "Element": { - "GenericElement": { + "Aviti": { "manifest_zip_location": f"{tmp.name}/ngi-nas-ns/samplesheets/Aviti", "transfer_log": f"{tmp.name}/log/transfer_aviti.tsv", }, From 3b87b45d7bde6428640cf9d42ded0981dcc6c0db Mon Sep 17 00:00:00 2001 From: kedhammar Date: Tue, 1 Oct 2024 09:26:56 +0200 Subject: [PATCH 090/187] ruff --- taca/element/Element_Runs.py | 348 +++++++++++++++++++++++++---------- 1 file changed, 249 insertions(+), 99 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 6faf5a53..610f77fc 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -39,7 +39,8 @@ def __init__(self, run_dir, configuration): "*RunStats.json" # Assumes demux is finished when this file is created ) self.transfer_file = ( - self.CONFIG.get("element_analysis").get("Element", {}) + self.CONFIG.get("element_analysis") + .get("Element", {}) .get(self.sequencer_type, {}) .get("transfer_log") ) # TODO: add to taca.yaml @@ -162,19 +163,19 @@ def to_doc_obj(self): "Unassigned_Sequences": unassigned_sequences, } } - + demux_command_file = os.path.join(self.run_dir, ".bases2fastq_command") if os.path.exists(demux_command_file): with open(demux_command_file) as command_file: demux_command = command_file.readlines()[0] else: demux_command = None - demux_version_file = os.path.join(self.run_dir,"Demultiplexing_0", "RunStats.json") + demux_version_file = os.path.join( + self.run_dir, "Demultiplexing_0", "RunStats.json" + ) if os.path.exists(demux_version_file): with open(demux_version_file) as json_file: - demux_info = json.load( - json_file - ) + demux_info = json.load(json_file) demux_version = demux_info.get("AnalysisVersion") else: demux_version = None @@ -184,7 +185,7 @@ def to_doc_obj(self): "bin": self.CONFIG.get("element_analysis").get("bases2fastq"), "options": demux_command, } - + doc_obj = { "name": self.NGI_run_id, "run_path": self.run_dir, @@ -214,7 +215,9 @@ def get_demultiplexing_status(self): sub_demux_dirs = glob.glob(os.path.join(self.run_dir, "Demultiplexing_*")) finished_count = 0 for demux_dir in sub_demux_dirs: - found_demux_stats_file = glob.glob(os.path.join(demux_dir, self.demux_stats_file)) + found_demux_stats_file = glob.glob( + os.path.join(demux_dir, self.demux_stats_file) + ) if not found_demux_stats_file: return "ongoing" elif found_demux_stats_file: @@ -257,11 +260,12 @@ def get_lims_step_id(self) -> str | None: lims_step_id = line.split(",")[1] return lims_step_id return None - + def find_manifest_zip(self): # Specify dir in which LIMS drop the manifest zip files dir_to_search = os.path.join( - self.CONFIG.get("element_analysis").get("Element", {}) + self.CONFIG.get("element_analysis") + .get("Element", {}) .get(self.sequencer_type, {}) .get("manifest_zip_location"), # TODO: add to taca.yaml str(self.year), @@ -296,7 +300,6 @@ def find_manifest_zip(self): zip_src_path = glob_results[0] return zip_src_path - def copy_manifests(self) -> bool: """Fetch the LIMS-generated run manifests from ngi-nas-ns and unzip them into a run subdir.""" # TODO: test me @@ -441,7 +444,7 @@ def generate_demux_command(self, run_manifest, demux_dir): + " --legacy-fastq" # TODO: except if Smart-seq3 + " --force-index-orientation" ) # TODO: any other options? - with open(os.path.join(self.run_dir, '.bases2fastq_command')) as command_file: + with open(os.path.join(self.run_dir, ".bases2fastq_command")) as command_file: command_file.write(command) return command @@ -494,7 +497,7 @@ def rsync_complete(self): def rsync_successful(self): with open(os.path.join(self.run_dir, ".rsync_exit_status")) as rsync_exit_file: rsync_exit_status = rsync_exit_file.readlines() - if rsync_exit_status[0].strip() == '0': + if rsync_exit_status[0].strip() == "0": return True else: return False @@ -516,32 +519,33 @@ def write_to_csv(data, filename): # Get the fieldnames from the keys of the first dictionary fieldnames = data[0].keys() # Open the file and write the CSV - with open(filename, mode='w', newline='') as file: + with open(filename, mode="w", newline="") as file: writer = csv.DictWriter(file, fieldnames=fieldnames) # Write the header (fieldnames) writer.writeheader() # Write the data (rows) writer.writerows(data) - # Collect demux info into a list of dictionaries # Structure: [{'sub_demux_count':XXX, 'SampleName':XXX, 'Index1':XXX, 'Index2':XXX, 'Lane':XXX, 'Project':XXX, 'Recipe':XXX}] def collect_demux_runmanifest(self, demux_results_dirs): demux_runmanifest = [] for demux_dir in demux_results_dirs: - sub_demux_count = os.path.basename(demux_dir).split('_')[1] - with open(os.path.join(self.run_dir, demux_dir, 'RunManifest.csv'), 'r') as file: + sub_demux_count = os.path.basename(demux_dir).split("_")[1] + with open( + os.path.join(self.run_dir, demux_dir, "RunManifest.csv"), "r" + ) as file: lines = file.readlines() sample_section = False headers = [] # Loop through each line for line in lines: # Check if we reached the "[SAMPLES]" section - if '[SAMPLES]' in line: + if "[SAMPLES]" in line: sample_section = True continue # Exit the sample section if another section is encountered - if sample_section and line.startswith('['): + if sample_section and line.startswith("["): break # If in the sample section, process the sample lines if sample_section: @@ -552,71 +556,124 @@ def collect_demux_runmanifest(self, demux_results_dirs): continue # Get the headers from the first line if not headers: - headers = line.split(',') + headers = line.split(",") else: # Parse sample data - values = line.split(',') + values = line.split(",") sample_dict = dict(zip(headers, values)) - sample_dict['sub_demux_count'] = sub_demux_count + sample_dict["sub_demux_count"] = sub_demux_count demux_runmanifest.append(sample_dict) - sorted_demux_runmanifest = sorted(demux_runmanifest, key=lambda x: (x['Lane'], x['SampleName'], x['sub_demux_count'])) + sorted_demux_runmanifest = sorted( + demux_runmanifest, + key=lambda x: (x["Lane"], x["SampleName"], x["sub_demux_count"]), + ) return sorted_demux_runmanifest - # Aggregate the output FastQ files of samples from multiple demux def aggregate_sample_fastq(self, demux_runmanifest): - lanes = sorted(list(set(sample['Lane'] for sample in demux_runmanifest))) + lanes = sorted(list(set(sample["Lane"] for sample in demux_runmanifest))) unique_sample_demux = set() for lane in lanes: sample_count = 1 for sample in demux_runmanifest: - lanenr = sample['Lane'] - project = sample['Project'] - sample_name = sample['SampleName'] - sub_demux_count = sample['sub_demux_count'] + lanenr = sample["Lane"] + project = sample["Project"] + sample_name = sample["SampleName"] + sub_demux_count = sample["sub_demux_count"] # Skip PhiX if lanenr == lane and sample_name != "PhiX": sample_tuple = (sample_name, sub_demux_count) if sample_tuple not in unique_sample_demux: - project_dest = os.path.join(self.run_dir, self.demux_dir, project) - sample_dest = os.path.join(self.run_dir, self.demux_dir, project, sample_name) + project_dest = os.path.join( + self.run_dir, self.demux_dir, project + ) + sample_dest = os.path.join( + self.run_dir, self.demux_dir, project, sample_name + ) if not os.path.exists(project_dest): os.makedirs(project_dest) if not os.path.exists(sample_dest): os.makedirs(sample_dest) - fastqfiles = glob.glob(os.path.join(self.run_dir, f"Demultiplexing_{sub_demux_count}", "Samples", project, sample_name, f"*L00{lane}*.fastq.gz")) + fastqfiles = glob.glob( + os.path.join( + self.run_dir, + f"Demultiplexing_{sub_demux_count}", + "Samples", + project, + sample_name, + f"*L00{lane}*.fastq.gz", + ) + ) for fastqfile in fastqfiles: old_name = os.path.basename(fastqfile) - read_label = re.search(rf"L00{lane}_(.*?)_001", old_name).group(1) - new_name = "_".join([sample_name, f"S{sample_count}", f"L00{lane}", read_label, "001.fastq.gz"]) + read_label = re.search( + rf"L00{lane}_(.*?)_001", old_name + ).group(1) + new_name = "_".join( + [ + sample_name, + f"S{sample_count}", + f"L00{lane}", + read_label, + "001.fastq.gz", + ] + ) os.symlink(fastqfile, os.path.join(sample_dest, new_name)) unique_sample_demux.add(sample_tuple) sample_count += 1 - # Symlink the output FastQ files of undet only if a lane does not have multiple demux def aggregate_undet_fastq(self, demux_runmanifest): - lanes = sorted(list(set(sample['Lane'] for sample in demux_runmanifest))) + lanes = sorted(list(set(sample["Lane"] for sample in demux_runmanifest))) for lane in lanes: - sub_demux = list(set(sample['sub_demux_count'] for sample in demux_runmanifest if sample['Lane']==lane)) + sub_demux = list( + set( + sample["sub_demux_count"] + for sample in demux_runmanifest + if sample["Lane"] == lane + ) + ) if len(sub_demux) == 1: - project_dest = os.path.join(self.run_dir, self.demux_dir, "Undetermined") + project_dest = os.path.join( + self.run_dir, self.demux_dir, "Undetermined" + ) if not os.path.exists(project_dest): os.makedirs(project_dest) - fastqfiles = glob.glob(os.path.join(self.run_dir, f"Demultiplexing_{sub_demux[0]}", "Samples", "Undetermined", f"*L00{lane}*.fastq.gz")) + fastqfiles = glob.glob( + os.path.join( + self.run_dir, + f"Demultiplexing_{sub_demux[0]}", + "Samples", + "Undetermined", + f"*L00{lane}*.fastq.gz", + ) + ) for fastqfile in fastqfiles: base_name = os.path.basename(fastqfile) os.symlink(fastqfile, os.path.join(project_dest, base_name)) - # Read in each Project_RunStats.json to fetch PercentMismatch, PercentQ30, PercentQ40 and QualityScoreMean # Note that Element promised that they would include these stats into IndexAssignment.csv # But for now we have to do this by ourselves in this hard way def get_project_runstats(self, sub_demux, demux_runmanifest): project_runstats = [] - project_list = sorted(list(set(sample['Project'] for sample in demux_runmanifest if sample['sub_demux_count']==sub_demux))) + project_list = sorted( + list( + set( + sample["Project"] + for sample in demux_runmanifest + if sample["sub_demux_count"] == sub_demux + ) + ) + ) for project in project_list: - project_runstats_json_path = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux}", "Samples", project, f"{project}_RunStats.json") + project_runstats_json_path = os.path.join( + self.run_dir, + f"Demultiplexing_{sub_demux}", + "Samples", + project, + f"{project}_RunStats.json", + ) if os.path.exists(project_runstats_json_path): with open(project_runstats_json_path) as stats_json: project_runstats_json = json.load(stats_json) @@ -629,129 +686,220 @@ def get_project_runstats(self, sub_demux, demux_runmanifest): percentage_q30 = occurrence["PercentQ30"] percentage_q40 = occurrence["PercentQ40"] quality_score_mean = occurrence["QualityScoreMean"] - project_runstats.append({ "SampleName" : sample_name, - "Lane" : str(lane), - "ExpectedSequence" : expected_sequence, - "PercentMismatch" : percentage_mismatch, - "PercentQ30" : percentage_q30, - "PercentQ40" : percentage_q40, - "QualityScoreMean" : quality_score_mean - }) + project_runstats.append( + { + "SampleName": sample_name, + "Lane": str(lane), + "ExpectedSequence": expected_sequence, + "PercentMismatch": percentage_mismatch, + "PercentQ30": percentage_q30, + "PercentQ40": percentage_q40, + "QualityScoreMean": quality_score_mean, + } + ) else: continue return project_runstats - # Aggregate stats in IndexAssignment.csv def aggregate_stats_assigned(self, demux_runmanifest): aggregated_assigned_indexes = [] - sub_demux_list = sorted(list(set(sample['sub_demux_count'] for sample in demux_runmanifest))) - lanes = sorted(list(set(sample['Lane'] for sample in demux_runmanifest))) + sub_demux_list = sorted( + list(set(sample["sub_demux_count"] for sample in demux_runmanifest)) + ) + lanes = sorted(list(set(sample["Lane"] for sample in demux_runmanifest))) for sub_demux in sub_demux_list: # Read in each Project_RunStats.json to fetch PercentMismatch, PercentQ30, PercentQ40 and QualityScoreMean # Note that Element promised that they would include these stats into IndexAssignment.csv # But for now we have to do this by ourselves in this hard way project_runstats = self.get_project_runstats(sub_demux, demux_runmanifest) # Read in IndexAssignment.csv - assigned_csv = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux}", "IndexAssignment.csv") + assigned_csv = os.path.join( + self.run_dir, f"Demultiplexing_{sub_demux}", "IndexAssignment.csv" + ) if os.path.exists(assigned_csv): - with open(assigned_csv, 'r') as assigned_file: + with open(assigned_csv, "r") as assigned_file: reader = csv.DictReader(assigned_file) index_assignment = [row for row in reader] for sample in index_assignment: - if sample['Lane'] in lanes: - project_runstats_sample = [d for d in project_runstats if d['SampleName'] == sample['SampleName'] and d['Lane'] == sample['Lane'] and d['ExpectedSequence'] == sample['I1']+sample['I2']] - sample['sub_demux_count'] = sub_demux - sample['PercentMismatch'] = project_runstats_sample[0]['PercentMismatch'] - sample['PercentQ30'] = project_runstats_sample[0]['PercentQ30'] - sample['PercentQ40'] = project_runstats_sample[0]['PercentQ40'] - sample['QualityScoreMean'] = project_runstats_sample[0]['QualityScoreMean'] + if sample["Lane"] in lanes: + project_runstats_sample = [ + d + for d in project_runstats + if d["SampleName"] == sample["SampleName"] + and d["Lane"] == sample["Lane"] + and d["ExpectedSequence"] == sample["I1"] + sample["I2"] + ] + sample["sub_demux_count"] = sub_demux + sample["PercentMismatch"] = project_runstats_sample[0][ + "PercentMismatch" + ] + sample["PercentQ30"] = project_runstats_sample[0]["PercentQ30"] + sample["PercentQ40"] = project_runstats_sample[0]["PercentQ40"] + sample["QualityScoreMean"] = project_runstats_sample[0][ + "QualityScoreMean" + ] aggregated_assigned_indexes.append(sample) else: - logger.warning(f"No IndexAssignment.csv file found for sub-demultiplexing {sub_demux}.") + logger.warning( + f"No IndexAssignment.csv file found for sub-demultiplexing {sub_demux}." + ) # Remove redundant rows for PhiX aggregated_assigned_indexes_filtered = [] unique_phiX_combination = set() for sample in aggregated_assigned_indexes: - if sample['SampleName'] == 'PhiX': - combination = (sample['I1'], sample['I2'], sample['Lane']) + if sample["SampleName"] == "PhiX": + combination = (sample["I1"], sample["I2"], sample["Lane"]) if combination not in unique_phiX_combination: aggregated_assigned_indexes_filtered.append(sample) unique_phiX_combination.add(combination) else: aggregated_assigned_indexes_filtered.append(sample) # Sort the list by Lane, SampleName and sub_demux_count - aggregated_assigned_indexes_filtered_sorted = sorted(aggregated_assigned_indexes_filtered, key=lambda x: (x['Lane'], x['SampleName'], x['sub_demux_count'])) + aggregated_assigned_indexes_filtered_sorted = sorted( + aggregated_assigned_indexes_filtered, + key=lambda x: (x["Lane"], x["SampleName"], x["sub_demux_count"]), + ) # Fix new sample number based on SampleName and Lane sample_count = 0 - previous_samplename_lane = ('NA', 'NA') + previous_samplename_lane = ("NA", "NA") for sample in aggregated_assigned_indexes_filtered_sorted: - if (sample['SampleName'], sample['Lane']) != previous_samplename_lane: + if (sample["SampleName"], sample["Lane"]) != previous_samplename_lane: sample_count += 1 - previous_samplename_lane = (sample['SampleName'], sample['Lane']) - sample['SampleNumber'] = sample_count + previous_samplename_lane = (sample["SampleName"], sample["Lane"]) + sample["SampleNumber"] = sample_count # Write to a new UnassignedSequences.csv file under demux_dir - aggregated_assigned_indexes_csv = os.path.join(self.run_dir, self.demux_dir, "IndexAssignment.csv") - self.write_to_csv(aggregated_assigned_indexes_filtered_sorted, aggregated_assigned_indexes_csv) - + aggregated_assigned_indexes_csv = os.path.join( + self.run_dir, self.demux_dir, "IndexAssignment.csv" + ) + self.write_to_csv( + aggregated_assigned_indexes_filtered_sorted, aggregated_assigned_indexes_csv + ) # Aggregate stats in UnassignedSequences.csv def aggregate_stats_unassigned(self, demux_runmanifest): aggregated_unassigned_indexes = [] - lanes = sorted(list(set(sample['Lane'] for sample in demux_runmanifest))) + lanes = sorted(list(set(sample["Lane"] for sample in demux_runmanifest))) for lane in lanes: sub_demux_index_lens = set() for sample in demux_runmanifest: - if sample['Lane'] == lane: - sub_demux_index_lens.add((sample['sub_demux_count'], (len(sample.get("Index1", "")), len(sample.get("Index2", ""))))) + if sample["Lane"] == lane: + sub_demux_index_lens.add( + ( + sample["sub_demux_count"], + ( + len(sample.get("Index1", "")), + len(sample.get("Index2", "")), + ), + ) + ) # List of sub-demux with a decreasing order of index lengths - sub_demux_list = [x[0] for x in sorted(sub_demux_index_lens, key=lambda x: sum(x[1]), reverse=True)] + sub_demux_list = [ + x[0] + for x in sorted( + sub_demux_index_lens, key=lambda x: sum(x[1]), reverse=True + ) + ] sub_demux_with_max_index_lens = sub_demux_list[0] # Start with the unassigned list with the longest index - max_unassigned_csv = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux_with_max_index_lens}", "UnassignedSequences.csv") + max_unassigned_csv = os.path.join( + self.run_dir, + f"Demultiplexing_{sub_demux_with_max_index_lens}", + "UnassignedSequences.csv", + ) if os.path.exists(max_unassigned_csv): - with open(max_unassigned_csv, 'r') as max_unassigned_file: + with open(max_unassigned_csv, "r") as max_unassigned_file: reader = csv.DictReader(max_unassigned_file) max_unassigned_indexes = [row for row in reader] else: - logger.warning(f"No UnassignedSequences.csv file found for sub-demultiplexing {sub_demux_with_max_index_lens}.") + logger.warning( + f"No UnassignedSequences.csv file found for sub-demultiplexing {sub_demux_with_max_index_lens}." + ) break # Filter by lane - max_unassigned_indexes = [idx for idx in max_unassigned_indexes if idx["Lane"] == lane] + max_unassigned_indexes = [ + idx for idx in max_unassigned_indexes if idx["Lane"] == lane + ] # Complicated case with multiple demuxes. Take the full list if there is only one sub-demux otherwise if len(sub_demux_list) > 1: # Order: from longer to shorter indexes sub_demux_with_shorter_index_lens = sub_demux_list[1:] for sub_demux in sub_demux_with_shorter_index_lens: - unassigned_csv = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux}", "UnassignedSequences.csv") + unassigned_csv = os.path.join( + self.run_dir, + f"Demultiplexing_{sub_demux}", + "UnassignedSequences.csv", + ) if os.path.exists(unassigned_csv): - with open(unassigned_csv, 'r') as unassigned_file: + with open(unassigned_csv, "r") as unassigned_file: reader = csv.DictReader(unassigned_file) unassigned_indexes = [row for row in reader] else: - logger.warning(f"No UnassignedSequences.csv file found for sub-demultiplexing {sub_demux}.") + logger.warning( + f"No UnassignedSequences.csv file found for sub-demultiplexing {sub_demux}." + ) continue # Filter by lane - unassigned_indexes = [unassigned_index for unassigned_index in unassigned_indexes if unassigned_index["Lane"] == lane] + unassigned_indexes = [ + unassigned_index + for unassigned_index in unassigned_indexes + if unassigned_index["Lane"] == lane + ] # Remove overlapped indexes from the list of max_unassigned_indexes - idx1_overlapped_len = min([demux_lens_pair[1] for demux_lens_pair in sub_demux_index_lens if demux_lens_pair[0] == sub_demux][0][0], - [demux_lens_pair[1] for demux_lens_pair in sub_demux_index_lens if demux_lens_pair[0] == sub_demux_with_max_index_lens][0][0]) - idx2_overlapped_len = min([demux_lens_pair[1] for demux_lens_pair in sub_demux_index_lens if demux_lens_pair[0] == sub_demux][0][1], - [demux_lens_pair[1] for demux_lens_pair in sub_demux_index_lens if demux_lens_pair[0] == sub_demux_with_max_index_lens][0][1]) + idx1_overlapped_len = min( + [ + demux_lens_pair[1] + for demux_lens_pair in sub_demux_index_lens + if demux_lens_pair[0] == sub_demux + ][0][0], + [ + demux_lens_pair[1] + for demux_lens_pair in sub_demux_index_lens + if demux_lens_pair[0] == sub_demux_with_max_index_lens + ][0][0], + ) + idx2_overlapped_len = min( + [ + demux_lens_pair[1] + for demux_lens_pair in sub_demux_index_lens + if demux_lens_pair[0] == sub_demux + ][0][1], + [ + demux_lens_pair[1] + for demux_lens_pair in sub_demux_index_lens + if demux_lens_pair[0] == sub_demux_with_max_index_lens + ][0][1], + ) for unassigned_index in unassigned_indexes: - idx1_overlapped_seq = unassigned_index['I1'][:idx1_overlapped_len] - idx2_overlapped_seq = unassigned_index['I2'][:idx2_overlapped_len] + idx1_overlapped_seq = unassigned_index["I1"][ + :idx1_overlapped_len + ] + idx2_overlapped_seq = unassigned_index["I2"][ + :idx2_overlapped_len + ] # Remove the overlapped record from the max_unassigned_indexes list - max_unassigned_indexes = [max_unassigned_index for max_unassigned_index in max_unassigned_indexes if not (max_unassigned_index['I1'][:idx1_overlapped_len] == idx1_overlapped_seq and max_unassigned_index['I2'][:idx2_overlapped_len] == idx2_overlapped_seq)] + max_unassigned_indexes = [ + max_unassigned_index + for max_unassigned_index in max_unassigned_indexes + if not ( + max_unassigned_index["I1"][:idx1_overlapped_len] + == idx1_overlapped_seq + and max_unassigned_index["I2"][:idx2_overlapped_len] + == idx2_overlapped_seq + ) + ] # Append to the aggregated_unassigned_indexes list aggregated_unassigned_indexes += max_unassigned_indexes # Sort aggregated_unassigned_indexes list first by lane and then by Count in the decreasing order - aggregated_unassigned_indexes = sorted(aggregated_unassigned_indexes, key=lambda x: (x['Lane'], -int(x['Count']))) + aggregated_unassigned_indexes = sorted( + aggregated_unassigned_indexes, key=lambda x: (x["Lane"], -int(x["Count"])) + ) # Write to a new UnassignedSequences.csv file under demux_dir - aggregated_unassigned_csv = os.path.join(self.run_dir, self.demux_dir, "UnassignedSequences.csv") + aggregated_unassigned_csv = os.path.join( + self.run_dir, self.demux_dir, "UnassignedSequences.csv" + ) self.write_to_csv(aggregated_unassigned_indexes, aggregated_unassigned_csv) - # Aggregate demux results def aggregate_demux_results(self, demux_results_dirs): # Ensure the destination directory exists @@ -778,8 +926,8 @@ def make_transfer_indicator(self): Path(transfer_indicator).touch() def transfer(self): - transfer_details = ( - self.CONFIG.get("element_analysis").get("transfer_details") + transfer_details = self.CONFIG.get("element_analysis").get( + "transfer_details" ) # TODO: Add section to taca.yaml command = ( "rsync" @@ -806,7 +954,7 @@ def transfer(self): return def remove_transfer_indicator(self): - transfer_indicator = os.path.join(self.run_dir, '.rsync_ongoing') + transfer_indicator = os.path.join(self.run_dir, ".rsync_ongoing") Path(transfer_indicator).unlink() def update_transfer_log(self): @@ -821,7 +969,9 @@ def update_transfer_log(self): raise OSError(msg) def update_paths_after_archiving(self, new_location): - self.run_dir = os.path.join(new_location, self.NGI_run_id) # Needs to be redirected to new location so that TACA can find files to upload to statusdb + self.run_dir = os.path.join( + new_location, self.NGI_run_id + ) # Needs to be redirected to new location so that TACA can find files to upload to statusdb self.run_parameters_file = os.path.join(self.run_dir, "RunParameters.json") self.run_stats_file = os.path.join(self.run_dir, "RunStats.json") self.run_manifest_file_from_instrument = os.path.join( From e245d1c718ace0ec41197934662dafb97e3aa9f9 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Tue, 1 Oct 2024 09:43:34 +0200 Subject: [PATCH 091/187] add status section to mock config --- tests/element/test_Element_Runs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py index 0471b0a4..23914a7d 100644 --- a/tests/element/test_Element_Runs.py +++ b/tests/element/test_Element_Runs.py @@ -18,6 +18,7 @@ def get_config(tmp: tempfile.TemporaryDirectory) -> dict: }, }, }, + "statusdb": {}, } return config From e772189ac8daee7782c64f50a89a18e9fc8e1738 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Tue, 1 Oct 2024 09:43:55 +0200 Subject: [PATCH 092/187] start analysis functional test --- tests/analysis/test_analysis_element.py | 35 +++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 tests/analysis/test_analysis_element.py diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py new file mode 100644 index 00000000..49067ca4 --- /dev/null +++ b/tests/analysis/test_analysis_element.py @@ -0,0 +1,35 @@ +from tempfile import TemporaryDirectory +from unittest.mock import patch + +import pytest + +from tests.element.test_Element_Runs import create_element_run_dir, get_config + + +@pytest.mark.skip(reason="Not implemented yet") +def test_run_preprocessing(create_dirs): + tmp: TemporaryDirectory = create_dirs + + # Mock config + config = get_config(tmp) + mock_config = patch("taca.utils.config.CONFIG", new=config) + mock_config.start() + + # Mock DB + mock_db = patch("taca.element.Element_Runs.ElementRunsConnection") + mock_db.start() + + # Import module to test + from taca.analysis import analysis_element as to_test + + run_dir = create_element_run_dir( + tmp=tmp, + nosync=False, + run_finished=False, + sync_finished=False, + demux_dir=False, + demux_done=False, + outcome_completed=False, + ) + + to_test.run_preprocessing(run_dir) From f87487187183be3e99fa7953be531be72f817a15 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Tue, 1 Oct 2024 09:48:06 +0200 Subject: [PATCH 093/187] attr for empty manifest --- taca/element/Element_Runs.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 5fdde1a3..d41635a6 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -321,6 +321,9 @@ def copy_manifests(self) -> bool: self.lims_start_manifest = [ m for m in manifests if re.match(r".*_trimmed\.csv$", m) ][0] + self.lims_empty_manifest = [ + m for m in manifests if re.match(r".*_empty\.csv$", m) + ][0] self.lims_demux_manifests = [ m for m in manifests if re.match(r".*_\d+\.csv$", m) ] From 7958caa4285815d33009725044b0cb408736dbb9 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Tue, 1 Oct 2024 11:18:46 +0200 Subject: [PATCH 094/187] wip --- taca/element/Element_Runs.py | 74 +++++++++++++++++++++++++++++++++++- 1 file changed, 73 insertions(+), 1 deletion(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index d41635a6..b42677da 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -18,6 +18,69 @@ logger = logging.getLogger(__name__) +def get_mask(seq: str, mask_type: str, which_index: int) -> str: + """Example usage: + + get_mask("ACGTACGTNNNNNNNN", "umi", 1) -> 'I1:N8Y8' + get_mask("ACGTACGTNNNNNNNN", "index", 2) -> 'I2:Y8N8' + """ + + # Input assertions + assert re.match(r"^[ACGTN]+$", seq), f"Index '{seq}' has non-ACGTN characters" + assert mask_type in ["umi", "index"], "Mask type must be 'umi' or 'index'" + assert which_index in [1, 2], "Index number must be 1 or 2" + + # Define dict to convert base to mask classifier + base2mask = ( + { + "N": "N", + "A": "Y", + "C": "Y", + "G": "Y", + "T": "Y", + } + if mask_type == "index" + else { + "N": "Y", + "A": "N", + "C": "N", + "G": "N", + "T": "N", + } + ) + + # Dynamically build the mask sequence + mask_seq = "I1:" if which_index == 1 else "I2:" + current_group = "" + current_group_len = 0 + for letter in seq: + if base2mask[letter] == current_group: + current_group_len += 1 + else: + mask_seq += ( + f"{current_group}{current_group_len}" if current_group_len > 0 else "" + ) + current_group = base2mask[letter] + current_group_len = 1 + mask_seq += f"{current_group}{current_group_len}" + + # Use the worlds ugliest string parsing to check that the mask length matches the input sequence length + assert sum( + [ + int(n) + for n in mask_seq[3:] + .replace("N", "-") + .replace("Y", "-") + .strip("-") + .split("-") + ] + ) == len( + seq + ), f"Length of mask '{mask_seq}' does not match length of input seq '{seq}'" + + return mask_seq + + class Run: """Defines an Element run""" @@ -344,7 +407,7 @@ def make_demux_manifests( manifest_contents = f.read() # Get '[SAMPLES]' section - split_contents = "[SAMPLES]".split(manifest_contents) + split_contents = manifest_contents.split("[SAMPLES]") assert ( len(split_contents) == 2 ), f"Could not split sample rows out of manifest {manifest_contents}" @@ -375,6 +438,15 @@ def make_demux_manifests( manifest_root_name = f"{self.NGI_run_id}_demux" + # Address UMI masks + for n in [1, 2]: + df_samples[f"I{n}Mask"] = df_samples[f"Index{n}"].apply( + lambda seq: get_mask(seq, "umi", n) + ) + df_samples["UmiMask"] = df_samples[f"Index{n}"].apply( + lambda seq: get_mask(seq, "umi", n) + ) + # Get idx lengths for calculations df_samples.loc[:, "len_idx1"] = df["Index1"].apply(len) df_samples.loc[:, "len_idx2"] = df["Index2"].apply(len) From 962a19cd68183dc1bbb0f4897610c2c0394fd086 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Tue, 1 Oct 2024 13:09:49 +0200 Subject: [PATCH 095/187] build manifests based on masks --- taca/element/Element_Runs.py | 75 +++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 31 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index b42677da..6f0ecd01 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -396,12 +396,10 @@ def copy_manifests(self) -> bool: def make_demux_manifests( self, manifest_to_split: os.PathLike, outdir: os.PathLike | None = None ) -> list[os.PathLike]: - """Derive composite demultiplexing manifests (grouped by index duplicity and lengths) + """Derive composite demultiplexing manifests from a single information-rich manifest. """ - # TODO test me - # Read specified manifest with open(manifest_to_split) as f: manifest_contents = f.read() @@ -411,7 +409,7 @@ def make_demux_manifests( assert ( len(split_contents) == 2 ), f"Could not split sample rows out of manifest {manifest_contents}" - sample_section = split_contents[1].split("\n") + sample_section = split_contents[1].strip().split("\n") # Split into header and rows header = sample_section[0] @@ -435,36 +433,40 @@ def make_demux_manifests( outdir = self.run_dir ## Build composite manifests - manifest_root_name = f"{self.NGI_run_id}_demux" - # Address UMI masks + # Bool indicating whether UMI is present + df_samples["has_umi"] = df_samples["Index2"].str.contains("N") + + # Add cols denoting idx and umi masks for n in [1, 2]: df_samples[f"I{n}Mask"] = df_samples[f"Index{n}"].apply( - lambda seq: get_mask(seq, "umi", n) - ) - df_samples["UmiMask"] = df_samples[f"Index{n}"].apply( - lambda seq: get_mask(seq, "umi", n) + lambda seq: get_mask(seq, "index", n) ) + df_samples["UmiMask"] = df_samples["Index2"].apply( + lambda seq: get_mask(seq, "umi", 2) + ) - # Get idx lengths for calculations - df_samples.loc[:, "len_idx1"] = df["Index1"].apply(len) - df_samples.loc[:, "len_idx2"] = df["Index2"].apply(len) + # Re-make idx col without Ns + df_samples["Index2_umi"] = df_samples["Index2"] + df_samples.loc[:, "Index2"] = df_samples["Index2"].apply( + lambda x: x.replace("N", "") + ) - # Break down by index lengths and lane, creating composite manifests + # Break down by masks and lane, creating composite manifests manifests = [] n = 0 - for (len_idx1, len_idx2, lane), group in df_samples.groupby( - ["len_idx1", "len_idx2", "Lane"] - ): + grouped_df = df_samples.groupby(["I1Mask", "I2Mask", "UmiMask", "Lane"]) + for (I1Mask, I2Mask, UmiMask, lane), group in grouped_df: file_name = f"{manifest_root_name}_{n}.csv" + runValues_section = "\n".join( [ "[RUNVALUES]", "KeyName, Value", f'manifest_file, "{file_name}"', - f"manifest_group, {n+1}/{len(df.groupby(['len_idx1', 'len_idx2', 'Lane']))}", - f"grouped_by, len_idx1:{len_idx1} len_idx2:{len_idx2} lane:{lane}", + f"manifest_group, {n+1}/{len(grouped_df)}", + f"grouped_by, I1Mask:'{I1Mask}' I2Mask:'{I2Mask}' UmiMask:'{UmiMask}' lane:{lane}", ] ) @@ -472,24 +474,35 @@ def make_demux_manifests( [ "[SETTINGS]", "SettingName, Value", + f"I1Mask, {I1Mask}", + f"I2Mask, {I2Mask}", ] ) + if group["has_umi"].all(): + settings_section += "\n" + "\n".join( + [ + f"UmiMask, {UmiMask}", + "UmiFastQ, True", + ] + ) + # Add PhiX stratified by index length - if group["phix_loaded"].any(): - # Subset controls by lane - group_controls = df_controls[df_controls["Lane"] == lane].copy() + # Subset controls by lane + group_controls = df_controls[df_controls["Lane"] == lane].copy() - # Trim PhiX indexes to match group - group_controls.loc[:, "Index1"] = group_controls.loc[:, "Index1"].apply( - lambda x: x[:len_idx1] - ) - group_controls.loc[:, "Index2"] = group_controls.loc[:, "Index2"].apply( - lambda x: x[:len_idx2] - ) + # Trim PhiX indexes to match group + i1_len = group["Index1"].apply(len).max() + group_controls.loc[:, "Index1"] = group_controls.loc[:, "Index1"].apply( + lambda x: x[:i1_len] + ) + i2_len = group["Index2"].apply(len).max() + group_controls.loc[:, "Index2"] = group_controls.loc[:, "Index2"].apply( + lambda x: x[:i2_len] + ) - # Add PhiX to group - group = pd.concat([group, group_controls], axis=0, ignore_index=True) + # Add PhiX to group + group = pd.concat([group, group_controls], axis=0, ignore_index=True) samples_section = ( f"[SAMPLES]\n{group.iloc[:, 0:6].to_csv(index=None, header=True)}" From bbc844087f5d63dbbc543e94dc6daeb19b2075c1 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Tue, 1 Oct 2024 13:45:18 +0200 Subject: [PATCH 096/187] ruff formatting --- taca/analysis/analysis_element.py | 14 +- taca/element/Element_Runs.py | 377 +++++++++++++++++++++--------- 2 files changed, 276 insertions(+), 115 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index 72c3c74d..eb15a8a4 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -45,9 +45,7 @@ def _process(run): os.mkdir(run.demux_dir) run.copy_manifests() run_manifests = glob.glob( - os.path.join( - run.run_dir, "RunManifest_*.csv" - ) + os.path.join(run.run_dir, "RunManifest_*.csv") ) sub_demux_count = 0 for run_manifest in run_manifests.sort(): @@ -96,7 +94,9 @@ def _process(run): run.status = "transferring" if run.status_changed: run.update_statusdb() - logger.info(f"{run} is being transferred. Skipping.") # TODO: fix formatting, currently prints "ElementRun(20240910_AV242106_B2403418431) is being transferred" + logger.info( + f"{run} is being transferred. Skipping." + ) # TODO: fix formatting, currently prints "ElementRun(20240910_AV242106_B2403418431) is being transferred" return elif transfer_status == "rsync done": if run.rsync_successful(): @@ -120,7 +120,7 @@ def _process(run): else: logger.warning( f"Unknown transfer status {transfer_status} of run {run}. Please investigate" - ) # TODO: email warning to operator + ) # TODO: email warning to operator return if given_run: @@ -132,9 +132,7 @@ def _process(run): ) # TODO: add to config for data_dir in data_dirs: # Run folder looks like DATE_*_*, the last section is the FC side (A/B) and name - runs = glob.glob( - os.path.join(data_dir, "[1-9]*_*_*") - ) + runs = glob.glob(os.path.join(data_dir, "[1-9]*_*_*")) for run in runs: runObj = Aviti_Run(run, CONFIG) try: diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index d6dd0b3a..e6264396 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -39,7 +39,8 @@ def __init__(self, run_dir, configuration): "*RunStats.json" # Assumes demux is finished when this file is created ) self.transfer_file = ( - self.CONFIG.get("element_analysis").get("Element", {}) + self.CONFIG.get("element_analysis") + .get("Element", {}) .get(self.sequencer_type, {}) .get("transfer_log") ) # TODO: add to taca.yaml @@ -106,7 +107,9 @@ def parse_run_parameters(self) -> None: "runID" ) # Unique hash that we don't really use self.side = run_parameters.get("Side") # SideA or SideB - self.side_letter = self.side[-1] # A or B TODO: compare side letter with manually entered letter in run name + self.side_letter = self.side[ + -1 + ] # A or B TODO: compare side letter with manually entered letter in run name self.run_type = run_parameters.get( "RunType" ) # Sequencing, wash or prime I believe? @@ -169,12 +172,12 @@ def to_doc_obj(self): demux_command = command_file.readlines()[0] else: demux_command = None - demux_version_file = os.path.join(self.run_dir,"Demultiplexing_0", "RunStats.json") + demux_version_file = os.path.join( + self.run_dir, "Demultiplexing_0", "RunStats.json" + ) if os.path.exists(demux_version_file): with open(demux_version_file) as json_file: - demux_info = json.load( - json_file - ) + demux_info = json.load(json_file) demux_version = demux_info.get("AnalysisVersion") else: demux_version = None @@ -214,7 +217,9 @@ def get_demultiplexing_status(self): sub_demux_dirs = glob.glob(os.path.join(self.run_dir, "Demultiplexing_*")) finished_count = 0 for demux_dir in sub_demux_dirs: - found_demux_stats_file = glob.glob(os.path.join(demux_dir, self.demux_stats_file)) + found_demux_stats_file = glob.glob( + os.path.join(demux_dir, self.demux_stats_file) + ) if not found_demux_stats_file: return "ongoing" elif found_demux_stats_file: @@ -261,7 +266,8 @@ def get_lims_step_id(self) -> str | None: def find_manifest_zip(self): # Specify dir in which LIMS drop the manifest zip files dir_to_search = os.path.join( - self.CONFIG.get("element_analysis").get("Element", {}) + self.CONFIG.get("element_analysis") + .get("Element", {}) .get(self.sequencer_type, {}) .get("manifest_zip_location"), # TODO: add to taca.yaml str(self.year), @@ -296,7 +302,6 @@ def find_manifest_zip(self): zip_src_path = glob_results[0] return zip_src_path - def copy_manifests(self) -> bool: """Fetch the LIMS-generated run manifests from ngi-nas-ns and unzip them into a run subdir.""" zip_src_path = self.find_manifest_zip() @@ -441,7 +446,7 @@ def generate_demux_command(self, run_manifest, demux_dir): + " --legacy-fastq" + " --force-index-orientation" ) - with open(os.path.join(self.run_dir, '.bases2fastq_command')) as command_file: + with open(os.path.join(self.run_dir, ".bases2fastq_command")) as command_file: command_file.write(command) return command @@ -493,7 +498,7 @@ def rsync_complete(self): def rsync_successful(self): with open(os.path.join(self.run_dir, ".rsync_exit_status")) as rsync_exit_file: rsync_exit_status = rsync_exit_file.readlines() - if rsync_exit_status[0].strip() == '0': + if rsync_exit_status[0].strip() == "0": return True else: return False @@ -510,38 +515,36 @@ def clear_dir(dir): except Exception as e: print(f"Failed to delete {file_path} Reason {e}") - # Write to csv def write_to_csv(data, filename): # Get the fieldnames from the keys of the first dictionary fieldnames = data[0].keys() # Open the file and write the CSV - with open(filename, mode='w', newline='') as file: + with open(filename, mode="w", newline="") as file: writer = csv.DictWriter(file, fieldnames=fieldnames) # Write the header (fieldnames) writer.writeheader() # Write the data (rows) writer.writerows(data) - # Collect demux info into a list of dictionaries # Structure: [{'sub_demux_count':XXX, 'SampleName':XXX, 'Index1':XXX, 'Index2':XXX, 'Lane':XXX, 'Project':XXX, 'Recipe':XXX}] def collect_demux_runmanifest(self, demux_results_dirs): demux_runmanifest = [] for demux_dir in demux_results_dirs: - sub_demux_count = os.path.basename(demux_dir).split('_')[1] - with open(os.path.join(self.run_dir, demux_dir, 'RunManifest.csv'), 'r') as file: + sub_demux_count = os.path.basename(demux_dir).split("_")[1] + with open(os.path.join(self.run_dir, demux_dir, "RunManifest.csv")) as file: lines = file.readlines() sample_section = False headers = [] # Loop through each line for line in lines: # Check if we reached the "[SAMPLES]" section - if '[SAMPLES]' in line: + if "[SAMPLES]" in line: sample_section = True continue # Exit the sample section if another section is encountered - if sample_section and line.startswith('['): + if sample_section and line.startswith("["): break # If in the sample section, process the sample lines if sample_section: @@ -552,71 +555,127 @@ def collect_demux_runmanifest(self, demux_results_dirs): continue # Get the headers from the first line if not headers: - headers = line.split(',') + headers = line.split(",") else: # Parse sample data - values = line.split(',') + values = line.split(",") sample_dict = dict(zip(headers, values)) - sample_dict['sub_demux_count'] = sub_demux_count + sample_dict["sub_demux_count"] = sub_demux_count demux_runmanifest.append(sample_dict) - sorted_demux_runmanifest = sorted(demux_runmanifest, key=lambda x: (x['Lane'], x['SampleName'], x['sub_demux_count'])) + sorted_demux_runmanifest = sorted( + demux_runmanifest, + key=lambda x: (x["Lane"], x["SampleName"], x["sub_demux_count"]), + ) return sorted_demux_runmanifest - # Aggregate the output FastQ files of samples from multiple demux def aggregate_sample_fastq(self, demux_runmanifest): - lanes = sorted(list(set(sample['Lane'] for sample in demux_runmanifest))) + lanes = sorted(list(set(sample["Lane"] for sample in demux_runmanifest))) unique_sample_demux = set() for lane in lanes: sample_count = 1 for sample in demux_runmanifest: - lanenr = sample['Lane'] - project = sample['Project'] - sample_name = sample['SampleName'] - sub_demux_count = sample['sub_demux_count'] + lanenr = sample["Lane"] + project = sample["Project"] + sample_name = sample["SampleName"] + sub_demux_count = sample["sub_demux_count"] # Skip PhiX if lanenr == lane and sample_name != "PhiX": sample_tuple = (sample_name, sub_demux_count) if sample_tuple not in unique_sample_demux: - project_dest = os.path.join(self.run_dir, self.demux_dir, project) - sample_dest = os.path.join(self.run_dir, self.demux_dir, project, f"Sample_{sample_name}") + project_dest = os.path.join( + self.run_dir, self.demux_dir, project + ) + sample_dest = os.path.join( + self.run_dir, + self.demux_dir, + project, + f"Sample_{sample_name}", + ) if not os.path.exists(project_dest): os.makedirs(project_dest) if not os.path.exists(sample_dest): os.makedirs(sample_dest) - fastqfiles = glob.glob(os.path.join(self.run_dir, f"Demultiplexing_{sub_demux_count}", "Samples", project, sample_name, f"*L00{lane}*.fastq.gz")) + fastqfiles = glob.glob( + os.path.join( + self.run_dir, + f"Demultiplexing_{sub_demux_count}", + "Samples", + project, + sample_name, + f"*L00{lane}*.fastq.gz", + ) + ) for fastqfile in fastqfiles: old_name = os.path.basename(fastqfile) - read_label = re.search(rf"L00{lane}_(.*?)_001", old_name).group(1) - new_name = "_".join([sample_name, f"S{sample_count}", f"L00{lane}", read_label, "001.fastq.gz"]) + read_label = re.search( + rf"L00{lane}_(.*?)_001", old_name + ).group(1) + new_name = "_".join( + [ + sample_name, + f"S{sample_count}", + f"L00{lane}", + read_label, + "001.fastq.gz", + ] + ) os.symlink(fastqfile, os.path.join(sample_dest, new_name)) unique_sample_demux.add(sample_tuple) sample_count += 1 - # Symlink the output FastQ files of undet only if a lane does not have multiple demux def aggregate_undet_fastq(self, demux_runmanifest): - lanes = sorted(list(set(sample['Lane'] for sample in demux_runmanifest))) + lanes = sorted(list(set(sample["Lane"] for sample in demux_runmanifest))) for lane in lanes: - sub_demux = list(set(sample['sub_demux_count'] for sample in demux_runmanifest if sample['Lane']==lane)) + sub_demux = list( + set( + sample["sub_demux_count"] + for sample in demux_runmanifest + if sample["Lane"] == lane + ) + ) if len(sub_demux) == 1: - project_dest = os.path.join(self.run_dir, self.demux_dir, "Undetermined") + project_dest = os.path.join( + self.run_dir, self.demux_dir, "Undetermined" + ) if not os.path.exists(project_dest): os.makedirs(project_dest) - fastqfiles = glob.glob(os.path.join(self.run_dir, f"Demultiplexing_{sub_demux[0]}", "Samples", "Undetermined", f"*L00{lane}*.fastq.gz")) + fastqfiles = glob.glob( + os.path.join( + self.run_dir, + f"Demultiplexing_{sub_demux[0]}", + "Samples", + "Undetermined", + f"*L00{lane}*.fastq.gz", + ) + ) for fastqfile in fastqfiles: base_name = os.path.basename(fastqfile) os.symlink(fastqfile, os.path.join(project_dest, base_name)) - # Read in each Project_RunStats.json to fetch PercentMismatch, PercentQ30, PercentQ40 and QualityScoreMean # Note that Element promised that they would include these stats into IndexAssignment.csv # But for now we have to do this by ourselves in this hard way def get_project_runstats(self, sub_demux, demux_runmanifest): project_runstats = [] - project_list = sorted(list(set(sample['Project'] for sample in demux_runmanifest if sample['sub_demux_count']==sub_demux))) + project_list = sorted( + list( + set( + sample["Project"] + for sample in demux_runmanifest + if sample["sub_demux_count"] == sub_demux + ) + ) + ) for project in project_list: - project_runstats_json_path = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux}", "Samples", project, f"{project}_RunStats.json") + project_runstats_json_path = os.path.join( + self.run_dir, + f"Demultiplexing_{sub_demux}", + "Samples", + project, + f"{project}_RunStats.json", + ) if os.path.exists(project_runstats_json_path): with open(project_runstats_json_path) as stats_json: project_runstats_json = json.load(stats_json) @@ -629,126 +688,218 @@ def get_project_runstats(self, sub_demux, demux_runmanifest): percentage_q30 = occurrence["PercentQ30"] percentage_q40 = occurrence["PercentQ40"] quality_score_mean = occurrence["QualityScoreMean"] - project_runstats.append({ "SampleName" : sample_name, - "Lane" : str(lane), - "ExpectedSequence" : expected_sequence, - "PercentMismatch" : percentage_mismatch, - "PercentQ30" : percentage_q30, - "PercentQ40" : percentage_q40, - "QualityScoreMean" : quality_score_mean - }) + project_runstats.append( + { + "SampleName": sample_name, + "Lane": str(lane), + "ExpectedSequence": expected_sequence, + "PercentMismatch": percentage_mismatch, + "PercentQ30": percentage_q30, + "PercentQ40": percentage_q40, + "QualityScoreMean": quality_score_mean, + } + ) else: continue return project_runstats - # Aggregate stats in IndexAssignment.csv def aggregate_stats_assigned(self, demux_runmanifest): aggregated_assigned_indexes = [] - sub_demux_list = sorted(list(set(sample['sub_demux_count'] for sample in demux_runmanifest))) - lanes = sorted(list(set(sample['Lane'] for sample in demux_runmanifest))) + sub_demux_list = sorted( + list(set(sample["sub_demux_count"] for sample in demux_runmanifest)) + ) + lanes = sorted(list(set(sample["Lane"] for sample in demux_runmanifest))) for sub_demux in sub_demux_list: # Read in each Project_RunStats.json to fetch PercentMismatch, PercentQ30, PercentQ40 and QualityScoreMean # Note that Element promised that they would include these stats into IndexAssignment.csv # But for now we have to do this by ourselves in this hard way project_runstats = self.get_project_runstats(sub_demux, demux_runmanifest) # Read in IndexAssignment.csv - assigned_csv = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux}", "IndexAssignment.csv") + assigned_csv = os.path.join( + self.run_dir, f"Demultiplexing_{sub_demux}", "IndexAssignment.csv" + ) if os.path.exists(assigned_csv): - with open(assigned_csv, 'r') as assigned_file: + with open(assigned_csv) as assigned_file: reader = csv.DictReader(assigned_file) index_assignment = [row for row in reader] for sample in index_assignment: - if sample['Lane'] in lanes: - project_runstats_sample = [d for d in project_runstats if d['SampleName'] == sample['SampleName'] and d['Lane'] == sample['Lane'] and d['ExpectedSequence'] == sample['I1']+sample['I2']] - sample['sub_demux_count'] = sub_demux - sample['PercentMismatch'] = project_runstats_sample[0]['PercentMismatch'] - sample['PercentQ30'] = project_runstats_sample[0]['PercentQ30'] - sample['PercentQ40'] = project_runstats_sample[0]['PercentQ40'] - sample['QualityScoreMean'] = project_runstats_sample[0]['QualityScoreMean'] + if sample["Lane"] in lanes: + project_runstats_sample = [ + d + for d in project_runstats + if d["SampleName"] == sample["SampleName"] + and d["Lane"] == sample["Lane"] + and d["ExpectedSequence"] == sample["I1"] + sample["I2"] + ] + sample["sub_demux_count"] = sub_demux + sample["PercentMismatch"] = project_runstats_sample[0][ + "PercentMismatch" + ] + sample["PercentQ30"] = project_runstats_sample[0]["PercentQ30"] + sample["PercentQ40"] = project_runstats_sample[0]["PercentQ40"] + sample["QualityScoreMean"] = project_runstats_sample[0][ + "QualityScoreMean" + ] aggregated_assigned_indexes.append(sample) else: - logger.warning(f"No {os.path.basename(assigned_csv)} file found for sub-demultiplexing {sub_demux}.") + logger.warning( + f"No {os.path.basename(assigned_csv)} file found for sub-demultiplexing {sub_demux}." + ) # Remove redundant rows for PhiX aggregated_assigned_indexes_filtered = [] unique_phiX_combination = set() for sample in aggregated_assigned_indexes: # Add project name - sample['Project'] = [d for d in demux_runmanifest if d['SampleName'] == sample['SampleName']][0]['Project'] - if sample['SampleName'] == 'PhiX': - combination = (sample['I1'], sample['I2'], sample['Lane']) + sample["Project"] = [ + d for d in demux_runmanifest if d["SampleName"] == sample["SampleName"] + ][0]["Project"] + if sample["SampleName"] == "PhiX": + combination = (sample["I1"], sample["I2"], sample["Lane"]) if combination not in unique_phiX_combination: aggregated_assigned_indexes_filtered.append(sample) unique_phiX_combination.add(combination) else: aggregated_assigned_indexes_filtered.append(sample) # Sort the list by Lane, SampleName and sub_demux_count - aggregated_assigned_indexes_filtered_sorted = sorted(aggregated_assigned_indexes_filtered, key=lambda x: (x['Lane'], x['SampleName'], x['sub_demux_count'])) + aggregated_assigned_indexes_filtered_sorted = sorted( + aggregated_assigned_indexes_filtered, + key=lambda x: (x["Lane"], x["SampleName"], x["sub_demux_count"]), + ) # Fix new sample number based on SampleName and Lane sample_count = 0 - previous_samplename_lane = ('NA', 'NA') + previous_samplename_lane = ("NA", "NA") for sample in aggregated_assigned_indexes_filtered_sorted: - if (sample['SampleName'], sample['Lane']) != previous_samplename_lane: + if (sample["SampleName"], sample["Lane"]) != previous_samplename_lane: sample_count += 1 - previous_samplename_lane = (sample['SampleName'], sample['Lane']) - sample['SampleNumber'] = sample_count + previous_samplename_lane = (sample["SampleName"], sample["Lane"]) + sample["SampleNumber"] = sample_count # Write to a new UnassignedSequences.csv file under demux_dir - aggregated_assigned_indexes_csv = os.path.join(self.run_dir, self.demux_dir, "IndexAssignment.csv") - self.write_to_csv(aggregated_assigned_indexes_filtered_sorted, aggregated_assigned_indexes_csv) - + aggregated_assigned_indexes_csv = os.path.join( + self.run_dir, self.demux_dir, "IndexAssignment.csv" + ) + self.write_to_csv( + aggregated_assigned_indexes_filtered_sorted, aggregated_assigned_indexes_csv + ) # Aggregate stats in UnassignedSequences.csv def aggregate_stats_unassigned(self, demux_runmanifest): aggregated_unassigned_indexes = [] - lanes = sorted(list(set(sample['Lane'] for sample in demux_runmanifest))) + lanes = sorted(list(set(sample["Lane"] for sample in demux_runmanifest))) for lane in lanes: sub_demux_index_lens = set() for sample in demux_runmanifest: - if sample['Lane'] == lane: - sub_demux_index_lens.add((sample['sub_demux_count'], (len(sample.get("Index1", "")), len(sample.get("Index2", ""))))) + if sample["Lane"] == lane: + sub_demux_index_lens.add( + ( + sample["sub_demux_count"], + ( + len(sample.get("Index1", "")), + len(sample.get("Index2", "")), + ), + ) + ) # List of sub-demux with a decreasing order of index lengths - sub_demux_list = [x[0] for x in sorted(sub_demux_index_lens, key=lambda x: sum(x[1]), reverse=True)] + sub_demux_list = [ + x[0] + for x in sorted( + sub_demux_index_lens, key=lambda x: sum(x[1]), reverse=True + ) + ] sub_demux_with_max_index_lens = sub_demux_list[0] # Start with the unassigned list with the longest index - max_unassigned_csv = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux_with_max_index_lens}", "UnassignedSequences.csv") + max_unassigned_csv = os.path.join( + self.run_dir, + f"Demultiplexing_{sub_demux_with_max_index_lens}", + "UnassignedSequences.csv", + ) if os.path.exists(max_unassigned_csv): - with open(max_unassigned_csv, 'r') as max_unassigned_file: + with open(max_unassigned_csv) as max_unassigned_file: reader = csv.DictReader(max_unassigned_file) max_unassigned_indexes = [row for row in reader] else: - logger.warning(f"No {os.path.basename(max_unassigned_csv)} file found for sub-demultiplexing {sub_demux_with_max_index_lens}.") + logger.warning( + f"No {os.path.basename(max_unassigned_csv)} file found for sub-demultiplexing {sub_demux_with_max_index_lens}." + ) break # Filter by lane - max_unassigned_indexes = [idx for idx in max_unassigned_indexes if idx["Lane"] == lane] + max_unassigned_indexes = [ + idx for idx in max_unassigned_indexes if idx["Lane"] == lane + ] # Complicated case with multiple demuxes. Take the full list if there is only one sub-demux otherwise if len(sub_demux_list) > 1: # Order: from longer to shorter indexes sub_demux_with_shorter_index_lens = sub_demux_list[1:] for sub_demux in sub_demux_with_shorter_index_lens: - unassigned_csv = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux}", "UnassignedSequences.csv") + unassigned_csv = os.path.join( + self.run_dir, + f"Demultiplexing_{sub_demux}", + "UnassignedSequences.csv", + ) if os.path.exists(unassigned_csv): - with open(unassigned_csv, 'r') as unassigned_file: + with open(unassigned_csv) as unassigned_file: reader = csv.DictReader(unassigned_file) unassigned_indexes = [row for row in reader] else: - logger.warning(f"No {os.path.basename(unassigned_csv)} file found for sub-demultiplexing {sub_demux}.") + logger.warning( + f"No {os.path.basename(unassigned_csv)} file found for sub-demultiplexing {sub_demux}." + ) continue # Filter by lane - unassigned_indexes = [unassigned_index for unassigned_index in unassigned_indexes if unassigned_index["Lane"] == lane] + unassigned_indexes = [ + unassigned_index + for unassigned_index in unassigned_indexes + if unassigned_index["Lane"] == lane + ] # Remove overlapped indexes from the list of max_unassigned_indexes - idx1_overlapped_len = min([demux_lens_pair[1] for demux_lens_pair in sub_demux_index_lens if demux_lens_pair[0] == sub_demux][0][0], - [demux_lens_pair[1] for demux_lens_pair in sub_demux_index_lens if demux_lens_pair[0] == sub_demux_with_max_index_lens][0][0]) - idx2_overlapped_len = min([demux_lens_pair[1] for demux_lens_pair in sub_demux_index_lens if demux_lens_pair[0] == sub_demux][0][1], - [demux_lens_pair[1] for demux_lens_pair in sub_demux_index_lens if demux_lens_pair[0] == sub_demux_with_max_index_lens][0][1]) + idx1_overlapped_len = min( + [ + demux_lens_pair[1] + for demux_lens_pair in sub_demux_index_lens + if demux_lens_pair[0] == sub_demux + ][0][0], + [ + demux_lens_pair[1] + for demux_lens_pair in sub_demux_index_lens + if demux_lens_pair[0] == sub_demux_with_max_index_lens + ][0][0], + ) + idx2_overlapped_len = min( + [ + demux_lens_pair[1] + for demux_lens_pair in sub_demux_index_lens + if demux_lens_pair[0] == sub_demux + ][0][1], + [ + demux_lens_pair[1] + for demux_lens_pair in sub_demux_index_lens + if demux_lens_pair[0] == sub_demux_with_max_index_lens + ][0][1], + ) for unassigned_index in unassigned_indexes: - idx1_overlapped_seq = unassigned_index['I1'][:idx1_overlapped_len] - idx2_overlapped_seq = unassigned_index['I2'][:idx2_overlapped_len] + idx1_overlapped_seq = unassigned_index["I1"][ + :idx1_overlapped_len + ] + idx2_overlapped_seq = unassigned_index["I2"][ + :idx2_overlapped_len + ] # Remove the overlapped record from the max_unassigned_indexes list - max_unassigned_indexes = [max_unassigned_index for max_unassigned_index in max_unassigned_indexes if not (max_unassigned_index['I1'][:idx1_overlapped_len] == idx1_overlapped_seq and max_unassigned_index['I2'][:idx2_overlapped_len] == idx2_overlapped_seq)] + max_unassigned_indexes = [ + max_unassigned_index + for max_unassigned_index in max_unassigned_indexes + if not ( + max_unassigned_index["I1"][:idx1_overlapped_len] + == idx1_overlapped_seq + and max_unassigned_index["I2"][:idx2_overlapped_len] + == idx2_overlapped_seq + ) + ] # Append to the aggregated_unassigned_indexes list aggregated_unassigned_indexes += max_unassigned_indexes # Sort aggregated_unassigned_indexes list first by lane and then by Count in the decreasing order - aggregated_unassigned_indexes = sorted(aggregated_unassigned_indexes, key=lambda x: (x['Lane'], -int(x['Count']))) + aggregated_unassigned_indexes = sorted( + aggregated_unassigned_indexes, key=lambda x: (x["Lane"], -int(x["Count"])) + ) # Fetch PFCount for each lane pfcount_lane = {} if os.path.exists(self.run_stats_file): @@ -759,15 +910,22 @@ def aggregate_stats_unassigned(self, demux_runmanifest): # Modify the % Polonies values based on PFCount for each lane for unassigned_index in aggregated_unassigned_indexes: if pfcount_lane.get(unassigned_index["Lane"]): - unassigned_index["% Polonies"] = float(unassigned_index["Count"])/pfcount_lane[unassigned_index["Lane"]]*100 + unassigned_index["% Polonies"] = ( + float(unassigned_index["Count"]) + / pfcount_lane[unassigned_index["Lane"]] + * 100 + ) else: - logger.warning(f"No {os.path.basename(self.run_stats_file)} file found for the run.") + logger.warning( + f"No {os.path.basename(self.run_stats_file)} file found for the run." + ) # Write to a new UnassignedSequences.csv file under demux_dir - aggregated_unassigned_csv = os.path.join(self.run_dir, self.demux_dir, "UnassignedSequences.csv") + aggregated_unassigned_csv = os.path.join( + self.run_dir, self.demux_dir, "UnassignedSequences.csv" + ) self.write_to_csv(aggregated_unassigned_indexes, aggregated_unassigned_csv) - # Aggregate demux results def aggregate_demux_results(self, demux_results_dirs): # Ensure the destination directory exists @@ -786,12 +944,15 @@ def aggregate_demux_results(self, demux_results_dirs): self.aggregate_stats_unassigned(demux_runmanifest) def sync_metadata(self): - files_to_copy = [self.run_stats_file, - os.path.join(self.run_dir, "Demultiplexing", "IndexAssignment.csv"), - os.path.join(self.run_dir, "Demultiplexing", "UnassignedSequences.csv"), - self.run_parameters_file, - ] - metadata_archive = self.CONFIG.get("element_analysis").get("metadata_location") # TODO: add to taca.yaml + files_to_copy = [ + self.run_stats_file, + os.path.join(self.run_dir, "Demultiplexing", "IndexAssignment.csv"), + os.path.join(self.run_dir, "Demultiplexing", "UnassignedSequences.csv"), + self.run_parameters_file, + ] + metadata_archive = self.CONFIG.get("element_analysis").get( + "metadata_location" + ) # TODO: add to taca.yaml dest = os.path.join(metadata_archive, self.NGI_run_id) os.makedirs(dest) for f in files_to_copy: @@ -802,8 +963,8 @@ def make_transfer_indicator(self): Path(transfer_indicator).touch() def transfer(self): - transfer_details = ( - self.CONFIG.get("element_analysis").get("transfer_details") + transfer_details = self.CONFIG.get("element_analysis").get( + "transfer_details" ) # TODO: Add section to taca.yaml command = ( "rsync" @@ -830,7 +991,7 @@ def transfer(self): return def remove_transfer_indicator(self): - transfer_indicator = os.path.join(self.run_dir, '.rsync_ongoing') + transfer_indicator = os.path.join(self.run_dir, ".rsync_ongoing") Path(transfer_indicator).unlink() def update_transfer_log(self): @@ -845,7 +1006,9 @@ def update_transfer_log(self): raise OSError(msg) def update_paths_after_archiving(self, new_location): - self.run_dir = os.path.join(new_location, self.NGI_run_id) # Needs to be redirected to new location so that TACA can find files to upload to statusdb + self.run_dir = os.path.join( + new_location, self.NGI_run_id + ) # Needs to be redirected to new location so that TACA can find files to upload to statusdb self.run_parameters_file = os.path.join(self.run_dir, "RunParameters.json") self.run_stats_file = os.path.join(self.run_dir, "AvitiRunStats.json") self.run_manifest_file_from_instrument = os.path.join( From 43d8316680c55eb695a7e98bf173f85a91dc96c9 Mon Sep 17 00:00:00 2001 From: chuan-wang Date: Tue, 1 Oct 2024 14:06:20 +0200 Subject: [PATCH 097/187] Add cycles in runparameters --- taca/element/Element_Runs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index e6264396..40aa0854 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -114,6 +114,7 @@ def parse_run_parameters(self) -> None: "RunType" ) # Sequencing, wash or prime I believe? self.flowcell_id = run_parameters.get("FlowcellID") + self.cycles = run_parameters.get("Cycles", {'R1': 0, 'R2': 0, 'I1': 0, 'I2': 0}) self.instrument_name = run_parameters.get("InstrumentName") self.date = run_parameters.get("Date")[0:10].replace("-", "") self.year = self.date[0:4] From 13340433a8c6dbda3ad690315a41d2950329e3a0 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Tue, 1 Oct 2024 14:23:50 +0200 Subject: [PATCH 098/187] polishing, adapt masks to true cycles --- taca/element/Element_Runs.py | 71 +++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 22 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 6f0ecd01..df48bdca 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -18,17 +18,27 @@ logger = logging.getLogger(__name__) -def get_mask(seq: str, mask_type: str, which_index: int) -> str: +def get_mask( + seq: str, + mask_type: str, + prefix: str, + cycles_used: int | None = None, +) -> str: """Example usage: - get_mask("ACGTACGTNNNNNNNN", "umi", 1) -> 'I1:N8Y8' - get_mask("ACGTACGTNNNNNNNN", "index", 2) -> 'I2:Y8N8' + get_mask("ACGTNNN", "umi", "I1:", None) -> 'I1:N4Y3' + get_mask("ACGTNNN", "index", "I2:", 10) -> 'I2:Y4N3N3' """ # Input assertions assert re.match(r"^[ACGTN]+$", seq), f"Index '{seq}' has non-ACGTN characters" assert mask_type in ["umi", "index"], "Mask type must be 'umi' or 'index'" - assert which_index in [1, 2], "Index number must be 1 or 2" + assert prefix in [ + "R1:", + "R2:", + "I1:", + "I2:", + ], f"Mask prefix {prefix} not recognized" # Define dict to convert base to mask classifier base2mask = ( @@ -50,7 +60,7 @@ def get_mask(seq: str, mask_type: str, which_index: int) -> str: ) # Dynamically build the mask sequence - mask_seq = "I1:" if which_index == 1 else "I2:" + mask_seq = prefix current_group = "" current_group_len = 0 for letter in seq: @@ -78,6 +88,11 @@ def get_mask(seq: str, mask_type: str, which_index: int) -> str: seq ), f"Length of mask '{mask_seq}' does not match length of input seq '{seq}'" + # TODO update this when we get the actual cycles used from the run parameters + if cycles_used is not None: + if cycles_used > len(mask_seq): + mask_seq += f"N{cycles_used-len(mask_seq)}" + return mask_seq @@ -428,23 +443,20 @@ def make_demux_manifests( df_samples = df[df["Project"] != "Control"].copy() df_controls = df[df["Project"] == "Control"].copy() - # Apply default dir path for output - if outdir is None: - outdir = self.run_dir - - ## Build composite manifests - manifest_root_name = f"{self.NGI_run_id}_demux" - # Bool indicating whether UMI is present df_samples["has_umi"] = df_samples["Index2"].str.contains("N") # Add cols denoting idx and umi masks - for n in [1, 2]: - df_samples[f"I{n}Mask"] = df_samples[f"Index{n}"].apply( - lambda seq: get_mask(seq, "index", n) - ) + df_samples["I1Mask"] = df_samples[ + "Index1" + ].apply( # TODO get cycles from run parameters + lambda seq: get_mask(seq, "index", "I1:", None) + ) + df_samples["I2Mask"] = df_samples["Index2"].apply( + lambda seq: get_mask(seq, "index", "I2:", None) + ) df_samples["UmiMask"] = df_samples["Index2"].apply( - lambda seq: get_mask(seq, "umi", 2) + lambda seq: get_mask(seq, "umi", "I2:", None) ) # Re-make idx col without Ns @@ -453,11 +465,20 @@ def make_demux_manifests( lambda x: x.replace("N", "") ) - # Break down by masks and lane, creating composite manifests + # Apply default dir path for output + if outdir is None: + outdir = self.run_dir + + # Break down into groups by non-consolable properties + grouped_df = df_samples.groupby( + ["I1Mask", "I2Mask", "UmiMask", "Lane", "Recipe"] + ) + + # Iterate over groups to build composite manifests + manifest_root_name = f"{self.NGI_run_id}_demux" manifests = [] n = 0 - grouped_df = df_samples.groupby(["I1Mask", "I2Mask", "UmiMask", "Lane"]) - for (I1Mask, I2Mask, UmiMask, lane), group in grouped_df: + for (I1Mask, I2Mask, UmiMask, lane, recipe), group in grouped_df: file_name = f"{manifest_root_name}_{n}.csv" runValues_section = "\n".join( @@ -466,16 +487,22 @@ def make_demux_manifests( "KeyName, Value", f'manifest_file, "{file_name}"', f"manifest_group, {n+1}/{len(grouped_df)}", - f"grouped_by, I1Mask:'{I1Mask}' I2Mask:'{I2Mask}' UmiMask:'{UmiMask}' lane:{lane}", + f"grouped_by, I1Mask:'{I1Mask}' I2Mask:'{I2Mask}' UmiMask:'{UmiMask}' lane:{lane} recipe:'{recipe}'", ] ) + recipe_split = recipe.split("-") + R1Mask = f"R1:Y{recipe_split[0]}N*" # TODO remove asterisk by getting de-facto cycles from run parameters + R2Mask = f"R2:Y{recipe_split[3]}N*" # TODO remove asterisk by getting de-facto cycles from run parameters + settings_section = "\n".join( [ "[SETTINGS]", "SettingName, Value", + f"R1Mask, {R1Mask}", f"I1Mask, {I1Mask}", f"I2Mask, {I2Mask}", + f"R2Mask, {R2Mask}", ] ) @@ -483,7 +510,7 @@ def make_demux_manifests( settings_section += "\n" + "\n".join( [ f"UmiMask, {UmiMask}", - "UmiFastQ, True", + "UmiFastQ, TRUE", ] ) From a1c7cceed5c05c3d0337a16004437365de897642 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Tue, 1 Oct 2024 14:32:06 +0200 Subject: [PATCH 099/187] ruff --- taca/analysis/analysis_element.py | 14 ++++++-------- taca/element/Aviti_Runs.py | 2 +- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index 72c3c74d..eb15a8a4 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -45,9 +45,7 @@ def _process(run): os.mkdir(run.demux_dir) run.copy_manifests() run_manifests = glob.glob( - os.path.join( - run.run_dir, "RunManifest_*.csv" - ) + os.path.join(run.run_dir, "RunManifest_*.csv") ) sub_demux_count = 0 for run_manifest in run_manifests.sort(): @@ -96,7 +94,9 @@ def _process(run): run.status = "transferring" if run.status_changed: run.update_statusdb() - logger.info(f"{run} is being transferred. Skipping.") # TODO: fix formatting, currently prints "ElementRun(20240910_AV242106_B2403418431) is being transferred" + logger.info( + f"{run} is being transferred. Skipping." + ) # TODO: fix formatting, currently prints "ElementRun(20240910_AV242106_B2403418431) is being transferred" return elif transfer_status == "rsync done": if run.rsync_successful(): @@ -120,7 +120,7 @@ def _process(run): else: logger.warning( f"Unknown transfer status {transfer_status} of run {run}. Please investigate" - ) # TODO: email warning to operator + ) # TODO: email warning to operator return if given_run: @@ -132,9 +132,7 @@ def _process(run): ) # TODO: add to config for data_dir in data_dirs: # Run folder looks like DATE_*_*, the last section is the FC side (A/B) and name - runs = glob.glob( - os.path.join(data_dir, "[1-9]*_*_*") - ) + runs = glob.glob(os.path.join(data_dir, "[1-9]*_*_*")) for run in runs: runObj = Aviti_Run(run, CONFIG) try: diff --git a/taca/element/Aviti_Runs.py b/taca/element/Aviti_Runs.py index cbe923ae..63b01bf7 100644 --- a/taca/element/Aviti_Runs.py +++ b/taca/element/Aviti_Runs.py @@ -5,4 +5,4 @@ class Aviti_Run(Run): def __init__(self, run_dir, configuration): self.sequencer_type = "Aviti" self.demux_dir = "Demultiplexing" - super().__init__(run_dir, configuration) \ No newline at end of file + super().__init__(run_dir, configuration) From 9197ea4cc3611d66c8f7d409a91107fd308cc06b Mon Sep 17 00:00:00 2001 From: kedhammar Date: Tue, 1 Oct 2024 14:34:40 +0200 Subject: [PATCH 100/187] ruff check --- taca/element/Element_Runs.py | 12 ++++++------ tests/element/test_Element_Runs.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index df48bdca..63ff4c4f 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -575,6 +575,7 @@ def start_demux(self, run_manifest, demux_dir): logger.info( "Bases2Fastq conversion and demultiplexing " f"started for run {self} on {datetime.now()}" + f"with p_handle {p_handle}" ) except subprocess.CalledProcessError: logger.warning( @@ -648,9 +649,7 @@ def collect_demux_runmanifest(self, demux_results_dirs): demux_runmanifest = [] for demux_dir in demux_results_dirs: sub_demux_count = os.path.basename(demux_dir).split("_")[1] - with open( - os.path.join(self.run_dir, demux_dir, "RunManifest.csv"), "r" - ) as file: + with open(os.path.join(self.run_dir, demux_dir, "RunManifest.csv")) as file: lines = file.readlines() sample_section = False headers = [] @@ -834,7 +833,7 @@ def aggregate_stats_assigned(self, demux_runmanifest): self.run_dir, f"Demultiplexing_{sub_demux}", "IndexAssignment.csv" ) if os.path.exists(assigned_csv): - with open(assigned_csv, "r") as assigned_file: + with open(assigned_csv) as assigned_file: reader = csv.DictReader(assigned_file) index_assignment = [row for row in reader] for sample in index_assignment: @@ -924,7 +923,7 @@ def aggregate_stats_unassigned(self, demux_runmanifest): "UnassignedSequences.csv", ) if os.path.exists(max_unassigned_csv): - with open(max_unassigned_csv, "r") as max_unassigned_file: + with open(max_unassigned_csv) as max_unassigned_file: reader = csv.DictReader(max_unassigned_file) max_unassigned_indexes = [row for row in reader] else: @@ -947,7 +946,7 @@ def aggregate_stats_unassigned(self, demux_runmanifest): "UnassignedSequences.csv", ) if os.path.exists(unassigned_csv): - with open(unassigned_csv, "r") as unassigned_file: + with open(unassigned_csv) as unassigned_file: reader = csv.DictReader(unassigned_file) unassigned_indexes = [row for row in reader] else: @@ -1072,6 +1071,7 @@ def transfer(self): logger.info( "Transfer to analysis cluster " f"started for run {self} on {datetime.now()}" + f"with p_handle {p_handle}" ) except subprocess.CalledProcessError: logger.warning( diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py index 23914a7d..2ebd777f 100644 --- a/tests/element/test_Element_Runs.py +++ b/tests/element/test_Element_Runs.py @@ -77,7 +77,7 @@ def create_element_run_dir( open( os.path.join( run_path, - f"Demultiplexing", + "Demultiplexing", "RunStats.json", ), "w", From afe2af67134d1dd08c0b9716fd2fd611e089beb2 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Tue, 1 Oct 2024 14:47:39 +0200 Subject: [PATCH 101/187] mypy fix --- taca/element/Element_Runs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 90dc6c33..ce7c1c39 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -410,7 +410,7 @@ def copy_manifests(self) -> bool: def make_demux_manifests( self, manifest_to_split: os.PathLike, outdir: os.PathLike | None = None - ) -> list[os.PathLike]: + ) -> list[str]: """Derive composite demultiplexing manifests from a single information-rich manifest. """ From 4b1610c590e52a64645ca43c06a15de069f3f7a5 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 2 Oct 2024 11:55:47 +0200 Subject: [PATCH 102/187] use cycles from runparam for mask generation --- taca/element/Element_Runs.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index e5867c4b..42947355 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -192,7 +192,7 @@ def parse_run_parameters(self) -> None: "RunType" ) # Sequencing, wash or prime I believe? self.flowcell_id = run_parameters.get("FlowcellID") - self.cycles = run_parameters.get("Cycles", {'R1': 0, 'R2': 0, 'I1': 0, 'I2': 0}) + self.cycles = run_parameters.get("Cycles", {"R1": 0, "R2": 0, "I1": 0, "I2": 0}) self.instrument_name = run_parameters.get("InstrumentName") self.date = run_parameters.get("Date")[0:10].replace("-", "") self.year = self.date[0:4] @@ -448,16 +448,29 @@ def make_demux_manifests( df_samples["has_umi"] = df_samples["Index2"].str.contains("N") # Add cols denoting idx and umi masks - df_samples["I1Mask"] = df_samples[ - "Index1" - ].apply( # TODO get cycles from run parameters - lambda seq: get_mask(seq, "index", "I1:", None) + df_samples["I1Mask"] = df_samples["Index1"].apply( + lambda seq: get_mask( + seq=seq, + mask_type="index", + prefix="I1:", + cycles_used=self.cycles["I1"], + ) ) df_samples["I2Mask"] = df_samples["Index2"].apply( - lambda seq: get_mask(seq, "index", "I2:", None) + lambda seq: get_mask( + seq=seq, + mask_type="index", + prefix="I2:", + cycles_used=self.cycles["I2"], + ) ) df_samples["UmiMask"] = df_samples["Index2"].apply( - lambda seq: get_mask(seq, "umi", "I2:", None) + lambda seq: get_mask( + seq=seq, + mask_type="umi", + prefix="I2:", + cycles_used=self.cycles["I2"], + ) ) # Re-make idx col without Ns From 298bcbee6a055e8fcf93cc34a5682686e289b359 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 2 Oct 2024 12:18:11 +0200 Subject: [PATCH 103/187] readability improvements, use runparam cycles for r1/r2 masks --- taca/element/Element_Runs.py | 71 ++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 31 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 42947355..07efea97 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -22,12 +22,12 @@ def get_mask( seq: str, mask_type: str, prefix: str, - cycles_used: int | None = None, + cycles_used: int, ) -> str: """Example usage: get_mask("ACGTNNN", "umi", "I1:", None) -> 'I1:N4Y3' - get_mask("ACGTNNN", "index", "I2:", 10) -> 'I2:Y4N3N3' + get_mask("ACGTNNN", "index", "I2:", 10) -> 'I2:Y4N6' """ # Input assertions @@ -59,41 +59,50 @@ def get_mask( } ) - # Dynamically build the mask sequence - mask_seq = prefix + # Instantiate the mask string + mask = "" + # Add the prefix + mask += prefix + # Loop through the input sequence and dynamically add mask groups current_group = "" current_group_len = 0 - for letter in seq: - if base2mask[letter] == current_group: + mask_len = 0 + for base in seq: + mask_len += 1 + if base2mask[base] == current_group: current_group_len += 1 else: - mask_seq += ( + mask += ( f"{current_group}{current_group_len}" if current_group_len > 0 else "" ) - current_group = base2mask[letter] + current_group = base2mask[base] current_group_len = 1 - mask_seq += f"{current_group}{current_group_len}" - - # Use the worlds ugliest string parsing to check that the mask length matches the input sequence length - assert sum( - [ - int(n) - for n in mask_seq[3:] - .replace("N", "-") - .replace("Y", "-") - .strip("-") - .split("-") - ] - ) == len( - seq - ), f"Length of mask '{mask_seq}' does not match length of input seq '{seq}'" - - # TODO update this when we get the actual cycles used from the run parameters - if cycles_used is not None: - if cycles_used > len(mask_seq): - mask_seq += f"N{cycles_used-len(mask_seq)}" + # For the last mask group, check if we need to pad with Ns to match the number of cycles used + if cycles_used > mask_len: + diff = cycles_used - mask_len + if current_group == "N": + current_group_len += diff + mask += f"{current_group}{current_group_len}" + else: + mask += f"{current_group}{current_group_len}" + mask += f"N{diff}" + + # Parse mask string to check that it matches the number of cycles used + assert ( + sum( + [ + int(n) + for n in mask[3:] + .replace("N", "-") + .replace("Y", "-") + .strip("-") + .split("-") + ] + ) + == cycles_used + ), f"Length of mask '{mask}' does not match number of cycles used '{cycles_used}'." - return mask_seq + return mask class Run: @@ -506,8 +515,8 @@ def make_demux_manifests( ) recipe_split = recipe.split("-") - R1Mask = f"R1:Y{recipe_split[0]}N*" # TODO remove asterisk by getting de-facto cycles from run parameters - R2Mask = f"R2:Y{recipe_split[3]}N*" # TODO remove asterisk by getting de-facto cycles from run parameters + R1Mask = f"R1:Y{recipe_split[0]}N{self.cycles["R1"]}" + R2Mask = f"R2:Y{recipe_split[3]}N{self.cycles["R2"]}" settings_section = "\n".join( [ From 95a32580bbcb7d033d3e135e0ad3fe541a05c084 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 2 Oct 2024 12:19:48 +0200 Subject: [PATCH 104/187] syntax fix --- taca/element/Element_Runs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 07efea97..c2ac77c8 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -515,8 +515,8 @@ def make_demux_manifests( ) recipe_split = recipe.split("-") - R1Mask = f"R1:Y{recipe_split[0]}N{self.cycles["R1"]}" - R2Mask = f"R2:Y{recipe_split[3]}N{self.cycles["R2"]}" + R1Mask = f"R1:Y{recipe_split[0]}N{self.cycles['R1']}" + R2Mask = f"R2:Y{recipe_split[3]}N{self.cycles['R2']}" settings_section = "\n".join( [ From 8a28e89c6ab1e5660b631707ccc2b06bcffdf786 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Wed, 2 Oct 2024 14:14:05 +0200 Subject: [PATCH 105/187] Email warnings to operator --- taca/analysis/analysis_element.py | 33 ++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index eb15a8a4..d18e71f1 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -6,6 +6,7 @@ from taca.element.Aviti_Runs import Aviti_Run from taca.utils.config import CONFIG +from taca.utils.misc import send_mail logger = logging.getLogger(__name__) @@ -27,11 +28,16 @@ def _process(run): logger.warning( f"Cannot reliably set NGI_run_id for {run} due to missing RunParameters.json. Aborting run processing" ) + email_subject = f"Issues processing {run}" + email_message = ( + f"RunParameters.json missing for {run}. Processing was aborted." + ) + send_mail(email_subject, email_message, CONFIG["mail"]["recipients"]) raise #### Sequencing status #### sequencing_done = run.check_sequencing_status() - if not sequencing_done: # Sequencing ongoing + if not sequencing_done: run.status = "sequencing" if run.status_changed: run.update_statusdb() @@ -40,7 +46,6 @@ def _process(run): #### Demultiplexing status #### demultiplexing_status = run.get_demultiplexing_status() if demultiplexing_status == "not started": - # Sequencing done. Start demux if run.manifest_exists(): os.mkdir(run.demux_dir) run.copy_manifests() @@ -61,7 +66,11 @@ def _process(run): logger.warning( f"Run manifest is missing for {run}, demultiplexing aborted" ) - # TODO: email operator warning + email_subject = f"Issues processing {run}" + email_message = ( + f"Run manifest is missing for {run}, demultiplexing aborted" + ) + send_mail(email_subject, email_message, CONFIG["mail"]["recipients"]) return elif demultiplexing_status == "ongoing": run.status = "demultiplexing" @@ -71,8 +80,13 @@ def _process(run): elif demultiplexing_status != "finished": logger.warning( - f"Unknown demultiplexing status {demultiplexing_status} of run {run}. Please investigate" + f"Unknown demultiplexing status {demultiplexing_status} of run {run}. Please investigate." ) + email_subject = f"Issues processing {run}" + email_message = ( + f"Unknown demultiplexing status {demultiplexing_status} of run {run}. Please investigate." + ) + send_mail(email_subject, email_message, CONFIG["mail"]["recipients"]) return #### Transfer status #### @@ -115,12 +129,17 @@ def _process(run): logger.warning( f"An issue occurred while transfering {run} to the analysis cluster." ) - # TODO: email warning to operator + email_subject = f"Issues processing {run}" + email_message = f"An issue occurred while transfering {run} to the analysis cluster." + send_mail(email_subject, email_message, CONFIG["mail"]["recipients"]) return else: logger.warning( - f"Unknown transfer status {transfer_status} of run {run}. Please investigate" - ) # TODO: email warning to operator + f"Unknown transfer status {transfer_status} of run {run}, please investigate." + ) + email_subject = f"Issues processing {run}" + email_message = f"Unknown transfer status {transfer_status} of run {run}, please investigate." + send_mail(email_subject, email_message, CONFIG["mail"]["recipients"]) return if given_run: From 1e168a2505d797392b3f315a3762a9c5575a59fc Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Thu, 3 Oct 2024 09:56:03 +0200 Subject: [PATCH 106/187] Cleanup comments --- taca/analysis/analysis_element.py | 2 +- taca/element/Element_Runs.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index d18e71f1..10b2adf1 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -148,7 +148,7 @@ def _process(run): else: data_dirs = CONFIG.get("element_analysis").get( "data_dirs" - ) # TODO: add to config + ) for data_dir in data_dirs: # Run folder looks like DATE_*_*, the last section is the FC side (A/B) and name runs = glob.glob(os.path.join(data_dir, "[1-9]*_*_*")) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index c2ac77c8..9e51ae3e 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -130,7 +130,7 @@ def __init__(self, run_dir, configuration): .get("Element", {}) .get(self.sequencer_type, {}) .get("transfer_log") - ) # TODO: add to taca.yaml + ) self.rsync_exit_file = os.path.join(self.run_dir, ".rsync_exit_status") # Instrument generated files @@ -340,7 +340,7 @@ def get_lims_step_id(self) -> str | None: the ID of the LIMS step can be extracted from it. """ - # TODO test me + # TODO: test me assert self.manifest_exists(), "Run manifest not found" with open(self.run_manifest_file_from_instrument) as csv_file: @@ -357,7 +357,7 @@ def find_manifest_zip(self): self.CONFIG.get("element_analysis") .get("Element", {}) .get(self.sequencer_type, {}) - .get("manifest_zip_location"), # TODO: add to taca.yaml + .get("manifest_zip_location"), str(self.year), ) @@ -575,7 +575,7 @@ def make_demux_manifests( def generate_demux_command(self, run_manifest, demux_dir): command = ( - f"{self.CONFIG.get('element_analysis').get('bases2fastq')}" # TODO: add path to bases2fastq executable to config + f"{self.CONFIG.get('element_analysis').get('bases2fastq')}" + f" {self.run_dir}" + f" {demux_dir}" + " -p 8" @@ -583,7 +583,7 @@ def generate_demux_command(self, run_manifest, demux_dir): + f" -r {run_manifest}" + " --legacy-fastq" + " --force-index-orientation" - ) # TODO: any other options? + ) with open(os.path.join(self.run_dir, ".bases2fastq_command")) as command_file: command_file.write(command) return command @@ -1091,7 +1091,7 @@ def sync_metadata(self): ] metadata_archive = self.CONFIG.get("element_analysis").get( "metadata_location" - ) # TODO: add to taca.yaml + ) dest = os.path.join(metadata_archive, self.NGI_run_id) os.makedirs(dest) for f in files_to_copy: @@ -1104,7 +1104,7 @@ def make_transfer_indicator(self): def transfer(self): transfer_details = self.CONFIG.get("element_analysis").get( "transfer_details" - ) # TODO: Add section to taca.yaml + ) command = ( "rsync" + " -rLav" From ec7b39035aeb649659ae767d47c76098c4c9d302 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Thu, 3 Oct 2024 15:16:31 +0200 Subject: [PATCH 107/187] typo --- taca/element/Element_Runs.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 9e51ae3e..50a7e6c7 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -191,7 +191,7 @@ def parse_run_parameters(self) -> None: self.run_name = run_parameters.get("RunName") self.run_id = run_parameters.get( - "runID" + "RunID" ) # Unique hash that we don't really use self.side = run_parameters.get("Side") # SideA or SideB self.side_letter = self.side[ @@ -1089,9 +1089,7 @@ def sync_metadata(self): os.path.join(self.run_dir, "Demultiplexing", "UnassignedSequences.csv"), self.run_parameters_file, ] - metadata_archive = self.CONFIG.get("element_analysis").get( - "metadata_location" - ) + metadata_archive = self.CONFIG.get("element_analysis").get("metadata_location") dest = os.path.join(metadata_archive, self.NGI_run_id) os.makedirs(dest) for f in files_to_copy: @@ -1102,9 +1100,7 @@ def make_transfer_indicator(self): Path(transfer_indicator).touch() def transfer(self): - transfer_details = self.CONFIG.get("element_analysis").get( - "transfer_details" - ) + transfer_details = self.CONFIG.get("element_analysis").get("transfer_details") command = ( "rsync" + " -rLav" From 178683d05295638a7e12cf056a1323f2c2b7e255 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Thu, 3 Oct 2024 15:59:05 +0200 Subject: [PATCH 108/187] wip --- taca/analysis/analysis_element.py | 13 +++++------- taca/element/Element_Runs.py | 34 ++++++++++++------------------- 2 files changed, 18 insertions(+), 29 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index 10b2adf1..dc0a202c 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -46,9 +46,10 @@ def _process(run): #### Demultiplexing status #### demultiplexing_status = run.get_demultiplexing_status() if demultiplexing_status == "not started": - if run.manifest_exists(): + lims_zip_path = run.find_lims_zip() + if lims_zip_path is not None: os.mkdir(run.demux_dir) - run.copy_manifests() + run.copy_manifests(lims_zip_path) run_manifests = glob.glob( os.path.join(run.run_dir, "RunManifest_*.csv") ) @@ -83,9 +84,7 @@ def _process(run): f"Unknown demultiplexing status {demultiplexing_status} of run {run}. Please investigate." ) email_subject = f"Issues processing {run}" - email_message = ( - f"Unknown demultiplexing status {demultiplexing_status} of run {run}. Please investigate." - ) + email_message = f"Unknown demultiplexing status {demultiplexing_status} of run {run}. Please investigate." send_mail(email_subject, email_message, CONFIG["mail"]["recipients"]) return @@ -146,9 +145,7 @@ def _process(run): run = Aviti_Run(given_run, CONFIG) _process(run) else: - data_dirs = CONFIG.get("element_analysis").get( - "data_dirs" - ) + data_dirs = CONFIG.get("element_analysis").get("data_dirs") for data_dir in data_dirs: # Run folder looks like DATE_*_*, the last section is the FC side (A/B) and name runs = glob.glob(os.path.join(data_dir, "[1-9]*_*_*")) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 50a7e6c7..9bcc2c7e 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -331,27 +331,19 @@ def update_statusdb(self): doc_obj = self.to_doc_obj() self.db.upload_to_statusdb(doc_obj) - def manifest_exists(self): - zip_src_path = self.find_manifest_zip() - return os.path.isfile(zip_src_path) - def get_lims_step_id(self) -> str | None: """If the run was started using a LIMS-generated manifest, the ID of the LIMS step can be extracted from it. """ - # TODO: test me + with open(self.run_manifest_file_from_instrument) as json_file: + manifest_json = json.load(json_file) + + lims_step_id = manifest_json.get("RunValues").get("lims_step_id") - assert self.manifest_exists(), "Run manifest not found" - with open(self.run_manifest_file_from_instrument) as csv_file: - manifest_lines = csv_file.readlines() - for line in manifest_lines: - if "lims_step_id" in line: - lims_step_id = line.split(",")[1] - return lims_step_id - return None + return lims_step_id - def find_manifest_zip(self): + def find_lims_zip(self) -> str | None: # Specify dir in which LIMS drop the manifest zip files dir_to_search = os.path.join( self.CONFIG.get("element_analysis") @@ -362,7 +354,8 @@ def find_manifest_zip(self): ) # Use LIMS step ID if available, else flowcell ID, to make a query pattern - if self.lims_step_id: + self.lims_step_id = self.get_lims_step_id() + if self.lims_step_id is not None: logging.info( f"Using LIMS step ID '{self.lims_step_id}' to find LIMS run manifests." ) @@ -379,20 +372,19 @@ def find_manifest_zip(self): logger.warning( f"No manifest found for run '{self.run_dir}' with pattern '{glob_pattern}'." ) - return False # TODO: determine whether to raise an error here instead + return None elif len(glob_results) > 1: logger.warning( f"Multiple manifests found for run '{self.run_dir}' with pattern '{glob_pattern}', using latest one." ) glob_results.sort() - zip_src_path = glob_results[-1] + lims_zip_src_path = glob_results[-1] else: - zip_src_path = glob_results[0] - return zip_src_path + lims_zip_src_path = glob_results[0] + return lims_zip_src_path - def copy_manifests(self) -> bool: + def copy_manifests(self, zip_src_path) -> bool: """Fetch the LIMS-generated run manifests from ngi-nas-ns and unzip them into a run subdir.""" - zip_src_path = self.find_manifest_zip() # Make a run subdir named after the zip file and extract manifests there zip_name = os.path.basename(zip_src_path) zip_dst_path = os.path.join(self.run_dir, zip_name) From 602b14f522541f32477781404e26a7f8223b4277 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Thu, 3 Oct 2024 16:57:15 +0200 Subject: [PATCH 109/187] troubleshooting --- taca/analysis/analysis_element.py | 8 +++---- taca/element/Element_Runs.py | 37 ++++++++++++++----------------- 2 files changed, 21 insertions(+), 24 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index dc0a202c..6e4c0a71 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -50,14 +50,14 @@ def _process(run): if lims_zip_path is not None: os.mkdir(run.demux_dir) run.copy_manifests(lims_zip_path) - run_manifests = glob.glob( - os.path.join(run.run_dir, "RunManifest_*.csv") + demux_manifests = run.make_demux_manifests( + manifest_to_split=run.lims_manifest ) sub_demux_count = 0 - for run_manifest in run_manifests.sort(): + for demux_manifest in demux_manifests.sort(): demux_dir = f"Demultiplexing_{sub_demux_count}" os.mkdir(demux_dir) - run.start_demux(run_manifest, demux_dir) + run.start_demux(demux_manifest, demux_dir) sub_demux_count += 1 run.status = "demultiplexing" if run.status_changed: diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 9bcc2c7e..bf804045 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -86,6 +86,8 @@ def get_mask( else: mask += f"{current_group}{current_group_len}" mask += f"N{diff}" + else: + mask += f"{current_group}{current_group_len}" # Parse mask string to check that it matches the number of cycles used assert ( @@ -386,29 +388,24 @@ def find_lims_zip(self) -> str | None: def copy_manifests(self, zip_src_path) -> bool: """Fetch the LIMS-generated run manifests from ngi-nas-ns and unzip them into a run subdir.""" # Make a run subdir named after the zip file and extract manifests there - zip_name = os.path.basename(zip_src_path) - zip_dst_path = os.path.join(self.run_dir, zip_name) - os.mkdir(zip_dst_path) + # Extract the contents of the zip file into the destination directory + unzipped_manifests = [] with zipfile.ZipFile(zip_src_path, "r") as zip_ref: - zip_ref.extractall(zip_dst_path) - - # Set the paths of the different manifests as attributes - manifests = os.listdir(zip_dst_path) - self.lims_full_manifest = [ - m for m in manifests if re.match(r".*_untrimmed\.csv$", m) - ][0] - self.lims_start_manifest = [ - m for m in manifests if re.match(r".*_trimmed\.csv$", m) - ][0] - self.lims_empty_manifest = [ - m for m in manifests if re.match(r".*_empty\.csv$", m) + for member in zip_ref.namelist(): + # Extract each file individually into the destination directory + filename = os.path.basename(member) + if filename: # Skip directories + source = zip_ref.open(member) + target = open(os.path.join(self.run_dir, filename), "wb") + unzipped_manifests.append(target.name) + with source, target: + target.write(source.read()) + + # Pick out the manifest to use + self.lims_manifest = [ + m for m in unzipped_manifests if re.match(r".*_untrimmed\.csv$", m) ][0] - self.lims_demux_manifests = [ - m for m in manifests if re.match(r".*_\d+\.csv$", m) - ] - - return True def make_demux_manifests( self, manifest_to_split: os.PathLike, outdir: os.PathLike | None = None From 102f223f05b9f621935065f602f53584580e211b Mon Sep 17 00:00:00 2001 From: kedhammar Date: Thu, 3 Oct 2024 17:02:01 +0200 Subject: [PATCH 110/187] mypy --- taca/element/Element_Runs.py | 2 +- tests/element/test_Element_Runs.py | 24 ------------------------ 2 files changed, 1 insertion(+), 25 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index bf804045..e8d41dba 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -385,7 +385,7 @@ def find_lims_zip(self) -> str | None: lims_zip_src_path = glob_results[0] return lims_zip_src_path - def copy_manifests(self, zip_src_path) -> bool: + def copy_manifests(self, zip_src_path): """Fetch the LIMS-generated run manifests from ngi-nas-ns and unzip them into a run subdir.""" # Make a run subdir named after the zip file and extract manifests there diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py index 2ebd777f..7963761d 100644 --- a/tests/element/test_Element_Runs.py +++ b/tests/element/test_Element_Runs.py @@ -158,30 +158,6 @@ def test_get_demultiplexing_status( assert run.get_demultiplexing_status() == p["expected"] - @pytest.mark.skip(reason="Not implemented yet") - @pytest.mark.parametrize( - "p", - [ - {"run_finished": True, "expected": True}, - {"run_finished": False, "expected": False}, - ], - ids=["exists", "does not exist"], - ) - def test_manifest_exists( - self, mock_db: mock.Mock, create_dirs: pytest.fixture, p: pytest.fixture - ): - tmp: tempfile.TemporaryDirectory = create_dirs - - run = to_test.Run( - create_element_run_dir( - tmp, - run_finished=p["run_finished"], - ), - get_config(tmp), - ) - - assert run.manifest_exists() == p["expected"] - @pytest.mark.skip(reason="Not implemented yet") def test_generate_demux_command(self, mock_db): pass From b57306d4298072a12d50bbc9ea9cea1d7fe776ba Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 4 Oct 2024 09:26:12 +0200 Subject: [PATCH 111/187] Don't group by lane when creating submanifests. Add sanity check for sample grouping. --- taca/element/Element_Runs.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index e8d41dba..97a9d2da 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -442,10 +442,10 @@ def make_demux_manifests( df_samples = df[df["Project"] != "Control"].copy() df_controls = df[df["Project"] == "Control"].copy() - # Bool indicating whether UMI is present + # Add bool indicating whether UMI is present df_samples["has_umi"] = df_samples["Index2"].str.contains("N") - # Add cols denoting idx and umi masks + # Add masks for indices and UMIs df_samples["I1Mask"] = df_samples["Index1"].apply( lambda seq: get_mask( seq=seq, @@ -471,7 +471,7 @@ def make_demux_manifests( ) ) - # Re-make idx col without Ns + # Re-make Index2 column without any Ns df_samples["Index2_umi"] = df_samples["Index2"] df_samples.loc[:, "Index2"] = df_samples["Index2"].apply( lambda x: x.replace("N", "") @@ -482,15 +482,21 @@ def make_demux_manifests( outdir = self.run_dir # Break down into groups by non-consolable properties - grouped_df = df_samples.groupby( - ["I1Mask", "I2Mask", "UmiMask", "Lane", "Recipe"] - ) + grouped_df = df_samples.groupby(["I1Mask", "I2Mask", "UmiMask", "Recipe"]) + + # Sanity check + if sum([len(group) for _, group in grouped_df]) < len(df_samples): + msg = "Some samples were not included in any submanifest." + logging.error(msg) + raise AssertionError(msg) + elif sum([len(group) for _, group in grouped_df]) > len(df_samples): + logging.warning("Some samples were included in multiple submanifests.") # Iterate over groups to build composite manifests manifest_root_name = f"{self.NGI_run_id}_demux" manifests = [] n = 0 - for (I1Mask, I2Mask, UmiMask, lane, recipe), group in grouped_df: + for (I1Mask, I2Mask, UmiMask, recipe), group in grouped_df: file_name = f"{manifest_root_name}_{n}.csv" runValues_section = "\n".join( @@ -499,7 +505,7 @@ def make_demux_manifests( "KeyName, Value", f'manifest_file, "{file_name}"', f"manifest_group, {n+1}/{len(grouped_df)}", - f"grouped_by, I1Mask:'{I1Mask}' I2Mask:'{I2Mask}' UmiMask:'{UmiMask}' lane:{lane} recipe:'{recipe}'", + f"grouped_by, I1Mask:'{I1Mask}' I2Mask:'{I2Mask}' UmiMask:'{UmiMask}' recipe:'{recipe}'", ] ) @@ -527,8 +533,9 @@ def make_demux_manifests( ) # Add PhiX stratified by index length - # Subset controls by lane - group_controls = df_controls[df_controls["Lane"] == lane].copy() + group_controls = df_controls[ + df_controls["Lane"].isin(group["Lane"].unique()) + ].copy() # Trim PhiX indexes to match group i1_len = group["Index1"].apply(len).max() From ec786f15919d96f050e17dd3407ca25cee9a6b59 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Mon, 7 Oct 2024 10:18:46 +0200 Subject: [PATCH 112/187] Fix method definitions --- taca/element/Element_Runs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 97a9d2da..ff1fc0ef 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -639,7 +639,7 @@ def rsync_successful(self): return False # Clear all content under a dir - def clear_dir(dir): + def clear_dir(self, dir): for filename in os.listdir(dir): file_path = os.path.join(dir, filename) try: @@ -651,7 +651,7 @@ def clear_dir(dir): print(f"Failed to delete {file_path} Reason {e}") # Write to csv - def write_to_csv(data, filename): + def write_to_csv(self, data, filename): # Get the fieldnames from the keys of the first dictionary fieldnames = data[0].keys() # Open the file and write the CSV From 533f448095ad76a52b3453fdcf1aefa9f06e63eb Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Mon, 7 Oct 2024 10:49:07 +0200 Subject: [PATCH 113/187] Handle masking when no R2 --- taca/element/Element_Runs.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index ff1fc0ef..34cea966 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -31,6 +31,8 @@ def get_mask( """ # Input assertions + if not seq: + return None assert re.match(r"^[ACGTN]+$", seq), f"Index '{seq}' has non-ACGTN characters" assert mask_type in ["umi", "index"], "Mask type must be 'umi' or 'index'" assert prefix in [ @@ -519,10 +521,12 @@ def make_demux_manifests( "SettingName, Value", f"R1Mask, {R1Mask}", f"I1Mask, {I1Mask}", - f"I2Mask, {I2Mask}", f"R2Mask, {R2Mask}", ] ) + + if I2Mask: + settings_section += f"\nI2Mask, {I2Mask}" if group["has_umi"].all(): settings_section += "\n" + "\n".join( From 0b030b13f7968a8b278b70b5a8196fe0f8ad85ec Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Mon, 7 Oct 2024 12:02:03 +0200 Subject: [PATCH 114/187] Return empty string instead of None for missing I2 --- taca/element/Element_Runs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 34cea966..6550756c 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -32,7 +32,7 @@ def get_mask( # Input assertions if not seq: - return None + return "" assert re.match(r"^[ACGTN]+$", seq), f"Index '{seq}' has non-ACGTN characters" assert mask_type in ["umi", "index"], "Mask type must be 'umi' or 'index'" assert prefix in [ @@ -525,7 +525,7 @@ def make_demux_manifests( ] ) - if I2Mask: + if I2Mask != "": settings_section += f"\nI2Mask, {I2Mask}" if group["has_umi"].all(): From f41778867d1314496ebc5dfeeee8711682b84881 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Mon, 7 Oct 2024 13:54:04 +0200 Subject: [PATCH 115/187] fix sorting issue --- taca/analysis/analysis_element.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index 6e4c0a71..95bacaac 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -54,7 +54,7 @@ def _process(run): manifest_to_split=run.lims_manifest ) sub_demux_count = 0 - for demux_manifest in demux_manifests.sort(): + for demux_manifest in sorted(demux_manifests): demux_dir = f"Demultiplexing_{sub_demux_count}" os.mkdir(demux_dir) run.start_demux(demux_manifest, demux_dir) From a0ec98dc6da0b37acf423f6d050a8935742291ce Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Mon, 7 Oct 2024 15:27:29 +0200 Subject: [PATCH 116/187] Fixes for masks --- taca/analysis/analysis_element.py | 8 +++++--- taca/element/Element_Runs.py | 21 +++++++++++++-------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index 95bacaac..f0274e47 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -55,9 +55,11 @@ def _process(run): ) sub_demux_count = 0 for demux_manifest in sorted(demux_manifests): - demux_dir = f"Demultiplexing_{sub_demux_count}" - os.mkdir(demux_dir) - run.start_demux(demux_manifest, demux_dir) + sub_demux_dir = os.path.join( + run.run_dir, f"Demultiplexing_{sub_demux_count}" + ) + os.mkdir(sub_demux_dir) + run.start_demux(demux_manifest, sub_demux_dir) sub_demux_count += 1 run.status = "demultiplexing" if run.status_changed: diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 6550756c..d1cd15a8 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -30,9 +30,14 @@ def get_mask( get_mask("ACGTNNN", "index", "I2:", 10) -> 'I2:Y4N6' """ + if not seq and prefix == "I2:": + mask = "I2:N*" + return mask + if not seq and mask_type == "umi": + mask = "I2:Y*" + return mask + # Input assertions - if not seq: - return "" assert re.match(r"^[ACGTN]+$", seq), f"Index '{seq}' has non-ACGTN characters" assert mask_type in ["umi", "index"], "Mask type must be 'umi' or 'index'" assert prefix in [ @@ -519,14 +524,12 @@ def make_demux_manifests( [ "[SETTINGS]", "SettingName, Value", - f"R1Mask, {R1Mask}", + f"R1FastqMask, {R1Mask}", f"I1Mask, {I1Mask}", - f"R2Mask, {R2Mask}", + f"I2Mask, {I2Mask}" + f"R2FastqMask, {R2Mask}", ] ) - - if I2Mask != "": - settings_section += f"\nI2Mask, {I2Mask}" if group["has_umi"].all(): settings_section += "\n" + "\n".join( @@ -584,7 +587,9 @@ def generate_demux_command(self, run_manifest, demux_dir): + " --legacy-fastq" + " --force-index-orientation" ) - with open(os.path.join(self.run_dir, ".bases2fastq_command")) as command_file: + with open( + os.path.join(self.run_dir, ".bases2fastq_command"), "w" + ) as command_file: command_file.write(command) return command From ec28bb74a24da82883a845d2d6025b6f65c01073 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Mon, 7 Oct 2024 15:39:34 +0200 Subject: [PATCH 117/187] Fixes to masks --- taca/element/Element_Runs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index d1cd15a8..38d3516a 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -517,8 +517,8 @@ def make_demux_manifests( ) recipe_split = recipe.split("-") - R1Mask = f"R1:Y{recipe_split[0]}N{self.cycles['R1']}" - R2Mask = f"R2:Y{recipe_split[3]}N{self.cycles['R2']}" + R1Mask = f"R1:Y{recipe_split[0]}" + R2Mask = f"R2:Y{recipe_split[3]}" settings_section = "\n".join( [ @@ -526,7 +526,7 @@ def make_demux_manifests( "SettingName, Value", f"R1FastqMask, {R1Mask}", f"I1Mask, {I1Mask}", - f"I2Mask, {I2Mask}" + f"I2Mask, {I2Mask}", f"R2FastqMask, {R2Mask}", ] ) From 2874583e865c8eff7b3308519fbb842a9873a84c Mon Sep 17 00:00:00 2001 From: kedhammar Date: Mon, 7 Oct 2024 17:37:51 +0200 Subject: [PATCH 118/187] Clarify get_mask() and also use it to generate R1/R2 masks --- taca/element/Element_Runs.py | 73 ++++++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 28 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 38d3516a..5f9bff2c 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -20,26 +20,25 @@ def get_mask( seq: str, - mask_type: str, + keep_Ns: bool, prefix: str, cycles_used: int, ) -> str: - """Example usage: - - get_mask("ACGTNNN", "umi", "I1:", None) -> 'I1:N4Y3' - get_mask("ACGTNNN", "index", "I2:", 10) -> 'I2:Y4N6' + """ + Inputs: + seq Sequence string to make mask from + keep_Ns Whether Ns should be "Y" or "N" in the mask, vice versa for ACGT + prefix Prefix to add to the mask + cycles_used Number of cycles used in the sequencing run + + Example usage: + get_mask( "ACGTNNN", True, "I1:", 7 ) -> 'I1:N4Y3' + get_mask( "ACGTNNN", False, "I2:", 10 ) -> 'I2:Y4N6' """ - if not seq and prefix == "I2:": - mask = "I2:N*" - return mask - if not seq and mask_type == "umi": - mask = "I2:Y*" - return mask - # Input assertions - assert re.match(r"^[ACGTN]+$", seq), f"Index '{seq}' has non-ACGTN characters" - assert mask_type in ["umi", "index"], "Mask type must be 'umi' or 'index'" + if seq != "": + assert re.match(r"^[ACGTN]+$", seq), f"Index '{seq}' has non-ACGTN characters" assert prefix in [ "R1:", "R2:", @@ -47,6 +46,11 @@ def get_mask( "I2:", ], f"Mask prefix {prefix} not recognized" + # Handle no-input cases + if seq == "": + mask = f"{prefix}N{cycles_used}" + return mask + # Define dict to convert base to mask classifier base2mask = ( { @@ -56,7 +60,7 @@ def get_mask( "G": "Y", "T": "Y", } - if mask_type == "index" + if keep_Ns is False else { "N": "Y", "A": "N", @@ -452,11 +456,11 @@ def make_demux_manifests( # Add bool indicating whether UMI is present df_samples["has_umi"] = df_samples["Index2"].str.contains("N") - # Add masks for indices and UMIs + # Add masks df_samples["I1Mask"] = df_samples["Index1"].apply( lambda seq: get_mask( seq=seq, - mask_type="index", + keep_Ns=False, prefix="I1:", cycles_used=self.cycles["I1"], ) @@ -464,7 +468,7 @@ def make_demux_manifests( df_samples["I2Mask"] = df_samples["Index2"].apply( lambda seq: get_mask( seq=seq, - mask_type="index", + keep_Ns=False, prefix="I2:", cycles_used=self.cycles["I2"], ) @@ -472,14 +476,30 @@ def make_demux_manifests( df_samples["UmiMask"] = df_samples["Index2"].apply( lambda seq: get_mask( seq=seq, - mask_type="umi", + keep_Ns=True, prefix="I2:", cycles_used=self.cycles["I2"], ) ) + df_samples["R1Mask"] = df_samples["Recipe"].apply( + lambda recipe: get_mask( + seq="N" * int(recipe.split("-")[0]), + keep_Ns=True, + prefix="R1:", + cycles_used=self.cycles["R1"], + ) + ) + df_samples["R2Mask"] = df_samples["Recipe"].apply( + lambda recipe: get_mask( + seq="N" * int(recipe.split("-")[3]), + keep_Ns=True, + prefix="R2:", + cycles_used=self.cycles["R2"], + ) + ) # Re-make Index2 column without any Ns - df_samples["Index2_umi"] = df_samples["Index2"] + df_samples["Index2_with_Ns"] = df_samples["Index2"] df_samples.loc[:, "Index2"] = df_samples["Index2"].apply( lambda x: x.replace("N", "") ) @@ -489,7 +509,9 @@ def make_demux_manifests( outdir = self.run_dir # Break down into groups by non-consolable properties - grouped_df = df_samples.groupby(["I1Mask", "I2Mask", "UmiMask", "Recipe"]) + grouped_df = df_samples.groupby( + ["I1Mask", "I2Mask", "UmiMask", "R1Mask", "R2Mask", "Recipe"] + ) # Sanity check if sum([len(group) for _, group in grouped_df]) < len(df_samples): @@ -503,23 +525,18 @@ def make_demux_manifests( manifest_root_name = f"{self.NGI_run_id}_demux" manifests = [] n = 0 - for (I1Mask, I2Mask, UmiMask, recipe), group in grouped_df: + for (I1Mask, I2Mask, UmiMask, R1Mask, R2Mask, recipe), group in grouped_df: file_name = f"{manifest_root_name}_{n}.csv" runValues_section = "\n".join( [ "[RUNVALUES]", "KeyName, Value", - f'manifest_file, "{file_name}"', + f"manifest_file, {file_name}", f"manifest_group, {n+1}/{len(grouped_df)}", - f"grouped_by, I1Mask:'{I1Mask}' I2Mask:'{I2Mask}' UmiMask:'{UmiMask}' recipe:'{recipe}'", ] ) - recipe_split = recipe.split("-") - R1Mask = f"R1:Y{recipe_split[0]}" - R2Mask = f"R2:Y{recipe_split[3]}" - settings_section = "\n".join( [ "[SETTINGS]", From 920cc27023113948640f5f4740abb09470a27d3e Mon Sep 17 00:00:00 2001 From: chuan-wang Date: Tue, 8 Oct 2024 11:07:16 +0200 Subject: [PATCH 119/187] Fix bug that not all FastQ files are symplinked --- taca/element/Element_Runs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 5f9bff2c..50696c11 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -732,8 +732,8 @@ def collect_demux_runmanifest(self, demux_results_dirs): # Aggregate the output FastQ files of samples from multiple demux def aggregate_sample_fastq(self, demux_runmanifest): lanes = sorted(list(set(sample["Lane"] for sample in demux_runmanifest))) - unique_sample_demux = set() for lane in lanes: + unique_sample_demux = set() sample_count = 1 for sample in demux_runmanifest: lanenr = sample["Lane"] From 6582eb9fc9a6d4d5d3e3ede66932d7332b2bdec1 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Tue, 8 Oct 2024 12:25:18 +0200 Subject: [PATCH 120/187] check if dir exists before creating one --- taca/element/Element_Runs.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 5f9bff2c..a973679d 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -1113,7 +1113,8 @@ def sync_metadata(self): ] metadata_archive = self.CONFIG.get("element_analysis").get("metadata_location") dest = os.path.join(metadata_archive, self.NGI_run_id) - os.makedirs(dest) + if not os.path.exists(dest): + os.makedirs(dest) for f in files_to_copy: shutil.copy(f, dest) From 160b204421a00c04e7c9a93f35a526a6da6a2de8 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Tue, 8 Oct 2024 13:49:53 +0200 Subject: [PATCH 121/187] catch stderr from bases2fastq --- taca/element/Element_Runs.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index f7d0b39b..a3b8419d 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -613,14 +613,19 @@ def generate_demux_command(self, run_manifest, demux_dir): def start_demux(self, run_manifest, demux_dir): with chdir(self.run_dir): cmd = self.generate_demux_command(run_manifest, demux_dir) + stderr_abspath = f"{self.run_dir}/bases2fastq_stderr.txt" try: - p_handle = subprocess.Popen( - cmd, stdout=subprocess.PIPE, shell=True, cwd=self.run_dir - ) + with open(stderr_abspath, "w") as stderr: + process = subprocess.Popen( + cmd, + shell=True, + cwd=self.run_dir, + stderr=stderr, + ) logger.info( "Bases2Fastq conversion and demultiplexing " f"started for run {self} on {datetime.now()}" - f"with p_handle {p_handle}" + f"with p_handle {process}" ) except subprocess.CalledProcessError: logger.warning( From 401c3a9bf10b4c0971613ddc837b4353bc3d14af Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Tue, 8 Oct 2024 14:14:52 +0200 Subject: [PATCH 122/187] cleanup and versioning --- VERSIONLOG.md | 9 ++------- taca/__init__.py | 2 +- taca/element/Element_Runs.py | 8 +++++--- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/VERSIONLOG.md b/VERSIONLOG.md index bc899c3c..0832b02d 100644 --- a/VERSIONLOG.md +++ b/VERSIONLOG.md @@ -1,13 +1,8 @@ # TACA Version Log -## 20240927.1 +## 20241008.1 -Add project name in IndexAssignment; -Correct index percentage in undet - -## 20240924.1 - -Aggregate aviti demultiplexing results +Add support for processing Element Aviti data ## 20240705.1 diff --git a/taca/__init__.py b/taca/__init__.py index b85b2cf5..c516d006 100644 --- a/taca/__init__.py +++ b/taca/__init__.py @@ -1,3 +1,3 @@ """Main TACA module""" -__version__ = "1.0.0" +__version__ = "1.1.0" diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index a3b8419d..38603e8c 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -390,7 +390,7 @@ def find_lims_zip(self) -> str | None: logger.warning( f"Multiple manifests found for run '{self.run_dir}' with pattern '{glob_pattern}', using latest one." ) - glob_results.sort() + glob_results.sort() # TODO: add CLI option to specify manifest for re-demux lims_zip_src_path = glob_results[-1] else: lims_zip_src_path = glob_results[0] @@ -621,7 +621,7 @@ def start_demux(self, run_manifest, demux_dir): shell=True, cwd=self.run_dir, stderr=stderr, - ) + ) logger.info( "Bases2Fastq conversion and demultiplexing " f"started for run {self} on {datetime.now()}" @@ -817,7 +817,9 @@ def aggregate_undet_fastq(self, demux_runmanifest): ) ) for fastqfile in fastqfiles: - base_name = os.path.basename(fastqfile) + base_name = os.path.basename( + fastqfile + ) # TODO: Make symlinks relative instead of absolute to maintain them after archiving os.symlink(fastqfile, os.path.join(project_dest, base_name)) # Read in each Project_RunStats.json to fetch PercentMismatch, PercentQ30, PercentQ40 and QualityScoreMean From 41b90eb2f53bcb5abce5424d4c9009cb238db2df Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Tue, 8 Oct 2024 14:18:15 +0200 Subject: [PATCH 123/187] spaaace --- taca/element/Element_Runs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 38603e8c..2a16c14c 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -390,7 +390,7 @@ def find_lims_zip(self) -> str | None: logger.warning( f"Multiple manifests found for run '{self.run_dir}' with pattern '{glob_pattern}', using latest one." ) - glob_results.sort() # TODO: add CLI option to specify manifest for re-demux + glob_results.sort() # TODO: add CLI option to specify manifest for re-demux lims_zip_src_path = glob_results[-1] else: lims_zip_src_path = glob_results[0] From e81460d2b6467f21b0042beee1c3810d2e019642 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Tue, 8 Oct 2024 16:38:42 +0200 Subject: [PATCH 124/187] Parse UMI masks for both I1 and I2, can only use one though --- taca/element/Element_Runs.py | 38 ++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 2a16c14c..65c41a6b 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -453,9 +453,6 @@ def make_demux_manifests( df_samples = df[df["Project"] != "Control"].copy() df_controls = df[df["Project"] == "Control"].copy() - # Add bool indicating whether UMI is present - df_samples["has_umi"] = df_samples["Index2"].str.contains("N") - # Add masks df_samples["I1Mask"] = df_samples["Index1"].apply( lambda seq: get_mask( @@ -473,7 +470,15 @@ def make_demux_manifests( cycles_used=self.cycles["I2"], ) ) - df_samples["UmiMask"] = df_samples["Index2"].apply( + df_samples["I1UmiMask"] = df_samples["Index1"].apply( + lambda seq: get_mask( + seq=seq, + keep_Ns=True, + prefix="I1:", + cycles_used=self.cycles["I1"], + ) + ) + df_samples["I2UmiMask"] = df_samples["Index2"].apply( lambda seq: get_mask( seq=seq, keep_Ns=True, @@ -510,7 +515,7 @@ def make_demux_manifests( # Break down into groups by non-consolable properties grouped_df = df_samples.groupby( - ["I1Mask", "I2Mask", "UmiMask", "R1Mask", "R2Mask", "Recipe"] + ["I1Mask", "I2Mask", "I1UmiMask", "I2UmiMask", "R1Mask", "R2Mask", "Recipe"] ) # Sanity check @@ -525,7 +530,15 @@ def make_demux_manifests( manifest_root_name = f"{self.NGI_run_id}_demux" manifests = [] n = 0 - for (I1Mask, I2Mask, UmiMask, R1Mask, R2Mask, recipe), group in grouped_df: + for ( + I1Mask, + I2Mask, + I1UmiMask, + I2UmiMask, + R1Mask, + R2Mask, + recipe, + ), group in grouped_df: file_name = f"{manifest_root_name}_{n}.csv" runValues_section = "\n".join( @@ -548,13 +561,22 @@ def make_demux_manifests( ] ) - if group["has_umi"].all(): + if "Y" in I1UmiMask: settings_section += "\n" + "\n".join( [ - f"UmiMask, {UmiMask}", + f"UmiMask, {I1UmiMask}", "UmiFastQ, TRUE", ] ) + elif "Y" in I2UmiMask: + settings_section += "\n" + "\n".join( + [ + f"UmiMask, {I2UmiMask}", + "UmiFastQ, TRUE", + ] + ) + else: + pass # Add PhiX stratified by index length group_controls = df_controls[ From 34e260f5d92ed58682b5222556940c4c08719766 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Tue, 8 Oct 2024 16:40:47 +0200 Subject: [PATCH 125/187] stricter check --- taca/element/Element_Runs.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 65c41a6b..233249d8 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -561,22 +561,24 @@ def make_demux_manifests( ] ) - if "Y" in I1UmiMask: + if "Y" in I1UmiMask and "Y" not in I2UmiMask: settings_section += "\n" + "\n".join( [ f"UmiMask, {I1UmiMask}", "UmiFastQ, TRUE", ] ) - elif "Y" in I2UmiMask: + elif "Y" in I2UmiMask and "Y" not in I1UmiMask: settings_section += "\n" + "\n".join( [ f"UmiMask, {I2UmiMask}", "UmiFastQ, TRUE", ] ) - else: + elif "Y" not in I1UmiMask and "Y" not in I2UmiMask: pass + else: + raise AssertionError("Both I1 and I2 appear to contain UMIs.") # Add PhiX stratified by index length group_controls = df_controls[ From ae961e94e500963fff9e284e3eb9ccc8172807ec Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 9 Oct 2024 13:17:50 +0200 Subject: [PATCH 126/187] overwrite group settings by settings row specified in lims manifest --- taca/element/Element_Runs.py | 60 +++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 24 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 233249d8..0b32accb 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -515,7 +515,15 @@ def make_demux_manifests( # Break down into groups by non-consolable properties grouped_df = df_samples.groupby( - ["I1Mask", "I2Mask", "I1UmiMask", "I2UmiMask", "R1Mask", "R2Mask", "Recipe"] + [ + "I1Mask", + "I2Mask", + "I1UmiMask", + "I2UmiMask", + "R1Mask", + "R2Mask", + "settings", + ] ) # Sanity check @@ -537,7 +545,7 @@ def make_demux_manifests( I2UmiMask, R1Mask, R2Mask, - recipe, + settings, ), group in grouped_df: file_name = f"{manifest_root_name}_{n}.csv" @@ -547,39 +555,43 @@ def make_demux_manifests( "KeyName, Value", f"manifest_file, {file_name}", f"manifest_group, {n+1}/{len(grouped_df)}", + f"built_from, {manifest_to_split}", ] ) - settings_section = "\n".join( - [ - "[SETTINGS]", - "SettingName, Value", - f"R1FastqMask, {R1Mask}", - f"I1Mask, {I1Mask}", - f"I2Mask, {I2Mask}", - f"R2FastqMask, {R2Mask}", - ] - ) + # Instantiate settings + settings_kvs = { + "R1FastqMask": R1Mask, + "I1Mask": I1Mask, + "I2Mask": I2Mask, + "R2FastqMask": R2Mask, + } + # Add UMI settings if "Y" in I1UmiMask and "Y" not in I2UmiMask: - settings_section += "\n" + "\n".join( - [ - f"UmiMask, {I1UmiMask}", - "UmiFastQ, TRUE", - ] - ) + settings_kvs["UmiMask"] = I1UmiMask + settings_kvs["UmiFastQ"] = "TRUE" elif "Y" in I2UmiMask and "Y" not in I1UmiMask: - settings_section += "\n" + "\n".join( - [ - f"UmiMask, {I2UmiMask}", - "UmiFastQ, TRUE", - ] - ) + settings_kvs["UmiMask"] = I2UmiMask + settings_kvs["UmiFastQ"] = "TRUE" elif "Y" not in I1UmiMask and "Y" not in I2UmiMask: pass else: raise AssertionError("Both I1 and I2 appear to contain UMIs.") + # Unpack settings from LIMS manifest + for kv in settings.split(" "): + k, v = kv.split(":") + settings_kvs[k] = v + + settings_section = "\n".join( + [ + "[SETTINGS]", + "SettingName, Value", + ] + + [f"{k}, {v}" for k, v in settings.items()] + ) + # Add PhiX stratified by index length group_controls = df_controls[ df_controls["Lane"].isin(group["Lane"].unique()) From 01dd52fd7ac215785062266047a3299f94f5f634 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 9 Oct 2024 13:20:14 +0200 Subject: [PATCH 127/187] fix ref --- taca/element/Element_Runs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 0b32accb..bfb8bfc7 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -589,7 +589,7 @@ def make_demux_manifests( "[SETTINGS]", "SettingName, Value", ] - + [f"{k}, {v}" for k, v in settings.items()] + + [f"{k}, {v}" for k, v in settings_kvs.items()] ) # Add PhiX stratified by index length From f20c3d7b9fc10ab7d86ff543ab532376fc5758e7 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 11 Oct 2024 11:25:10 +0200 Subject: [PATCH 128/187] use tree output for docs --- tests/conftest.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index e9a3fd89..a4945938 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -14,31 +14,33 @@ def create_dirs(): │ ├── Chromium_10X_indexes.txt │ └── Smart-seq3_v1.5.csv ├── log - │ ├── transfer_minion_qc.tsv + │ ├── taca.log + │ ├── transfer.tsv + │ ├── transfer_aviti.tsv │ ├── transfer_minion.tsv + │ ├── transfer_minion_qc.tsv │ └── transfer_promethion.tsv - │ └── transfer_aviti.tsv - │ └── transfer.tsv - │ └── taca.log ├── miarka │ ├── minion │ │ └── qc │ └── promethion ├── minknow_reports ├── ngi-nas-ns + │ ├── Aviti_data │ ├── NextSeq_data │ ├── NovaSeqXPlus_data │ ├── NovaSeq_data │ ├── minion_data │ ├── miseq_data │ ├── promethion_data - │ ├── Aviti_data │ └── samplesheets + │ ├── Aviti │ ├── NovaSeqXPlus │ └── anglerfish - │ └── Aviti └── ngi_data └── sequencing + ├── AV242106 + │ └── nosync ├── MiSeq │ └── nosync ├── NextSeq @@ -52,8 +54,6 @@ def create_dirs(): │ └── qc │ └── nosync └── promethion - │ └── nosync - └── AV242106 └── nosync --> Return the the temporary directory object From 1d48b47cd493c46ce9a83d5b097e5f1745e13965 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 11 Oct 2024 11:25:21 +0200 Subject: [PATCH 129/187] add files for func test --- tests/element/test_Element_Runs.py | 356 ++++++++++++++++++++++++++++- 1 file changed, 347 insertions(+), 9 deletions(-) diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py index 7963761d..c7dffc18 100644 --- a/tests/element/test_Element_Runs.py +++ b/tests/element/test_Element_Runs.py @@ -1,6 +1,6 @@ -import json import os import tempfile +import zipfile from unittest import mock import pytest @@ -25,7 +25,8 @@ def get_config(tmp: tempfile.TemporaryDirectory) -> dict: def create_element_run_dir( tmp: tempfile.TemporaryDirectory, - run_name: str = "20240716_AV242106_testrun", + run_name: str = "20240926_AV242106_A2349523513", + lims_manifest: bool = True, nosync: bool = False, run_finished: bool = True, sync_finished: bool = True, @@ -35,9 +36,11 @@ def create_element_run_dir( outcome_completed: bool = True, ) -> str: """ - Conditionally build a file tree for an Element run. + Build a run dir for an Element run for test purposes. - . + Some file contents are replaced with "MOCK" to shorten them. + + 20240926_AV242106_A2349523513 ├── RunManifest.csv ├── RunManifest.json ├── RunParameters.json @@ -59,14 +62,349 @@ def create_element_run_dir( run_path = f"{tmp.name}/ngi_data/sequencing/AV242106/{run_name}" os.mkdir(run_path) + # Create LIMS manifest + if lims_manifest: + manifest_root_name = "AVITI_run_manifest_2349523513_24-1061390_240926_171138_ChristianNatanaelsson" + manifest_pdir = f"{tmp.name}/ngi-nas-ns/samplesheets/Aviti/2024" + + csv_path = f"{manifest_pdir}/{manifest_root_name}_untrimmed.csv" + zip_path = f"{manifest_pdir}/{manifest_root_name}.zip" + + with open(csv_path, "w") as stream: + stream.write("""[RUNVALUES] +KeyName, Value +lims_step_name, "Load to Flowcell (AVITI) v1.0" +lims_step_id, "24-1061390" +manifest_file, "AVITI_run_manifest_2349523513_24-1061390_240926_171138_ChristianNatanaelsson_untrimmed.csv" + +[SETTINGS] +SettingName, Value + +[SAMPLES] +SampleName,Index1,Index2,Lane,Project,Recipe +P32105_1001,AAAGCATA,,1,I__Adameyko_24_06,50-8-24-49 +P32105_1001,CTGCAGCC,,1,I__Adameyko_24_06,50-8-24-49 +P32105_1001,GCCTTTAT,,1,I__Adameyko_24_06,50-8-24-49 +P32105_1001,TGTAGCGG,,1,I__Adameyko_24_06,50-8-24-49 +P32105_1002,ATTGGACG,,1,I__Adameyko_24_06,50-8-24-49 +P32105_1002,CAGCTTAC,,1,I__Adameyko_24_06,50-8-24-49 +P32105_1002,GGCAAGGA,,1,I__Adameyko_24_06,50-8-24-49 +P32105_1002,TCATCCTT,,1,I__Adameyko_24_06,50-8-24-49 +P32105_1003,ACGTTACA,,1,I__Adameyko_24_06,50-8-24-49 +P32105_1003,CGTAGGTT,,1,I__Adameyko_24_06,50-8-24-49 +P32105_1003,GACGACGG,,1,I__Adameyko_24_06,50-8-24-49 +P32105_1003,TTACCTAC,,1,I__Adameyko_24_06,50-8-24-49 +P32105_1004,ACTTCACT,,1,I__Adameyko_24_06,50-8-24-49 +P32105_1004,CGAAGTTG,,1,I__Adameyko_24_06,50-8-24-49 +P32105_1004,GAGCACGC,,1,I__Adameyko_24_06,50-8-24-49 +P32105_1004,TTCGTGAA,,1,I__Adameyko_24_06,50-8-24-49 +PhiX_Adept,ATGTCGCTAG,CTAGCTCGTA,1,Control,0-0 +PhiX_Adept,CACAGATCGT,ACGAGAGTCT,1,Control,0-0 +PhiX_Adept,GCACATAGTC,GACTACTAGC,1,Control,0-0 +PhiX_Adept,TGTGTCGACA,TGTCTGACAG,1,Control,0-0 +P32105_1001,AAAGCATA,,2,I__Adameyko_24_06,50-8-24-49 +P32105_1001,CTGCAGCC,,2,I__Adameyko_24_06,50-8-24-49 +P32105_1001,GCCTTTAT,,2,I__Adameyko_24_06,50-8-24-49 +P32105_1001,TGTAGCGG,,2,I__Adameyko_24_06,50-8-24-49 +P32105_1002,ATTGGACG,,2,I__Adameyko_24_06,50-8-24-49 +P32105_1002,CAGCTTAC,,2,I__Adameyko_24_06,50-8-24-49 +P32105_1002,GGCAAGGA,,2,I__Adameyko_24_06,50-8-24-49 +P32105_1002,TCATCCTT,,2,I__Adameyko_24_06,50-8-24-49 +P32105_1003,ACGTTACA,,2,I__Adameyko_24_06,50-8-24-49 +P32105_1003,CGTAGGTT,,2,I__Adameyko_24_06,50-8-24-49 +P32105_1003,GACGACGG,,2,I__Adameyko_24_06,50-8-24-49 +P32105_1003,TTACCTAC,,2,I__Adameyko_24_06,50-8-24-49 +P32105_1004,ACTTCACT,,2,I__Adameyko_24_06,50-8-24-49 +P32105_1004,CGAAGTTG,,2,I__Adameyko_24_06,50-8-24-49 +P32105_1004,GAGCACGC,,2,I__Adameyko_24_06,50-8-24-49 +P32105_1004,TTCGTGAA,,2,I__Adameyko_24_06,50-8-24-49 +PhiX_Adept,ATGTCGCTAG,CTAGCTCGTA,2,Control,0-0 +PhiX_Adept,CACAGATCGT,ACGAGAGTCT,2,Control,0-0 +PhiX_Adept,GCACATAGTC,GACTACTAGC,2,Control,0-0 +PhiX_Adept,TGTGTCGACA,TGTCTGACAG,2,Control,0-0 +""") + + with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf: + # Add the CSV file to the zip file + zipf.write(csv_path, os.path.basename(csv_path)) + # Populate run dir with files and folders if run_finished: - open(f"{run_path}/RunManifest.csv", "w").close() - open(f"{run_path}/RunManifest.json", "w").close() - open(f"{run_path}/RunParameters.json", "w").close() - with open(f"{run_path}/RunUploaded.json", "w") as f: + with open(f"{run_path}/RunManifest.json", "w") as stream: + stream.write("""{ + "KitConfiguration": { + "MaxCycles": 334, + "DefaultR1Cycles": 151, + "DefaultR2Cycles": 151, + "DefaultI1Cycles": -1, + "DefaultI2Cycles": -1, + "MinimumR1Cycles": 5, + "MinimumR2Cycles": 0, + "MinimumI1Cycles": 0, + "MinimumI2Cycles": 0, + "DefaultI1FastQ": false, + "DefaultI2FastQ": false, + "DefaultUMIFastQ": false, + "DefaultI1Mask": "I1:Y*", + "DefaultI2Mask": "I2:Y*", + "DefaultUmiMask": "I1:N*", + "DefaultR1FastQMask": "R1:Y*N", + "DefaultR2FastQMask": "R2:Y*N", + "DefaultI1MaskRead": "I1", + "DefaultI2MaskRead": "I2", + "DefaultUmiMaskRead": "I1", + "DefaultR1FastQMaskRead": "R1", + "DefaultR2FastQMaskRead": "R2", + "DefaultR1Adapter": "", + "DefaultR2Adapter": "", + "DefaultR1AdapterTrim": false, + "DefaultR2AdapterTrim": false, + "DefaultR1AdapterNMask": false, + "DefaultR2AdapterNMask": false, + "DefaultR1AdapterMinimumTrimmedLength": 16, + "DefaultR2AdapterMinimumTrimmedLength": 16, + "DefaultR1AdapterMinimumStringency": 0.9, + "DefaultR2AdapterMinimumStringency": 0.9, + "DefaultR1AdapterMinimumOverlap": 3, + "DefaultR2AdapterMinimumOverlap": 3, + "DefaultAdapterTrimType": "Paired-End" + }, + "RunParameters": { + "PreparationWorkflow": "Adept", + "KitConfiguration": "300Cycles", + "ChemistryVersion": "Cloudbreak", + "LowDiversity": false, + "I1Cycles": 8, + "I2Cycles": 24, + "R1Cycles": 50, + "R2Cycles": 49 + }, + "RunValues": { + "lims_step_id": "24-1061390", + "lims_step_name": "Load to Flowcell (AVITI) v1.0", + "manifest_file": "AVITI_run_manifest_2349523513_24-1061390_240926_171138_ChristianNatanaelsson_trimmed.csv" + }, + "Settings": [ + { + "Lane": 1, + "I1MismatchThreshold": 1, + "I2MismatchThreshold": 1, + "R1Adapter": [], + "R2Adapter": [], + "I1MaskManifest": "I1:N*", + "I1Mask": [ + { + "Read": "I1", + "Cycles": [] + } + ], + "I1FastQ": false, + "I2MaskManifest": "I2:N*", + "I2Mask": [ + { + "Read": "I2", + "Cycles": [] + } + ], + "I2FastQ": false, + "UmiMaskManifest": "I1:N*", + "UmiMask": [ + { + "Read": "I1", + "Cycles": [] + } + ], + "UmiFastQ": false, + "R1FastQMaskManifest": "R1:Y*N", + "R1FastQMask": [ + { + "Read": "R1", + "Cycles": "MOCK" + } + ], + "R2FastQMaskManifest": "R2:Y*N", + "R2FastQMask": [ + { + "Read": "R2", + "Cycles": "MOCK" + } + ], + "SpikeInAsUnassigned": true, + "R1AdapterTrim": false, + "R2AdapterTrim": false, + "R1AdapterNMask": false, + "R2AdapterNMask": false, + "R1AdapterMinimumTrimmedLength": 16, + "R2AdapterMinimumTrimmedLength": 16, + "R1AdapterMinimumStringency": 0.9, + "R2AdapterMinimumStringency": 0.9, + "R1AdapterMinimumOverlap": 3, + "R2AdapterMinimumOverlap": 3, + "AdapterTrimType": "Paired-End" + }, + { + "Lane": 2, + "I1MismatchThreshold": 1, + "I2MismatchThreshold": 1, + "R1Adapter": [], + "R2Adapter": [], + "I1MaskManifest": "I1:N*", + "I1Mask": [ + { + "Read": "I1", + "Cycles": [] + } + ], + "I1FastQ": false, + "I2MaskManifest": "I2:N*", + "I2Mask": [ + { + "Read": "I2", + "Cycles": [] + } + ], + "I2FastQ": false, + "UmiMaskManifest": "I1:N*", + "UmiMask": [ + { + "Read": "I1", + "Cycles": [] + } + ], + "UmiFastQ": false, + "R1FastQMaskManifest": "R1:Y*N", + "R1FastQMask": [ + { + "Read": "R1", + "Cycles": "MOCK" + } + ], + "R2FastQMaskManifest": "R2:Y*N", + "R2FastQMask": [ + { + "Read": "R2", + "Cycles": "MOCK" + } + ], + "SpikeInAsUnassigned": true, + "R1AdapterTrim": false, + "R2AdapterTrim": false, + "R1AdapterNMask": false, + "R2AdapterNMask": false, + "R1AdapterMinimumTrimmedLength": 16, + "R2AdapterMinimumTrimmedLength": 16, + "R1AdapterMinimumStringency": 0.9, + "R2AdapterMinimumStringency": 0.9, + "R1AdapterMinimumOverlap": 3, + "R2AdapterMinimumOverlap": 3, + "AdapterTrimType": "Paired-End" + } + ], + "Samples": [ + { + "SampleName": "DefaultSample", + "SampleNumber": 1, + "ExternalId": "", + "Indexes": [ + { + "Lane": 1, + "Index1": "", + "Index2": "" + }, + { + "Lane": 2, + "Index1": "", + "Index2": "" + } + ], + "CustomMetadata": {}, + "Project": "DefaultProject" + } + ] +} +""") + with open(f"{run_path}/RunParameters.json", "w") as stream: + stream.write("""{ + "FileVersion": "5.0.0", + "RunName": "A2349523513", + "RecipeExecutionID": "rec.9590c80c95fc4eee8b3eb10c31251915", + "RunID": "seq_66f5837f1ae1a35f10a2e594", + "RunType": "Sequencing", + "RunDescription": "", + "Side": "SideA", + "FlowcellID": "2349523513", + "Date": "2024-09-26T16:34:55.978072698Z", + "InstrumentName": "AV242106", + "OperatorName": "christian.natanael@scilifelab.se ", + "RunFolderName": "20240926_AV242106_A2349523513", + "Tiles": "MOCK", + "Cycles": { + "R1": 50, + "R2": 49, + "I1": 8, + "I2": 24 + }, + "ReadOrder": "I1,I2,R1,R2", + "ThroughputSelection": "High", + "KitConfiguration": "300Cycles", + "PreparationWorkflow": "Adept", + "ChemistryVersion": "Cloudbreak", + "LowDiversity": false, + "PlatformVersion": "2.6.2", + "AnalysisLanes": "1+2", + "StorageConnectionID": "local:66866355d07c3234c01b67b1", + "PMGMask": "P1:Y4N*", + "Consumables": { + "Flowcell": { + "SerialNumber": "2349523513", + "PartNumber": "810-00002", + "LotNumber": "2405300233", + "Expiration": "2025-05-31T00:00:00Z", + "ExpirationStr": "20250531", + "BarcodeStr": "2349523513,810-00002,2405300233,20250531" + }, + "SequencingCartridge": { + "SerialNumber": "24062600390028", + "PartNumber": "820-00013", + "LotNumber": "2406260039", + "Expiration": "2025-05-22T00:00:00Z", + "ExpirationStr": "20250522", + "BarcodeStr": "24062600390028,820-00013,2406260039,20250522" + }, + "Buffer": { + "SerialNumber": "24062400390041", + "PartNumber": "820-00002", + "LotNumber": "2406240039", + "Expiration": "2026-06-25T00:00:00Z", + "ExpirationStr": "20260625", + "BarcodeStr": "24062400390041,820-00002,2406240039,20260625" + } + }, + "LibraryType": "Linear", + "RecipeValues": [ + { + "Name": "filterMask", + "Value": "R1:Y15N*-R2:Y15N*" + } + ], + "AdvancedSettings": { + "PolonyDensity": "HighDensity" + } +} +""") + with open(f"{run_path}/RunUploaded.json", "w") as stream: outcome = "OutcomeCompleted" if outcome_completed else "OutcomeFailed" - f.write(json.dumps({"outcome": outcome})) + stream.write( + "{" + + '"version":"1.0.0",' + + '"instrument":"AV242106",' + + '"instrumentId":"0000024023696901c5621014",' + + '"runType":"Sequencing",' + + '"recipeExecutionId":"rec.9590c80c95fc4eee8b3eb10c31251915",' + + '"runID":"seq_66f5837f1ae1a35f10a2e594",' + + f'"outcome":"{outcome}"' + + "}" + ) if sync_finished: open(f"{run_path}/.sync_finished", "w").close() From 76da6a5ff70e1b89d2d4392fb289e3e93e79c616 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 11 Oct 2024 14:52:44 +0200 Subject: [PATCH 130/187] fix presumed method call bug --- taca/analysis/analysis_element.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index f0274e47..915fcfa7 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -62,7 +62,7 @@ def _process(run): run.start_demux(demux_manifest, sub_demux_dir) sub_demux_count += 1 run.status = "demultiplexing" - if run.status_changed: + if run.status_changed(): run.update_statusdb() return else: From 2fb014f936d5aa983b2d998fc00f9a615a1924a7 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 11 Oct 2024 14:54:05 +0200 Subject: [PATCH 131/187] reset kwargs, update lims manifest, fix bugs, remove csv after zipping it --- tests/element/test_Element_Runs.py | 103 +++++++++++++++-------------- 1 file changed, 54 insertions(+), 49 deletions(-) diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py index c7dffc18..63e8fe89 100644 --- a/tests/element/test_Element_Runs.py +++ b/tests/element/test_Element_Runs.py @@ -27,13 +27,13 @@ def create_element_run_dir( tmp: tempfile.TemporaryDirectory, run_name: str = "20240926_AV242106_A2349523513", lims_manifest: bool = True, - nosync: bool = False, run_finished: bool = True, - sync_finished: bool = True, - demux_dir: bool = True, - n_demux_subdirs: int = 1, - demux_done: bool = True, outcome_completed: bool = True, + sync_finished: bool = True, + demux_dir: bool = False, + n_demux_subdirs: int = 0, + demux_done: bool = False, + nosync: bool = False, ) -> str: """ Build a run dir for an Element run for test purposes. @@ -67,66 +67,71 @@ def create_element_run_dir( manifest_root_name = "AVITI_run_manifest_2349523513_24-1061390_240926_171138_ChristianNatanaelsson" manifest_pdir = f"{tmp.name}/ngi-nas-ns/samplesheets/Aviti/2024" + os.mkdir(manifest_pdir) + csv_path = f"{manifest_pdir}/{manifest_root_name}_untrimmed.csv" zip_path = f"{manifest_pdir}/{manifest_root_name}.zip" with open(csv_path, "w") as stream: + # This run manifest was generated after the sequencing run, + # and is different from what it's file name implies. stream.write("""[RUNVALUES] KeyName, Value -lims_step_name, "Load to Flowcell (AVITI) v1.0" -lims_step_id, "24-1061390" -manifest_file, "AVITI_run_manifest_2349523513_24-1061390_240926_171138_ChristianNatanaelsson_untrimmed.csv" +lims_step_name, Load to Flowcell (AVITI) v1.0 +lims_step_id, 24-1061411 +manifest_file, AVITI_run_manifest_2349523513_24-1061411_241011_142515_AlfredKedhammar_untrimmed.csv [SETTINGS] SettingName, Value [SAMPLES] -SampleName,Index1,Index2,Lane,Project,Recipe -P32105_1001,AAAGCATA,,1,I__Adameyko_24_06,50-8-24-49 -P32105_1001,CTGCAGCC,,1,I__Adameyko_24_06,50-8-24-49 -P32105_1001,GCCTTTAT,,1,I__Adameyko_24_06,50-8-24-49 -P32105_1001,TGTAGCGG,,1,I__Adameyko_24_06,50-8-24-49 -P32105_1002,ATTGGACG,,1,I__Adameyko_24_06,50-8-24-49 -P32105_1002,CAGCTTAC,,1,I__Adameyko_24_06,50-8-24-49 -P32105_1002,GGCAAGGA,,1,I__Adameyko_24_06,50-8-24-49 -P32105_1002,TCATCCTT,,1,I__Adameyko_24_06,50-8-24-49 -P32105_1003,ACGTTACA,,1,I__Adameyko_24_06,50-8-24-49 -P32105_1003,CGTAGGTT,,1,I__Adameyko_24_06,50-8-24-49 -P32105_1003,GACGACGG,,1,I__Adameyko_24_06,50-8-24-49 -P32105_1003,TTACCTAC,,1,I__Adameyko_24_06,50-8-24-49 -P32105_1004,ACTTCACT,,1,I__Adameyko_24_06,50-8-24-49 -P32105_1004,CGAAGTTG,,1,I__Adameyko_24_06,50-8-24-49 -P32105_1004,GAGCACGC,,1,I__Adameyko_24_06,50-8-24-49 -P32105_1004,TTCGTGAA,,1,I__Adameyko_24_06,50-8-24-49 -PhiX_Adept,ATGTCGCTAG,CTAGCTCGTA,1,Control,0-0 -PhiX_Adept,CACAGATCGT,ACGAGAGTCT,1,Control,0-0 -PhiX_Adept,GCACATAGTC,GACTACTAGC,1,Control,0-0 -PhiX_Adept,TGTGTCGACA,TGTCTGACAG,1,Control,0-0 -P32105_1001,AAAGCATA,,2,I__Adameyko_24_06,50-8-24-49 -P32105_1001,CTGCAGCC,,2,I__Adameyko_24_06,50-8-24-49 -P32105_1001,GCCTTTAT,,2,I__Adameyko_24_06,50-8-24-49 -P32105_1001,TGTAGCGG,,2,I__Adameyko_24_06,50-8-24-49 -P32105_1002,ATTGGACG,,2,I__Adameyko_24_06,50-8-24-49 -P32105_1002,CAGCTTAC,,2,I__Adameyko_24_06,50-8-24-49 -P32105_1002,GGCAAGGA,,2,I__Adameyko_24_06,50-8-24-49 -P32105_1002,TCATCCTT,,2,I__Adameyko_24_06,50-8-24-49 -P32105_1003,ACGTTACA,,2,I__Adameyko_24_06,50-8-24-49 -P32105_1003,CGTAGGTT,,2,I__Adameyko_24_06,50-8-24-49 -P32105_1003,GACGACGG,,2,I__Adameyko_24_06,50-8-24-49 -P32105_1003,TTACCTAC,,2,I__Adameyko_24_06,50-8-24-49 -P32105_1004,ACTTCACT,,2,I__Adameyko_24_06,50-8-24-49 -P32105_1004,CGAAGTTG,,2,I__Adameyko_24_06,50-8-24-49 -P32105_1004,GAGCACGC,,2,I__Adameyko_24_06,50-8-24-49 -P32105_1004,TTCGTGAA,,2,I__Adameyko_24_06,50-8-24-49 -PhiX_Adept,ATGTCGCTAG,CTAGCTCGTA,2,Control,0-0 -PhiX_Adept,CACAGATCGT,ACGAGAGTCT,2,Control,0-0 -PhiX_Adept,GCACATAGTC,GACTACTAGC,2,Control,0-0 -PhiX_Adept,TGTGTCGACA,TGTCTGACAG,2,Control,0-0 +SampleName,Index1,Index2,Lane,Project,Recipe,lims_label,settings +P32105_1001,AAAGCATA,NNNNNNNNNNNNNNNNNNNNNNNN,1,I__Adameyko_24_06,50-8-24-49,SI-NA-A3,I1Fastq:True +P32105_1001,CTGCAGCC,NNNNNNNNNNNNNNNNNNNNNNNN,1,I__Adameyko_24_06,50-8-24-49,SI-NA-A3,I1Fastq:True +P32105_1001,GCCTTTAT,NNNNNNNNNNNNNNNNNNNNNNNN,1,I__Adameyko_24_06,50-8-24-49,SI-NA-A3,I1Fastq:True +P32105_1001,TGTAGCGG,NNNNNNNNNNNNNNNNNNNNNNNN,1,I__Adameyko_24_06,50-8-24-49,SI-NA-A3,I1Fastq:True +P32105_1002,ATTGGACG,NNNNNNNNNNNNNNNNNNNNNNNN,1,I__Adameyko_24_06,50-8-24-49,SI-NA-B3,I1Fastq:True +P32105_1002,CAGCTTAC,NNNNNNNNNNNNNNNNNNNNNNNN,1,I__Adameyko_24_06,50-8-24-49,SI-NA-B3,I1Fastq:True +P32105_1002,GGCAAGGA,NNNNNNNNNNNNNNNNNNNNNNNN,1,I__Adameyko_24_06,50-8-24-49,SI-NA-B3,I1Fastq:True +P32105_1002,TCATCCTT,NNNNNNNNNNNNNNNNNNNNNNNN,1,I__Adameyko_24_06,50-8-24-49,SI-NA-B3,I1Fastq:True +P32105_1003,ACGTTACA,NNNNNNNNNNNNNNNNNNNNNNNN,1,I__Adameyko_24_06,50-8-24-49,SI-NA-C3,I1Fastq:True +P32105_1003,CGTAGGTT,NNNNNNNNNNNNNNNNNNNNNNNN,1,I__Adameyko_24_06,50-8-24-49,SI-NA-C3,I1Fastq:True +P32105_1003,GACGACGG,NNNNNNNNNNNNNNNNNNNNNNNN,1,I__Adameyko_24_06,50-8-24-49,SI-NA-C3,I1Fastq:True +P32105_1003,TTACCTAC,NNNNNNNNNNNNNNNNNNNNNNNN,1,I__Adameyko_24_06,50-8-24-49,SI-NA-C3,I1Fastq:True +P32105_1004,ACTTCACT,NNNNNNNNNNNNNNNNNNNNNNNN,1,I__Adameyko_24_06,50-8-24-49,SI-NA-D3,I1Fastq:True +P32105_1004,CGAAGTTG,NNNNNNNNNNNNNNNNNNNNNNNN,1,I__Adameyko_24_06,50-8-24-49,SI-NA-D3,I1Fastq:True +P32105_1004,GAGCACGC,NNNNNNNNNNNNNNNNNNNNNNNN,1,I__Adameyko_24_06,50-8-24-49,SI-NA-D3,I1Fastq:True +P32105_1004,TTCGTGAA,NNNNNNNNNNNNNNNNNNNNNNNN,1,I__Adameyko_24_06,50-8-24-49,SI-NA-D3,I1Fastq:True +PhiX_Adept,ATGTCGCTAG,CTAGCTCGTA,1,Control,0-0,, +PhiX_Adept,CACAGATCGT,ACGAGAGTCT,1,Control,0-0,, +PhiX_Adept,GCACATAGTC,GACTACTAGC,1,Control,0-0,, +PhiX_Adept,TGTGTCGACA,TGTCTGACAG,1,Control,0-0,, +P32105_1001,AAAGCATA,NNNNNNNNNNNNNNNNNNNNNNNN,2,I__Adameyko_24_06,50-8-24-49,SI-NA-A3,I1Fastq:True +P32105_1001,CTGCAGCC,NNNNNNNNNNNNNNNNNNNNNNNN,2,I__Adameyko_24_06,50-8-24-49,SI-NA-A3,I1Fastq:True +P32105_1001,GCCTTTAT,NNNNNNNNNNNNNNNNNNNNNNNN,2,I__Adameyko_24_06,50-8-24-49,SI-NA-A3,I1Fastq:True +P32105_1001,TGTAGCGG,NNNNNNNNNNNNNNNNNNNNNNNN,2,I__Adameyko_24_06,50-8-24-49,SI-NA-A3,I1Fastq:True +P32105_1002,ATTGGACG,NNNNNNNNNNNNNNNNNNNNNNNN,2,I__Adameyko_24_06,50-8-24-49,SI-NA-B3,I1Fastq:True +P32105_1002,CAGCTTAC,NNNNNNNNNNNNNNNNNNNNNNNN,2,I__Adameyko_24_06,50-8-24-49,SI-NA-B3,I1Fastq:True +P32105_1002,GGCAAGGA,NNNNNNNNNNNNNNNNNNNNNNNN,2,I__Adameyko_24_06,50-8-24-49,SI-NA-B3,I1Fastq:True +P32105_1002,TCATCCTT,NNNNNNNNNNNNNNNNNNNNNNNN,2,I__Adameyko_24_06,50-8-24-49,SI-NA-B3,I1Fastq:True +P32105_1003,ACGTTACA,NNNNNNNNNNNNNNNNNNNNNNNN,2,I__Adameyko_24_06,50-8-24-49,SI-NA-C3,I1Fastq:True +P32105_1003,CGTAGGTT,NNNNNNNNNNNNNNNNNNNNNNNN,2,I__Adameyko_24_06,50-8-24-49,SI-NA-C3,I1Fastq:True +P32105_1003,GACGACGG,NNNNNNNNNNNNNNNNNNNNNNNN,2,I__Adameyko_24_06,50-8-24-49,SI-NA-C3,I1Fastq:True +P32105_1003,TTACCTAC,NNNNNNNNNNNNNNNNNNNNNNNN,2,I__Adameyko_24_06,50-8-24-49,SI-NA-C3,I1Fastq:True +P32105_1004,ACTTCACT,NNNNNNNNNNNNNNNNNNNNNNNN,2,I__Adameyko_24_06,50-8-24-49,SI-NA-D3,I1Fastq:True +P32105_1004,CGAAGTTG,NNNNNNNNNNNNNNNNNNNNNNNN,2,I__Adameyko_24_06,50-8-24-49,SI-NA-D3,I1Fastq:True +P32105_1004,GAGCACGC,NNNNNNNNNNNNNNNNNNNNNNNN,2,I__Adameyko_24_06,50-8-24-49,SI-NA-D3,I1Fastq:True +P32105_1004,TTCGTGAA,NNNNNNNNNNNNNNNNNNNNNNNN,2,I__Adameyko_24_06,50-8-24-49,SI-NA-D3,I1Fastq:True +PhiX_Adept,ATGTCGCTAG,CTAGCTCGTA,2,Control,0-0,, +PhiX_Adept,CACAGATCGT,ACGAGAGTCT,2,Control,0-0,, +PhiX_Adept,GCACATAGTC,GACTACTAGC,2,Control,0-0,, +PhiX_Adept,TGTGTCGACA,TGTCTGACAG,2,Control,0-0,, """) with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf: # Add the CSV file to the zip file zipf.write(csv_path, os.path.basename(csv_path)) + os.remove(csv_path) # Populate run dir with files and folders if run_finished: From 120f9475f4bd3364e7e22a9b2948442283815d19 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 11 Oct 2024 14:54:43 +0200 Subject: [PATCH 132/187] mock subprocess, use default kwargs --- tests/analysis/test_analysis_element.py | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py index 49067ca4..ff4d380b 100644 --- a/tests/analysis/test_analysis_element.py +++ b/tests/analysis/test_analysis_element.py @@ -1,12 +1,9 @@ from tempfile import TemporaryDirectory from unittest.mock import patch -import pytest - from tests.element.test_Element_Runs import create_element_run_dir, get_config -@pytest.mark.skip(reason="Not implemented yet") def test_run_preprocessing(create_dirs): tmp: TemporaryDirectory = create_dirs @@ -19,17 +16,14 @@ def test_run_preprocessing(create_dirs): mock_db = patch("taca.element.Element_Runs.ElementRunsConnection") mock_db.start() + # Mock subprocess + mock_subprocess = patch("subprocess.Popen") + mock_subprocess.start() + + # Create run dir and associated LIMS manifest + run_dir = create_element_run_dir(tmp=tmp) + # Import module to test from taca.analysis import analysis_element as to_test - run_dir = create_element_run_dir( - tmp=tmp, - nosync=False, - run_finished=False, - sync_finished=False, - demux_dir=False, - demux_done=False, - outcome_completed=False, - ) - to_test.run_preprocessing(run_dir) From dd4917744e2dd4e719d5f6a38ee826eeda9d31b6 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 11 Oct 2024 15:01:11 +0200 Subject: [PATCH 133/187] add stop mocks --- tests/analysis/test_analysis_element.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py index ff4d380b..41e7f305 100644 --- a/tests/analysis/test_analysis_element.py +++ b/tests/analysis/test_analysis_element.py @@ -26,4 +26,10 @@ def test_run_preprocessing(create_dirs): # Import module to test from taca.analysis import analysis_element as to_test + # Test to_test.run_preprocessing(run_dir) + + # Stop mocks + mock_config.stop() + mock_db.stop() + mock_subprocess.stop() From e27bc66353bd4b716157ed7e6d4f301d5499a579 Mon Sep 17 00:00:00 2001 From: chuan-wang Date: Fri, 11 Oct 2024 15:01:15 +0200 Subject: [PATCH 134/187] Fix issue with 0 lane number; Add percentage int total unassigned --- taca/element/Element_Runs.py | 49 ++++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 5 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index bfb8bfc7..d7fb42c7 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -986,8 +986,11 @@ def aggregate_stats_assigned(self, demux_runmanifest): aggregated_assigned_indexes_filtered_sorted, aggregated_assigned_indexes_csv ) + yield aggregated_assigned_indexes_filtered_sorted + + # Aggregate stats in UnassignedSequences.csv - def aggregate_stats_unassigned(self, demux_runmanifest): + def aggregate_stats_unassigned(self, demux_runmanifest, aggregated_assigned_indexes_filtered_sorted): aggregated_unassigned_indexes = [] lanes = sorted(list(set(sample["Lane"] for sample in demux_runmanifest))) for lane in lanes: @@ -1105,12 +1108,37 @@ def aggregate_stats_unassigned(self, demux_runmanifest): aggregated_unassigned_indexes, key=lambda x: (x["Lane"], -int(x["Count"])) ) # Fetch PFCount for each lane + # to calculate % of unassigned index in total lane PF polonies pfcount_lane = {} if os.path.exists(self.run_stats_file): with open(self.run_stats_file) as stats_json: aviti_runstats_json = json.load(stats_json) - for lane_stats in aviti_runstats_json["LaneStats"]: - pfcount_lane[str(lane_stats["Lane"])] = float(lane_stats["PFCount"]) + # Check whether the lane numbers match between the run stat json and run manifests + if len(aviti_runstats_json["LaneStats"]) != len(lanes): + logger.warning( + f"Inconsistent lane numbers between the {os.path.basename(self.run_stats_file)} file and run manifests!" + ) + else: + # When there is no RunManifest uploaded at the sequencer, the lane numbers will all be 0 + # In this case we assume that the lanes are ordered by their numbers + if all(lane_stats["Lane"] == 0 for lane_stats in aviti_runstats_json["LaneStats"]): + lane_counter = 1 + for lane_stats in aviti_runstats_json["LaneStats"]: + pfcount_lane[str(lane_counter)] = float(lane_stats["PFCount"]) + lane_counter += 1 + # Otherwise we parse the PF counts by matching the lane numbers + else: + for lane_stats in aviti_runstats_json["LaneStats"]: + pfcount_lane[str(lane_stats["Lane"])] = float(lane_stats["PFCount"]) + # Prepare the dict for pf assigned coutn for each lane + pf_assigned_lane = {} + for sample in aggregated_assigned_indexes_filtered_sorted: + lane = sample['Lane'] + num_polonies_assigned = int(sample['NumPoloniesAssigned']) + if lane in pf_assigned_lane: + pf_assigned_lane[lane] += num_polonies_assigned + else: + pf_assigned_lane[lane] = num_polonies_assigned # Modify the % Polonies values based on PFCount for each lane for unassigned_index in aggregated_unassigned_indexes: if pfcount_lane.get(unassigned_index["Lane"]): @@ -1119,6 +1147,17 @@ def aggregate_stats_unassigned(self, demux_runmanifest): / pfcount_lane[unassigned_index["Lane"]] * 100 ) + # Calculate the % Polonies values in the total unassigned for each lane + if pf_assigned_lane.get(unassigned_index["Lane"]): + unassigned_index["% Unassigned"] = ( + float(unassigned_index["Count"]) + / (pfcount_lane[unassigned_index["Lane"]] - pf_assigned_lane[unassigned_index["Lane"]]) + * 100 + ) + else: + unassigned_index["% Unassigned"] = 0 + else: + unassigned_index["% Polonies"] = 0 else: logger.warning( f"No {os.path.basename(self.run_stats_file)} file found for the run." @@ -1143,9 +1182,9 @@ def aggregate_demux_results(self, demux_results_dirs): # Symlink the output FastQ files of undet only if a lane does not have multiple demux self.aggregate_undet_fastq(demux_runmanifest) # Aggregate stats in IndexAssignment.csv - self.aggregate_stats_assigned(demux_runmanifest) + aggregated_assigned_indexes_filtered_sorted = self.aggregate_stats_assigned(demux_runmanifest) # Aggregate stats in UnassignedSequences.csv - self.aggregate_stats_unassigned(demux_runmanifest) + self.aggregate_stats_unassigned(demux_runmanifest, aggregated_assigned_indexes_filtered_sorted) def sync_metadata(self): files_to_copy = [ From 447045f5fe9cbe718d036945ac0ff4f10c82287e Mon Sep 17 00:00:00 2001 From: chuan-wang Date: Fri, 11 Oct 2024 15:02:55 +0200 Subject: [PATCH 135/187] Fix bug' --- taca/element/Element_Runs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index d7fb42c7..f74a30ee 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -986,7 +986,7 @@ def aggregate_stats_assigned(self, demux_runmanifest): aggregated_assigned_indexes_filtered_sorted, aggregated_assigned_indexes_csv ) - yield aggregated_assigned_indexes_filtered_sorted + return aggregated_assigned_indexes_filtered_sorted # Aggregate stats in UnassignedSequences.csv From cfcef6834152eac61aea719b00a9b326f4aeba3e Mon Sep 17 00:00:00 2001 From: chuan-wang Date: Fri, 11 Oct 2024 15:04:52 +0200 Subject: [PATCH 136/187] ruff format --- taca/element/Element_Runs.py | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index f74a30ee..be0fce49 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -988,9 +988,10 @@ def aggregate_stats_assigned(self, demux_runmanifest): return aggregated_assigned_indexes_filtered_sorted - # Aggregate stats in UnassignedSequences.csv - def aggregate_stats_unassigned(self, demux_runmanifest, aggregated_assigned_indexes_filtered_sorted): + def aggregate_stats_unassigned( + self, demux_runmanifest, aggregated_assigned_indexes_filtered_sorted + ): aggregated_unassigned_indexes = [] lanes = sorted(list(set(sample["Lane"] for sample in demux_runmanifest))) for lane in lanes: @@ -1121,7 +1122,10 @@ def aggregate_stats_unassigned(self, demux_runmanifest, aggregated_assigned_inde else: # When there is no RunManifest uploaded at the sequencer, the lane numbers will all be 0 # In this case we assume that the lanes are ordered by their numbers - if all(lane_stats["Lane"] == 0 for lane_stats in aviti_runstats_json["LaneStats"]): + if all( + lane_stats["Lane"] == 0 + for lane_stats in aviti_runstats_json["LaneStats"] + ): lane_counter = 1 for lane_stats in aviti_runstats_json["LaneStats"]: pfcount_lane[str(lane_counter)] = float(lane_stats["PFCount"]) @@ -1129,12 +1133,14 @@ def aggregate_stats_unassigned(self, demux_runmanifest, aggregated_assigned_inde # Otherwise we parse the PF counts by matching the lane numbers else: for lane_stats in aviti_runstats_json["LaneStats"]: - pfcount_lane[str(lane_stats["Lane"])] = float(lane_stats["PFCount"]) + pfcount_lane[str(lane_stats["Lane"])] = float( + lane_stats["PFCount"] + ) # Prepare the dict for pf assigned coutn for each lane pf_assigned_lane = {} for sample in aggregated_assigned_indexes_filtered_sorted: - lane = sample['Lane'] - num_polonies_assigned = int(sample['NumPoloniesAssigned']) + lane = sample["Lane"] + num_polonies_assigned = int(sample["NumPoloniesAssigned"]) if lane in pf_assigned_lane: pf_assigned_lane[lane] += num_polonies_assigned else: @@ -1151,7 +1157,10 @@ def aggregate_stats_unassigned(self, demux_runmanifest, aggregated_assigned_inde if pf_assigned_lane.get(unassigned_index["Lane"]): unassigned_index["% Unassigned"] = ( float(unassigned_index["Count"]) - / (pfcount_lane[unassigned_index["Lane"]] - pf_assigned_lane[unassigned_index["Lane"]]) + / ( + pfcount_lane[unassigned_index["Lane"]] + - pf_assigned_lane[unassigned_index["Lane"]] + ) * 100 ) else: @@ -1182,9 +1191,13 @@ def aggregate_demux_results(self, demux_results_dirs): # Symlink the output FastQ files of undet only if a lane does not have multiple demux self.aggregate_undet_fastq(demux_runmanifest) # Aggregate stats in IndexAssignment.csv - aggregated_assigned_indexes_filtered_sorted = self.aggregate_stats_assigned(demux_runmanifest) + aggregated_assigned_indexes_filtered_sorted = self.aggregate_stats_assigned( + demux_runmanifest + ) # Aggregate stats in UnassignedSequences.csv - self.aggregate_stats_unassigned(demux_runmanifest, aggregated_assigned_indexes_filtered_sorted) + self.aggregate_stats_unassigned( + demux_runmanifest, aggregated_assigned_indexes_filtered_sorted + ) def sync_metadata(self): files_to_copy = [ From 9e8b375bce9facac8491d19307e3d44f0b973872 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 11 Oct 2024 15:10:25 +0200 Subject: [PATCH 137/187] fix tests --- tests/element/test_Element_Runs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py index 63e8fe89..eb4a714c 100644 --- a/tests/element/test_Element_Runs.py +++ b/tests/element/test_Element_Runs.py @@ -31,7 +31,7 @@ def create_element_run_dir( outcome_completed: bool = True, sync_finished: bool = True, demux_dir: bool = False, - n_demux_subdirs: int = 0, + n_demux_subdirs: int = 2, demux_done: bool = False, nosync: bool = False, ) -> str: From bd6ac211acebe8ba7b9f0a93908d25e86cb7d1f8 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 11 Oct 2024 15:22:45 +0200 Subject: [PATCH 138/187] stop mocks --- tests/analysis/test_analysis_element.py | 4 +--- tests/analysis/test_analysis_nanopore.py | 3 +++ 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py index 41e7f305..bc3bdd8b 100644 --- a/tests/analysis/test_analysis_element.py +++ b/tests/analysis/test_analysis_element.py @@ -30,6 +30,4 @@ def test_run_preprocessing(create_dirs): to_test.run_preprocessing(run_dir) # Stop mocks - mock_config.stop() - mock_db.stop() - mock_subprocess.stop() + patch.stopall() diff --git a/tests/analysis/test_analysis_nanopore.py b/tests/analysis/test_analysis_nanopore.py index 01fec070..bd4499ca 100644 --- a/tests/analysis/test_analysis_nanopore.py +++ b/tests/analysis/test_analysis_nanopore.py @@ -147,3 +147,6 @@ def side_effect(*args, **kwargs): # Start testing analysis_nanopore.ont_transfer(run_abspath=None, qc=False) + + # Stop mocks + patch.stopall() From 9b1a4953799e27f9fffc858cc17c90c8158ee933 Mon Sep 17 00:00:00 2001 From: chuan-wang Date: Fri, 11 Oct 2024 15:49:12 +0200 Subject: [PATCH 139/187] Refactor based on comments from JoA; Fix VERSIONLOG --- VERSIONLOG.md | 4 ++++ taca/element/Element_Runs.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/VERSIONLOG.md b/VERSIONLOG.md index 80a77861..2b5fed5c 100644 --- a/VERSIONLOG.md +++ b/VERSIONLOG.md @@ -1,5 +1,9 @@ # TACA Version Log +## 20241011.1 + +Fix issue with 0 lane number; Add percentage of unassigned in total unassigned per lane + ## 20241008.1 Add support for processing Element Aviti data diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index be0fce49..4473b9a8 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -1136,7 +1136,7 @@ def aggregate_stats_unassigned( pfcount_lane[str(lane_stats["Lane"])] = float( lane_stats["PFCount"] ) - # Prepare the dict for pf assigned coutn for each lane + # Prepare the dict for pf assigned count for each lane pf_assigned_lane = {} for sample in aggregated_assigned_indexes_filtered_sorted: lane = sample["Lane"] From 81b91ea4922f88231a5119a8557a4807f18622b3 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 11 Oct 2024 16:05:36 +0200 Subject: [PATCH 140/187] fix faulty annotation --- tests/analysis/test_analysis_nanopore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/analysis/test_analysis_nanopore.py b/tests/analysis/test_analysis_nanopore.py index bd4499ca..e06059cc 100644 --- a/tests/analysis/test_analysis_nanopore.py +++ b/tests/analysis/test_analysis_nanopore.py @@ -15,7 +15,7 @@ ) -def build_run_properties() -> dict: +def build_run_properties() -> list[dict]: """In order to parametrize the test in a comprehensive way, the parametrization is tabulated as a string here. """ From fb9218903f782738a8938d9c816f3390b460b83e Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 11 Oct 2024 16:13:21 +0200 Subject: [PATCH 141/187] parametrization --- tests/analysis/test_analysis_element.py | 33 +++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py index bc3bdd8b..8f12bf10 100644 --- a/tests/analysis/test_analysis_element.py +++ b/tests/analysis/test_analysis_element.py @@ -1,10 +1,39 @@ +from io import StringIO from tempfile import TemporaryDirectory from unittest.mock import patch +import pandas as pd +import pytest + from tests.element.test_Element_Runs import create_element_run_dir, get_config -def test_run_preprocessing(create_dirs): +def get_run_kwargs() -> list[dict]: + parameter_string_table = """ +lims_manifest run_finished outcome_completed sync_finished demux_dir demux_done nosync +False False False False False False False +True False False False False False False +True True False False False False False +True True True False False False False +True True True True False False False +True True True True True False False +True True True True True True False +True True True True True True True +""" + # Turn string table to datastream + data = StringIO(parameter_string_table) + + # Read data, trimming whitespace + df = pd.read_csv(data, sep=r"\s+") + + # Compile into list of parameters to use + run_kwargs = df.to_dict(orient="records") + + return run_kwargs + + +@pytest.mark.parametrize("run_kwargs", get_run_kwargs()) +def test_run_preprocessing(create_dirs, run_kwargs): tmp: TemporaryDirectory = create_dirs # Mock config @@ -21,7 +50,7 @@ def test_run_preprocessing(create_dirs): mock_subprocess.start() # Create run dir and associated LIMS manifest - run_dir = create_element_run_dir(tmp=tmp) + run_dir = create_element_run_dir(tmp=tmp, **run_kwargs) # Import module to test from taca.analysis import analysis_element as to_test From 47e912d27ab9fe3b21a0b0d816608d8e03c62e9e Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 11 Oct 2024 16:53:40 +0200 Subject: [PATCH 142/187] test polishing --- tests/analysis/test_analysis_element.py | 35 ++++++++++++++----------- tests/element/test_Element_Runs.py | 24 +++++++++++------ 2 files changed, 35 insertions(+), 24 deletions(-) diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py index 8f12bf10..4fb5acbe 100644 --- a/tests/analysis/test_analysis_element.py +++ b/tests/analysis/test_analysis_element.py @@ -8,31 +8,34 @@ from tests.element.test_Element_Runs import create_element_run_dir, get_config -def get_run_kwargs() -> list[dict]: - parameter_string_table = """ -lims_manifest run_finished outcome_completed sync_finished demux_dir demux_done nosync -False False False False False False False -True False False False False False False -True True False False False False False -True True True False False False False -True True True True False False False -True True True True True False False -True True True True True True False -True True True True True True True -""" +def parametrize_testruns() -> list[dict]: + """Helper function to build test parametrization from a friendly string table.""" + + testrun_descs = ["ready to demux"] + + kwarg_table = """ + lims_manifest metadata_files run_finished outcome_completed demux_dir demux_done rsync_ongoing rsync_exit_status nosync + True True True True False False False None False + """ + # Turn string table to datastream - data = StringIO(parameter_string_table) + data = StringIO(kwarg_table) # Read data, trimming whitespace df = pd.read_csv(data, sep=r"\s+") # Compile into list of parameters to use - run_kwargs = df.to_dict(orient="records") + testrun_kwargs = df.to_dict(orient="records") + + assert len(testrun_descs) == len(testrun_kwargs) + + return testrun_kwargs, testrun_descs + - return run_kwargs +testrun_kwargs, testrun_descs = parametrize_testruns() -@pytest.mark.parametrize("run_kwargs", get_run_kwargs()) +@pytest.mark.parametrize("run_kwargs", testrun_kwargs, ids=testrun_descs) def test_run_preprocessing(create_dirs, run_kwargs): tmp: TemporaryDirectory = create_dirs diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py index eb4a714c..7ba86e9f 100644 --- a/tests/element/test_Element_Runs.py +++ b/tests/element/test_Element_Runs.py @@ -26,13 +26,15 @@ def get_config(tmp: tempfile.TemporaryDirectory) -> dict: def create_element_run_dir( tmp: tempfile.TemporaryDirectory, run_name: str = "20240926_AV242106_A2349523513", - lims_manifest: bool = True, - run_finished: bool = True, - outcome_completed: bool = True, - sync_finished: bool = True, + metadata_files: bool = False, + lims_manifest: bool = False, + run_finished: bool = False, + outcome_completed: bool = False, demux_dir: bool = False, n_demux_subdirs: int = 2, demux_done: bool = False, + rsync_ongoing: bool = False, + rsync_exit_status: int | None = None, nosync: bool = False, ) -> str: """ @@ -134,7 +136,7 @@ def create_element_run_dir( os.remove(csv_path) # Populate run dir with files and folders - if run_finished: + if metadata_files: with open(f"{run_path}/RunManifest.json", "w") as stream: stream.write("""{ "KitConfiguration": { @@ -397,6 +399,8 @@ def create_element_run_dir( } } """) + + if run_finished: with open(f"{run_path}/RunUploaded.json", "w") as stream: outcome = "OutcomeCompleted" if outcome_completed else "OutcomeFailed" stream.write( @@ -411,8 +415,12 @@ def create_element_run_dir( + "}" ) - if sync_finished: - open(f"{run_path}/.sync_finished", "w").close() + if rsync_ongoing: + open(f"{run_path}/.rsync_ongoing", "w").close() + + if rsync_exit_status is not None: + with open(f"{run_path}/.rsync_exit_status", "w") as stream: + stream.write(str(rsync_exit_status)) if demux_dir: os.mkdir(os.path.join(run_path, "Demultiplexing")) @@ -469,7 +477,7 @@ def test_check_sequencing_status( run = to_test.Run( create_element_run_dir( tmp, - run_finished=p["run_finished"], + metadata_files=p["run_finished"], outcome_completed=p["outcome_completed"], ), get_config(tmp), From b16423bcc6f985d1ab172422a1e4b523c9f9b40c Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 11 Oct 2024 17:02:03 +0200 Subject: [PATCH 143/187] test troubleshooting --- tests/element/test_Element_Runs.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py index 7ba86e9f..f0e3e0c6 100644 --- a/tests/element/test_Element_Runs.py +++ b/tests/element/test_Element_Runs.py @@ -460,9 +460,24 @@ def test_init(self, mock_db: mock.Mock, create_dirs: pytest.fixture): @pytest.mark.parametrize( "p", [ - {"run_finished": True, "outcome_completed": True, "expected": True}, - {"run_finished": True, "outcome_completed": False, "expected": False}, - {"run_finished": False, "outcome_completed": False, "expected": False}, + { + "run_finished": True, + "metadata_files": True, + "outcome_completed": True, + "expected": True, + }, + { + "run_finished": True, + "metadata_files": True, + "outcome_completed": False, + "expected": False, + }, + { + "run_finished": False, + "metadata_files ": False, + "outcome_completed": False, + "expected": False, + }, ], ids=["success", "failure", "ongoing"], ) From 81dcc40ae2ef4ea03f92941d1484c3bc4021f72b Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 11 Oct 2024 17:11:22 +0200 Subject: [PATCH 144/187] all tests functional --- tests/element/test_Element_Runs.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py index f0e3e0c6..2accd282 100644 --- a/tests/element/test_Element_Runs.py +++ b/tests/element/test_Element_Runs.py @@ -474,7 +474,7 @@ def test_init(self, mock_db: mock.Mock, create_dirs: pytest.fixture): }, { "run_finished": False, - "metadata_files ": False, + "metadata_files": False, "outcome_completed": False, "expected": False, }, @@ -489,15 +489,15 @@ def test_check_sequencing_status( ): tmp: tempfile.TemporaryDirectory = create_dirs + expected_outcome = p.pop("expected") run = to_test.Run( create_element_run_dir( tmp, - metadata_files=p["run_finished"], - outcome_completed=p["outcome_completed"], + **p, ), get_config(tmp), ) - assert run.check_sequencing_status() is p["expected"] + assert run.check_sequencing_status() is expected_outcome @pytest.mark.parametrize( "p", From b0593da06117e85033f7df794d229793f65e5a9f Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 11 Oct 2024 17:15:47 +0200 Subject: [PATCH 145/187] add some parametrization --- tests/analysis/test_analysis_element.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py index 4fb5acbe..1657667f 100644 --- a/tests/analysis/test_analysis_element.py +++ b/tests/analysis/test_analysis_element.py @@ -11,11 +11,12 @@ def parametrize_testruns() -> list[dict]: """Helper function to build test parametrization from a friendly string table.""" - testrun_descs = ["ready to demux"] + testrun_descs = ["ready to demux", "demux_ongoing"] kwarg_table = """ lims_manifest metadata_files run_finished outcome_completed demux_dir demux_done rsync_ongoing rsync_exit_status nosync True True True True False False False None False + True True True True True False False None False """ # Turn string table to datastream From b290bb53c8904655398046857161c3370322cd94 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 11 Oct 2024 17:23:24 +0200 Subject: [PATCH 146/187] mypy --- tests/analysis/test_analysis_element.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py index 1657667f..96a3817e 100644 --- a/tests/analysis/test_analysis_element.py +++ b/tests/analysis/test_analysis_element.py @@ -8,10 +8,10 @@ from tests.element.test_Element_Runs import create_element_run_dir, get_config -def parametrize_testruns() -> list[dict]: +def parametrize_testruns() -> tuple[list[dict], list[str]]: """Helper function to build test parametrization from a friendly string table.""" - testrun_descs = ["ready to demux", "demux_ongoing"] + testrun_descs: list[str] = ["ready to demux", "demux_ongoing"] kwarg_table = """ lims_manifest metadata_files run_finished outcome_completed demux_dir demux_done rsync_ongoing rsync_exit_status nosync @@ -26,7 +26,7 @@ def parametrize_testruns() -> list[dict]: df = pd.read_csv(data, sep=r"\s+") # Compile into list of parameters to use - testrun_kwargs = df.to_dict(orient="records") + testrun_kwargs: list[dict] = df.to_dict(orient="records") assert len(testrun_descs) == len(testrun_kwargs) From 35248bd33d5332f3d94f369ed7b414e1e77d77e1 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 11 Oct 2024 17:24:04 +0200 Subject: [PATCH 147/187] remove empty, skipped tests --- tests/element/test_Element_Runs.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py index 2accd282..f558c194 100644 --- a/tests/element/test_Element_Runs.py +++ b/tests/element/test_Element_Runs.py @@ -524,10 +524,6 @@ def test_get_demultiplexing_status( assert run.get_demultiplexing_status() == p["expected"] - @pytest.mark.skip(reason="Not implemented yet") - def test_generate_demux_command(self, mock_db): - pass - def test_start_demux(self, mock_db, create_dirs): tmp: tempfile.TemporaryDirectory = create_dirs with mock.patch("subprocess.Popen") as mock_Popen, mock.patch( @@ -538,11 +534,3 @@ def test_start_demux(self, mock_db, create_dirs): run.start_demux("mock_run_manifest", "mock_demux_dir") mock_command.assert_called_once_with("mock_run_manifest", "mock_demux_dir") mock_Popen.assert_called_once() - - @pytest.mark.skip(reason="Not implemented yet") - def test_is_transferred(self, mock_db, create_dirs): - pass - - @pytest.mark.skip(reason="Not implemented yet") - def test_parse_rundir(self, mock_db, create_dirs): - pass From bb888bcd7f206911b0b299d01510f1e4e6872cb1 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Mon, 14 Oct 2024 11:46:39 +0200 Subject: [PATCH 148/187] Reconfigure pytest arguments w / wo CI --- .github/workflows/test-code.yml | 4 ++-- pyproject.toml | 4 ---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/.github/workflows/test-code.yml b/.github/workflows/test-code.yml index 1ff360e3..eec2d5fb 100644 --- a/.github/workflows/test-code.yml +++ b/.github/workflows/test-code.yml @@ -21,8 +21,8 @@ jobs: - name: Install TACA run: pip install -e . - name: pytest - # Options are configured in pyproject.toml - run: pytest --cov=genologics --cov-report=xml + # Default options are configured in pyproject.toml + run: pytest --cov=./taca --cov-report=xml --cov-report term-missing -vv - name: CodeCov uses: codecov/codecov-action@v4 with: diff --git a/pyproject.toml b/pyproject.toml index d5d152b2..15c6c907 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,10 +38,6 @@ filterwarnings = [ # Default addopts addopts = "--ignore tests_old/" -# CLI coverage reports, messes with IDE debugging -# pytest --ignore tests_old/ --cov=./taca --cov-report term-missing -vv - - [tool.coverage.run] # The comment "# pragma: no cover" can be used to exclude a line from coverage source = ["taca"] From fd54a125cbab15caf397a1e97b40954399fa78a1 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Tue, 15 Oct 2024 14:00:42 +0200 Subject: [PATCH 149/187] Small fixes --- taca/element/Element_Runs.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 4473b9a8..02ca8810 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -270,9 +270,9 @@ def to_doc_obj(self): demux_command_file = os.path.join(self.run_dir, ".bases2fastq_command") if os.path.exists(demux_command_file): with open(demux_command_file) as command_file: - demux_command = command_file.readlines()[0] + demux_commands = command_file.readlines() else: - demux_command = None + demux_commands = None demux_version_file = os.path.join( self.run_dir, "Demultiplexing_0", "RunStats.json" ) @@ -286,7 +286,7 @@ def to_doc_obj(self): software_info = { "Version": demux_version, "bin": self.CONFIG.get("element_analysis").get("bases2fastq"), - "options": demux_command, + "options": demux_commands, } doc_obj = { @@ -580,9 +580,10 @@ def make_demux_manifests( raise AssertionError("Both I1 and I2 appear to contain UMIs.") # Unpack settings from LIMS manifest - for kv in settings.split(" "): - k, v = kv.split(":") - settings_kvs[k] = v + if settings: + for kv in settings.split(" "): + k, v = kv.split(":") + settings_kvs[k] = v settings_section = "\n".join( [ @@ -641,7 +642,7 @@ def generate_demux_command(self, run_manifest, demux_dir): + " --force-index-orientation" ) with open( - os.path.join(self.run_dir, ".bases2fastq_command"), "w" + os.path.join(self.run_dir, ".bases2fastq_command"), "a" ) as command_file: command_file.write(command) return command From 5f46a2fdb86119372097d403af58cc1cab822f6f Mon Sep 17 00:00:00 2001 From: chuan-wang Date: Wed, 16 Oct 2024 10:01:54 +0200 Subject: [PATCH 150/187] Fix bug with empty aggregated_unassigned_indexes --- taca/element/Element_Runs.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 02ca8810..afeb2777 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -1174,10 +1174,11 @@ def aggregate_stats_unassigned( ) # Write to a new UnassignedSequences.csv file under demux_dir - aggregated_unassigned_csv = os.path.join( - self.run_dir, self.demux_dir, "UnassignedSequences.csv" - ) - self.write_to_csv(aggregated_unassigned_indexes, aggregated_unassigned_csv) + if aggregated_unassigned_indexes: + aggregated_unassigned_csv = os.path.join( + self.run_dir, self.demux_dir, "UnassignedSequences.csv" + ) + self.write_to_csv(aggregated_unassigned_indexes, aggregated_unassigned_csv) # Aggregate demux results def aggregate_demux_results(self, demux_results_dirs): From 340de287301c7a7b5e1dbe86cd7016cfdafba30e Mon Sep 17 00:00:00 2001 From: chuan-wang Date: Wed, 16 Oct 2024 10:36:54 +0200 Subject: [PATCH 151/187] Fix wrong logic for collecting unassigned indexes --- VERSIONLOG.md | 4 ++++ taca/element/Element_Runs.py | 30 ++++++++---------------------- 2 files changed, 12 insertions(+), 22 deletions(-) diff --git a/VERSIONLOG.md b/VERSIONLOG.md index 2b5fed5c..2e6d992b 100644 --- a/VERSIONLOG.md +++ b/VERSIONLOG.md @@ -1,5 +1,9 @@ # TACA Version Log +## 20241016.1 + +Fix wrong logic for collecting unassigned indexes + ## 20241011.1 Fix issue with 0 lane number; Add percentage of unassigned in total unassigned per lane diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index afeb2777..fb2ce2b2 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -1040,25 +1040,11 @@ def aggregate_stats_unassigned( # Order: from longer to shorter indexes sub_demux_with_shorter_index_lens = sub_demux_list[1:] for sub_demux in sub_demux_with_shorter_index_lens: - unassigned_csv = os.path.join( - self.run_dir, - f"Demultiplexing_{sub_demux}", - "UnassignedSequences.csv", - ) - if os.path.exists(unassigned_csv): - with open(unassigned_csv) as unassigned_file: - reader = csv.DictReader(unassigned_file) - unassigned_indexes = [row for row in reader] - else: - logger.warning( - f"No {os.path.basename(unassigned_csv)} file found for sub-demultiplexing {sub_demux}." - ) - continue - # Filter by lane - unassigned_indexes = [ - unassigned_index - for unassigned_index in unassigned_indexes - if unassigned_index["Lane"] == lane + sub_demux_assigned_indexes = [ + sub_demux_assigned_index + for sub_demux_assigned_index in aggregated_assigned_indexes_filtered_sorted + if sub_demux_assigned_index["sub_demux_count"] == sub_demux + and sub_demux_assigned_index["Lane"] == lane ] # Remove overlapped indexes from the list of max_unassigned_indexes idx1_overlapped_len = min( @@ -1085,11 +1071,11 @@ def aggregate_stats_unassigned( if demux_lens_pair[0] == sub_demux_with_max_index_lens ][0][1], ) - for unassigned_index in unassigned_indexes: - idx1_overlapped_seq = unassigned_index["I1"][ + for sub_demux_assigned_index in sub_demux_assigned_indexes: + idx1_overlapped_seq = sub_demux_assigned_index["I1"][ :idx1_overlapped_len ] - idx2_overlapped_seq = unassigned_index["I2"][ + idx2_overlapped_seq = sub_demux_assigned_index["I2"][ :idx2_overlapped_len ] # Remove the overlapped record from the max_unassigned_indexes list From 8134461c45ba0a86756466eebad8ad7da9c7bc1e Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Wed, 16 Oct 2024 13:22:30 +0200 Subject: [PATCH 152/187] Warn about missing files --- taca/element/Element_Runs.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index fb2ce2b2..28dc81da 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -650,7 +650,7 @@ def generate_demux_command(self, run_manifest, demux_dir): def start_demux(self, run_manifest, demux_dir): with chdir(self.run_dir): cmd = self.generate_demux_command(run_manifest, demux_dir) - stderr_abspath = f"{self.run_dir}/bases2fastq_stderr.txt" + stderr_abspath = f"{self.run_dir}/bases2fastq_stderr.txt" #TODO: individual files for each sub-demux try: with open(stderr_abspath, "w") as stderr: process = subprocess.Popen( @@ -1198,8 +1198,11 @@ def sync_metadata(self): dest = os.path.join(metadata_archive, self.NGI_run_id) if not os.path.exists(dest): os.makedirs(dest) - for f in files_to_copy: - shutil.copy(f, dest) + for f in files_to_copy: # UnassignedSequences.csv missing in NoIndex case + if os.path.exists(f): + shutil.copy(f, dest) + else: + logger.warning(f"File {f} missing for run {self.run}") def make_transfer_indicator(self): transfer_indicator = os.path.join(self.run_dir, ".rsync_ongoing") From ad2d01515b731b4b798559ea1bf9d74c4f608ac3 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Tue, 22 Oct 2024 15:20:21 +0200 Subject: [PATCH 153/187] modernize ont test parametrization --- tests/analysis/test_analysis_nanopore.py | 80 ++++++++++-------------- 1 file changed, 34 insertions(+), 46 deletions(-) diff --git a/tests/analysis/test_analysis_nanopore.py b/tests/analysis/test_analysis_nanopore.py index e06059cc..142d1a38 100644 --- a/tests/analysis/test_analysis_nanopore.py +++ b/tests/analysis/test_analysis_nanopore.py @@ -15,70 +15,58 @@ ) -def build_run_properties() -> list[dict]: +def parametrize_testruns() -> list[dict]: """In order to parametrize the test in a comprehensive way, the parametrization is tabulated as a string here. """ - col_names = [ - "instrument", - "qc", - "run_finished", - "sync_finished", - "raw_dirs", - "fastq_dirs", - "barcode_dirs", - "anglerfish_samplesheets", - "anglerfish_ongoing", - "anglerfish_exit", - ] - parameter_string_table = """ - promethion False False False False False False False False NA - promethion False True False False False False False False NA - promethion False True True False False False False False NA - promethion False True True True False False False False NA - promethion False True True True True False False False NA - promethion False True True True True True False False NA - minion False False False False False False False False NA - minion False True False False False False False False NA - minion False True True False False False False False NA - minion False True True True False False False False NA - minion False True True True True False False False NA - minion False True True True True True False False NA - minion True False False False False False False False NA - minion True True False False False False False False NA - minion True True True False False False False False NA - minion True True True True False False False False NA - minion True True True True True False False False NA - minion True True True True True True False False NA - minion True True True True True True True False NA - minion True True True True True True True True NA - minion True True True True True True True False 0 + desc instrument qc run_finished sync_finished raw_dirs fastq_dirs barcode_dirs anglerfish_samplesheets anglerfish_ongoing anglerfish_exit + prom_ongoing promethion False False False False False False False False NA + prom_done promethion False True False False False False False False NA + prom_synced promethion False True True False False False False False NA + prom_reads promethion False True True True False False False False NA + prom_fastq promethion False True True True True False False False NA + prom_bcs promethion False True True True True True False False NA + min_ongoing minion False False False False False False False False NA + min_done minion False True False False False False False False NA + min_synced minion False True True False False False False False NA + min_reads minion False True True True False False False False NA + min_fastq minion False True True True True False False False NA + min_bcs minion False True True True True True False False NA + min_qc_ongoing minion True False False False False False False False NA + min_qc_done minion True True False False False False False False NA + min_qc_synced minion True True True False False False False False NA + min_qc_reads minion True True True True False False False False NA + min_qc_fastq minion True True True True True False False False NA + min_qc_bcs minion True True True True True True False False NA + min_qc_ang_ss minion True True True True True True True False NA + min_qc_ang_run minion True True True True True True True True NA + min_qc_ang_done minion True True True True True True True False 0 """ + # Turn string table to datastream data = StringIO(parameter_string_table) # Read data, trimming whitespace - df = pd.read_csv(data, header=None, sep=r"\s+") - assert len(df.columns) == len(col_names) - df.columns = col_names + df = pd.read_csv(data, sep=r"\s+") # Replace nan(s) with None(s) df = df.replace(np.nan, None) - # Convert to dict - run_properties = df.to_dict("records") + # Drop the "desc" column and retain it as a list + testrun_descs = df.pop("desc").tolist() + + # Compile into list of parameters to use + testrun_kwargs: list[dict] = df.to_dict(orient="records") + + return testrun_kwargs, testrun_descs - # Convert float exit codes to ints - for d in run_properties: - if d["anglerfish_exit"] == 0.0: - d["anglerfish_exit"] = int(d["anglerfish_exit"]) - return run_properties +testrun_kwargs, testrun_descs = parametrize_testruns() -@pytest.mark.parametrize("run_properties", build_run_properties()) +@pytest.mark.parametrize("run_properties", testrun_kwargs, ids=testrun_descs) def test_ont_transfer(create_dirs, run_properties, caplog): """Test the "taca analaysis ont-transfer" subcommand automation from start to finish for a variety of runs. From 554d38a2ef650fac8d97f921961df8b214430868 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Tue, 22 Oct 2024 15:20:47 +0200 Subject: [PATCH 154/187] remove rsync options causing vscode pytest crash due to outdated rsync version --- tests/nanopore/test_ONT_run_classes.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/nanopore/test_ONT_run_classes.py b/tests/nanopore/test_ONT_run_classes.py index a91a2f97..99cd558f 100644 --- a/tests/nanopore/test_ONT_run_classes.py +++ b/tests/nanopore/test_ONT_run_classes.py @@ -52,8 +52,6 @@ def make_ONT_test_config(tmp: tempfile.TemporaryDirectory) -> dict: minknow_reports_dir: {tmp.name}/minknow_reports/ rsync_options: '-Lav': None - '--chown': ':ngi2016003' - '--chmod': 'Dg+s,g+rw' '-r': None '--exclude': ['work']""" From 4b2f8d8d5a1d268f5ef7804a809002661e7ea842 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Tue, 22 Oct 2024 17:33:46 +0200 Subject: [PATCH 155/187] add mock mail to config and overwrite functionality to element dir --- tests/element/test_Element_Runs.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py index f558c194..7a54bd56 100644 --- a/tests/element/test_Element_Runs.py +++ b/tests/element/test_Element_Runs.py @@ -18,6 +18,9 @@ def get_config(tmp: tempfile.TemporaryDirectory) -> dict: }, }, }, + "mail": { + "recipients": ["mock@mock.com"], + }, "statusdb": {}, } return config @@ -25,6 +28,7 @@ def get_config(tmp: tempfile.TemporaryDirectory) -> dict: def create_element_run_dir( tmp: tempfile.TemporaryDirectory, + overwrite: bool = False, run_name: str = "20240926_AV242106_A2349523513", metadata_files: bool = False, lims_manifest: bool = False, @@ -62,6 +66,11 @@ def create_element_run_dir( run_path = f"{tmp.name}/ngi_data/sequencing/AV242106/nosync/{run_name}" else: run_path = f"{tmp.name}/ngi_data/sequencing/AV242106/{run_name}" + if os.path.exists(run_path): + if overwrite: + os.rmdir(run_path) + else: + raise FileExistsError(f"Directory {run_path} already exists.") os.mkdir(run_path) # Create LIMS manifest From 70d51bcb57cf6b972341ab423c438563da21fe40 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Tue, 22 Oct 2024 17:35:43 +0200 Subject: [PATCH 156/187] Add conftest fixture for logging when testing. Send to stdout and specified log file in tempdir. --- tests/conftest.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index a4945938..8b53b4e3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,3 +1,4 @@ +import logging import os import shutil import tempfile @@ -115,3 +116,46 @@ def create_dirs(): yield tmp tmp.cleanup() + + +@pytest.fixture(autouse=True) +def configure_logging(create_dirs): + """Configure logging for the entire test session.""" + + # Use fixture + tmp = create_dirs + + # Specify log file path + log_file = os.path.join(tmp.name, "log", "taca.log") + assert os.path.exists(log_file) + + # Get the root logger + logger = logging.getLogger() + + # Clear any existing handlers to avoid duplicate logs + if logger.hasHandlers(): + logger.handlers.clear() + + # Configure logging + file_handler = logging.FileHandler(log_file) + stream_handler = logging.StreamHandler() + + # Set a common formatter + formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + ) + file_handler.setFormatter(formatter) + stream_handler.setFormatter(formatter) + + # Add handlers to the root logger + logger.addHandler(file_handler) + logger.addHandler(stream_handler) + + # Set log level + logger.setLevel(logging.INFO) + + # Log to confirm the logger is working + logger.info(f"Logging is set up. Logs will be stored in {log_file}.") + + # Return the log file path to use in tests if needed + return log_file From 825182e77ecac7dfb777b3b1a57d9d981936c601 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Tue, 22 Oct 2024 17:35:56 +0200 Subject: [PATCH 157/187] start work on incremental test function --- tests/analysis/test_analysis_element.py | 51 +++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py index 96a3817e..760ab1be 100644 --- a/tests/analysis/test_analysis_element.py +++ b/tests/analysis/test_analysis_element.py @@ -1,3 +1,4 @@ +import logging from io import StringIO from tempfile import TemporaryDirectory from unittest.mock import patch @@ -64,3 +65,53 @@ def test_run_preprocessing(create_dirs, run_kwargs): # Stop mocks patch.stopall() + + +def test_incremental(create_dirs, caplog): + # Create tempdir + tmp: TemporaryDirectory = create_dirs + + # Capture log + caplog.at_level(logging.INFO) + + # Mock config + config = get_config(tmp) + mock_config = patch("taca.utils.config.CONFIG", new=config) + mock_config.start() + + # Mock DB + mock_db = patch("taca.element.Element_Runs.ElementRunsConnection") + mock_db.start() + + # Mock send mail + mock_mail = patch("taca.analysis.analysis_element.send_mail").start() + + # Mock subprocess + mock_subprocess = patch("subprocess.Popen") + mock_subprocess.start() + + # Import module to test + from taca.analysis import analysis_element as to_test + + # Test: Empty dir, should raise error and send mail + run_dir = create_element_run_dir( + tmp=tmp, + lims_manifest=False, + metadata_files=False, + run_finished=False, + outcome_completed=False, + demux_dir=False, + demux_done=False, + rsync_ongoing=False, + rsync_exit_status=None, + nosync=False, + ) + + with pytest.raises(FileNotFoundError): + to_test.run_preprocessing(run_dir) + + mock_mail.assert_called_once() + assert "Run parameters file not found" in caplog.text + + # Stop mocks + patch.stopall() From f69beea89bc76ee9b444575b9b08e0e45d69dc06 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Tue, 22 Oct 2024 17:58:46 +0200 Subject: [PATCH 158/187] add package for checking dir hashes, add 2nd test increment --- requirements-dev.txt | 16 +++++++------- tests/analysis/test_analysis_element.py | 28 ++++++++++++++++++++++++- 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 0ef1b795..8126d039 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,13 +1,13 @@ -r requirements.txt - -nose +dirhash +ipdb +ipython mock -sphinx -sphinx-rtd-theme +mypy +nose +pipreqs pytest pytest-cov -ipython -ipdb ruff -mypy -pipreqs +sphinx +sphinx-rtd-theme diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py index 760ab1be..4b924ef8 100644 --- a/tests/analysis/test_analysis_element.py +++ b/tests/analysis/test_analysis_element.py @@ -5,6 +5,7 @@ import pandas as pd import pytest +from dirhash import dirhash from tests.element.test_Element_Runs import create_element_run_dir, get_config @@ -93,7 +94,9 @@ def test_incremental(create_dirs, caplog): # Import module to test from taca.analysis import analysis_element as to_test - # Test: Empty dir, should raise error and send mail + ### Test: Empty dir, should raise error and send mail + + # Create dir run_dir = create_element_run_dir( tmp=tmp, lims_manifest=False, @@ -107,11 +110,34 @@ def test_incremental(create_dirs, caplog): nosync=False, ) + # Run code (1) with pytest.raises(FileNotFoundError): to_test.run_preprocessing(run_dir) + # Assertions mock_mail.assert_called_once() assert "Run parameters file not found" in caplog.text + # Add metadata files + run_dir = create_element_run_dir( + tmp=tmp, + overwrite=True, + lims_manifest=False, + metadata_files=True, + run_finished=False, + outcome_completed=False, + demux_dir=False, + demux_done=False, + rsync_ongoing=False, + rsync_exit_status=None, + nosync=False, + ) + + # Run code (2) with snapshots + before = dirhash(run_dir, "md5") + to_test.run_preprocessing(run_dir) + after = dirhash(run_dir, "md5") + assert before == after + # Stop mocks patch.stopall() From f383665f4aa2ba0b90ef2e321bdd6830845abbf7 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 23 Oct 2024 13:09:30 +0200 Subject: [PATCH 159/187] suspected bug fix --- taca/analysis/analysis_element.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index 915fcfa7..984c2a7c 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -39,7 +39,7 @@ def _process(run): sequencing_done = run.check_sequencing_status() if not sequencing_done: run.status = "sequencing" - if run.status_changed: + if run.status_changed(): run.update_statusdb() return From f659fa6f06f6f2264efad5c58e44e372482f1978 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 23 Oct 2024 13:16:45 +0200 Subject: [PATCH 160/187] start working on nicer tests --- tests/analysis/test_analysis_element.py | 34 +++++++++++++++---------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py index 4b924ef8..b05109c8 100644 --- a/tests/analysis/test_analysis_element.py +++ b/tests/analysis/test_analysis_element.py @@ -68,7 +68,8 @@ def test_run_preprocessing(create_dirs, run_kwargs): patch.stopall() -def test_incremental(create_dirs, caplog): +@pytest.fixture +def aviti_fixture(create_dirs, caplog): # Create tempdir tmp: TemporaryDirectory = create_dirs @@ -77,12 +78,10 @@ def test_incremental(create_dirs, caplog): # Mock config config = get_config(tmp) - mock_config = patch("taca.utils.config.CONFIG", new=config) - mock_config.start() + mock_config = patch("taca.utils.config.CONFIG", new=config).start() # Mock DB - mock_db = patch("taca.element.Element_Runs.ElementRunsConnection") - mock_db.start() + mock_db = patch("taca.element.Element_Runs.ElementRunsConnection").start() # Mock send mail mock_mail = patch("taca.analysis.analysis_element.send_mail").start() @@ -94,7 +93,16 @@ def test_incremental(create_dirs, caplog): # Import module to test from taca.analysis import analysis_element as to_test - ### Test: Empty dir, should raise error and send mail + # Yield fixtures + yield to_test, tmp, mock_mail, mock_db, caplog + + # Stop mocks + patch.stopall() + + +def test_process_empty_dir(aviti_fixture): + to_test, tmp, mock_mail, mock_db, caplog = aviti_fixture + """Should raise FileNotFoundError when no files are present in the run dir and send mail.""" # Create dir run_dir = create_element_run_dir( @@ -110,7 +118,6 @@ def test_incremental(create_dirs, caplog): nosync=False, ) - # Run code (1) with pytest.raises(FileNotFoundError): to_test.run_preprocessing(run_dir) @@ -118,6 +125,10 @@ def test_incremental(create_dirs, caplog): mock_mail.assert_called_once() assert "Run parameters file not found" in caplog.text + +def test_process_dir_metadata(aviti_fixture): + to_test, tmp, mock_mail, mock_db, caplog = aviti_fixture + # Add metadata files run_dir = create_element_run_dir( tmp=tmp, @@ -133,11 +144,8 @@ def test_incremental(create_dirs, caplog): nosync=False, ) - # Run code (2) with snapshots - before = dirhash(run_dir, "md5") to_test.run_preprocessing(run_dir) - after = dirhash(run_dir, "md5") - assert before == after - # Stop mocks - patch.stopall() + assert mock_db.upload_to_statusdb.called + + print(caplog.text) From 2ef5bae5f4a0f5c281e4b1029515dd9d1f3a24ff Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 23 Oct 2024 13:17:13 +0200 Subject: [PATCH 161/187] syntax fix --- tests/analysis/test_analysis_element.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py index b05109c8..18408143 100644 --- a/tests/analysis/test_analysis_element.py +++ b/tests/analysis/test_analysis_element.py @@ -87,8 +87,7 @@ def aviti_fixture(create_dirs, caplog): mock_mail = patch("taca.analysis.analysis_element.send_mail").start() # Mock subprocess - mock_subprocess = patch("subprocess.Popen") - mock_subprocess.start() + mock_subprocess = patch("subprocess.Popen").start() # Import module to test from taca.analysis import analysis_element as to_test From 6db637889e1cbe21c11a0dca70ad32164b321743 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 23 Oct 2024 13:47:02 +0200 Subject: [PATCH 162/187] add db mock assertions! --- tests/analysis/test_analysis_element.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py index 18408143..2b3ca0fe 100644 --- a/tests/analysis/test_analysis_element.py +++ b/tests/analysis/test_analysis_element.py @@ -81,7 +81,10 @@ def aviti_fixture(create_dirs, caplog): mock_config = patch("taca.utils.config.CONFIG", new=config).start() # Mock DB - mock_db = patch("taca.element.Element_Runs.ElementRunsConnection").start() + mock_db = patch( + "taca.element.Element_Runs.ElementRunsConnection", autospec=True + ).start() + print("BOOYAH", mock_db) # Mock send mail mock_mail = patch("taca.analysis.analysis_element.send_mail").start() @@ -128,6 +131,10 @@ def test_process_empty_dir(aviti_fixture): def test_process_dir_metadata(aviti_fixture): to_test, tmp, mock_mail, mock_db, caplog = aviti_fixture + # Sub-mock configuration + mock_db.return_value.check_db_run_status.return_value = "ongoing" + mock_db.return_value.upload_to_statusdb.return_value = None + # Add metadata files run_dir = create_element_run_dir( tmp=tmp, @@ -145,6 +152,6 @@ def test_process_dir_metadata(aviti_fixture): to_test.run_preprocessing(run_dir) - assert mock_db.upload_to_statusdb.called + assert mock_db.return_value.upload_to_statusdb.called print(caplog.text) From 4b649ebdfb0a3206e12b9324cacc6fcea6561aa3 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 23 Oct 2024 14:04:43 +0200 Subject: [PATCH 163/187] use dict of mocks for flexibility --- tests/analysis/test_analysis_element.py | 45 +++++++++++-------------- 1 file changed, 19 insertions(+), 26 deletions(-) diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py index 2b3ca0fe..c7cc81e0 100644 --- a/tests/analysis/test_analysis_element.py +++ b/tests/analysis/test_analysis_element.py @@ -76,34 +76,28 @@ def aviti_fixture(create_dirs, caplog): # Capture log caplog.at_level(logging.INFO) - # Mock config - config = get_config(tmp) - mock_config = patch("taca.utils.config.CONFIG", new=config).start() - - # Mock DB - mock_db = patch( - "taca.element.Element_Runs.ElementRunsConnection", autospec=True - ).start() - print("BOOYAH", mock_db) - - # Mock send mail - mock_mail = patch("taca.analysis.analysis_element.send_mail").start() - - # Mock subprocess - mock_subprocess = patch("subprocess.Popen").start() + # Mocks + mocks = { + "mock_config": patch("taca.utils.config.CONFIG", new=get_config(tmp)).start(), + "mock_db": patch( + "taca.element.Element_Runs.ElementRunsConnection", autospec=True + ).start(), + "mock_mail": patch("taca.analysis.analysis_element.send_mail").start(), + "mock_subprocess": patch("subprocess.Popen").start(), + } # Import module to test from taca.analysis import analysis_element as to_test # Yield fixtures - yield to_test, tmp, mock_mail, mock_db, caplog + yield to_test, tmp, caplog, mocks # Stop mocks patch.stopall() -def test_process_empty_dir(aviti_fixture): - to_test, tmp, mock_mail, mock_db, caplog = aviti_fixture +def test_process_on_empty_dir(aviti_fixture): + to_test, tmp, caplog, mocks = aviti_fixture """Should raise FileNotFoundError when no files are present in the run dir and send mail.""" # Create dir @@ -124,16 +118,17 @@ def test_process_empty_dir(aviti_fixture): to_test.run_preprocessing(run_dir) # Assertions - mock_mail.assert_called_once() + mocks["mock_mail"].assert_called_once() assert "Run parameters file not found" in caplog.text -def test_process_dir_metadata(aviti_fixture): - to_test, tmp, mock_mail, mock_db, caplog = aviti_fixture +def test_process_on_dir_w_metadata(aviti_fixture): + """Should update statusdb.""" + to_test, tmp, caplog, mocks = aviti_fixture # Sub-mock configuration - mock_db.return_value.check_db_run_status.return_value = "ongoing" - mock_db.return_value.upload_to_statusdb.return_value = None + mocks["mock_db"].return_value.check_db_run_status.return_value = "ongoing" + mocks["mock_db"].return_value.upload_to_statusdb.return_value = None # Add metadata files run_dir = create_element_run_dir( @@ -152,6 +147,4 @@ def test_process_dir_metadata(aviti_fixture): to_test.run_preprocessing(run_dir) - assert mock_db.return_value.upload_to_statusdb.called - - print(caplog.text) + assert mocks["mock_db"].return_value.upload_to_statusdb.called From ddfb36536c3ac667958b40eeefd01114336e5e4d Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 23 Oct 2024 14:18:56 +0200 Subject: [PATCH 164/187] prep for merge --- tests/analysis/test_analysis_element.py | 55 ++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py index c7cc81e0..939b132e 100644 --- a/tests/analysis/test_analysis_element.py +++ b/tests/analysis/test_analysis_element.py @@ -133,7 +133,6 @@ def test_process_on_dir_w_metadata(aviti_fixture): # Add metadata files run_dir = create_element_run_dir( tmp=tmp, - overwrite=True, lims_manifest=False, metadata_files=True, run_finished=False, @@ -148,3 +147,57 @@ def test_process_on_dir_w_metadata(aviti_fixture): to_test.run_preprocessing(run_dir) assert mocks["mock_db"].return_value.upload_to_statusdb.called + + +@pytest.skip("Not implemented") +def test_process_on_failed_run(aviti_fixture): + """""" + to_test, tmp, caplog, mocks = aviti_fixture + + # Sub-mock configuration + mocks["mock_db"].return_value.check_db_run_status.return_value = "ongoing" + mocks["mock_db"].return_value.upload_to_statusdb.return_value = None + + # Add metadata files + run_dir = create_element_run_dir( + tmp=tmp, + lims_manifest=False, + metadata_files=True, + run_finished=True, + outcome_completed=False, + demux_dir=False, + demux_done=False, + rsync_ongoing=False, + rsync_exit_status=None, + nosync=False, + ) + + to_test.run_preprocessing(run_dir) + + +def test_process_on_finished_run_wo_lims_manifest(aviti_fixture): + """Should fail to find LIMS run manifest and send mail.""" + to_test, tmp, caplog, mocks = aviti_fixture + + # Sub-mock configuration + mocks["mock_db"].return_value.check_db_run_status.return_value = "ongoing" + mocks["mock_db"].return_value.upload_to_statusdb.return_value = None + + # Add metadata files + run_dir = create_element_run_dir( + tmp=tmp, + lims_manifest=False, + metadata_files=True, + run_finished=True, + outcome_completed=True, + demux_dir=False, + demux_done=False, + rsync_ongoing=False, + rsync_exit_status=None, + nosync=False, + ) + + to_test.run_preprocessing(run_dir) + + assert "No manifest found for run" in caplog.text + mocks["mock_mail"].assert_called_once() From 3527588262e15ea58c1a25e8b8168bbab9156be8 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 23 Oct 2024 14:58:36 +0200 Subject: [PATCH 165/187] add mock bases2fastq executable --- tests/element/test_Element_Runs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py index 7a54bd56..37a5c9a4 100644 --- a/tests/element/test_Element_Runs.py +++ b/tests/element/test_Element_Runs.py @@ -17,6 +17,7 @@ def get_config(tmp: tempfile.TemporaryDirectory) -> dict: "transfer_log": f"{tmp.name}/log/transfer_aviti.tsv", }, }, + "bases2fastq": "mock_bases2fastq_path", }, "mail": { "recipients": ["mock@mock.com"], From 8aef1c95b47eb5f32c03854fcddb5e95e3819b3c Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 23 Oct 2024 14:59:07 +0200 Subject: [PATCH 166/187] add demux test w assertions --- tests/analysis/test_analysis_element.py | 46 +++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py index 939b132e..7202c589 100644 --- a/tests/analysis/test_analysis_element.py +++ b/tests/analysis/test_analysis_element.py @@ -83,7 +83,7 @@ def aviti_fixture(create_dirs, caplog): "taca.element.Element_Runs.ElementRunsConnection", autospec=True ).start(), "mock_mail": patch("taca.analysis.analysis_element.send_mail").start(), - "mock_subprocess": patch("subprocess.Popen").start(), + "mock_popen": patch("subprocess.Popen").start(), } # Import module to test @@ -149,9 +149,8 @@ def test_process_on_dir_w_metadata(aviti_fixture): assert mocks["mock_db"].return_value.upload_to_statusdb.called -@pytest.skip("Not implemented") +@pytest.mark.skip("Currently a failed run is treated as an ongoing run.") def test_process_on_failed_run(aviti_fixture): - """""" to_test, tmp, caplog, mocks = aviti_fixture # Sub-mock configuration @@ -201,3 +200,44 @@ def test_process_on_finished_run_wo_lims_manifest(aviti_fixture): assert "No manifest found for run" in caplog.text mocks["mock_mail"].assert_called_once() + + +def test_process_on_finished_run(aviti_fixture): + """Should start demux.""" + to_test, tmp, caplog, mocks = aviti_fixture + + # Sub-mock configuration + mocks["mock_db"].return_value.check_db_run_status.return_value = "ongoing" + mocks["mock_db"].return_value.upload_to_statusdb.return_value = None + + # Add metadata files + run_dir = create_element_run_dir( + tmp=tmp, + lims_manifest=True, + metadata_files=True, + run_finished=True, + outcome_completed=True, + demux_dir=False, + demux_done=False, + rsync_ongoing=False, + rsync_exit_status=None, + nosync=False, + ) + + to_test.run_preprocessing(run_dir) + + expected_call = " ".join( + [ + "mock_bases2fastq_path", + f"{tmp.name}/ngi_data/sequencing/AV242106/20240926_AV242106_A2349523513", + f"{tmp.name}/ngi_data/sequencing/AV242106/20240926_AV242106_A2349523513/Demultiplexing_0", + "-p 8", + "--num-unassigned 500", + f"-r {tmp.name}/ngi_data/sequencing/AV242106/20240926_AV242106_A2349523513/20240926_AV242106_A2349523513_demux_0.csv", + "--legacy-fastq", + "--force-index-orientation", + ] + ) + assert mocks["mock_popen"].call_args.args[0] == expected_call + assert "Bases2Fastq conversion and demultiplexing started for run " in caplog.text + assert mocks["mock_db"].return_value.upload_to_statusdb.called From fa506dd317259ee973c3744a1e528e1557bf97bc Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 23 Oct 2024 15:33:31 +0200 Subject: [PATCH 167/187] formatting --- taca/element/Element_Runs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 28dc81da..eaa700fe 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -650,7 +650,7 @@ def generate_demux_command(self, run_manifest, demux_dir): def start_demux(self, run_manifest, demux_dir): with chdir(self.run_dir): cmd = self.generate_demux_command(run_manifest, demux_dir) - stderr_abspath = f"{self.run_dir}/bases2fastq_stderr.txt" #TODO: individual files for each sub-demux + stderr_abspath = f"{self.run_dir}/bases2fastq_stderr.txt" # TODO: individual files for each sub-demux try: with open(stderr_abspath, "w") as stderr: process = subprocess.Popen( @@ -1198,7 +1198,7 @@ def sync_metadata(self): dest = os.path.join(metadata_archive, self.NGI_run_id) if not os.path.exists(dest): os.makedirs(dest) - for f in files_to_copy: # UnassignedSequences.csv missing in NoIndex case + for f in files_to_copy: # UnassignedSequences.csv missing in NoIndex case if os.path.exists(f): shutil.copy(f, dest) else: From e208826aa69379a8967e99bbee302a51b65fa594 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 23 Oct 2024 16:54:30 +0200 Subject: [PATCH 168/187] IT'S ALIVE --- tests/analysis/test_analysis_element.py | 66 ++----------------------- 1 file changed, 5 insertions(+), 61 deletions(-) diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py index 7202c589..b5f442ad 100644 --- a/tests/analysis/test_analysis_element.py +++ b/tests/analysis/test_analysis_element.py @@ -1,74 +1,15 @@ import logging -from io import StringIO +import sys from tempfile import TemporaryDirectory from unittest.mock import patch -import pandas as pd import pytest from dirhash import dirhash from tests.element.test_Element_Runs import create_element_run_dir, get_config -def parametrize_testruns() -> tuple[list[dict], list[str]]: - """Helper function to build test parametrization from a friendly string table.""" - - testrun_descs: list[str] = ["ready to demux", "demux_ongoing"] - - kwarg_table = """ - lims_manifest metadata_files run_finished outcome_completed demux_dir demux_done rsync_ongoing rsync_exit_status nosync - True True True True False False False None False - True True True True True False False None False - """ - - # Turn string table to datastream - data = StringIO(kwarg_table) - - # Read data, trimming whitespace - df = pd.read_csv(data, sep=r"\s+") - - # Compile into list of parameters to use - testrun_kwargs: list[dict] = df.to_dict(orient="records") - - assert len(testrun_descs) == len(testrun_kwargs) - - return testrun_kwargs, testrun_descs - - -testrun_kwargs, testrun_descs = parametrize_testruns() - - -@pytest.mark.parametrize("run_kwargs", testrun_kwargs, ids=testrun_descs) -def test_run_preprocessing(create_dirs, run_kwargs): - tmp: TemporaryDirectory = create_dirs - - # Mock config - config = get_config(tmp) - mock_config = patch("taca.utils.config.CONFIG", new=config) - mock_config.start() - - # Mock DB - mock_db = patch("taca.element.Element_Runs.ElementRunsConnection") - mock_db.start() - - # Mock subprocess - mock_subprocess = patch("subprocess.Popen") - mock_subprocess.start() - - # Create run dir and associated LIMS manifest - run_dir = create_element_run_dir(tmp=tmp, **run_kwargs) - - # Import module to test - from taca.analysis import analysis_element as to_test - - # Test - to_test.run_preprocessing(run_dir) - - # Stop mocks - patch.stopall() - - -@pytest.fixture +@pytest.fixture() def aviti_fixture(create_dirs, caplog): # Create tempdir tmp: TemporaryDirectory = create_dirs @@ -95,6 +36,9 @@ def aviti_fixture(create_dirs, caplog): # Stop mocks patch.stopall() + # Purge module + del sys.modules["taca.analysis.analysis_element"] + def test_process_on_empty_dir(aviti_fixture): to_test, tmp, caplog, mocks = aviti_fixture From efffd89847106da8cadeb2d0597bb7524a0dc869 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 23 Oct 2024 16:57:03 +0200 Subject: [PATCH 169/187] ruff fix --- tests/analysis/test_analysis_element.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py index b5f442ad..905ca963 100644 --- a/tests/analysis/test_analysis_element.py +++ b/tests/analysis/test_analysis_element.py @@ -4,10 +4,11 @@ from unittest.mock import patch import pytest -from dirhash import dirhash from tests.element.test_Element_Runs import create_element_run_dir, get_config +# from dirhash import dirhash TODO this might be useful for validating dir tree snapshots + @pytest.fixture() def aviti_fixture(create_dirs, caplog): From 4807f747a0aafcdc7a7e5313f580f844a9d4c601 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 23 Oct 2024 17:03:08 +0200 Subject: [PATCH 170/187] mypy fix --- pyproject.toml | 1 + tests/analysis/test_analysis_nanopore.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 15c6c907..0fc1fcb9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,7 @@ ignore = [ [tool.mypy] ignore_missing_imports = true follow_imports = 'skip' +exclude = "build" # === Testing ================================================================ diff --git a/tests/analysis/test_analysis_nanopore.py b/tests/analysis/test_analysis_nanopore.py index 142d1a38..30a7105a 100644 --- a/tests/analysis/test_analysis_nanopore.py +++ b/tests/analysis/test_analysis_nanopore.py @@ -15,7 +15,7 @@ ) -def parametrize_testruns() -> list[dict]: +def parametrize_testruns(): """In order to parametrize the test in a comprehensive way, the parametrization is tabulated as a string here. """ From 636dcddbd3effab8a5dce53893f56ce0891e2693 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 23 Oct 2024 17:06:39 +0200 Subject: [PATCH 171/187] try to placate GHA --- tests/analysis/test_analysis_element.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py index 905ca963..3cbdbfb5 100644 --- a/tests/analysis/test_analysis_element.py +++ b/tests/analysis/test_analysis_element.py @@ -38,7 +38,13 @@ def aviti_fixture(create_dirs, caplog): patch.stopall() # Purge module - del sys.modules["taca.analysis.analysis_element"] + try: + del sys.modules["taca.analysis.analysis_element"] + except KeyError: + try: + del sys.modules["to_test"] + except KeyError: + pass def test_process_on_empty_dir(aviti_fixture): From 9130d43e2fcb6c28fdb5d71a19303d299185644b Mon Sep 17 00:00:00 2001 From: kedhammar Date: Thu, 24 Oct 2024 12:44:19 +0200 Subject: [PATCH 172/187] try different way of extracting mock call args --- tests/analysis/test_analysis_element.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py index 3cbdbfb5..0d4e328b 100644 --- a/tests/analysis/test_analysis_element.py +++ b/tests/analysis/test_analysis_element.py @@ -177,7 +177,7 @@ def test_process_on_finished_run(aviti_fixture): to_test.run_preprocessing(run_dir) - expected_call = " ".join( + expected_cmd = " ".join( [ "mock_bases2fastq_path", f"{tmp.name}/ngi_data/sequencing/AV242106/20240926_AV242106_A2349523513", @@ -189,6 +189,6 @@ def test_process_on_finished_run(aviti_fixture): "--force-index-orientation", ] ) - assert mocks["mock_popen"].call_args.args[0] == expected_call + assert mocks["mock_popen"].call_args_list[0].args[0] == expected_cmd assert "Bases2Fastq conversion and demultiplexing started for run " in caplog.text assert mocks["mock_db"].return_value.upload_to_statusdb.called From a91d868efb0503bb0c9bb4360352b43b2120919d Mon Sep 17 00:00:00 2001 From: kedhammar Date: Thu, 24 Oct 2024 12:55:28 +0200 Subject: [PATCH 173/187] potential fix --- tests/analysis/test_analysis_element.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py index 0d4e328b..1be0860f 100644 --- a/tests/analysis/test_analysis_element.py +++ b/tests/analysis/test_analysis_element.py @@ -189,6 +189,9 @@ def test_process_on_finished_run(aviti_fixture): "--force-index-orientation", ] ) - assert mocks["mock_popen"].call_args_list[0].args[0] == expected_cmd + assert any( + expected_cmd in call.args[0] for call in mocks["mock_popen"].call_args_list + ), f"Expected command '{expected_cmd}' not found in any Popen calls." + assert "Bases2Fastq conversion and demultiplexing started for run " in caplog.text assert mocks["mock_db"].return_value.upload_to_statusdb.called From 383a7b2ed647bac4ea5b70dea10ffe852929ad1a Mon Sep 17 00:00:00 2001 From: kedhammar Date: Thu, 24 Oct 2024 12:58:18 +0200 Subject: [PATCH 174/187] troubleshooting --- tests/analysis/test_analysis_element.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py index 1be0860f..60f966a0 100644 --- a/tests/analysis/test_analysis_element.py +++ b/tests/analysis/test_analysis_element.py @@ -189,6 +189,11 @@ def test_process_on_finished_run(aviti_fixture): "--force-index-orientation", ] ) + + print("Troubleshooting start") + print([call.args[0] for call in mocks["mock_popen"].call_args_list]) + print("Troubleshooting end") + assert any( expected_cmd in call.args[0] for call in mocks["mock_popen"].call_args_list ), f"Expected command '{expected_cmd}' not found in any Popen calls." From e2cefe88c16ecd2043fc45b166bd18998436aee6 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Thu, 24 Oct 2024 13:01:43 +0200 Subject: [PATCH 175/187] more debugging --- tests/analysis/test_analysis_element.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py index 60f966a0..beb4fbc8 100644 --- a/tests/analysis/test_analysis_element.py +++ b/tests/analysis/test_analysis_element.py @@ -190,13 +190,11 @@ def test_process_on_finished_run(aviti_fixture): ] ) - print("Troubleshooting start") - print([call.args[0] for call in mocks["mock_popen"].call_args_list]) - print("Troubleshooting end") + debug_msg = "\n".join([call.args[0] for call in mocks["mock_popen"].call_args_list]) assert any( expected_cmd in call.args[0] for call in mocks["mock_popen"].call_args_list - ), f"Expected command '{expected_cmd}' not found in any Popen calls." + ), f"Expected command '{expected_cmd}' not found in any Popen calls: {debug_msg}" assert "Bases2Fastq conversion and demultiplexing started for run " in caplog.text assert mocks["mock_db"].return_value.upload_to_statusdb.called From 63495ef10d765bf301df13db7383167d16fee6a7 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Thu, 24 Oct 2024 13:08:08 +0200 Subject: [PATCH 176/187] try getting python ver up to date and decreasing debugging resolution --- .github/workflows/lint-code.yml | 8 ++++---- .github/workflows/test-code.yml | 2 +- tests/analysis/test_analysis_element.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/lint-code.yml b/.github/workflows/lint-code.yml index 59536920..bbb74445 100644 --- a/.github/workflows/lint-code.yml +++ b/.github/workflows/lint-code.yml @@ -11,7 +11,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: "3.10" + python-version: "3.11.5" - name: Install dependencies run: | python -m pip install --upgrade pip @@ -29,7 +29,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: "3.10" + python-version: "3.11.5" - name: Install dependencies run: | python -m pip install --upgrade pip @@ -46,7 +46,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: "3.10" + python-version: "3.11.5" - name: Install dependencies run: | python -m pip install --upgrade pip @@ -67,7 +67,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: "3.10" + python-version: "3.11.5" - name: Install dependencies run: | diff --git a/.github/workflows/test-code.yml b/.github/workflows/test-code.yml index eec2d5fb..7ac1ed0d 100644 --- a/.github/workflows/test-code.yml +++ b/.github/workflows/test-code.yml @@ -12,7 +12,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: "3.10" + python-version: "3.11.5" - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py index beb4fbc8..91398db2 100644 --- a/tests/analysis/test_analysis_element.py +++ b/tests/analysis/test_analysis_element.py @@ -190,7 +190,7 @@ def test_process_on_finished_run(aviti_fixture): ] ) - debug_msg = "\n".join([call.args[0] for call in mocks["mock_popen"].call_args_list]) + debug_msg = "\n".join([call for call in mocks["mock_popen"].call_args_list]) assert any( expected_cmd in call.args[0] for call in mocks["mock_popen"].call_args_list From 0a4fd33ffaf75722457fd6dda9a2e4e575f5a741 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Thu, 24 Oct 2024 13:11:54 +0200 Subject: [PATCH 177/187] more debugging --- .github/workflows/test-code.yml | 2 +- tests/analysis/test_analysis_element.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test-code.yml b/.github/workflows/test-code.yml index 7ac1ed0d..4248a17d 100644 --- a/.github/workflows/test-code.yml +++ b/.github/workflows/test-code.yml @@ -22,7 +22,7 @@ jobs: run: pip install -e . - name: pytest # Default options are configured in pyproject.toml - run: pytest --cov=./taca --cov-report=xml --cov-report term-missing -vv + run: pytest -s . - name: CodeCov uses: codecov/codecov-action@v4 with: diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py index 91398db2..506bd538 100644 --- a/tests/analysis/test_analysis_element.py +++ b/tests/analysis/test_analysis_element.py @@ -190,11 +190,12 @@ def test_process_on_finished_run(aviti_fixture): ] ) - debug_msg = "\n".join([call for call in mocks["mock_popen"].call_args_list]) + for call in mocks["mock_popen"].call_args_list: + print(call) assert any( expected_cmd in call.args[0] for call in mocks["mock_popen"].call_args_list - ), f"Expected command '{expected_cmd}' not found in any Popen calls: {debug_msg}" + ), f"Expected command '{expected_cmd}' not found in any Popen calls." assert "Bases2Fastq conversion and demultiplexing started for run " in caplog.text assert mocks["mock_db"].return_value.upload_to_statusdb.called From 60ed258b35eba1b8b53027189dce954f4fed6f89 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Thu, 24 Oct 2024 13:16:07 +0200 Subject: [PATCH 178/187] last commit worked! Try re-instating cov pytest --- .github/workflows/test-code.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-code.yml b/.github/workflows/test-code.yml index 4248a17d..f0f9af7d 100644 --- a/.github/workflows/test-code.yml +++ b/.github/workflows/test-code.yml @@ -22,7 +22,7 @@ jobs: run: pip install -e . - name: pytest # Default options are configured in pyproject.toml - run: pytest -s . + run: pytest -s --cov=./taca --cov-report=xml --cov-report term-missing -vv - name: CodeCov uses: codecov/codecov-action@v4 with: From e6c9043f8f7edef4a217fed5bcf26d9c1bc101a6 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Thu, 24 Oct 2024 13:18:47 +0200 Subject: [PATCH 179/187] last commit worked, try not capturing pytest output --- .github/workflows/test-code.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-code.yml b/.github/workflows/test-code.yml index f0f9af7d..7ac1ed0d 100644 --- a/.github/workflows/test-code.yml +++ b/.github/workflows/test-code.yml @@ -22,7 +22,7 @@ jobs: run: pip install -e . - name: pytest # Default options are configured in pyproject.toml - run: pytest -s --cov=./taca --cov-report=xml --cov-report term-missing -vv + run: pytest --cov=./taca --cov-report=xml --cov-report term-missing -vv - name: CodeCov uses: codecov/codecov-action@v4 with: From 221a57a29eba4a0a9521c4feda297c60262bcd0c Mon Sep 17 00:00:00 2001 From: kedhammar Date: Thu, 24 Oct 2024 13:20:49 +0200 Subject: [PATCH 180/187] last commit worked, try removing debug statement --- tests/analysis/test_analysis_element.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py index 506bd538..0b783875 100644 --- a/tests/analysis/test_analysis_element.py +++ b/tests/analysis/test_analysis_element.py @@ -190,9 +190,6 @@ def test_process_on_finished_run(aviti_fixture): ] ) - for call in mocks["mock_popen"].call_args_list: - print(call) - assert any( expected_cmd in call.args[0] for call in mocks["mock_popen"].call_args_list ), f"Expected command '{expected_cmd}' not found in any Popen calls." From 80c687b39967e3c4612e1ebcd9c177e95c2486ab Mon Sep 17 00:00:00 2001 From: kedhammar Date: Thu, 24 Oct 2024 13:27:36 +0200 Subject: [PATCH 181/187] try cleaner module purge --- tests/analysis/test_analysis_element.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py index 0b783875..356a695a 100644 --- a/tests/analysis/test_analysis_element.py +++ b/tests/analysis/test_analysis_element.py @@ -38,13 +38,7 @@ def aviti_fixture(create_dirs, caplog): patch.stopall() # Purge module - try: - del sys.modules["taca.analysis.analysis_element"] - except KeyError: - try: - del sys.modules["to_test"] - except KeyError: - pass + del sys.modules["taca.analysis.analysis_element"] def test_process_on_empty_dir(aviti_fixture): From db2b99e3bf39ea4c4a14da1367228e6b090b1b05 Mon Sep 17 00:00:00 2001 From: chuan-wang Date: Thu, 24 Oct 2024 15:44:03 +0200 Subject: [PATCH 182/187] Fix issue with redundant PhiX record --- taca/element/Element_Runs.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 28dc81da..6aed0ebb 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -953,19 +953,39 @@ def aggregate_stats_assigned(self, demux_runmanifest): ) # Remove redundant rows for PhiX aggregated_assigned_indexes_filtered = [] - unique_phiX_combination = set() + phix_filtered = [] for sample in aggregated_assigned_indexes: # Add project name sample["Project"] = [ d for d in demux_runmanifest if d["SampleName"] == sample["SampleName"] ][0]["Project"] + # Get the PhiX with the longest index combination. if sample["SampleName"] == "PhiX": - combination = (sample["I1"], sample["I2"], sample["Lane"]) - if combination not in unique_phiX_combination: - aggregated_assigned_indexes_filtered.append(sample) - unique_phiX_combination.add(combination) + lane = sample["Lane"] + idx1 = sample["I1"] + idx2 = sample["I2"] + num_polonies_assigned = sample["NumPoloniesAssigned"] + if not phix_filtered: + phix_filtered.append(sample) + else: + found_flag = False + for phix_record in phix_filtered: + if lane == phix_record["Lane"]: + idx1_shorter_len = min(len(idx1), len(phix_record["I1"])) + idx2_shorter_len = min(len(idx2), len(phix_record["I2"])) + if idx1[:idx1_shorter_len] == phix_record["I1"][:idx1_shorter_len] and idx2[:idx2_shorter_len] == phix_record["I2"][:idx2_shorter_len]: + found_flag = True + # When the new record has a longer index combination length, take the new record and remove the old one + # When the index combination length happen to be the same, keep the one with the higher polonies assigned + if len(idx1)+len(idx2) > len(phix_record["I1"])+len(phix_record["I2"]) or (len(idx1)+len(idx2) == len(phix_record["I1"])+len(phix_record["I2"]) and num_polonies_assigned >= phix_record["NumPoloniesAssigned"]): + phix_filtered.remove(phix_record) + phix_filtered.append(sample) + if not found_flag: + phix_filtered.append(sample) else: aggregated_assigned_indexes_filtered.append(sample) + # Combine the list of samples and PhiX + aggregated_assigned_indexes_filtered += phix_filtered # Sort the list by Lane, SampleName and sub_demux_count aggregated_assigned_indexes_filtered_sorted = sorted( aggregated_assigned_indexes_filtered, From 2de49941256cc02b1ea315b964f787a09fcbd330 Mon Sep 17 00:00:00 2001 From: chuan-wang Date: Thu, 24 Oct 2024 16:13:11 +0200 Subject: [PATCH 183/187] ruff format change --- taca/element/Element_Runs.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 6aed0ebb..8b246858 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -650,7 +650,7 @@ def generate_demux_command(self, run_manifest, demux_dir): def start_demux(self, run_manifest, demux_dir): with chdir(self.run_dir): cmd = self.generate_demux_command(run_manifest, demux_dir) - stderr_abspath = f"{self.run_dir}/bases2fastq_stderr.txt" #TODO: individual files for each sub-demux + stderr_abspath = f"{self.run_dir}/bases2fastq_stderr.txt" # TODO: individual files for each sub-demux try: with open(stderr_abspath, "w") as stderr: process = subprocess.Popen( @@ -973,11 +973,23 @@ def aggregate_stats_assigned(self, demux_runmanifest): if lane == phix_record["Lane"]: idx1_shorter_len = min(len(idx1), len(phix_record["I1"])) idx2_shorter_len = min(len(idx2), len(phix_record["I2"])) - if idx1[:idx1_shorter_len] == phix_record["I1"][:idx1_shorter_len] and idx2[:idx2_shorter_len] == phix_record["I2"][:idx2_shorter_len]: + if ( + idx1[:idx1_shorter_len] + == phix_record["I1"][:idx1_shorter_len] + and idx2[:idx2_shorter_len] + == phix_record["I2"][:idx2_shorter_len] + ): found_flag = True # When the new record has a longer index combination length, take the new record and remove the old one # When the index combination length happen to be the same, keep the one with the higher polonies assigned - if len(idx1)+len(idx2) > len(phix_record["I1"])+len(phix_record["I2"]) or (len(idx1)+len(idx2) == len(phix_record["I1"])+len(phix_record["I2"]) and num_polonies_assigned >= phix_record["NumPoloniesAssigned"]): + if len(idx1) + len(idx2) > len(phix_record["I1"]) + len( + phix_record["I2"] + ) or ( + len(idx1) + len(idx2) + == len(phix_record["I1"]) + len(phix_record["I2"]) + and num_polonies_assigned + >= phix_record["NumPoloniesAssigned"] + ): phix_filtered.remove(phix_record) phix_filtered.append(sample) if not found_flag: @@ -1218,7 +1230,7 @@ def sync_metadata(self): dest = os.path.join(metadata_archive, self.NGI_run_id) if not os.path.exists(dest): os.makedirs(dest) - for f in files_to_copy: # UnassignedSequences.csv missing in NoIndex case + for f in files_to_copy: # UnassignedSequences.csv missing in NoIndex case if os.path.exists(f): shutil.copy(f, dest) else: From 05979c7208b377d60d569a3a79512639ea84e53f Mon Sep 17 00:00:00 2001 From: kedhammar Date: Thu, 24 Oct 2024 16:34:47 +0200 Subject: [PATCH 184/187] propagate bugfix --- taca/analysis/analysis_element.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index 984c2a7c..6937b0c5 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -77,7 +77,7 @@ def _process(run): return elif demultiplexing_status == "ongoing": run.status = "demultiplexing" - if run.status_changed: + if run.status_changed(): run.update_statusdb() return @@ -100,14 +100,14 @@ def _process(run): run.sync_metadata() run.make_transfer_indicator() run.status = "transferring" - if run.status_changed: + if run.status_changed(): run.update_statusdb() # TODO: Also update statusdb with a timestamp of when the transfer started run.transfer() return elif transfer_status == "ongoing": run.status = "transferring" - if run.status_changed: + if run.status_changed(): run.update_statusdb() logger.info( f"{run} is being transferred. Skipping." @@ -118,12 +118,12 @@ def _process(run): run.remove_transfer_indicator() run.update_transfer_log() run.status = "transferred" - if run.status_changed: + if run.status_changed(): run.update_statusdb() run.archive() run.status = "archived" - if run.status_changed: + if run.status_changed(): run.update_statusdb() else: run.status = "transfer failed" From 5599bed3fb9b2ead09bcfd14dfd418762db258e2 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Thu, 24 Oct 2024 16:42:01 +0200 Subject: [PATCH 185/187] set required python version, same as for GHA build --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index e278a522..da2b5026 100644 --- a/setup.py +++ b/setup.py @@ -27,6 +27,7 @@ keywords="bioinformatics", author="NGI-stockholm", author_email="ngi_pipeline_operators@scilifelab.se", + python_requires=">=3.11.5", url="http://taca.readthedocs.org/en/latest/", license="MIT", packages=find_packages(exclude=["ez_setup", "examples", "tests"]), From 488cb39f01e2d2806ae1a5c351f0b16d6fb07cdb Mon Sep 17 00:00:00 2001 From: kedhammar Date: Thu, 24 Oct 2024 16:43:50 +0200 Subject: [PATCH 186/187] propagate python version explication to Dockerfile --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 93fd631b..5f9e41f0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.10 AS base +FROM python:3.11.5 AS base # Update pip to latest version RUN python -m pip install --upgrade pip From 6b6e05631563a9db41840a395538c833f906c3c3 Mon Sep 17 00:00:00 2001 From: Sara Sjunnebo Date: Fri, 25 Oct 2024 08:52:43 +0200 Subject: [PATCH 187/187] Rename archiving --- taca/analysis/analysis_element.py | 4 ++-- taca/element/Element_Runs.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py index 6937b0c5..9f71efb5 100755 --- a/taca/analysis/analysis_element.py +++ b/taca/analysis/analysis_element.py @@ -120,8 +120,8 @@ def _process(run): run.status = "transferred" if run.status_changed(): run.update_statusdb() - run.archive() - run.status = "archived" + run.move_to_nosync() + run.status = "processed" if run.status_changed(): run.update_statusdb() diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 8b246858..3fa697a0 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -1293,7 +1293,7 @@ def update_paths_after_archiving(self, new_location): ) self.run_uploaded_file = os.path.join(self.run_dir, "RunUploaded.json") - def archive(self): + def move_to_nosync(self): """Move directory to nosync.""" src = self.run_dir parent_dir = Path(self.run_dir).parent.absolute()