From 22bbbcf3dd244b6de0e5f2648cdf946f6ec0a03d Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Thu, 29 Aug 2024 10:19:04 +0200
Subject: [PATCH 001/187] Initial setup for element aviti

---
 taca/analysis/analysis_element.py | 65 +++++++++++++++++++++++++++++++
 taca/analysis/cli.py              | 23 +++++++++++
 taca/element/Aviti_Runs.py        |  7 ++++
 taca/element/Element_Runs.py      | 18 +++++++++
 taca/element/__init__.py          |  3 ++
 5 files changed, 116 insertions(+)
 create mode 100755 taca/analysis/analysis_element.py
 create mode 100644 taca/element/Aviti_Runs.py
 create mode 100644 taca/element/Element_Runs.py
 create mode 100644 taca/element/__init__.py

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
new file mode 100755
index 00000000..1099ffc9
--- /dev/null
+++ b/taca/analysis/analysis_element.py
@@ -0,0 +1,65 @@
+"""Analysis methods for sequencing runs produced by Element instruments."""
+
+import glob
+import logging
+import os
+
+from taca.element.Element_Runs import Aviti_Run
+from taca.utils.config import CONFIG
+
+logger = logging.getLogger(__name__)
+
+
+def run_preprocessing(given_run):
+    """Run demultiplexing in all data directories.
+
+    :param str given_run: Process a particular run instead of looking for runs
+    """
+
+    def _process(run):
+        """Process a run/flowcell and transfer to analysis server.
+
+        :param taca.element.Run run: Run to be processed and transferred
+        """
+        # Check if sequencing is finished. (is the final file there and was it completed OK)
+        # if sequencing is not done
+            # Update statusdb?
+            # return
+        # else If sequencing finished and demux not started
+            # Update statusdb
+            # Get/generate sample sheet
+            # Start demux
+        # else if sequencing finished and demux ongoing
+            # do nothing
+        # Else if sequencing started and demux finished
+            # check if run is transferred or transfer is ongoing
+            # if run has not been transferred and transfer is not ongoing
+                # make a hidden file to indicate that transfer has started
+                # transfer run to miarka 
+                # remove hidden file if transfer was successful
+                # Update transfer log
+                # archive run to nosync
+            # elif run is being transferred (hidden file exists)
+                # return
+            # elif run is already transferred (in transfer log)
+                # warn that transferred run has not been archived
+
+        
+
+    if given_run:
+        run = Aviti_Run(run) #TODO: Needs to change if more Element machines are aquired in the future
+        _process(runObj)
+    else:
+        data_dirs = CONFIG.get("element_analysis").get("data_dirs") #TODO: add to config
+        for data_dir in data_dirs:
+            # Run folder looks like DATE_*_*_*, the last section is the FC name.
+            runs = glob.glob(os.path.join(data_dir, "[1-9]*_*_*_*")) #TODO: adapt to aviti format
+            for run in runs:
+                runObj = Aviti_Run(run)
+                try:
+                    _process(runObj)
+                except: #TODO: chatch error message and print it
+                    # This function might throw and exception,
+                    # it is better to continue processing other runs
+                    logger.warning(f"There was an error processing the run {run}")
+                    pass
diff --git a/taca/analysis/cli.py b/taca/analysis/cli.py
index 342a8b1c..13250f61 100644
--- a/taca/analysis/cli.py
+++ b/taca/analysis/cli.py
@@ -4,6 +4,7 @@
 
 from taca.analysis import analysis as an
 from taca.analysis import analysis_nanopore
+from taca.analysis import analysis_element
 
 
 @click.group()
@@ -71,6 +72,28 @@ def updatedb(rundir, software):
     """Save the run to statusdb."""
     an.upload_to_statusdb(rundir, software)
 
+# Element analysis subcommands
+
+
+@analysis.command()
+@click.option(
+    "-r",
+    "--run",
+    type=click.Path(exists=True),
+    default=None,
+    help="Demultiplex only a particular run",
+)
+def demultiplex_element(run):
+    """Demultiplex and transfer all runs present in the data directories."""
+    analysis_element.run_preprocessing(run)
+
+
+@analysis.command()
+@click.argument("run")
+def element_updatedb(run):
+    """Save the run to statusdb."""
+    analysis_element.upload_to_statusdb(run)
+
 
 # Nanopore analysis subcommands
 
diff --git a/taca/element/Aviti_Runs.py b/taca/element/Aviti_Runs.py
new file mode 100644
index 00000000..ad162ac4
--- /dev/null
+++ b/taca/element/Aviti_Runs.py
@@ -0,0 +1,7 @@
+from taca.element.Element_Runs import Run
+
+
+class Aviti_Run(Run):
+    def __init__(self, run_dir, configuration):
+        super().__init__(run_dir, configuration)
+        self.sequencer_type = "Aviti"
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
new file mode 100644
index 00000000..8bd18273
--- /dev/null
+++ b/taca/element/Element_Runs.py
@@ -0,0 +1,18 @@
+import logging
+import os
+
+logger = logging.getLogger(__name__)
+
+
+class Run:
+    """Defines an Element run"""
+
+    def __init__(self, run_dir, configuration):
+        if not os.path.exists(run_dir):
+            raise RuntimeError(f"Could not locate run directory {run_dir}")
+        self.run_dir = os.path.abspath(run_dir)
+        self.CONFIG = configuration
+        self.demux_dir = "Demultiplexing"
+    
+    def is_transferred(self, transfer_file):
+        pass
\ No newline at end of file
diff --git a/taca/element/__init__.py b/taca/element/__init__.py
new file mode 100644
index 00000000..75a569ff
--- /dev/null
+++ b/taca/element/__init__.py
@@ -0,0 +1,3 @@
+"""
+Classes to parse and work with Element data
+"""

From 36dbafd21734ad0b0b7aeea9c470390f475476c8 Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Thu, 29 Aug 2024 12:50:07 +0200
Subject: [PATCH 002/187] Updated outline for Aviti

---
 taca/analysis/analysis_element.py | 18 ++++++++++++++----
 taca/element/Element_Runs.py      |  3 +++
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index 1099ffc9..7d9648b7 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -21,27 +21,37 @@ def _process(run):
 
         :param taca.element.Run run: Run to be processed and transferred
         """
+        # Fetch statusdb document for run
+        
+        # Get previous status of run from statusdb document
         # Check if sequencing is finished. (is the final file there and was it completed OK)
         # if sequencing is not done
-            # Update statusdb?
+            # compare previous status with current status and update statusdb document if different
             # return
-        # else If sequencing finished and demux not started
-            # Update statusdb
+        # else if sequencing finished and demux not started
             # Get/generate sample sheet
             # Start demux
+            # compare previous status with current status and update statusdb document if different
         # else if sequencing finished and demux ongoing
-            # do nothing
+            # compare previous status with current status and update statusdb document if different
+            # return
         # Else if sequencing started and demux finished
             # check if run is transferred or transfer is ongoing
             # if run has not been transferred and transfer is not ongoing
                 # make a hidden file to indicate that transfer has started
+                # compare previous status with current status and update statusdb document if different
+                    # Also update statusdb with a timestamp of when the transfer started
                 # transfer run to miarka 
                 # remove hidden file if transfer was successful
                 # Update transfer log
+                # update statusdb document
                 # archive run to nosync
+                # update statusdb document
             # elif run is being transferred (hidden file exists)
+                # compare previous status with current status and update statusdb document if different
                 # return
             # elif run is already transferred (in transfer log)
+                # compare previous status with current status and update statusdb document if different
                 # warn that transferred run has not been archived
 
         
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 8bd18273..e718cc85 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -15,4 +15,7 @@ def __init__(self, run_dir, configuration):
         self.demux_dir = "Demultiplexing"
     
     def is_transferred(self, transfer_file):
+        pass
+    
+    def parse_rundir(self):
         pass
\ No newline at end of file

From d0054466ec098701d551f6203d8883409c182e30 Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Thu, 29 Aug 2024 16:22:20 +0200
Subject: [PATCH 003/187] Process aviti data from sequencing to demux

---
 taca/analysis/analysis_element.py | 43 +++++++++++++++++---------
 taca/element/Element_Runs.py      | 51 ++++++++++++++++++++++++++++++-
 2 files changed, 78 insertions(+), 16 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index 7d9648b7..6e6a395f 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -6,9 +6,17 @@
 
 from taca.element.Element_Runs import Aviti_Run
 from taca.utils.config import CONFIG
+from taca.utils import statusdb
+
 
 logger = logging.getLogger(__name__)
 
+def _upload_to_statusdb(run):
+    """Triggers the upload to statusdb.
+
+    :param Run run: the object run
+    """
+    pass
 
 def run_preprocessing(given_run):
     """Run demultiplexing in all data directories.
@@ -21,21 +29,26 @@ def _process(run):
 
         :param taca.element.Run run: Run to be processed and transferred
         """
-        # Fetch statusdb document for run
-        
-        # Get previous status of run from statusdb document
-        # Check if sequencing is finished. (is the final file there and was it completed OK)
-        # if sequencing is not done
-            # compare previous status with current status and update statusdb document if different
-            # return
-        # else if sequencing finished and demux not started
-            # Get/generate sample sheet
+        #TODO: Fetch statusdb document for run
+        #TODO: Get previous status of run from statusdb document
+        sequencing_done = run.check_sequencing_status()
+        demultiplexing_status = run.get_demultiplexing_status()
+        if not sequencing_done:
+            #TODO: compare previous status with current status and update statusdb document if different
+            return
+        elif sequencing_done and demultiplexing_status == "not started":
+            if not run.manifest_exists(): # Assumes that we use the same manifest as for sequencing. TODO: demux settings need to be added to the original manifest by lims
+                #TODO: email operator that manifest is missing
+                return
             # Start demux
-            # compare previous status with current status and update statusdb document if different
-        # else if sequencing finished and demux ongoing
-            # compare previous status with current status and update statusdb document if different
-            # return
-        # Else if sequencing started and demux finished
+            run.start_demux()
+            #TODO: compare previous status with current status and update statusdb document if different
+            return
+        elif sequencing_done and demultiplexing_status == "ongoing":
+            #TODO: compare previous status with current status and update statusdb document if different
+            return
+        elif sequencing_done and demultiplexing_status == "finished":
+            # Sync metadata to ngi-data-ns
             # check if run is transferred or transfer is ongoing
             # if run has not been transferred and transfer is not ongoing
                 # make a hidden file to indicate that transfer has started
@@ -53,7 +66,7 @@ def _process(run):
             # elif run is already transferred (in transfer log)
                 # compare previous status with current status and update statusdb document if different
                 # warn that transferred run has not been archived
-
+            pass
         
 
     if given_run:
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index e718cc85..f3007dff 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -1,5 +1,10 @@
 import logging
 import os
+import json
+from datetime import datetime
+
+from taca.utils import misc
+from taca.utils.filesystem import chdir
 
 logger = logging.getLogger(__name__)
 
@@ -12,7 +17,51 @@ def __init__(self, run_dir, configuration):
             raise RuntimeError(f"Could not locate run directory {run_dir}")
         self.run_dir = os.path.abspath(run_dir)
         self.CONFIG = configuration
-        self.demux_dir = "Demultiplexing"
+        self.demux_dir = os.path.join(self.run_dir, "Demultiplexing")
+        self.final_sequencing_file = os.path.join(self.run_dir, "RunUploaded.json")
+        self.demux_stats_file = os.path.join(self.demux_dir, "RunStats.json") #TODO: How to handle SideA/SideB?
+        self.run_manifest_file = os.path.join(self.run_dir, "RunManifest.csv")
+    
+    def check_sequencing_status(self):
+        if os.path.exists(self.final_sequencing_file):
+            with open(self.final_sequencing_file) as json_file:
+                sequencing_outcome = json.load(json_file).get("outcome")
+            if sequencing_outcome != "OutcomeCompleted":
+                return False
+            else:
+                return True
+        else:
+            return False
+    
+    def get_demultiplexing_status(self):
+        if not os.path.exists(self.demux_dir):
+            return "not started"
+        elif os.path.exists(self.demux_dir) and not os.path.isfile(self.demux_stats_file):
+            return "ongoing"
+        elif os.path.exists(self.demux_dir) and os.path.isfile(self.demux_stats_file):
+            return "finished"
+    
+    def manifest_exists(self):
+        return os.path.isfile(self.run_manifest_file)
+    
+    def generate_demux_command(self):
+        command = [self.CONFIG.get(self.software)["bin"], #TODO add path to bases2fastq executable to config
+                   self.run_dir,
+                   self.demux_dir, #TODO: how to handle SideA/SideB?
+                   "-p 12"
+                   ] 
+        return command
+    
+    def start_demux(self):
+        with chdir(self.run_dir):
+            cmd = self.generate_demux_command()
+            misc.call_external_command_detached(
+                cmd, with_log_files=True, prefix=f"demux_"
+            )
+            logger.info(
+                "Bases2Fastq conversion and demultiplexing "
+                f"started for run {os.path.basename(self.id)} on {datetime.now()}"
+            )
     
     def is_transferred(self, transfer_file):
         pass

From 6b9e885c728daeab5516986099223b80e0901953 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Mon, 2 Sep 2024 11:59:33 +0200
Subject: [PATCH 004/187] add aviti stuff to test conf tempdir

---
 tests/conftest.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tests/conftest.py b/tests/conftest.py
index 171a5667..d699945a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -31,9 +31,11 @@ def create_dirs():
         │   ├── minion_data
         │   ├── miseq_data
         │   ├── promethion_data
+        │   ├── Aviti_data
         │   └── samplesheets
         │       ├── NovaSeqXPlus
         │       └── anglerfish
+        │       └── Aviti
         └── ngi_data
             └── sequencing
                 ├── MiSeq
@@ -49,6 +51,8 @@ def create_dirs():
                 │   └── qc
                 │       └── nosync
                 └── promethion
+                │   └── nosync
+                └── AV242106
                     └── nosync
 
     --> Return the the temporary directory object
@@ -65,6 +69,8 @@ def create_dirs():
     os.makedirs(f"{tmp.name}/ngi_data/sequencing/promethion/nosync")
     os.makedirs(f"{tmp.name}/ngi_data/sequencing/minion/nosync")
     os.makedirs(f"{tmp.name}/ngi_data/sequencing/minion/qc/nosync")
+    ## AVITI
+    os.makedirs(f"{tmp.name}/ngi_data/sequencing/AV242106/nosync")
 
     # Sequencing metadata
     ## Illumina
@@ -75,10 +81,13 @@ def create_dirs():
     ## ONT
     os.makedirs(f"{tmp.name}/ngi-nas-ns/promethion_data")
     os.makedirs(f"{tmp.name}/ngi-nas-ns/minion_data")
+    ## AVITI
+    os.makedirs(f"{tmp.name}/ngi-nas-ns/Aviti_data")
 
     # Samplesheets
     os.makedirs(f"{tmp.name}/ngi-nas-ns/samplesheets/anglerfish")
     os.makedirs(f"{tmp.name}/ngi-nas-ns/samplesheets/NovaSeqXPlus")
+    os.makedirs(f"{tmp.name}/ngi-nas-ns/samplesheets/Aviti")
 
     # Misc. ONT dirs/files
     os.makedirs(f"{tmp.name}/minknow_reports")

From 36fc1c64faa6945b891c94a28270e57993165839 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Mon, 2 Sep 2024 12:36:53 +0200
Subject: [PATCH 005/187] modularize

---
 tests/element/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 tests/element/__init__.py

diff --git a/tests/element/__init__.py b/tests/element/__init__.py
new file mode 100644
index 00000000..e69de29b

From bafbf6a7ef978b44761fba9559cd77491db03598 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Mon, 2 Sep 2024 12:37:03 +0200
Subject: [PATCH 006/187] add test class

---
 tests/element/test_Element_Runs.py | 66 ++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 tests/element/test_Element_Runs.py

diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py
new file mode 100644
index 00000000..9f421400
--- /dev/null
+++ b/tests/element/test_Element_Runs.py
@@ -0,0 +1,66 @@
+import os
+import tempfile
+
+import pytest
+
+from taca.element import Element_Runs as to_test
+
+
+def create_aviti_run_dir(
+    tmp: tempfile.TemporaryDirectory,
+    run_name: str = "20240716_AV242106_testrun",
+    nosync: bool = False,
+    run_finished: bool = True,
+    sync_finished: bool = True,
+) -> str:
+    # Create run dir
+    if nosync:
+        run_path = f"{tmp.name}/ngi_data/sequencing/AV242106/nosync/{run_name}"
+    else:
+        run_path = f"{tmp.name}/ngi_data/sequencing/AV242106/{run_name}"
+    os.mkdir(run_path)
+
+    # Create files
+    if run_finished:
+        open(f"{run_path}/AvitiRunStats.json", "w").close()
+        open(f"{run_path}/RunManifest.csv", "w").close()
+        open(f"{run_path}/RunManifest.json", "w").close()
+        open(f"{run_path}/RunParameters.json", "w").close()
+        open(f"{run_path}/RunUploaded.json", "w").close()
+
+    if sync_finished:
+        open(f"{run_path}/.sync_finished", "w").close()
+
+    return run_path
+
+
+class TestRun:
+    @pytest.fixture(autouse=True)
+    def setup(self, create_dirs: pytest.fixture):
+        self.tmp: tempfile.TemporaryDirectory = create_dirs
+        self.run_path = create_aviti_run_dir(self.tmp)
+        self.run = to_test.Run(self.run_path, {})
+
+    def test_init(self):
+        assert self.run.run_dir == self.run_path
+
+    def test_check_sequencing_status(self):
+        assert False
+
+    def test_get_demultiplexing_status(self):
+        assert False
+
+    def test_manifest_exists(self):
+        assert False
+
+    def test_generate_demux_command(self):
+        assert False
+
+    def test_start_demux(self):
+        assert False
+
+    def test_is_transferred(self):
+        assert False
+
+    def test_parse_rundir(self):
+        assert False

From 7ff7a26f5a234094bda3028e596ec89f79fb06cb Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Mon, 2 Sep 2024 14:05:02 +0200
Subject: [PATCH 007/187] add tests

---
 tests/element/test_Element_Runs.py | 116 +++++++++++++++++++++++------
 1 file changed, 94 insertions(+), 22 deletions(-)

diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py
index 9f421400..33050200 100644
--- a/tests/element/test_Element_Runs.py
+++ b/tests/element/test_Element_Runs.py
@@ -1,3 +1,4 @@
+import json
 import os
 import tempfile
 
@@ -12,6 +13,9 @@ def create_aviti_run_dir(
     nosync: bool = False,
     run_finished: bool = True,
     sync_finished: bool = True,
+    demux_dir: bool = True,
+    demux_done: bool = True,
+    outcome_completed: bool = True,
 ) -> str:
     # Create run dir
     if nosync:
@@ -20,47 +24,115 @@ def create_aviti_run_dir(
         run_path = f"{tmp.name}/ngi_data/sequencing/AV242106/{run_name}"
     os.mkdir(run_path)
 
-    # Create files
+    # Populate run dir with files and folders
     if run_finished:
         open(f"{run_path}/AvitiRunStats.json", "w").close()
         open(f"{run_path}/RunManifest.csv", "w").close()
         open(f"{run_path}/RunManifest.json", "w").close()
         open(f"{run_path}/RunParameters.json", "w").close()
-        open(f"{run_path}/RunUploaded.json", "w").close()
+        with open(f"{run_path}/RunUploaded.json", "w") as f:
+            outcome = "OutcomeCompleted" if outcome_completed else "OutcomeFailed"
+            f.write(json.dumps({"outcome": outcome}))
 
     if sync_finished:
         open(f"{run_path}/.sync_finished", "w").close()
 
-    return run_path
-
-
-class TestRun:
-    @pytest.fixture(autouse=True)
-    def setup(self, create_dirs: pytest.fixture):
-        self.tmp: tempfile.TemporaryDirectory = create_dirs
-        self.run_path = create_aviti_run_dir(self.tmp)
-        self.run = to_test.Run(self.run_path, {})
-
-    def test_init(self):
-        assert self.run.run_dir == self.run_path
+    if demux_dir:
+        os.mkdir(os.path.join(run_path, "Demultiplexing"))
 
-    def test_check_sequencing_status(self):
-        assert False
+    if demux_done:
+        open(os.path.join(run_path, "Demultiplexing", "RunStats.json"), "w").close()
 
-    def test_get_demultiplexing_status(self):
-        assert False
+    return run_path
 
-    def test_manifest_exists(self):
-        assert False
 
+class TestRun:
+    def test_init(self, create_dirs: pytest.fixture):
+        tmp: tempfile.TemporaryDirectory = create_dirs
+        run_dir = create_aviti_run_dir(tmp)
+        run = to_test.Run(run_dir, {})
+        assert run.run_dir == run_dir
+
+    @pytest.mark.parametrize(
+        "p",
+        [
+            {"run_finished": True, "outcome_completed": True, "expected": True},
+            {"run_finished": True, "outcome_completed": False, "expected": False},
+            {"run_finished": False, "outcome_completed": False, "expected": False},
+        ],
+        ids=["success", "failure", "ongoing"],
+    )
+    def test_check_sequencing_status(
+        self, p: pytest.fixture, create_dirs: pytest.fixture
+    ):
+        tmp: tempfile.TemporaryDirectory = create_dirs
+
+        run = to_test.Run(
+            create_aviti_run_dir(
+                tmp,
+                run_finished=p["run_finished"],
+                outcome_completed=p["outcome_completed"],
+            ),
+            {},
+        )
+        assert run.check_sequencing_status() is p["expected"]
+
+    @pytest.mark.parametrize(
+        "p",
+        [
+            {"demux_dir": False, "demux_done": False, "expected": "not started"},
+            {"demux_dir": True, "demux_done": False, "expected": "ongoing"},
+            {"demux_dir": True, "demux_done": True, "expected": "finished"},
+        ],
+        ids=["not started", "ongoing", "finished"],
+    )
+    def test_get_demultiplexing_status(
+        self, p: pytest.fixture, create_dirs: pytest.fixture
+    ):
+        tmp: tempfile.TemporaryDirectory = create_dirs
+
+        run = to_test.Run(
+            create_aviti_run_dir(
+                tmp,
+                demux_dir=p["demux_dir"],
+                demux_done=p["demux_done"],
+            ),
+            {},
+        )
+        assert run.get_demultiplexing_status() == p["expected"]
+
+    @pytest.mark.parametrize(
+        "p",
+        [
+            {"run_finished": True, "expected": True},
+            {"run_finished": False, "expected": False},
+        ],
+        ids=["exists", "does not exist"],
+    )
+    def test_manifest_exists(self, create_dirs: pytest.fixture, p: pytest.fixture):
+        tmp: tempfile.TemporaryDirectory = create_dirs
+
+        run = to_test.Run(
+            create_aviti_run_dir(
+                tmp,
+                run_finished=p["run_finished"],
+            ),
+            {},
+        )
+        assert run.manifest_exists() == p["expected"]
+
+    @pytest.mark.skip
     def test_generate_demux_command(self):
         assert False
 
+    @pytest.mark.skip
     def test_start_demux(self):
         assert False
 
+    @pytest.mark.skip
     def test_is_transferred(self):
-        assert False
+        pass
 
+    @pytest.mark.skip
     def test_parse_rundir(self):
-        assert False
+        pass

From 781ef95e9d45cd84194dcc382bc79fc0dfcc3f91 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Mon, 2 Sep 2024 14:09:22 +0200
Subject: [PATCH 008/187] docs

---
 tests/element/test_Element_Runs.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py
index 33050200..2f491441 100644
--- a/tests/element/test_Element_Runs.py
+++ b/tests/element/test_Element_Runs.py
@@ -17,6 +17,21 @@ def create_aviti_run_dir(
     demux_done: bool = True,
     outcome_completed: bool = True,
 ) -> str:
+    """
+    Conditionally build a file tree for an Aviti run.
+
+        .
+        ├── AvitiRunStats.json
+        ├── RunManifest.csv
+        ├── RunManifest.json
+        ├── RunParameters.json
+        ├── RunUploaded.json
+        ├── .sync_finished
+        └── Demultiplexing
+            └── RunStats.json
+
+    """
+
     # Create run dir
     if nosync:
         run_path = f"{tmp.name}/ngi_data/sequencing/AV242106/nosync/{run_name}"

From d32b7de8b776a132eacdf7de4c10b4c1572afd71 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Mon, 2 Sep 2024 14:54:37 +0200
Subject: [PATCH 009/187] bugfix

---
 taca/element/Element_Runs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index f3007dff..c850446c 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -60,7 +60,7 @@ def start_demux(self):
             )
             logger.info(
                 "Bases2Fastq conversion and demultiplexing "
-                f"started for run {os.path.basename(self.id)} on {datetime.now()}"
+                f"started for run {os.path.basename(self.run_dir)} on {datetime.now()}"
             )
     
     def is_transferred(self, transfer_file):

From 1d3408b8beb4ca1ad79122131ce776b0bc172abc Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Mon, 2 Sep 2024 14:55:13 +0200
Subject: [PATCH 010/187] add tests

---
 tests/element/test_Element_Runs.py | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py
index 2f491441..cfeb013f 100644
--- a/tests/element/test_Element_Runs.py
+++ b/tests/element/test_Element_Runs.py
@@ -1,6 +1,7 @@
 import json
 import os
 import tempfile
+from unittest.mock import patch
 
 import pytest
 
@@ -137,17 +138,27 @@ def test_manifest_exists(self, create_dirs: pytest.fixture, p: pytest.fixture):
         assert run.manifest_exists() == p["expected"]
 
     @pytest.mark.skip
-    def test_generate_demux_command(self):
-        assert False
+    def test_generate_demux_command(self, create_dirs):
+        pass
 
-    @pytest.mark.skip
-    def test_start_demux(self):
-        assert False
+    def test_start_demux(self, create_dirs):
+        with patch(
+            "taca.utils.misc.call_external_command_detached"
+        ) as mock_call, patch(
+            "taca.element.Element_Runs.Run.generate_demux_command"
+        ) as mock_command:
+            mock_command.return_value = "test command"
+            run = to_test.Run(create_aviti_run_dir(create_dirs), {})
+            run.start_demux()
+            mock_command.assert_called_once()
+            mock_call.assert_called_once_with(
+                "test command", with_log_files=True, prefix="demux_"
+            )
 
     @pytest.mark.skip
-    def test_is_transferred(self):
+    def test_is_transferred(self, create_dirs):
         pass
 
     @pytest.mark.skip
-    def test_parse_rundir(self):
+    def test_parse_rundir(self, create_dirs):
         pass

From b5d307c1c1b5ce20a76327fb44af5a3da4d58d76 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Mon, 2 Sep 2024 14:59:36 +0200
Subject: [PATCH 011/187] differentiate element/aviti

---
 tests/conftest.py                  |  4 ++--
 tests/element/test_Element_Runs.py | 16 +++++++---------
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index d699945a..c26d4c03 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -69,7 +69,7 @@ def create_dirs():
     os.makedirs(f"{tmp.name}/ngi_data/sequencing/promethion/nosync")
     os.makedirs(f"{tmp.name}/ngi_data/sequencing/minion/nosync")
     os.makedirs(f"{tmp.name}/ngi_data/sequencing/minion/qc/nosync")
-    ## AVITI
+    ## Element
     os.makedirs(f"{tmp.name}/ngi_data/sequencing/AV242106/nosync")
 
     # Sequencing metadata
@@ -81,7 +81,7 @@ def create_dirs():
     ## ONT
     os.makedirs(f"{tmp.name}/ngi-nas-ns/promethion_data")
     os.makedirs(f"{tmp.name}/ngi-nas-ns/minion_data")
-    ## AVITI
+    ## Element
     os.makedirs(f"{tmp.name}/ngi-nas-ns/Aviti_data")
 
     # Samplesheets
diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py
index cfeb013f..7377742b 100644
--- a/tests/element/test_Element_Runs.py
+++ b/tests/element/test_Element_Runs.py
@@ -8,7 +8,7 @@
 from taca.element import Element_Runs as to_test
 
 
-def create_aviti_run_dir(
+def create_element_run_dir(
     tmp: tempfile.TemporaryDirectory,
     run_name: str = "20240716_AV242106_testrun",
     nosync: bool = False,
@@ -19,10 +19,9 @@ def create_aviti_run_dir(
     outcome_completed: bool = True,
 ) -> str:
     """
-    Conditionally build a file tree for an Aviti run.
+    Conditionally build a file tree for an Element run.
 
         .
-        ├── AvitiRunStats.json
         ├── RunManifest.csv
         ├── RunManifest.json
         ├── RunParameters.json
@@ -42,7 +41,6 @@ def create_aviti_run_dir(
 
     # Populate run dir with files and folders
     if run_finished:
-        open(f"{run_path}/AvitiRunStats.json", "w").close()
         open(f"{run_path}/RunManifest.csv", "w").close()
         open(f"{run_path}/RunManifest.json", "w").close()
         open(f"{run_path}/RunParameters.json", "w").close()
@@ -65,7 +63,7 @@ def create_aviti_run_dir(
 class TestRun:
     def test_init(self, create_dirs: pytest.fixture):
         tmp: tempfile.TemporaryDirectory = create_dirs
-        run_dir = create_aviti_run_dir(tmp)
+        run_dir = create_element_run_dir(tmp)
         run = to_test.Run(run_dir, {})
         assert run.run_dir == run_dir
 
@@ -84,7 +82,7 @@ def test_check_sequencing_status(
         tmp: tempfile.TemporaryDirectory = create_dirs
 
         run = to_test.Run(
-            create_aviti_run_dir(
+            create_element_run_dir(
                 tmp,
                 run_finished=p["run_finished"],
                 outcome_completed=p["outcome_completed"],
@@ -108,7 +106,7 @@ def test_get_demultiplexing_status(
         tmp: tempfile.TemporaryDirectory = create_dirs
 
         run = to_test.Run(
-            create_aviti_run_dir(
+            create_element_run_dir(
                 tmp,
                 demux_dir=p["demux_dir"],
                 demux_done=p["demux_done"],
@@ -129,7 +127,7 @@ def test_manifest_exists(self, create_dirs: pytest.fixture, p: pytest.fixture):
         tmp: tempfile.TemporaryDirectory = create_dirs
 
         run = to_test.Run(
-            create_aviti_run_dir(
+            create_element_run_dir(
                 tmp,
                 run_finished=p["run_finished"],
             ),
@@ -148,7 +146,7 @@ def test_start_demux(self, create_dirs):
             "taca.element.Element_Runs.Run.generate_demux_command"
         ) as mock_command:
             mock_command.return_value = "test command"
-            run = to_test.Run(create_aviti_run_dir(create_dirs), {})
+            run = to_test.Run(create_element_run_dir(create_dirs), {})
             run.start_demux()
             mock_command.assert_called_once()
             mock_call.assert_called_once_with(

From 047ac2222907d14cc26c963ac5288ea9a7eed8aa Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Mon, 2 Sep 2024 15:09:34 +0200
Subject: [PATCH 012/187] add skip reasons

---
 tests/element/test_Element_Runs.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py
index 7377742b..a6eca20a 100644
--- a/tests/element/test_Element_Runs.py
+++ b/tests/element/test_Element_Runs.py
@@ -135,8 +135,8 @@ def test_manifest_exists(self, create_dirs: pytest.fixture, p: pytest.fixture):
         )
         assert run.manifest_exists() == p["expected"]
 
-    @pytest.mark.skip
-    def test_generate_demux_command(self, create_dirs):
+    @pytest.mark.skip(reason="Not implemented yet")
+    def test_generate_demux_command(self):
         pass
 
     def test_start_demux(self, create_dirs):
@@ -153,10 +153,10 @@ def test_start_demux(self, create_dirs):
                 "test command", with_log_files=True, prefix="demux_"
             )
 
-    @pytest.mark.skip
+    @pytest.mark.skip(reason="Not implemented yet")
     def test_is_transferred(self, create_dirs):
         pass
 
-    @pytest.mark.skip
+    @pytest.mark.skip(reason="Not implemented yet")
     def test_parse_rundir(self, create_dirs):
         pass

From fa473cd1e3c867f88430679a54c219b29eb44de8 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Mon, 2 Sep 2024 15:09:38 +0200
Subject: [PATCH 013/187] add test

---
 tests/element/test_Aviti_Runs.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 tests/element/test_Aviti_Runs.py

diff --git a/tests/element/test_Aviti_Runs.py b/tests/element/test_Aviti_Runs.py
new file mode 100644
index 00000000..0ff3e7db
--- /dev/null
+++ b/tests/element/test_Aviti_Runs.py
@@ -0,0 +1,15 @@
+import tempfile
+
+import pytest
+
+from taca.element import Aviti_Runs as to_test
+from tests.element.test_Element_Runs import create_element_run_dir
+
+
+class TestAviti_Run:
+    def test_init(self, create_dirs: pytest.fixture):
+        tmp: tempfile.TemporaryDirectory = create_dirs
+        run_dir = create_element_run_dir(tmp)
+        run = to_test.Aviti_Run(run_dir, {})
+        assert run.run_dir == run_dir
+        assert run.sequencer_type == "Aviti"

From 7e84ac16b0677adb54b9f96e83bff1f419d606e2 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Mon, 2 Sep 2024 15:13:35 +0200
Subject: [PATCH 014/187] ruff format

---
 taca/analysis/analysis_element.py | 57 +++++++++++++++++--------------
 taca/analysis/cli.py              |  1 +
 taca/element/Element_Runs.py      | 37 ++++++++++++--------
 3 files changed, 55 insertions(+), 40 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index 6e6a395f..20bc0c80 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -11,6 +11,7 @@
 
 logger = logging.getLogger(__name__)
 
+
 def _upload_to_statusdb(run):
     """Triggers the upload to statusdb.
 
@@ -18,6 +19,7 @@ def _upload_to_statusdb(run):
     """
     pass
 
+
 def run_preprocessing(given_run):
     """Run demultiplexing in all data directories.
 
@@ -29,59 +31,64 @@ def _process(run):
 
         :param taca.element.Run run: Run to be processed and transferred
         """
-        #TODO: Fetch statusdb document for run
-        #TODO: Get previous status of run from statusdb document
+        # TODO: Fetch statusdb document for run
+        # TODO: Get previous status of run from statusdb document
         sequencing_done = run.check_sequencing_status()
         demultiplexing_status = run.get_demultiplexing_status()
         if not sequencing_done:
-            #TODO: compare previous status with current status and update statusdb document if different
+            # TODO: compare previous status with current status and update statusdb document if different
             return
         elif sequencing_done and demultiplexing_status == "not started":
-            if not run.manifest_exists(): # Assumes that we use the same manifest as for sequencing. TODO: demux settings need to be added to the original manifest by lims
-                #TODO: email operator that manifest is missing
+            if not run.manifest_exists():  # Assumes that we use the same manifest as for sequencing. TODO: demux settings need to be added to the original manifest by lims
+                # TODO: email operator that manifest is missing
                 return
             # Start demux
             run.start_demux()
-            #TODO: compare previous status with current status and update statusdb document if different
+            # TODO: compare previous status with current status and update statusdb document if different
             return
         elif sequencing_done and demultiplexing_status == "ongoing":
-            #TODO: compare previous status with current status and update statusdb document if different
+            # TODO: compare previous status with current status and update statusdb document if different
             return
         elif sequencing_done and demultiplexing_status == "finished":
             # Sync metadata to ngi-data-ns
             # check if run is transferred or transfer is ongoing
             # if run has not been transferred and transfer is not ongoing
-                # make a hidden file to indicate that transfer has started
-                # compare previous status with current status and update statusdb document if different
-                    # Also update statusdb with a timestamp of when the transfer started
-                # transfer run to miarka 
-                # remove hidden file if transfer was successful
-                # Update transfer log
-                # update statusdb document
-                # archive run to nosync
-                # update statusdb document
+            # make a hidden file to indicate that transfer has started
+            # compare previous status with current status and update statusdb document if different
+            # Also update statusdb with a timestamp of when the transfer started
+            # transfer run to miarka
+            # remove hidden file if transfer was successful
+            # Update transfer log
+            # update statusdb document
+            # archive run to nosync
+            # update statusdb document
             # elif run is being transferred (hidden file exists)
-                # compare previous status with current status and update statusdb document if different
-                # return
+            # compare previous status with current status and update statusdb document if different
+            # return
             # elif run is already transferred (in transfer log)
-                # compare previous status with current status and update statusdb document if different
-                # warn that transferred run has not been archived
+            # compare previous status with current status and update statusdb document if different
+            # warn that transferred run has not been archived
             pass
-        
 
     if given_run:
-        run = Aviti_Run(run) #TODO: Needs to change if more Element machines are aquired in the future
+        run = Aviti_Run(
+            run
+        )  # TODO: Needs to change if more Element machines are aquired in the future
         _process(runObj)
     else:
-        data_dirs = CONFIG.get("element_analysis").get("data_dirs") #TODO: add to config
+        data_dirs = CONFIG.get("element_analysis").get(
+            "data_dirs"
+        )  # TODO: add to config
         for data_dir in data_dirs:
             # Run folder looks like DATE_*_*_*, the last section is the FC name.
-            runs = glob.glob(os.path.join(data_dir, "[1-9]*_*_*_*")) #TODO: adapt to aviti format
+            runs = glob.glob(
+                os.path.join(data_dir, "[1-9]*_*_*_*")
+            )  # TODO: adapt to aviti format
             for run in runs:
                 runObj = Aviti_Run(run)
                 try:
                     _process(runObj)
-                except: #TODO: chatch error message and print it
+                except:  # TODO: chatch error message and print it
                     # This function might throw and exception,
                     # it is better to continue processing other runs
                     logger.warning(f"There was an error processing the run {run}")
diff --git a/taca/analysis/cli.py b/taca/analysis/cli.py
index 13250f61..2e433a4c 100644
--- a/taca/analysis/cli.py
+++ b/taca/analysis/cli.py
@@ -72,6 +72,7 @@ def updatedb(rundir, software):
     """Save the run to statusdb."""
     an.upload_to_statusdb(rundir, software)
 
+
 # Element analysis subcommands
 
 
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index c850446c..a86e7d8e 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -19,9 +19,11 @@ def __init__(self, run_dir, configuration):
         self.CONFIG = configuration
         self.demux_dir = os.path.join(self.run_dir, "Demultiplexing")
         self.final_sequencing_file = os.path.join(self.run_dir, "RunUploaded.json")
-        self.demux_stats_file = os.path.join(self.demux_dir, "RunStats.json") #TODO: How to handle SideA/SideB?
+        self.demux_stats_file = os.path.join(
+            self.demux_dir, "RunStats.json"
+        )  # TODO: How to handle SideA/SideB?
         self.run_manifest_file = os.path.join(self.run_dir, "RunManifest.csv")
-    
+
     def check_sequencing_status(self):
         if os.path.exists(self.final_sequencing_file):
             with open(self.final_sequencing_file) as json_file:
@@ -32,26 +34,31 @@ def check_sequencing_status(self):
                 return True
         else:
             return False
-    
+
     def get_demultiplexing_status(self):
         if not os.path.exists(self.demux_dir):
             return "not started"
-        elif os.path.exists(self.demux_dir) and not os.path.isfile(self.demux_stats_file):
+        elif os.path.exists(self.demux_dir) and not os.path.isfile(
+            self.demux_stats_file
+        ):
             return "ongoing"
         elif os.path.exists(self.demux_dir) and os.path.isfile(self.demux_stats_file):
             return "finished"
-    
+
     def manifest_exists(self):
         return os.path.isfile(self.run_manifest_file)
-    
+
     def generate_demux_command(self):
-        command = [self.CONFIG.get(self.software)["bin"], #TODO add path to bases2fastq executable to config
-                   self.run_dir,
-                   self.demux_dir, #TODO: how to handle SideA/SideB?
-                   "-p 12"
-                   ] 
+        command = [
+            self.CONFIG.get(self.software)[
+                "bin"
+            ],  # TODO add path to bases2fastq executable to config
+            self.run_dir,
+            self.demux_dir,  # TODO: how to handle SideA/SideB?
+            "-p 12",
+        ]
         return command
-    
+
     def start_demux(self):
         with chdir(self.run_dir):
             cmd = self.generate_demux_command()
@@ -62,9 +69,9 @@ def start_demux(self):
                 "Bases2Fastq conversion and demultiplexing "
                 f"started for run {os.path.basename(self.run_dir)} on {datetime.now()}"
             )
-    
+
     def is_transferred(self, transfer_file):
         pass
-    
+
     def parse_rundir(self):
-        pass
\ No newline at end of file
+        pass

From 9052caaf41a545ef866a22623cf2f90ebd52e28b Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Mon, 2 Sep 2024 15:20:02 +0200
Subject: [PATCH 015/187] ruff check

---
 taca/analysis/analysis_element.py | 2 --
 taca/analysis/cli.py              | 3 +--
 taca/element/Element_Runs.py      | 4 ++--
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index 20bc0c80..189cf01d 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -6,8 +6,6 @@
 
 from taca.element.Element_Runs import Aviti_Run
 from taca.utils.config import CONFIG
-from taca.utils import statusdb
-
 
 logger = logging.getLogger(__name__)
 
diff --git a/taca/analysis/cli.py b/taca/analysis/cli.py
index 2e433a4c..131e78e0 100644
--- a/taca/analysis/cli.py
+++ b/taca/analysis/cli.py
@@ -3,8 +3,7 @@
 import click
 
 from taca.analysis import analysis as an
-from taca.analysis import analysis_nanopore
-from taca.analysis import analysis_element
+from taca.analysis import analysis_element, analysis_nanopore
 
 
 @click.group()
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index a86e7d8e..b3888c55 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -1,6 +1,6 @@
+import json
 import logging
 import os
-import json
 from datetime import datetime
 
 from taca.utils import misc
@@ -63,7 +63,7 @@ def start_demux(self):
         with chdir(self.run_dir):
             cmd = self.generate_demux_command()
             misc.call_external_command_detached(
-                cmd, with_log_files=True, prefix=f"demux_"
+                cmd, with_log_files=True, prefix="demux_"
             )
             logger.info(
                 "Bases2Fastq conversion and demultiplexing "

From b63cd4d6544c5f1c2f55d88a1b99121c0146c5d7 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Mon, 2 Sep 2024 15:20:19 +0200
Subject: [PATCH 016/187] bugfix

---
 taca/analysis/analysis_element.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index 189cf01d..8f2b29d9 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -4,7 +4,7 @@
 import logging
 import os
 
-from taca.element.Element_Runs import Aviti_Run
+from taca.element.Aviti_Runs import Aviti_Run
 from taca.utils.config import CONFIG
 
 logger = logging.getLogger(__name__)

From 5ddd2f082520d0ee0d2b12bb688bbe557b89f62d Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Wed, 4 Sep 2024 13:00:48 +0200
Subject: [PATCH 017/187] Add funktions for aviti processing

---
 taca/analysis/analysis_element.py | 62 +++++++++++++++++++------------
 taca/element/Element_Runs.py      | 41 ++++++++++++++++++++
 2 files changed, 79 insertions(+), 24 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index 6e6a395f..55fc6b42 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -37,37 +37,51 @@ def _process(run):
             #TODO: compare previous status with current status and update statusdb document if different
             return
         elif sequencing_done and demultiplexing_status == "not started":
-            if not run.manifest_exists(): # Assumes that we use the same manifest as for sequencing. TODO: demux settings need to be added to the original manifest by lims
-                #TODO: email operator that manifest is missing
+            if not run.manifest_exists():
+                #TODO: email operator warning
                 return
-            # Start demux
-            run.start_demux()
-            #TODO: compare previous status with current status and update statusdb document if different
-            return
+            elif run.manifest_exists():
+                # Get sample info from manifest
+                sample_info = run.get_sample_info()
+                sample_types = run.get_sample_types(sample_info)
+                if len(sample_types) == 1:
+                    run.start_demux()
+                elif len(sample_types) > 1:
+                    for sample_type in sample_types:
+                        run.make_manifest(sample_info, sample_type)
+                        run.start_demux()
+                else:
+                    #TODO: warn that no samples were found in the run manifest
+                    return
+                #TODO: compare previous status with current status and update statusdb document if different
         elif sequencing_done and demultiplexing_status == "ongoing":
             #TODO: compare previous status with current status and update statusdb document if different
             return
         elif sequencing_done and demultiplexing_status == "finished":
-            # Sync metadata to ngi-data-ns
-            # check if run is transferred or transfer is ongoing
-            # if run has not been transferred and transfer is not ongoing
-                # make a hidden file to indicate that transfer has started
-                # compare previous status with current status and update statusdb document if different
+            transfer_file = CONFIG.get('Element').get('Aviti').get('transfer_log')
+            if not run.is_transferred(transfer_file) and not run.transfer_ongoing():
+                run.sync_metadata()
+                run.make_transfer_indicator()
+                #TODO: compare previous status with current status and update statusdb document if different
                     # Also update statusdb with a timestamp of when the transfer started
-                # transfer run to miarka 
-                # remove hidden file if transfer was successful
-                # Update transfer log
-                # update statusdb document
-                # archive run to nosync
-                # update statusdb document
-            # elif run is being transferred (hidden file exists)
-                # compare previous status with current status and update statusdb document if different
-                # return
-            # elif run is already transferred (in transfer log)
-                # compare previous status with current status and update statusdb document if different
+                run.transfer()
+                run.remove_transfer_indicator()
+                run.update_transfer_log(transfer_file)
+                #TODO: update statusdb document
+                run.archive()
+            elif not run.is_transferred(transfer_file) and run.transfer_ongoing():
+                #TODO: compare previous status with current status and update statusdb document if different
+                logger.info("Run is being transferred. Skipping.")
+                return
+            elif run.is_transferred(transfer_file):
+                #TODO: compare previous status with current status and update statusdb document if different
                 # warn that transferred run has not been archived
-            pass
-        
+                logger.warn("The run has already been transferred but has not been archived. Please investigate")
+                return
+            else:
+                logger.warn("Unknown transfer status. Please investigate")
+                
+            
 
     if given_run:
         run = Aviti_Run(run) #TODO: Needs to change if more Element machines are aquired in the future
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index f3007dff..2cb113e3 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -44,6 +44,18 @@ def get_demultiplexing_status(self):
     def manifest_exists(self):
         return os.path.isfile(self.run_manifest_file)
     
+    def get_sample_info(self):
+        sample_info = {} #TODO: populate 
+        return sample_info
+    
+    def get_sample_types(self, sample_info):
+        sample_types = () #TODO: populate 
+        return sample_types
+    
+    def make_manifest(self, sample_info, sample_type):
+        #TODO: make a manifest for a sample_type based on sample_info
+        return
+     
     def generate_demux_command(self):
         command = [self.CONFIG.get(self.software)["bin"], #TODO add path to bases2fastq executable to config
                    self.run_dir,
@@ -64,6 +76,35 @@ def start_demux(self):
             )
     
     def is_transferred(self, transfer_file):
+        #TODO: return true if run in transfer log, else false
+        pass
+    
+    def transfer_ongoing(self):
+        #TODO: return true if hidden transfer file marker exists, else false
+        pass
+    
+    def sync_metadata(self):
+        #TODO: copy metadata from demuxed run to ngi-nas-ns
+        pass
+    
+    def make_transfer_indicator(self):
+        #TODO: touch a hidden file in the run directory
+        pass
+    
+    def transfer(self):
+        #TODO: rsync run to analysis cluster
+        pass
+    
+    def remove_transfer_indicator(self):
+        #TODO: remove hidden file in run directory
+        pass
+    
+    def update_transfer_log(self, transfer_file):
+        #TODO: update the transfer log
+        pass
+    
+    def archive(self):
+        #TODO: move run dir to nosync
         pass
     
     def parse_rundir(self):

From 1a4320750e8b13294cdc996658f3f36d3a649db1 Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Tue, 10 Sep 2024 15:20:00 +0200
Subject: [PATCH 018/187] Add status of a run

---
 taca/analysis/analysis_element.py | 66 ++++++++++++++++---------------
 taca/element/Element_Runs.py      | 25 ++++++++----
 2 files changed, 52 insertions(+), 39 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index 50f95cef..433b2483 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -10,14 +10,6 @@
 logger = logging.getLogger(__name__)
 
 
-def _upload_to_statusdb(run):
-    """Triggers the upload to statusdb.
-
-    :param Run run: the object run
-    """
-    pass
-
-
 def run_preprocessing(given_run):
     """Run demultiplexing in all data directories.
 
@@ -29,20 +21,19 @@ def _process(run):
 
         :param taca.element.Run run: Run to be processed and transferred
         """
-        # TODO: Fetch statusdb document for run
-        # TODO: Get previous status of run from statusdb document
         sequencing_done = run.check_sequencing_status()
         demultiplexing_status = run.get_demultiplexing_status()
-        if not sequencing_done:
-            # TODO: compare previous status with current status and update statusdb document if different
-            return
-        elif sequencing_done and demultiplexing_status == "not started":
+        if not sequencing_done: # Sequencing ongoing
+            current_run_status = 'sequencing'
+            if run.status_changed(current_run_status):
+                run.update_statusdb(current_run_status) #TODO: what info needs to be gathered and uploaded?
+        elif sequencing_done and demultiplexing_status == "not started": # Sequencing done. Start demux
             if not run.manifest_exists():
+                logger.warn(f"Run manifest is missing for {run.flowcell_id}")
                 #TODO: email operator warning
                 return
             elif run.manifest_exists():
-                # Get sample info from manifest
-                sample_info = run.get_sample_info()
+                sample_info = run.get_sample_info_from_manifest()
                 sample_types = run.get_sample_types(sample_info)
                 if len(sample_types) == 1:
                     run.start_demux()
@@ -51,48 +42,59 @@ def _process(run):
                         run.make_manifest(sample_info, sample_type)
                         run.start_demux()
                 else:
-                    #TODO: warn that no samples were found in the run manifest
+                    logger.warn(f"No samples were found in the sample manifest for run {run.flowcell_id}.")
+                    #TODO: email operator warning
                     return
-                #TODO: compare previous status with current status and update statusdb document if different
+                current_run_status = "demultiplexing"
+                if run.status_changed(current_run_status):
+                    run.update_statusdb(current_run_status)
         elif sequencing_done and demultiplexing_status == "ongoing":
-            # TODO: compare previous status with current status and update statusdb document if different
+            current_run_status = "demultiplexing"
+            if run.status_changed(current_run_status):
+                run.update_statusdb(current_run_status)
             return
         elif sequencing_done and demultiplexing_status == "finished":
             transfer_file = CONFIG.get('Element').get('Aviti').get('transfer_log')
             if not run.is_transferred(transfer_file) and not run.transfer_ongoing():
                 run.sync_metadata()
                 run.make_transfer_indicator()
-                #TODO: compare previous status with current status and update statusdb document if different
-                    # Also update statusdb with a timestamp of when the transfer started
+                current_run_status = "transferring"
+                if run.status_changed(current_run_status):
+                    run.update_statusdb(current_run_status)
+                    #TODO: Also update statusdb with a timestamp of when the transfer started
                 run.transfer()
                 run.remove_transfer_indicator()
                 run.update_transfer_log(transfer_file)
-                #TODO: update statusdb document
+                current_run_status = "transferred"
+                if run.status_changed(current_run_status):
+                    run.update_statusdb(current_run_status)
                 run.archive()
+                current_run_status = "archived"
+                if run.status_changed(current_run_status):
+                    run.update_statusdb(current_run_status)
             elif not run.is_transferred(transfer_file) and run.transfer_ongoing():
-                #TODO: compare previous status with current status and update statusdb document if different
-                logger.info("Run is being transferred. Skipping.")
+                current_run_status = "transferring"
+                if run.status_changed(current_run_status):
+                    run.update_statusdb(current_run_status)
+                logger.info(f"Run {run.flowcell_id} is being transferred. Skipping.")
                 return
             elif run.is_transferred(transfer_file):
-                #TODO: compare previous status with current status and update statusdb document if different
-                # warn that transferred run has not been archived
-                logger.warn("The run has already been transferred but has not been archived. Please investigate")
+                logger.warn(f"The run {run.flowcell_id} has already been transferred but has not been archived. Please investigate")
+                #TODO: email operator warning
                 return
             else:
-                logger.warn("Unknown transfer status. Please investigate")
+                logger.warn(f"Unknown transfer status of run {run.flowcell_id}. Please investigate")
                 
             
 
     if given_run:
-        run = Aviti_Run(
-            run
-        )  # TODO: Needs to change if more Element machines are aquired in the future
+        run = Aviti_Run(given_run)  # TODO: Needs to change if more Element machines are aquired in the future
         _process(run)
     else:
         data_dirs = CONFIG.get("element_analysis").get(
             "data_dirs"
         )  # TODO: add to config
-        for data_dir in data_dirs:
+        for data_dir in data_dirs: #TODO: make sure to look in both side A and B
             # Run folder looks like DATE_*_*_*, the last section is the FC name.
             runs = glob.glob(
                 os.path.join(data_dir, "[1-9]*_*_*_*")
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index b8c273b8..813e2ea4 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -15,15 +15,16 @@ class Run:
     def __init__(self, run_dir, configuration):
         if not os.path.exists(run_dir):
             raise RuntimeError(f"Could not locate run directory {run_dir}")
-        self.run_dir = os.path.abspath(run_dir)
+        self.flowcell_id = run_dir #TODO: get flowcell id from json instead
+        self.run_dir = os.path.abspath(run_dir) # TODO: How to handle SideA/SideB?
         self.CONFIG = configuration
         self.demux_dir = os.path.join(self.run_dir, "Demultiplexing")
         self.final_sequencing_file = os.path.join(self.run_dir, "RunUploaded.json")
         self.demux_stats_file = os.path.join(
-            self.demux_dir, "RunStats.json"
-        )  # TODO: How to handle SideA/SideB?
+            self.demux_dir, "RunStats.json" # Assumes demux is finished when this file is created
+        )
         self.run_manifest_file = os.path.join(self.run_dir, "RunManifest.csv")
-
+    
     def check_sequencing_status(self):
         if os.path.exists(self.final_sequencing_file):
             with open(self.final_sequencing_file) as json_file:
@@ -34,7 +35,7 @@ def check_sequencing_status(self):
                 return True
         else:
             return False
-
+    
     def get_demultiplexing_status(self):
         if not os.path.exists(self.demux_dir):
             return "not started"
@@ -44,12 +45,22 @@ def get_demultiplexing_status(self):
             return "ongoing"
         elif os.path.exists(self.demux_dir) and os.path.isfile(self.demux_stats_file):
             return "finished"
+        else:
+            return "unknown"
+    
+    def status_changed(self, current_run_status):
+        #TODO: get document from statusdb, check status field, return true if status of run changed
+        pass
+
+    def update_statusdb(self, current_run_status):
+        #TODO: Get document from statusdb. Gather data about run and update the statusdb document, then upload to statusdb
+        pass
 
     def manifest_exists(self):
         return os.path.isfile(self.run_manifest_file)
     
-    def get_sample_info(self):
-        sample_info = {} #TODO: populate 
+    def get_sample_info_from_manifest(self):
+        sample_info = {} #TODO: populate with sample info from manifest
         return sample_info
     
     def get_sample_types(self, sample_info):

From 38b3276001e388303f241555472df03f042c2b9b Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Wed, 11 Sep 2024 14:17:23 +0200
Subject: [PATCH 019/187] Get run manifests from lims instead

---
 taca/analysis/analysis_element.py | 30 +++++++++++++++++-------------
 taca/element/Element_Runs.py      | 18 +++++-------------
 2 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index 433b2483..91902d05 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -28,23 +28,25 @@ def _process(run):
             if run.status_changed(current_run_status):
                 run.update_statusdb(current_run_status) #TODO: what info needs to be gathered and uploaded?
         elif sequencing_done and demultiplexing_status == "not started": # Sequencing done. Start demux
-            if not run.manifest_exists():
+            if not run.manifest_exists(): #TODO: this should check for the zip file in lims output location
                 logger.warn(f"Run manifest is missing for {run.flowcell_id}")
                 #TODO: email operator warning
                 return
             elif run.manifest_exists():
-                sample_info = run.get_sample_info_from_manifest()
-                sample_types = run.get_sample_types(sample_info)
-                if len(sample_types) == 1:
-                    run.start_demux()
-                elif len(sample_types) > 1:
-                    for sample_type in sample_types:
-                        run.make_manifest(sample_info, sample_type)
-                        run.start_demux()
-                else:
-                    logger.warn(f"No samples were found in the sample manifest for run {run.flowcell_id}.")
-                    #TODO: email operator warning
-                    return
+                os.mkdir(run.demux_dir)
+                run.copy_manifests()
+                run_manifests = glob.glob(
+                    os.path.join(run.run_dir, "RunManifest_*.csv")
+                    )  # TODO: is this filename right?
+                sub_demux_count = 0
+                for run_manifest in run_manifests.sort():
+                    if len(run_manifests) == 1:
+                        demux_dir = run.demux_dir
+                    elif len(run_manifests) > 1:
+                        demux_dir = f"Demultiplexing_{sub_demux_count}"
+                    os.mkdir(demux_dir)
+                    run.start_demux(run_manifest, demux_dir)
+                    sub_demux_count += 1
                 current_run_status = "demultiplexing"
                 if run.status_changed(current_run_status):
                     run.update_statusdb(current_run_status)
@@ -56,6 +58,8 @@ def _process(run):
         elif sequencing_done and demultiplexing_status == "finished":
             transfer_file = CONFIG.get('Element').get('Aviti').get('transfer_log')
             if not run.is_transferred(transfer_file) and not run.transfer_ongoing():
+                #TODO: if multiple demux dirs, aggregate the results into Demultiplexing?
+                run.aggregate_demux_results
                 run.sync_metadata()
                 run.make_transfer_indicator()
                 current_run_status = "transferring"
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 813e2ea4..560eb4da 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -57,19 +57,11 @@ def update_statusdb(self, current_run_status):
         pass
 
     def manifest_exists(self):
-        return os.path.isfile(self.run_manifest_file)
+        return os.path.isfile(self.run_manifest_file) #TODO: still true?
     
-    def get_sample_info_from_manifest(self):
-        sample_info = {} #TODO: populate with sample info from manifest
-        return sample_info
-    
-    def get_sample_types(self, sample_info):
-        sample_types = () #TODO: populate 
-        return sample_types
-    
-    def make_manifest(self, sample_info, sample_type):
-        #TODO: make a manifest for a sample_type based on sample_info
-        return
+    def copy_manifests():
+        #TODO: copy manifest zip file from lims location and unzip
+        pass
      
     def generate_demux_command(self):
         command = [
@@ -82,7 +74,7 @@ def generate_demux_command(self):
         ]
         return command
 
-    def start_demux(self):
+    def start_demux(self, run_manifest, demux_dir):
         with chdir(self.run_dir):
             cmd = self.generate_demux_command()
             misc.call_external_command_detached(

From ad657179a501acdf316bbdbcd7f847fc80a04e73 Mon Sep 17 00:00:00 2001
From: Johannes Alneberg <alneberg@kth.se>
Date: Thu, 12 Sep 2024 12:03:19 +0200
Subject: [PATCH 020/187] Draft for Element upload to statusdb

---
 taca/element/Aviti_Runs.py | 25 +++++++++++++++++++++++++
 taca/utils/statusdb.py     | 27 +++++++++++++++++++++++++++
 2 files changed, 52 insertions(+)

diff --git a/taca/element/Aviti_Runs.py b/taca/element/Aviti_Runs.py
index ad162ac4..3f672d5a 100644
--- a/taca/element/Aviti_Runs.py
+++ b/taca/element/Aviti_Runs.py
@@ -1,7 +1,32 @@
 from taca.element.Element_Runs import Run
+from taca.utils.config import CONFIG
+from taca.utils.statusdb import ElementRunsConnection
 
 
 class Aviti_Run(Run):
     def __init__(self, run_dir, configuration):
         super().__init__(run_dir, configuration)
         self.sequencer_type = "Aviti"
+        self.db = ElementRunsConnection(CONFIG["statusdb"], dbname="element_runs")
+
+    def update_statusdb(self):
+        doc_obj = self.to_doc_obj()
+        self.db.upload_to_statusdb(doc_obj)
+
+    def construct_NGI_run_id(self):
+        pass
+
+    def to_doc_obj(self):
+        doc_obj = {
+            "name": self.run_name,
+            "run_status": self.run_status,
+            "run_id": self.run_id,
+            "run_dir": self.run_dir,
+            "run_type": self.run_type,
+            "sequencer_type": self.sequencer_type,
+            "samples": self.samples,
+            "demux": self.demux,
+            "date": self.date,
+            "flowcell": self.flowcell,
+        }
+        return doc_obj
diff --git a/taca/utils/statusdb.py b/taca/utils/statusdb.py
index 939e0606..47620ea2 100644
--- a/taca/utils/statusdb.py
+++ b/taca/utils/statusdb.py
@@ -166,6 +166,33 @@ def finish_ongoing_run(self, ont_run, dict_json: dict):
         self.db[doc.id] = doc
 
 
+class ElementRunsConnection(StatusdbSession):
+    def __init__(self, config, dbname="element_runs"):
+        super().__init__(config)
+        self.db = self.connection[dbname]
+
+    def get_db_entry(self, run_id):
+        view_run_id = self.db.view("info/id")
+        try:
+            return view_run_id[run_id].rows[0]
+        except IndexError:
+            return None
+
+    def check_if_run_exists(self, run_id) -> bool:
+        return self.get_db_entry(run_id) is not None
+
+    def check_db_run_status(self, run_name) -> str:
+        view_status = self.db.view("info/status")
+        try:
+            status = view_status[run_name].rows[0].value
+        except IndexError:  # No rows found
+            return "Unknown"
+        return status
+
+    def upload_to_statusdb(self, run_obj: dict):
+        update_doc(self.db, run_obj)
+
+
 def update_doc(db, obj, over_write_db_entry=False):
     view = db.view("info/name")
     if len(view[obj["name"]].rows) == 1:

From dcb450448208042c84bb0e07066d39f1e2250357 Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Thu, 12 Sep 2024 14:16:54 +0200
Subject: [PATCH 021/187] more functions

---
 taca/analysis/analysis_element.py |  4 ++--
 taca/element/Element_Runs.py      | 22 +++++++++++-----------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index 91902d05..e33b655b 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -36,8 +36,8 @@ def _process(run):
                 os.mkdir(run.demux_dir)
                 run.copy_manifests()
                 run_manifests = glob.glob(
-                    os.path.join(run.run_dir, "RunManifest_*.csv")
-                    )  # TODO: is this filename right?
+                    os.path.join(run.run_dir, "RunManifest_*.csv")  # TODO: is this filename right?
+                    )
                 sub_demux_count = 0
                 for run_manifest in run_manifests.sort():
                     if len(run_manifests) == 1:
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 560eb4da..d2287659 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -1,6 +1,7 @@
 import json
 import logging
 import os
+import shutil
 from datetime import datetime
 
 from taca.utils import misc
@@ -23,7 +24,8 @@ def __init__(self, run_dir, configuration):
         self.demux_stats_file = os.path.join(
             self.demux_dir, "RunStats.json" # Assumes demux is finished when this file is created
         )
-        self.run_manifest_file = os.path.join(self.run_dir, "RunManifest.csv")
+        self.run_manifest_zip_file = os.path.join(self.CONFIG.get('Aviti').get('manifest_zip_location'),
+                                                  self.flowcell_id + '.tar.gz') #TODO: change and add to taca.yaml
     
     def check_sequencing_status(self):
         if os.path.exists(self.final_sequencing_file):
@@ -57,26 +59,24 @@ def update_statusdb(self, current_run_status):
         pass
 
     def manifest_exists(self):
-        return os.path.isfile(self.run_manifest_file) #TODO: still true?
+        return os.path.isfile(self.run_manifest_zip_file)
     
-    def copy_manifests():
-        #TODO: copy manifest zip file from lims location and unzip
-        pass
+    def copy_manifests(self):
+        shutil.copy(self.run_manifest_zip_file, self.run_dir)
+        #TODO: unzip
      
-    def generate_demux_command(self):
+    def generate_demux_command(self, run_manifest, demux_dir):
         command = [
-            self.CONFIG.get(self.software)[
-                "bin"
-            ],  # TODO add path to bases2fastq executable to config
+            self.CONFIG.get(self.software)["bin"],  # TODO add path to bases2fastq executable to config
             self.run_dir,
-            self.demux_dir,  # TODO: how to handle SideA/SideB?
+            demux_dir,
             "-p 12",
         ]
         return command
 
     def start_demux(self, run_manifest, demux_dir):
         with chdir(self.run_dir):
-            cmd = self.generate_demux_command()
+            cmd = self.generate_demux_command(run_manifest, demux_dir)
             misc.call_external_command_detached(
                 cmd, with_log_files=True, prefix="demux_"
             )

From d23be69672c260546df0c187f0b00b0a3dbb0323 Mon Sep 17 00:00:00 2001
From: Johannes Alneberg <alneberg@kth.se>
Date: Thu, 12 Sep 2024 14:44:20 +0200
Subject: [PATCH 022/187] RunParameters parsing

---
 taca/analysis/analysis_element.py |  48 ++++++++-----
 taca/element/Aviti_Runs.py        |  25 -------
 taca/element/Element_Runs.py      | 112 ++++++++++++++++++++++--------
 3 files changed, 111 insertions(+), 74 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index 91902d05..d632c2f0 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -21,23 +21,28 @@ def _process(run):
 
         :param taca.element.Run run: Run to be processed and transferred
         """
+        run.parse_run_parameters()
         sequencing_done = run.check_sequencing_status()
         demultiplexing_status = run.get_demultiplexing_status()
-        if not sequencing_done: # Sequencing ongoing
-            current_run_status = 'sequencing'
+        if not sequencing_done:  # Sequencing ongoing
+            current_run_status = "sequencing"
             if run.status_changed(current_run_status):
-                run.update_statusdb(current_run_status) #TODO: what info needs to be gathered and uploaded?
-        elif sequencing_done and demultiplexing_status == "not started": # Sequencing done. Start demux
-            if not run.manifest_exists(): #TODO: this should check for the zip file in lims output location
-                logger.warn(f"Run manifest is missing for {run.flowcell_id}")
-                #TODO: email operator warning
+                run.update_statusdb()
+        elif (
+            sequencing_done and demultiplexing_status == "not started"
+        ):  # Sequencing done. Start demux
+            if (
+                not run.manifest_exists()
+            ):  # TODO: this should check for the zip file in lims output location
+                logger.warn(f"Run manifest is missing for {run}")
+                # TODO: email operator warning
                 return
             elif run.manifest_exists():
                 os.mkdir(run.demux_dir)
                 run.copy_manifests()
                 run_manifests = glob.glob(
                     os.path.join(run.run_dir, "RunManifest_*.csv")
-                    )  # TODO: is this filename right?
+                )  # TODO: is this filename right?
                 sub_demux_count = 0
                 for run_manifest in run_manifests.sort():
                     if len(run_manifests) == 1:
@@ -56,16 +61,16 @@ def _process(run):
                 run.update_statusdb(current_run_status)
             return
         elif sequencing_done and demultiplexing_status == "finished":
-            transfer_file = CONFIG.get('Element').get('Aviti').get('transfer_log')
+            transfer_file = CONFIG.get("Element").get("Aviti").get("transfer_log")
             if not run.is_transferred(transfer_file) and not run.transfer_ongoing():
-                #TODO: if multiple demux dirs, aggregate the results into Demultiplexing?
+                # TODO: if multiple demux dirs, aggregate the results into Demultiplexing?
                 run.aggregate_demux_results
                 run.sync_metadata()
                 run.make_transfer_indicator()
                 current_run_status = "transferring"
                 if run.status_changed(current_run_status):
                     run.update_statusdb(current_run_status)
-                    #TODO: Also update statusdb with a timestamp of when the transfer started
+                    # TODO: Also update statusdb with a timestamp of when the transfer started
                 run.transfer()
                 run.remove_transfer_indicator()
                 run.update_transfer_log(transfer_file)
@@ -80,25 +85,29 @@ def _process(run):
                 current_run_status = "transferring"
                 if run.status_changed(current_run_status):
                     run.update_statusdb(current_run_status)
-                logger.info(f"Run {run.flowcell_id} is being transferred. Skipping.")
+                logger.info(f"{run} is being transferred. Skipping.")
                 return
             elif run.is_transferred(transfer_file):
-                logger.warn(f"The run {run.flowcell_id} has already been transferred but has not been archived. Please investigate")
-                #TODO: email operator warning
+                logger.warn(
+                    f"The run {run.flowcell_id} has already been transferred but has not been archived. Please investigate"
+                )
+                # TODO: email operator warning
                 return
             else:
-                logger.warn(f"Unknown transfer status of run {run.flowcell_id}. Please investigate")
-                
-            
+                logger.warn(
+                    f"Unknown transfer status of run {run.flowcell_id}. Please investigate"
+                )
 
     if given_run:
-        run = Aviti_Run(given_run)  # TODO: Needs to change if more Element machines are aquired in the future
+        run = Aviti_Run(given_run)
+        # TODO: Needs to change if more types of Element machines are aquired in the future
+
         _process(run)
     else:
         data_dirs = CONFIG.get("element_analysis").get(
             "data_dirs"
         )  # TODO: add to config
-        for data_dir in data_dirs: #TODO: make sure to look in both side A and B
+        for data_dir in data_dirs:  # TODO: make sure to look in both side A and B
             # Run folder looks like DATE_*_*_*, the last section is the FC name.
             runs = glob.glob(
                 os.path.join(data_dir, "[1-9]*_*_*_*")
@@ -111,4 +120,5 @@ def _process(run):
                     # This function might throw and exception,
                     # it is better to continue processing other runs
                     logger.warning(f"There was an error processing the run {run}")
+                    # TODO: Think about how to avoid silent errors (email?)
                     pass
diff --git a/taca/element/Aviti_Runs.py b/taca/element/Aviti_Runs.py
index 3f672d5a..ad162ac4 100644
--- a/taca/element/Aviti_Runs.py
+++ b/taca/element/Aviti_Runs.py
@@ -1,32 +1,7 @@
 from taca.element.Element_Runs import Run
-from taca.utils.config import CONFIG
-from taca.utils.statusdb import ElementRunsConnection
 
 
 class Aviti_Run(Run):
     def __init__(self, run_dir, configuration):
         super().__init__(run_dir, configuration)
         self.sequencer_type = "Aviti"
-        self.db = ElementRunsConnection(CONFIG["statusdb"], dbname="element_runs")
-
-    def update_statusdb(self):
-        doc_obj = self.to_doc_obj()
-        self.db.upload_to_statusdb(doc_obj)
-
-    def construct_NGI_run_id(self):
-        pass
-
-    def to_doc_obj(self):
-        doc_obj = {
-            "name": self.run_name,
-            "run_status": self.run_status,
-            "run_id": self.run_id,
-            "run_dir": self.run_dir,
-            "run_type": self.run_type,
-            "sequencer_type": self.sequencer_type,
-            "samples": self.samples,
-            "demux": self.demux,
-            "date": self.date,
-            "flowcell": self.flowcell,
-        }
-        return doc_obj
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 560eb4da..2b745f8b 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -5,6 +5,7 @@
 
 from taca.utils import misc
 from taca.utils.filesystem import chdir
+from taca.utils.statusdb import ElementRunsConnection
 
 logger = logging.getLogger(__name__)
 
@@ -15,16 +16,67 @@ class Run:
     def __init__(self, run_dir, configuration):
         if not os.path.exists(run_dir):
             raise RuntimeError(f"Could not locate run directory {run_dir}")
-        self.flowcell_id = run_dir #TODO: get flowcell id from json instead
-        self.run_dir = os.path.abspath(run_dir) # TODO: How to handle SideA/SideB?
+        self.run_parameters_parsed = False
+
+        self.run_dir = os.path.abspath(run_dir)  # TODO: How to handle SideA/SideB?
         self.CONFIG = configuration
+
         self.demux_dir = os.path.join(self.run_dir, "Demultiplexing")
         self.final_sequencing_file = os.path.join(self.run_dir, "RunUploaded.json")
+
         self.demux_stats_file = os.path.join(
-            self.demux_dir, "RunStats.json" # Assumes demux is finished when this file is created
+            self.demux_dir,
+            "RunStats.json",  # Assumes demux is finished when this file is created
         )
         self.run_manifest_file = os.path.join(self.run_dir, "RunManifest.csv")
-    
+
+        # Instrument generated files
+        self.run_parameters_file = os.path.join(self.run_dir, "RunParameters.json")
+        self.run_stats_file = os.path.join(self.run_dir, "RunStats.json")
+        self.run_manifest_file_from_instrument = os.path.join(
+            self.run_dir, "RunManifest.json"
+        )
+        self.run_uploaded_file = os.path.join(self.run_dir, "RunUploaded.json")
+
+        self.db = ElementRunsConnection(self.CONFIG["statusdb"], dbname="element_runs")
+
+    def __str__(self) -> str:
+        if self.run_parameters_parsed:
+            return f"ElementRun({self.NGI_run_id})"
+        else:
+            return f"ElementRun({self.run_dir})"
+
+    @property
+    def NGI_run_id(self):
+        if self.run_parameters_parsed:
+            return f"{self.date}_{self.instrument_name}_{self.side_letter}{self.flowcell_id}"
+        else:
+            raise RuntimeError(f"Run parameters not parsed for run {self.run_dir}")
+
+    def parse_run_parameters(self) -> None:
+        with open(self.run_parameters_file) as json_file:
+            run_parameters = json.load(json_file)
+
+        # Manually entered, but should be side and flowcell id
+        self.run_name = run_parameters.get("RunName")
+
+        self.run_id = run_parameters.get(
+            "runID"
+        )  # Unique hash that we don't really use
+        self.side = run_parameters.get("Side")  # SideA or SideB
+        self.side_letter = self.side[-1]  # A or B
+        self.run_type = run_parameters.get(
+            "RunType"
+        )  # Sequencing, wash or prime I believe?
+        self.flowcell_id = run_parameters.get("FlowcellID")
+        self.instrument_name = run_parameters.get("InstrumentName")
+        self.date = run_parameters.get("Date")
+        self.operator_name = run_parameters.get("OperatorName")
+
+    def to_doc_obj(self):
+        # TODO
+        pass
+
     def check_sequencing_status(self):
         if os.path.exists(self.final_sequencing_file):
             with open(self.final_sequencing_file) as json_file:
@@ -35,7 +87,7 @@ def check_sequencing_status(self):
                 return True
         else:
             return False
-    
+
     def get_demultiplexing_status(self):
         if not os.path.exists(self.demux_dir):
             return "not started"
@@ -47,22 +99,22 @@ def get_demultiplexing_status(self):
             return "finished"
         else:
             return "unknown"
-    
+
     def status_changed(self, current_run_status):
-        #TODO: get document from statusdb, check status field, return true if status of run changed
+        # TODO: get document from statusdb, check status field, return true if status of run changed
         pass
 
-    def update_statusdb(self, current_run_status):
-        #TODO: Get document from statusdb. Gather data about run and update the statusdb document, then upload to statusdb
-        pass
+    def update_statusdb(self):
+        doc_obj = self.to_doc_obj()
+        self.db.upload_to_statusdb(doc_obj)
 
     def manifest_exists(self):
-        return os.path.isfile(self.run_manifest_file) #TODO: still true?
-    
-    def copy_manifests():
-        #TODO: copy manifest zip file from lims location and unzip
+        return os.path.isfile(self.run_manifest_file)  # TODO: still true?
+
+    def copy_manifests(self):
+        # TODO: copy manifest zip file from lims location and unzip
         pass
-     
+
     def generate_demux_command(self):
         command = [
             self.CONFIG.get(self.software)[
@@ -86,35 +138,35 @@ def start_demux(self, run_manifest, demux_dir):
             )
 
     def is_transferred(self, transfer_file):
-        #TODO: return true if run in transfer log, else false
+        # TODO: return true if run in transfer log, else false
         pass
-    
+
     def transfer_ongoing(self):
-        #TODO: return true if hidden transfer file marker exists, else false
+        # TODO: return true if hidden transfer file marker exists, else false
         pass
-    
+
     def sync_metadata(self):
-        #TODO: copy metadata from demuxed run to ngi-nas-ns
+        # TODO: copy metadata from demuxed run to ngi-nas-ns
         pass
-    
+
     def make_transfer_indicator(self):
-        #TODO: touch a hidden file in the run directory
+        # TODO: touch a hidden file in the run directory
         pass
-    
+
     def transfer(self):
-        #TODO: rsync run to analysis cluster
+        # TODO: rsync run to analysis cluster
         pass
-    
+
     def remove_transfer_indicator(self):
-        #TODO: remove hidden file in run directory
+        # TODO: remove hidden file in run directory
         pass
-    
+
     def update_transfer_log(self, transfer_file):
-        #TODO: update the transfer log
+        # TODO: update the transfer log
         pass
-    
+
     def archive(self):
-        #TODO: move run dir to nosync
+        # TODO: move run dir to nosync
         pass
 
     def parse_rundir(self):

From 6ac2bee3d7a8c0a5ec850da6e157186c276128a3 Mon Sep 17 00:00:00 2001
From: Johannes Alneberg <alneberg@kth.se>
Date: Fri, 13 Sep 2024 08:33:13 +0200
Subject: [PATCH 023/187] No need to give the status again

---
 taca/analysis/analysis_element.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index e10230b3..1987260f 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -22,6 +22,8 @@ def _process(run):
         :param taca.element.Run run: Run to be processed and transferred
         """
         run.parse_run_parameters()
+        # TODO Should we just abort if the run parameters is not found? We cannot assign the run id without it.
+
         sequencing_done = run.check_sequencing_status()
         demultiplexing_status = run.get_demultiplexing_status()
         if not sequencing_done:  # Sequencing ongoing
@@ -34,7 +36,9 @@ def _process(run):
             if (
                 not run.manifest_exists()
             ):  # TODO: this should check for the zip file in lims output location
-                logger.warn(f"Run manifest is missing for {run}")
+                logger.warn(
+                    f"Run manifest is missing for {run}, demultiplexing aborted"
+                )
                 # TODO: email operator warning
                 return
             elif run.manifest_exists():
@@ -56,11 +60,11 @@ def _process(run):
                     sub_demux_count += 1
                 run.status = "demultiplexing"
                 if run.status_changed:
-                    run.update_statusdb(run.status)
+                    run.update_statusdb()
         elif sequencing_done and demultiplexing_status == "ongoing":
             run.status = "demultiplexing"
             if run.status_changed:
-                run.update_statusdb(run.status)
+                run.update_statusdb()
             return
         elif sequencing_done and demultiplexing_status == "finished":
             transfer_file = CONFIG.get("Element").get("Aviti").get("transfer_log")
@@ -71,22 +75,22 @@ def _process(run):
                 run.make_transfer_indicator()
                 run.status = "transferring"
                 if run.status_changed:
-                    run.update_statusdb(run.status)
+                    run.update_statusdb()
                     # TODO: Also update statusdb with a timestamp of when the transfer started
                 run.transfer()
                 run.remove_transfer_indicator()
                 run.update_transfer_log(transfer_file)
                 run.status = "transferred"
                 if run.status_changed:
-                    run.update_statusdb(run.status)
+                    run.update_statusdb()
                 run.archive()
                 run.status = "archived"
                 if run.status_changed:
-                    run.update_statusdb(run.status)
+                    run.update_statusdb()
             elif not run.is_transferred(transfer_file) and run.transfer_ongoing():
                 run.status = "transferring"
                 if run.status_changed:
-                    run.update_statusdb(run.status)
+                    run.update_statusdb()
                 logger.info(f"{run} is being transferred. Skipping.")
                 return
             elif run.is_transferred(transfer_file):

From 539bbee825f25780db05120c0e7e511499e95ef1 Mon Sep 17 00:00:00 2001
From: Johannes Alneberg <alneberg@kth.se>
Date: Fri, 13 Sep 2024 11:22:09 +0200
Subject: [PATCH 024/187] Use run for printing

---
 taca/analysis/analysis_element.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index 1987260f..c7230a90 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -67,7 +67,9 @@ def _process(run):
                 run.update_statusdb()
             return
         elif sequencing_done and demultiplexing_status == "finished":
-            transfer_file = CONFIG.get("Element").get("Aviti").get("transfer_log")
+            transfer_file = (
+                CONFIG.get("Element").get(run.sequencer_type).get("transfer_log")
+            )
             if not run.is_transferred(transfer_file) and not run.transfer_ongoing():
                 # TODO: if multiple demux dirs, aggregate the results into Demultiplexing?
                 run.aggregate_demux_results
@@ -77,9 +79,10 @@ def _process(run):
                 if run.status_changed:
                     run.update_statusdb()
                     # TODO: Also update statusdb with a timestamp of when the transfer started
-                run.transfer()
+                run.transfer()  # I think this should be a detached command as well
                 run.remove_transfer_indicator()
                 run.update_transfer_log(transfer_file)
+
                 run.status = "transferred"
                 if run.status_changed:
                     run.update_statusdb()
@@ -95,14 +98,12 @@ def _process(run):
                 return
             elif run.is_transferred(transfer_file):
                 logger.warn(
-                    f"The run {run.flowcell_id} has already been transferred but has not been archived. Please investigate"
+                    f"The run {run} has already been transferred but has not been archived. Please investigate"
                 )
                 # TODO: email operator warning
                 return
             else:
-                logger.warn(
-                    f"Unknown transfer status of run {run.flowcell_id}. Please investigate"
-                )
+                logger.warn(f"Unknown transfer status of run {run}. Please investigate")
 
     if given_run:
         run = Aviti_Run(given_run)

From 4c15b67ea99307a5d936c45a46a5d5c7f1d732e9 Mon Sep 17 00:00:00 2001
From: Johannes Alneberg <alneberg@kth.se>
Date: Fri, 13 Sep 2024 15:47:35 +0200
Subject: [PATCH 025/187] Read instrument generated files to doc obj and
 implemented the status_changed method

---
 taca/element/Element_Runs.py | 38 +++++++++++++++++++++++++++++++-----
 1 file changed, 33 insertions(+), 5 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 36a7ef9b..78f6763e 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -73,6 +73,7 @@ def NGI_run_id(self):
             raise RuntimeError(f"Run parameters not parsed for run {self.run_dir}")
 
     def parse_run_parameters(self) -> None:
+        """Parse run-information from the RunParameters.json file"""
         try:
             with open(self.run_parameters_file) as json_file:
                 run_parameters = json.load(json_file)
@@ -100,12 +101,31 @@ def parse_run_parameters(self) -> None:
         self.run_parameters_parsed = True
 
     def to_doc_obj(self):
-        # TODO
+        # TODO, are we sure what we should do when the RunParameters.json file is missing?
+
+        # Read in all instrument generated files
+        instrument_generated_files = {}
+        for file in [
+            self.run_parameters_file,
+            self.run_stats_file,
+            self.run_manifest_file_from_instrument,
+            self.run_uploaded_file,
+        ]:
+            if os.path.exists(file):
+                with open(file) as json_file:
+                    instrument_generated_files[os.path.basename(file)] = json.load(
+                        json_file
+                    )
+            else:
+                instrument_generated_files[os.path.basename(file)] = None
+
         doc_obj = {
             "run_path": self.run_dir,
             "run_status": self.status,
-            "pore_count_history": [],
+            "NGI_run_id": self.NGI_run_id,
+            "instrument_generated_files": instrument_generated_files,
         }
+
         return doc_obj
 
     def check_sequencing_status(self):
@@ -131,9 +151,13 @@ def get_demultiplexing_status(self):
         else:
             return "unknown"
 
-    def status_changed(self, current_run_status):
-        # TODO: get document from statusdb, check status field, return true if status of run changed
-        pass
+    def status_changed(self):
+        if not self.run_parameters_parsed:
+            raise RuntimeError(
+                f"Run parameters not parsed for run {self.run_dir}, cannot check status"
+            )
+        db_run_status = self.db.check_db_run_status(self.NGI_run_id)
+        return db_run_status != self.status
 
     def update_statusdb(self):
         doc_obj = self.to_doc_obj()
@@ -199,3 +223,7 @@ def update_transfer_log(self, transfer_file):
     def archive(self):
         # TODO: move run dir to nosync
         pass
+
+    def aggregate_demux_results(self):
+        # TODO: aggregate demux results
+        pass

From bf101872345eb8f46400265e4a455f85f455a1bc Mon Sep 17 00:00:00 2001
From: Johannes Alneberg <alneberg@kth.se>
Date: Mon, 16 Sep 2024 15:20:06 +0200
Subject: [PATCH 026/187] Abort processing if RunParameters.json is not found

---
 taca/analysis/analysis_element.py | 9 +++++++--
 taca/element/Element_Runs.py      | 2 +-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index c7230a90..4877218e 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -21,8 +21,13 @@ def _process(run):
 
         :param taca.element.Run run: Run to be processed and transferred
         """
-        run.parse_run_parameters()
-        # TODO Should we just abort if the run parameters is not found? We cannot assign the run id without it.
+        try:
+            run.parse_run_parameters()
+        except FileNotFoundError:
+            logger.warn(
+                f"Cannot reliably set NGI_run_id for {run} due to missing RunParameters.json. Aborting run processing"
+            )
+            raise
 
         sequencing_done = run.check_sequencing_status()
         demultiplexing_status = run.get_demultiplexing_status()
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 78f6763e..83eb6334 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -81,7 +81,7 @@ def parse_run_parameters(self) -> None:
             logger.warning(
                 f"Run parameters file not found for {self}, might not be ready yet"
             )
-            return
+            raise
 
         # Manually entered, but should be side and flowcell id
         self.run_name = run_parameters.get("RunName")

From cd857d7161bb7e03224ab2a5e6a6b579c1be25b1 Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Mon, 16 Sep 2024 15:29:25 +0200
Subject: [PATCH 027/187] Restructure transfer logic

---
 taca/analysis/analysis_element.py | 41 +++++++++++++-------------
 taca/element/Element_Runs.py      | 48 ++++++++++++++++++++++---------
 2 files changed, 56 insertions(+), 33 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index c7230a90..20ec17d2 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -67,12 +67,8 @@ def _process(run):
                 run.update_statusdb()
             return
         elif sequencing_done and demultiplexing_status == "finished":
-            transfer_file = (
-                CONFIG.get("Element").get(run.sequencer_type).get("transfer_log")
-            )
-            if not run.is_transferred(transfer_file) and not run.transfer_ongoing():
-                # TODO: if multiple demux dirs, aggregate the results into Demultiplexing?
-                run.aggregate_demux_results
+            if not run.is_transferred() and not run.transfer_ongoing() and not run.rsync_complete():
+                run.aggregate_demux_results() # TODO: if multiple demux dirs, aggregate the results into Demultiplexing?
                 run.sync_metadata()
                 run.make_transfer_indicator()
                 run.status = "transferring"
@@ -80,30 +76,35 @@ def _process(run):
                     run.update_statusdb()
                     # TODO: Also update statusdb with a timestamp of when the transfer started
                 run.transfer()  # I think this should be a detached command as well
-                run.remove_transfer_indicator()
-                run.update_transfer_log(transfer_file)
-
-                run.status = "transferred"
-                if run.status_changed:
-                    run.update_statusdb()
-                run.archive()
-                run.status = "archived"
-                if run.status_changed:
-                    run.update_statusdb()
-            elif not run.is_transferred(transfer_file) and run.transfer_ongoing():
+            elif run.transfer_ongoing() and not run.rsync_complete():
                 run.status = "transferring"
                 if run.status_changed:
                     run.update_statusdb()
                 logger.info(f"{run} is being transferred. Skipping.")
                 return
-            elif run.is_transferred(transfer_file):
-                logger.warn(
+            elif run.rsync_complete() and not run.is_transferred():
+                if run.rsync_success():
+                    run.remove_transfer_indicator()
+                    run.update_transfer_log()
+                    run.status = "transferred"
+                    if run.status_changed:
+                        run.update_statusdb()
+                    run.archive()
+                    run.status = "archived"
+                    if run.status_changed:
+                        run.update_statusdb()
+                else:
+                    run.status = "transfer failed"
+                    logger.warning(f"An issue occurred while transfering {run} to the analysis cluster." )
+                    # TODO: email warning to operator
+            elif run.is_transferred():
+                logger.warning(
                     f"The run {run} has already been transferred but has not been archived. Please investigate"
                 )
                 # TODO: email operator warning
                 return
             else:
-                logger.warn(f"Unknown transfer status of run {run}. Please investigate")
+                logger.warning(f"Unknown transfer status of run {run}. Please investigate")
 
     if given_run:
         run = Aviti_Run(given_run)
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 78f6763e..4bb65ffe 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -34,6 +34,8 @@ def __init__(self, run_dir, configuration):
             self.flowcell_id + ".tar.gz",
         )  # TODO: change and add to taca.yaml
         # TODO, need to be real careful when using the flowcell_id as it is manually entered and can mean three different things
+        self.transfer_file = (
+            self.CONFIG.get("Element").get(self.sequencer_type).get("transfer_log")) # TODO: change and add to taca.yaml
 
         # Instrument generated files
         self.run_parameters_file = os.path.join(self.run_dir, "RunParameters.json")
@@ -145,9 +147,9 @@ def get_demultiplexing_status(self):
         elif os.path.exists(self.demux_dir) and not os.path.isfile(
             self.demux_stats_file
         ):
-            return "ongoing"
+            return "ongoing" # TODO: check for exit status file instead
         elif os.path.exists(self.demux_dir) and os.path.isfile(self.demux_stats_file):
-            return "finished"
+            return "finished" # TODO: check exit status of demux in exit status file
         else:
             return "unknown"
 
@@ -177,8 +179,12 @@ def generate_demux_command(self, run_manifest, demux_dir):
             ],  # TODO add path to bases2fastq executable to config
             self.run_dir,
             demux_dir,
-            "-p 12",
-        ]
+            "-p 12", # TODO: how many? Considering that we may start several demux runs at once
+            f"-r {run_manifest}",
+            "--legacy-fastq", # TODO: except if Smart-seq3
+            "--force-index-orientation",
+        ] # TODO: any other options?
+        # TODO: write exit status of command to file
         return command
 
     def start_demux(self, run_manifest, demux_dir):
@@ -189,17 +195,35 @@ def start_demux(self, run_manifest, demux_dir):
             )
             logger.info(
                 "Bases2Fastq conversion and demultiplexing "
-                f"started for run {os.path.basename(self.run_dir)} on {datetime.now()}"
+                f"started for run {self} on {datetime.now()}"
             )
 
-    def is_transferred(self, transfer_file):
-        # TODO: return true if run in transfer log, else false
-        pass
-
+    def is_transferred(self):
+        with open(self.transfer_file, 'r') as transfer_file:
+            for row in transfer_file.read():
+                if self.NGI_run_id in row:
+                    return True
+        return False
+    
     def transfer_ongoing(self):
         # TODO: return true if hidden transfer file marker exists, else false
+        
+        pass
+    
+    def rsync_complete(self):
+        # TODO: return true if .rsync_exit_status exists
+        pass
+
+    def get_rsync_exit_status():
+        # TODO: return status of rsync from .rsync_exit_status
         pass
 
+    def aggregate_demux_results(self):
+        # TODO: aggregate demux results
+        pass
+
+
+
     def sync_metadata(self):
         # TODO: copy metadata from demuxed run to ngi-nas-ns
         pass
@@ -216,7 +240,7 @@ def remove_transfer_indicator(self):
         # TODO: remove hidden file in run directory
         pass
 
-    def update_transfer_log(self, transfer_file):
+    def update_transfer_log(self):
         # TODO: update the transfer log
         pass
 
@@ -224,6 +248,4 @@ def archive(self):
         # TODO: move run dir to nosync
         pass
 
-    def aggregate_demux_results(self):
-        # TODO: aggregate demux results
-        pass
+

From 3be54840d57a19ee0df91e405840d99a9778d5ce Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Mon, 16 Sep 2024 15:35:59 +0200
Subject: [PATCH 028/187] rename is_transferred function

---
 taca/analysis/analysis_element.py | 6 +++---
 taca/element/Element_Runs.py      | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index 0c0bf977..42fd3e00 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -72,7 +72,7 @@ def _process(run):
                 run.update_statusdb()
             return
         elif sequencing_done and demultiplexing_status == "finished":
-            if not run.is_transferred() and not run.transfer_ongoing() and not run.rsync_complete():
+            if not run.in_transfer_log() and not run.transfer_ongoing() and not run.rsync_complete():
                 run.aggregate_demux_results() # TODO: if multiple demux dirs, aggregate the results into Demultiplexing?
                 run.sync_metadata()
                 run.make_transfer_indicator()
@@ -87,7 +87,7 @@ def _process(run):
                     run.update_statusdb()
                 logger.info(f"{run} is being transferred. Skipping.")
                 return
-            elif run.rsync_complete() and not run.is_transferred():
+            elif run.rsync_complete() and not run.in_transfer_log():
                 if run.rsync_success():
                     run.remove_transfer_indicator()
                     run.update_transfer_log()
@@ -102,7 +102,7 @@ def _process(run):
                     run.status = "transfer failed"
                     logger.warning(f"An issue occurred while transfering {run} to the analysis cluster." )
                     # TODO: email warning to operator
-            elif run.is_transferred():
+            elif run.in_transfer_log():
                 logger.warning(
                     f"The run {run} has already been transferred but has not been archived. Please investigate"
                 )
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 541bb319..6b8f03e5 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -198,7 +198,7 @@ def start_demux(self, run_manifest, demux_dir):
                 f"started for run {self} on {datetime.now()}"
             )
 
-    def is_transferred(self):
+    def in_transfer_log(self):
         with open(self.transfer_file, 'r') as transfer_file:
             for row in transfer_file.read():
                 if self.NGI_run_id in row:

From 4ca441d33aaa87d5404075412985952b8195a084 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Mon, 16 Sep 2024 15:43:36 +0200
Subject: [PATCH 029/187] add methods for finding and copying LIMS-generated
 manifests

---
 taca/element/Element_Runs.py | 90 ++++++++++++++++++++++++++++++++----
 1 file changed, 81 insertions(+), 9 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 78f6763e..0a7cab61 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -1,8 +1,11 @@
 import json
 import logging
 import os
+import re
 import shutil
+import zipfile
 from datetime import datetime
+from glob import glob
 
 from taca.utils import misc
 from taca.utils.filesystem import chdir
@@ -28,12 +31,6 @@ def __init__(self, run_dir, configuration):
             self.demux_dir,
             "RunStats.json",  # Assumes demux is finished when this file is created
         )
-        self.run_manifest_file = os.path.join(self.run_dir, "RunManifest.csv")
-        self.run_manifest_zip_file = os.path.join(
-            self.CONFIG.get("Aviti").get("manifest_zip_location"),
-            self.flowcell_id + ".tar.gz",
-        )  # TODO: change and add to taca.yaml
-        # TODO, need to be real careful when using the flowcell_id as it is manually entered and can mean three different things
 
         # Instrument generated files
         self.run_parameters_file = os.path.join(self.run_dir, "RunParameters.json")
@@ -47,6 +44,10 @@ def __init__(self, run_dir, configuration):
 
         # Fields to be set by TACA
         self.status = None
+        self.lims_step_id = None
+        self.lims_full_manifest = None
+        self.lims_start_manifest = None
+        self.lims_demux_manifests = None
 
         # Fields that will be set when parsing run parameters
         self.run_name = None
@@ -166,9 +167,79 @@ def update_statusdb(self):
     def manifest_exists(self):
         return os.path.isfile(self.run_manifest_zip_file)
 
-    def copy_manifests(self):
-        shutil.copy(self.run_manifest_zip_file, self.run_dir)
-        # TODO: unzip
+    def get_lims_step_id(self) -> str | None:
+        """If the run was started using a LIMS-generated manifest,
+        the ID of the LIMS step can be extracted from it.
+        """
+        assert self.manifest_exists(), "Run manifest not found"
+        with open(self.run_manifest_file_from_instrument) as csv_file:
+            manifest_lines = csv_file.readlines()
+        for line in manifest_lines:
+            if "lims_step_id" in line:
+                lims_step_id = line.split(",")[1]
+                return lims_step_id
+        return None
+
+    def copy_manifests(self) -> bool:
+        """Fetch the LIMS-generated run manifests from ngi-nas-ns and unzip them into a run subdir."""
+
+        # Specify dir in which LIMS drop the manifest zip files
+        dir_to_search = os.path.join(
+            self.CONFIG.get("Aviti").get(
+                "manifest_zip_location"
+            ),  # TODO: change and add to taca.yaml
+            datetime.now().year,
+        )
+
+        # Use LIMS step ID if available, else flowcell ID, to make a query pattern
+        if self.lims_step_id:
+            logging.info(
+                f"Using LIMS step ID '{self.lims_step_id}' to find LIMS run manifests."
+            )
+            glob_pattern = f"{dir_to_search}/*{self.lims_step_id}*.zip"
+        else:
+            logging.warning(
+                "LIMS step ID not available, using flowcell ID to find LIMS run manifests."
+            )
+            glob_pattern = f"{dir_to_search}/*{self.flowcell_id}*.zip"
+
+        # Find paths matching the pattern
+        glob_results = glob(glob_pattern)
+        if len(glob_results) == 0:
+            logger.warning(
+                f"No manifest found for run '{self.run_dir}' with pattern '{glob_pattern}'."
+            )
+            return False  # TODO determine whether to raise an error here instead
+        elif len(glob_results) > 1:
+            logger.warning(
+                f"Multiple manifests found for run '{self.run_dir}' with pattern '{glob_pattern}', using latest one."
+            )
+            glob_results.sort()
+            zip_src_path = glob_results[-1]
+        else:
+            zip_src_path = glob_results[0]
+
+        # Make a run subdir named after the zip file and extract manifests there
+        zip_name = os.path.basename(zip_src_path)
+        zip_dst_path = os.path.join(self.run_dir, zip_name)
+        os.mkdir(zip_dst_path)
+
+        with zipfile.ZipFile(zip_src_path, "r") as zip_ref:
+            zip_ref.extractall(zip_dst_path)
+
+        # Set the paths of the different manifests as attributes
+        manifests = os.listdir(zip_dst_path)
+        self.lims_full_manifest = [
+            m for m in manifests if re.match(r".*_untrimmed\.csv$", m)
+        ][0]
+        self.lims_start_manifest = [
+            m for m in manifests if re.match(r".*_trimmed\.csv$", m)
+        ][0]
+        self.lims_demux_manifests = [
+            m for m in manifests if re.match(r".*_\d+\.csv$", m)
+        ]
+
+        return True
 
     def generate_demux_command(self, run_manifest, demux_dir):
         command = [
@@ -184,6 +255,7 @@ def generate_demux_command(self, run_manifest, demux_dir):
     def start_demux(self, run_manifest, demux_dir):
         with chdir(self.run_dir):
             cmd = self.generate_demux_command(run_manifest, demux_dir)
+            # TODO handle multiple composite manifests for demux
             misc.call_external_command_detached(
                 cmd, with_log_files=True, prefix="demux_"
             )

From c840ea8f749513b4d0fc452ecbff4ebc548cf52c Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Mon, 16 Sep 2024 16:48:43 +0200
Subject: [PATCH 030/187] add very rushed function for deriving new composite
 demux manifests

---
 taca/element/Element_Runs.py | 115 +++++++++++++++++++++++++++++++++++
 1 file changed, 115 insertions(+)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index c0c2c0c2..a338d691 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -7,6 +7,8 @@
 from datetime import datetime
 from glob import glob
 
+import pandas as pd
+
 from taca.utils import misc
 from taca.utils.filesystem import chdir
 from taca.utils.statusdb import ElementRunsConnection
@@ -174,6 +176,9 @@ def get_lims_step_id(self) -> str | None:
         """If the run was started using a LIMS-generated manifest,
         the ID of the LIMS step can be extracted from it.
         """
+
+        # TODO test me
+
         assert self.manifest_exists(), "Run manifest not found"
         with open(self.run_manifest_file_from_instrument) as csv_file:
             manifest_lines = csv_file.readlines()
@@ -186,6 +191,8 @@ def get_lims_step_id(self) -> str | None:
     def copy_manifests(self) -> bool:
         """Fetch the LIMS-generated run manifests from ngi-nas-ns and unzip them into a run subdir."""
 
+        # TODO test me
+
         # Specify dir in which LIMS drop the manifest zip files
         dir_to_search = os.path.join(
             self.CONFIG.get("Aviti").get(
@@ -244,6 +251,114 @@ def copy_manifests(self) -> bool:
 
         return True
 
+    def make_demux_manifests(
+        self, manifest_to_split: os.PathLike, outdir: os.PathLike | None = None
+    ) -> list[os.PathLike]:
+        """Derive composite demultiplexing manifests (grouped by index duplicity and lengths)
+        from a single information-rich manifest.
+        """
+
+        # TODO test me
+
+        # Read specified manifest
+        with open(manifest_to_split) as f:
+            manifest_contents = f.read()
+
+        # Get '[SAMPLES]' section
+        split_contents = "[SAMPLES]".split(manifest_contents)
+        assert (
+            len(split_contents) == 2
+        ), f"Could not split sample rows out of manifest {manifest_contents}"
+        sample_section = split_contents[1].split("\n")
+
+        # Split into header and rows
+        header = sample_section[0]
+        sample_rows = sample_section[1:]
+
+        # Convert to list of dicts
+        sample_dicts = []
+        for row in sample_rows:
+            row_dict = dict(zip(header.split(","), row.split(",")))
+            sample_dicts.append(row_dict)
+
+        # Convert to dataframe
+        df = pd.DataFrame.from_dict(sample_dicts)
+
+        # Separate samples from controls
+        df_samples = df[df["Project"] != "Control"].copy()
+        df_controls = df[df["Project"] == "Control"].copy()
+
+        # Apply default dir path for output
+        if outdir is None:
+            outdir = self.run_dir
+
+        ## Build composite manifests
+
+        manifest_root_name = f"{self.NGI_run_id}_demux"
+
+        # Get idx lengths for calculations
+        df_samples.loc[:, "len_idx1"] = df["Index1"].apply(len)
+        df_samples.loc[:, "len_idx2"] = df["Index2"].apply(len)
+
+        # Break down by index lengths and lane, creating composite manifests
+        manifests = []
+        n = 0
+        for (len_idx1, len_idx2, lane), group in df_samples.groupby(
+            ["len_idx1", "len_idx2", "Lane"]
+        ):
+            file_name = f"{manifest_root_name}_{n}.csv"
+            runValues_section = "\n".join(
+                [
+                    "[RUNVALUES]",
+                    "KeyName, Value",
+                    f'manifest_file, "{file_name}"',
+                    f"manifest_group, {n+1}/{len(df.groupby(['len_idx1', 'len_idx2', 'Lane']))}",
+                    f"grouped_by, len_idx1:{len_idx1} len_idx2:{len_idx2} lane:{lane}",
+                ]
+            )
+
+            settings_section = "\n".join(
+                [
+                    "[SETTINGS]",
+                    "SettingName, Value",
+                ]
+            )
+
+            # Add PhiX stratified by index length
+            if group["phix_loaded"].any():
+                # Subset controls by lane
+                group_controls = df_controls[df_controls["Lane"] == lane].copy()
+
+                # Trim PhiX indexes to match group
+                group_controls.loc[:, "Index1"] = group_controls.loc[:, "Index1"].apply(
+                    lambda x: x[:len_idx1]
+                )
+                group_controls.loc[:, "Index2"] = group_controls.loc[:, "Index2"].apply(
+                    lambda x: x[:len_idx2]
+                )
+
+                # Add PhiX to group
+                group = pd.concat([group, group_controls], axis=0, ignore_index=True)
+
+            samples_section = (
+                f"[SAMPLES]\n{group.iloc[:, 0:6].to_csv(index=None, header=True)}"
+            )
+
+            manifest_contents = "\n\n".join(
+                [runValues_section, settings_section, samples_section]
+            )
+
+            file_path = os.path.join(outdir, file_name)
+            manifests.append((file_path, manifest_contents))
+            n += 1
+
+        for manifest_path, manifest_contents in manifests:
+            with open(os.path.join(outdir, manifest_path), "w") as f:
+                f.write(manifest_contents)
+
+        manifest_paths = [t[0] for t in manifests]
+        return manifest_paths
+
     def generate_demux_command(self, run_manifest, demux_dir):
         command = [
             self.CONFIG.get(self.software)[

From eda9f3ff7c5e28d6af1f1fb0d479785ce74bf581 Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Tue, 17 Sep 2024 08:45:53 +0200
Subject: [PATCH 031/187] Restructure transfer status

---
 taca/analysis/analysis_element.py |  9 +++++----
 taca/element/Element_Runs.py      | 12 +++++++++++-
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index 42fd3e00..0f20ba8a 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -72,7 +72,8 @@ def _process(run):
                 run.update_statusdb()
             return
         elif sequencing_done and demultiplexing_status == "finished":
-            if not run.in_transfer_log() and not run.transfer_ongoing() and not run.rsync_complete():
+            transfer_status = run.get_transfer_status()
+            if transfer_status == "not started":
                 run.aggregate_demux_results() # TODO: if multiple demux dirs, aggregate the results into Demultiplexing?
                 run.sync_metadata()
                 run.make_transfer_indicator()
@@ -81,13 +82,13 @@ def _process(run):
                     run.update_statusdb()
                     # TODO: Also update statusdb with a timestamp of when the transfer started
                 run.transfer()  # I think this should be a detached command as well
-            elif run.transfer_ongoing() and not run.rsync_complete():
+            elif transfer_status == "ongoing":
                 run.status = "transferring"
                 if run.status_changed:
                     run.update_statusdb()
                 logger.info(f"{run} is being transferred. Skipping.")
                 return
-            elif run.rsync_complete() and not run.in_transfer_log():
+            elif transfer_status == "finished":
                 if run.rsync_success():
                     run.remove_transfer_indicator()
                     run.update_transfer_log()
@@ -102,7 +103,7 @@ def _process(run):
                     run.status = "transfer failed"
                     logger.warning(f"An issue occurred while transfering {run} to the analysis cluster." )
                     # TODO: email warning to operator
-            elif run.in_transfer_log():
+            elif transfer_status == "unknown":
                 logger.warning(
                     f"The run {run} has already been transferred but has not been archived. Please investigate"
                 )
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 6b8f03e5..2010a933 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -179,7 +179,7 @@ def generate_demux_command(self, run_manifest, demux_dir):
             ],  # TODO add path to bases2fastq executable to config
             self.run_dir,
             demux_dir,
-            "-p 12", # TODO: how many? Considering that we may start several demux runs at once
+            "-p 8",
             f"-r {run_manifest}",
             "--legacy-fastq", # TODO: except if Smart-seq3
             "--force-index-orientation",
@@ -198,6 +198,16 @@ def start_demux(self, run_manifest, demux_dir):
                 f"started for run {self} on {datetime.now()}"
             )
 
+    def get_transfer_status(self):
+        if not self.in_transfer_log() and not self.transfer_ongoing() and not self.rsync_complete():
+            return "not started"
+        elif self.transfer_ongoing() and not self.rsync_complete():
+            return "ongoing"
+        elif self.rsync_complete() and not self.in_transfer_log():
+            return "finished"
+        elif self.in_transfer_log():
+            return "unknown"
+    
     def in_transfer_log(self):
         with open(self.transfer_file, 'r') as transfer_file:
             for row in transfer_file.read():

From 0baa93c933df4a082b359a600cf64c5041493e21 Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Tue, 17 Sep 2024 09:24:06 +0200
Subject: [PATCH 032/187] refrase

---
 taca/analysis/analysis_element.py | 2 +-
 taca/element/Element_Runs.py      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index 0f20ba8a..e57052a6 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -88,7 +88,7 @@ def _process(run):
                     run.update_statusdb()
                 logger.info(f"{run} is being transferred. Skipping.")
                 return
-            elif transfer_status == "finished":
+            elif transfer_status == "rsync done":
                 if run.rsync_success():
                     run.remove_transfer_indicator()
                     run.update_transfer_log()
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index bd807b52..5267d6ba 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -392,7 +392,7 @@ def get_transfer_status(self):
         elif self.transfer_ongoing() and not self.rsync_complete():
             return "ongoing"
         elif self.rsync_complete() and not self.in_transfer_log():
-            return "finished"
+            return "rsync done"
         elif self.in_transfer_log():
             return "unknown"
     

From 1fe015daf1be02f4c4a868dbb96d4a99ea720f85 Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Tue, 17 Sep 2024 13:28:52 +0200
Subject: [PATCH 033/187] Add command for starting demux

---
 taca/analysis/analysis_element.py |  2 +-
 taca/element/Element_Runs.py      | 64 ++++++++++++++++---------------
 2 files changed, 35 insertions(+), 31 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index e57052a6..250fc013 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -89,7 +89,7 @@ def _process(run):
                 logger.info(f"{run} is being transferred. Skipping.")
                 return
             elif transfer_status == "rsync done":
-                if run.rsync_success():
+                if run.rsync_successful():
                     run.remove_transfer_indicator()
                     run.update_transfer_log()
                     run.status = "transferred"
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 5267d6ba..7f3e8eae 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -2,9 +2,10 @@
 import logging
 import os
 import re
-import shutil
 import zipfile
+import subprocess
 from datetime import datetime
+from pathlib import Path
 from glob import glob
 
 import pandas as pd
@@ -36,6 +37,7 @@ def __init__(self, run_dir, configuration):
         self.transfer_file = (
             self.CONFIG.get("Element").get(self.sequencer_type).get("transfer_log")
         )  # TODO: change and add to taca.yaml
+        self.rsync_exit_file = os.path.join(self.run_dir, '.rsync_exit_status')
 
         # Instrument generated files
         self.run_parameters_file = os.path.join(self.run_dir, "RunParameters.json")
@@ -360,31 +362,32 @@ def make_demux_manifests(
         return manifest_paths
 
     def generate_demux_command(self, run_manifest, demux_dir):
-        command = [
-            self.CONFIG.get(self.software)[
-                "bin"
-            ],  # TODO add path to bases2fastq executable to config
-            self.run_dir,
-            demux_dir,
-            "-p 8",
-            f"-r {run_manifest}",
-            "--legacy-fastq",  # TODO: except if Smart-seq3
-            "--force-index-orientation",
-        ]  # TODO: any other options?
-        # TODO: write exit status of command to file
+        command = (f"{self.CONFIG.get(self.software)["bin"]}"   # TODO: add path to bases2fastq executable to config
+            + f" {self.run_dir}"
+            + f" {demux_dir}"
+            + " -p 8"
+            + f" -r {run_manifest}"
+            + " --legacy-fastq"  # TODO: except if Smart-seq3
+            + f" --force-index-orientation; echo $? > {self.rsync_exit_file}"
+            )  # TODO: any other options?
         return command
 
     def start_demux(self, run_manifest, demux_dir):
         with chdir(self.run_dir):
             cmd = self.generate_demux_command(run_manifest, demux_dir)
             # TODO handle multiple composite manifests for demux
-            misc.call_external_command_detached(
-                cmd, with_log_files=True, prefix="demux_"
-            )
-            logger.info(
-                "Bases2Fastq conversion and demultiplexing "
-                f"started for run {self} on {datetime.now()}"
-            )
+            try:
+                p_handle = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True, cwd=self.run_dir)
+                logger.info(
+                    "Bases2Fastq conversion and demultiplexing "
+                    f"started for run {self} on {datetime.now()}"
+                )
+            except subprocess.CalledProcessError:
+                logger.warning("An error occurred while starting demultiplexing for "
+                               f"{self} on {datetime.now()}."
+                )
+        return
+            
 
     def get_transfer_status(self):
         if not self.in_transfer_log() and not self.transfer_ongoing() and not self.rsync_complete():
@@ -404,17 +407,18 @@ def in_transfer_log(self):
         return False
 
     def transfer_ongoing(self):
-        # TODO: return true if hidden transfer file marker exists, else false
-
-        pass
+        return os.path.isfile(os.path.join(self.run_dir, '.rsync_ongoing'))
 
     def rsync_complete(self):
-        # TODO: return true if .rsync_exit_status exists
-        pass
+        return os.path.isfile(self.rsync_exit_file)
 
-    def get_rsync_exit_status():
-        # TODO: return status of rsync from .rsync_exit_status
-        pass
+    def rsync_successful(self):
+        with open(os.path.join(self.run_dir, '.rsync_exit_status')) as rsync_exit_file:
+            rsync_exit_status = rsync_exit_file.readlines()
+        if rsync_exit_status[0].strip() == 0:
+            return True
+        else:
+            return False
 
     def aggregate_demux_results(self):
         # TODO: aggregate demux results
@@ -425,8 +429,8 @@ def sync_metadata(self):
         pass
 
     def make_transfer_indicator(self):
-        # TODO: touch a hidden file in the run directory
-        pass
+        transfer_indicator = os.path.join(self.run_dir, '.rsync_ongoing')
+        Path(transfer_indicator).touch()
 
     def transfer(self):
         # TODO: rsync run to analysis cluster

From 5790c1aacd05e8608c98607a121b69abd12e553c Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Tue, 17 Sep 2024 14:24:49 +0200
Subject: [PATCH 034/187] Check all demux dirs if demux is done

---
 taca/analysis/analysis_element.py |  7 ++++---
 taca/element/Element_Runs.py      | 28 +++++++++++++++++-----------
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index 250fc013..32439610 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -24,7 +24,7 @@ def _process(run):
         try:
             run.parse_run_parameters()
         except FileNotFoundError:
-            logger.warn(
+            logger.warning(
                 f"Cannot reliably set NGI_run_id for {run} due to missing RunParameters.json. Aborting run processing"
             )
             raise
@@ -41,7 +41,7 @@ def _process(run):
             if (
                 not run.manifest_exists()
             ):  # TODO: this should check for the zip file in lims output location
-                logger.warn(
+                logger.warning(
                     f"Run manifest is missing for {run}, demultiplexing aborted"
                 )
                 # TODO: email operator warning
@@ -74,7 +74,8 @@ def _process(run):
         elif sequencing_done and demultiplexing_status == "finished":
             transfer_status = run.get_transfer_status()
             if transfer_status == "not started":
-                run.aggregate_demux_results() # TODO: if multiple demux dirs, aggregate the results into Demultiplexing?
+                #TODO: if multiple demux dir exist, move the data dirs into Demultiplexing
+                run.aggregate_demux_results()
                 run.sync_metadata()
                 run.make_transfer_indicator()
                 run.status = "transferring"
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 7f3e8eae..fed22e28 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -30,10 +30,7 @@ def __init__(self, run_dir, configuration):
 
         self.demux_dir = os.path.join(self.run_dir, "Demultiplexing")
         self.final_sequencing_file = os.path.join(self.run_dir, "RunUploaded.json")
-        self.demux_stats_file = os.path.join(
-            self.demux_dir,
-            "RunStats.json",  # Assumes demux is finished when this file is created
-        )
+        self.demux_stats_file = "RunStats.json"  # Assumes demux is finished when this file is created
         self.transfer_file = (
             self.CONFIG.get("Element").get(self.sequencer_type).get("transfer_log")
         )  # TODO: change and add to taca.yaml
@@ -150,12 +147,21 @@ def check_sequencing_status(self):
     def get_demultiplexing_status(self):
         if not os.path.exists(self.demux_dir):
             return "not started"
-        elif os.path.exists(self.demux_dir) and not os.path.isfile(
-            self.demux_stats_file
-        ):
-            return "ongoing"  # TODO: check for exit status file instead
-        elif os.path.exists(self.demux_dir) and os.path.isfile(self.demux_stats_file):
-            return "finished"  # TODO: check exit status of demux in exit status file
+        demux_dirs = glob.glob(
+            os.path.join(self.run_dir, "Delmultiplexing*")
+            )
+        finished_count = 0
+        for demux_dir in demux_dirs:
+            if os.path.exists(self.demux_dir) and not os.path.isfile(
+                os.path.join(demux_dir, self.demux_stats_file)
+                ):
+                return "ongoing"
+            elif os.path.exists(self.demux_dir) and os.path.isfile(
+                os.path.join(demux_dir, self.demux_stats_file)
+                ):
+                finished_count += 1  # TODO: check exit status of demux in exit status file
+        if finished_count == len(demux_dirs):
+            return "finished"
         else:
             return "unknown"
 
@@ -421,7 +427,7 @@ def rsync_successful(self):
             return False
 
     def aggregate_demux_results(self):
-        # TODO: aggregate demux results
+        # TODO: aggregate demux results. Move project data dir from each sub demux dir to Demultiplexing
         pass
 
     def sync_metadata(self):

From 32f812cb72913e6a401c362da1a839e6f4377712 Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Tue, 17 Sep 2024 14:53:37 +0200
Subject: [PATCH 035/187] Aggregate demux results if more than one

---
 taca/analysis/analysis_element.py |  7 +++++--
 taca/element/Element_Runs.py      | 14 +++++++++-----
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index 32439610..06d4e252 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -74,8 +74,11 @@ def _process(run):
         elif sequencing_done and demultiplexing_status == "finished":
             transfer_status = run.get_transfer_status()
             if transfer_status == "not started":
-                #TODO: if multiple demux dir exist, move the data dirs into Demultiplexing
-                run.aggregate_demux_results()
+                demux_results_dirs = glob.glob(
+                    os.path.join(run.run_dir, "Delmultiplexing*")
+                    )
+                if len(demux_results_dirs > 1):
+                    run.aggregate_demux_results(demux_results_dirs)
                 run.sync_metadata()
                 run.make_transfer_indicator()
                 run.status = "transferring"
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index fed22e28..67222304 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -4,6 +4,7 @@
 import re
 import zipfile
 import subprocess
+import shutil
 from datetime import datetime
 from pathlib import Path
 from glob import glob
@@ -374,14 +375,14 @@ def generate_demux_command(self, run_manifest, demux_dir):
             + " -p 8"
             + f" -r {run_manifest}"
             + " --legacy-fastq"  # TODO: except if Smart-seq3
-            + f" --force-index-orientation; echo $? > {self.rsync_exit_file}"
+            + f" --force-index-orientation"
             )  # TODO: any other options?
         return command
 
     def start_demux(self, run_manifest, demux_dir):
         with chdir(self.run_dir):
             cmd = self.generate_demux_command(run_manifest, demux_dir)
-            # TODO handle multiple composite manifests for demux
+            # TODO: handle multiple composite manifests for demux
             try:
                 p_handle = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True, cwd=self.run_dir)
                 logger.info(
@@ -426,9 +427,12 @@ def rsync_successful(self):
         else:
             return False
 
-    def aggregate_demux_results(self):
-        # TODO: aggregate demux results. Move project data dir from each sub demux dir to Demultiplexing
-        pass
+    def aggregate_demux_results(self, demux_results_dirs):
+        for demux_dir in demux_results_dirs:
+            data_dirs = [f.path for f in os.scandir(os.path.join(demux_dir, 'Samples')) if f.is_dir()]
+        for data_dir in data_dirs:
+            if not "PhiX" in data_dir and not "Unassigned" in data_dir:
+                shutil.move(data_dir, self.demux_dir)            
 
     def sync_metadata(self):
         # TODO: copy metadata from demuxed run to ngi-nas-ns

From 253b9d1ea65a356c7b050e3768b1a29469b202cf Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Wed, 18 Sep 2024 08:37:50 +0200
Subject: [PATCH 036/187] rsync function

---
 taca/element/Element_Runs.py | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 67222304..1cfdf953 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -428,6 +428,7 @@ def rsync_successful(self):
             return False
 
     def aggregate_demux_results(self, demux_results_dirs):
+        # TODO: Correct this based on comments from Chuan
         for demux_dir in demux_results_dirs:
             data_dirs = [f.path for f in os.scandir(os.path.join(demux_dir, 'Samples')) if f.is_dir()]
         for data_dir in data_dirs:
@@ -443,8 +444,28 @@ def make_transfer_indicator(self):
         Path(transfer_indicator).touch()
 
     def transfer(self):
-        # TODO: rsync run to analysis cluster
-        pass
+        transfer_details = self.CONFIG.get("Element").get(self.sequencer_type).get("transfer_details") #TODO: Add section to taca.yaml
+        command = ("rsync"
+                   + " -rLav"
+                   + f" --chown={transfer_details.get("owner")}"
+                   + f" --chmod={transfer_details.get("permissions")}"
+                   + " --exclude BaseCalls" # TODO: check that we actually want to exclude these
+                   + " --exclude Alignment"
+                   + f" {self.run_dir}"
+                   + f" {transfer_details.get("user")@transfer_details.get("host")}:/"
+                   + "; echo $? > .rsync_exit_status"
+            )  # TODO: any other options?
+        try:
+            p_handle = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
+            logger.info(
+                "Transfer to analysis cluster "
+                f"started for run {self} on {datetime.now()}"
+            )
+        except subprocess.CalledProcessError:
+            logger.warning("An error occurred while starting transfer to analysis cluster "
+                            f"for {self} on {datetime.now()}."
+            )
+        return
 
     def remove_transfer_indicator(self):
         # TODO: remove hidden file in run directory

From 90bc0ed60b5dc9635be420ad4dfd2b7dc53252ae Mon Sep 17 00:00:00 2001
From: Johannes Alneberg <alneberg@kth.se>
Date: Wed, 18 Sep 2024 10:40:43 +0200
Subject: [PATCH 037/187] A suggestion for structure for the _process function

---
 taca/analysis/analysis_element.py | 121 +++++++++++++++++-------------
 1 file changed, 69 insertions(+), 52 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index 06d4e252..e69e0902 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -29,24 +29,19 @@ def _process(run):
             )
             raise
 
+        #### Sequencing status ####
         sequencing_done = run.check_sequencing_status()
-        demultiplexing_status = run.get_demultiplexing_status()
         if not sequencing_done:  # Sequencing ongoing
             run.status = "sequencing"
             if run.status_changed:
                 run.update_statusdb()
-        elif (
-            sequencing_done and demultiplexing_status == "not started"
-        ):  # Sequencing done. Start demux
-            if (
-                not run.manifest_exists()
-            ):  # TODO: this should check for the zip file in lims output location
-                logger.warning(
-                    f"Run manifest is missing for {run}, demultiplexing aborted"
-                )
-                # TODO: email operator warning
-                return
-            elif run.manifest_exists():
+            return
+
+        #### Demultiplexing status ####
+        demultiplexing_status = run.get_demultiplexing_status()
+        if demultiplexing_status == "not started":
+            # Sequencing done. Start demux
+            if run.manifest_exists():
                 os.mkdir(run.demux_dir)
                 run.copy_manifests()
                 run_manifests = glob.glob(
@@ -66,55 +61,77 @@ def _process(run):
                 run.status = "demultiplexing"
                 if run.status_changed:
                     run.update_statusdb()
-        elif sequencing_done and demultiplexing_status == "ongoing":
+                return
+            else:
+                # TODO: this should check for the zip file in lims output location
+                logger.warning(
+                    f"Run manifest is missing for {run}, demultiplexing aborted"
+                )
+                # TODO: email operator warning
+                return
+        elif demultiplexing_status == "ongoing":
             run.status = "demultiplexing"
             if run.status_changed:
                 run.update_statusdb()
             return
-        elif sequencing_done and demultiplexing_status == "finished":
-            transfer_status = run.get_transfer_status()
-            if transfer_status == "not started":
-                demux_results_dirs = glob.glob(
-                    os.path.join(run.run_dir, "Delmultiplexing*")
-                    )
-                if len(demux_results_dirs > 1):
-                    run.aggregate_demux_results(demux_results_dirs)
-                run.sync_metadata()
-                run.make_transfer_indicator()
-                run.status = "transferring"
+        elif demultiplexing_status != "finished":
+            logger.warning(
+                f"Unknown demultiplexing status {demultiplexing_status} of run {run}. Please investigate"
+            )
+            return
+
+        #### Transfer status ####
+        transfer_status = run.get_transfer_status()
+        if transfer_status == "not started":
+            demux_results_dirs = glob.glob(
+                os.path.join(run.run_dir, "Delmultiplexing*")
+            )
+            if len(demux_results_dirs > 1):
+                run.aggregate_demux_results(demux_results_dirs)
+            run.sync_metadata()
+            run.make_transfer_indicator()
+            run.status = "transferring"
+            if run.status_changed:
+                run.update_statusdb()
+                # TODO: Also update statusdb with a timestamp of when the transfer started
+            run.transfer()  # I think this should be a detached command as well
+            return
+        elif transfer_status == "ongoing":
+            run.status = "transferring"
+            if run.status_changed:
+                run.update_statusdb()
+            logger.info(f"{run} is being transferred. Skipping.")
+            return
+        elif transfer_status == "rsync done":
+            if run.rsync_successful():
+                run.remove_transfer_indicator()
+                run.update_transfer_log()
+                run.status = "transferred"
                 if run.status_changed:
                     run.update_statusdb()
-                    # TODO: Also update statusdb with a timestamp of when the transfer started
-                run.transfer()  # I think this should be a detached command as well
-            elif transfer_status == "ongoing":
-                run.status = "transferring"
+                run.archive()
+                run.status = "archived"
                 if run.status_changed:
                     run.update_statusdb()
-                logger.info(f"{run} is being transferred. Skipping.")
-                return
-            elif transfer_status == "rsync done":
-                if run.rsync_successful():
-                    run.remove_transfer_indicator()
-                    run.update_transfer_log()
-                    run.status = "transferred"
-                    if run.status_changed:
-                        run.update_statusdb()
-                    run.archive()
-                    run.status = "archived"
-                    if run.status_changed:
-                        run.update_statusdb()
-                else:
-                    run.status = "transfer failed"
-                    logger.warning(f"An issue occurred while transfering {run} to the analysis cluster." )
-                    # TODO: email warning to operator
-            elif transfer_status == "unknown":
+            else:
+                run.status = "transfer failed"
                 logger.warning(
-                    f"The run {run} has already been transferred but has not been archived. Please investigate"
+                    f"An issue occurred while transfering {run} to the analysis cluster."
                 )
-                # TODO: email operator warning
-                return
-            else:
-                logger.warning(f"Unknown transfer status of run {run}. Please investigate")
+                # TODO: email warning to operator
+            return
+        elif transfer_status == "unknown":
+            logger.warning(
+                f"The run {run} has already been transferred but has not been archived. Please investigate"
+            )
+            # TODO: email operator warning
+            return
+        else:
+            # TODO Merge with the one above?
+            logger.warning(
+                f"Unknown transfer status {transfer_status} of run {run}. Please investigate"
+            )
+            return
 
     if given_run:
         run = Aviti_Run(given_run)

From 08995cb5a881a5548eccc40765fb259cb3d86757 Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Wed, 18 Sep 2024 12:54:24 +0200
Subject: [PATCH 038/187] gather demux results and upload to statusdb

---
 taca/analysis/analysis_element.py |  1 +
 taca/element/Element_Runs.py      | 49 +++++++++++++++++++++++++++++--
 2 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index 06d4e252..5a21d718 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -79,6 +79,7 @@ def _process(run):
                     )
                 if len(demux_results_dirs > 1):
                     run.aggregate_demux_results(demux_results_dirs)
+                run.upload_demux_results_to_statusdb()
                 run.sync_metadata()
                 run.make_transfer_indicator()
                 run.status = "transferring"
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 1cfdf953..af54aa03 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -2,6 +2,7 @@
 import logging
 import os
 import re
+import csv
 import zipfile
 import subprocess
 import shutil
@@ -107,7 +108,7 @@ def parse_run_parameters(self) -> None:
         self.run_parameters_parsed = True
 
     def to_doc_obj(self):
-        # TODO, are we sure what we should do when the RunParameters.json file is missing?
+        # TODO: are we sure what we should do when the RunParameters.json file is missing?
 
         # Read in all instrument generated files
         instrument_generated_files = {}
@@ -432,8 +433,50 @@ def aggregate_demux_results(self, demux_results_dirs):
         for demux_dir in demux_results_dirs:
             data_dirs = [f.path for f in os.scandir(os.path.join(demux_dir, 'Samples')) if f.is_dir()]
         for data_dir in data_dirs:
-            if not "PhiX" in data_dir and not "Unassigned" in data_dir:
-                shutil.move(data_dir, self.demux_dir)            
+            if not "PhiX" in data_dir in data_dir:
+                shutil.move(data_dir, self.demux_dir)
+                
+    def upload_demux_results_to_statusdb(self):
+        # TODO: dump contents of IndexAssignment.csv and UnassignedSequences.csv into statusdb document
+        doc_obj = self.db.get_db_entry(self.NGI_run_id)
+        index_assignement_file = os.path.join(self.run_dir, "Demultiplexing", "IndexAssignment.csv")
+        with open(index_assignement_file, 'r') as index_file:
+            reader = csv.DictReader(index_file)
+            index_assignments = [row for row in reader]
+        unassigned_sequences_file = os.path.join(self.run_dir, "Demultiplexing", "UnassignedSequences.csv")
+        with open(unassigned_sequences_file, 'r') as unassigned_file:
+            reader = csv.DictReader(unassigned_file)
+            unassigned_sequences = [row for row in reader]
+        project_dirs = [f.path for f in os.scandir(os.path.join(self.run_dir, "Demultiplexing")) if f.is_dir() and not "PhiX" in f]
+        for project_dir in project_dirs:
+            run_stats_file = glob.glob(os.path.join(project_dir, "*_RunStats.json"))
+            with open(run_stats_file) as stats_json:
+                project_sample_stats_raw = json.load(stats_json)
+            collected_sample_stats = {}
+            for sample_stats in project_sample_stats_raw["SampleStats"]:
+                sample_name = sample_stats["SampleName"]
+                percent_q30 = sample_stats["PercentQ30"]
+                quality_score_mean = sample_stats["QualityScoreMean"]
+                percent_mismatch = sample_stats["PercentMismatch"]
+                collected_sample_stats[sample_name] = {
+                    "PercentQ30": percent_q30,
+                    "QualityScoreMean": quality_score_mean,
+                    "PercentMismatch": percent_mismatch
+                    }
+            for assignment in index_assignments:
+                sample = assignment.get("SampleName")
+                sample_stats_to_add = collected_sample_stats.get(sample)
+                assignment["PercentQ30"] = sample_stats_to_add.get("PercentQ30")
+                assignment["QualityScoreMean"] = sample_stats_to_add.get("QualityScoreMean")
+                assignment["PercentMismatch"] = sample_stats_to_add.get("PercentMismatch")
+        demultiplex_stats = {
+            "Demultiplex_Stats": {
+                "Index_Assignment": index_assignments,
+                "Unassigned_Sequences": unassigned_sequences
+                }
+            }
+        doc_obj["Aviti": demultiplex_stats]
+        self.db.upload_to_statusdb(doc_obj)
 
     def sync_metadata(self):
         # TODO: copy metadata from demuxed run to ngi-nas-ns

From ee38b71f71d7665e5d145e0b90dfdb8342fcbf4b Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Wed, 18 Sep 2024 15:39:39 +0200
Subject: [PATCH 039/187] Correct path finding

---
 taca/element/Element_Runs.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index af54aa03..875e2cdd 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -447,8 +447,12 @@ def upload_demux_results_to_statusdb(self):
         with open(unassigned_sequences_file, 'r') as unassigned_file:
             reader = csv.DictReader(unassigned_file)
             unassigned_sequences = [row for row in reader]
-        project_dirs = [f.path for f in os.scandir(os.path.join(self.run_dir, "Demultiplexing")) if f.is_dir() and not "PhiX" in f]
-        for project_dir in project_dirs:
+        dirs = os.scandir("Demultiplexing")
+        project_dirs = []
+        for directory in dirs:
+            if os.path.isdir(directory.path) and not "Unassigned" in directory.path:
+                project_dirs.append(directory.path)
+        for project_dir in project_dirs: # TODO: remove this block when q30 is added to IndexAssignment.csv by Element
             run_stats_file = glob.glob(os.path.join(project_dir, "*_RunStats.json"))
             with open(run_stats_file) as stats_json:
                 project_sample_stats_raw = json.load(stats_json)

From c9acfb94c19b4204dde99f21e1c6d029621c7497 Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Wed, 18 Sep 2024 15:43:48 +0200
Subject: [PATCH 040/187] filtter out phix

---
 taca/element/Element_Runs.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 875e2cdd..dcfb0633 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -469,10 +469,11 @@ def upload_demux_results_to_statusdb(self):
                     }
             for assignment in index_assignments:
                 sample = assignment.get("SampleName")
-                sample_stats_to_add = collected_sample_stats.get(sample)
-                assignment["PercentQ30"] = sample_stats_to_add.get("PercentQ30")
-                assignment["QualityScoreMean"] = sample_stats_to_add.get("QualityScoreMean")
-                assignment["PercentMismatch"] = sample_stats_to_add.get("PercentMismatch")
+                if sample != "PhiX":
+                    sample_stats_to_add = collected_sample_stats.get(sample)
+                    assignment["PercentQ30"] = sample_stats_to_add.get("PercentQ30")
+                    assignment["QualityScoreMean"] = sample_stats_to_add.get("QualityScoreMean")
+                    assignment["PercentMismatch"] = sample_stats_to_add.get("PercentMismatch")
         demultiplex_stats = {
             "Demultiplex_Stats": {
                 "Index_Assignment": index_assignments,

From faa7f68e73e3be5723f94dd2ca8b790724757cec Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Wed, 18 Sep 2024 15:45:11 +0200
Subject: [PATCH 041/187] Fix dictionary

---
 taca/element/Element_Runs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index dcfb0633..8fd53316 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -480,7 +480,7 @@ def upload_demux_results_to_statusdb(self):
                 "Unassigned_Sequences": unassigned_sequences
                 }
             }
-        doc_obj["Aviti": demultiplex_stats]
+        doc_obj["Aviti"] = demultiplex_stats
         self.db.upload_to_statusdb(doc_obj)
 
     def sync_metadata(self):

From 70803cd71a12758d90a64923f057f1e80d0d6c96 Mon Sep 17 00:00:00 2001
From: Johannes Alneberg <alneberg@kth.se>
Date: Thu, 19 Sep 2024 09:42:25 +0200
Subject: [PATCH 042/187] Remove config file for tests from Docker

---
 Dockerfile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index db69561c..93fd631b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -23,7 +23,6 @@ COPY requirements-dev.txt requirements-dev.txt
 RUN python -m pip install -r requirements-dev.txt
 
 RUN mkdir /root/.taca/
-COPY tests/data/taca_test_cfg.yaml /root/.taca/taca.yaml
 
 FROM base AS testing
 COPY . /taca

From c6667b24130b3487298baad897c6c9d8fe39fea3 Mon Sep 17 00:00:00 2001
From: Johannes Alneberg <alneberg@kth.se>
Date: Thu, 19 Sep 2024 09:43:18 +0200
Subject: [PATCH 043/187] Change order of init in aviti

---
 taca/element/Aviti_Runs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/taca/element/Aviti_Runs.py b/taca/element/Aviti_Runs.py
index ad162ac4..18c81eda 100644
--- a/taca/element/Aviti_Runs.py
+++ b/taca/element/Aviti_Runs.py
@@ -3,5 +3,5 @@
 
 class Aviti_Run(Run):
     def __init__(self, run_dir, configuration):
-        super().__init__(run_dir, configuration)
         self.sequencer_type = "Aviti"
+        super().__init__(run_dir, configuration)

From 0ec6924215aec36f214803628d47317f44ef9fa2 Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Thu, 19 Sep 2024 10:08:57 +0200
Subject: [PATCH 044/187] Handle run post transfer

---
 taca/element/Element_Runs.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 8fd53316..d22fced6 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -516,13 +516,22 @@ def transfer(self):
         return
 
     def remove_transfer_indicator(self):
-        # TODO: remove hidden file in run directory
-        pass
+        transfer_indicator = os.path.join(self.run_dir, '.rsync_ongoing')
+        Path(transfer_indicator).unlink()
 
     def update_transfer_log(self):
-        # TODO: update the transfer log
-        pass
+        """Update transfer log with run id and date."""
+        try:
+            with open(self.transfer_file, "a") as f:
+                tsv_writer = csv.writer(f, delimiter="\t")
+                tsv_writer.writerow([self.NGI_run_id, str(datetime.now())])
+        except OSError:
+            msg = f"{self}: Could not update the transfer logfile {self.transfer_file}"
+            logger.error(msg)
+            raise OSError(msg)
 
     def archive(self):
-        # TODO: move run dir to nosync
-        pass
+        """Move directory to nosync."""
+        src = self.run_dir
+        dst = os.path.join(self.run_dir, os.pardir, "nosync")
+        shutil.move(src, dst)

From 0a3de0b5a1e49eff7df6389fe7f14b4ff958ce70 Mon Sep 17 00:00:00 2001
From: Johannes Alneberg <alneberg@kth.se>
Date: Thu, 19 Sep 2024 10:11:26 +0200
Subject: [PATCH 045/187] Some small bugfixes for Element Runs

---
 taca/element/Element_Runs.py | 138 +++++++++++++++++++++--------------
 1 file changed, 83 insertions(+), 55 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 8fd53316..1a660d82 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -1,18 +1,17 @@
+import csv
 import json
 import logging
 import os
 import re
-import csv
-import zipfile
-import subprocess
 import shutil
+import subprocess
+import zipfile
 from datetime import datetime
-from pathlib import Path
 from glob import glob
+from pathlib import Path
 
 import pandas as pd
 
-from taca.utils import misc
 from taca.utils.filesystem import chdir
 from taca.utils.statusdb import ElementRunsConnection
 
@@ -32,11 +31,15 @@ def __init__(self, run_dir, configuration):
 
         self.demux_dir = os.path.join(self.run_dir, "Demultiplexing")
         self.final_sequencing_file = os.path.join(self.run_dir, "RunUploaded.json")
-        self.demux_stats_file = "RunStats.json"  # Assumes demux is finished when this file is created
+        self.demux_stats_file = (
+            "RunStats.json"  # Assumes demux is finished when this file is created
+        )
         self.transfer_file = (
-            self.CONFIG.get("Element").get(self.sequencer_type).get("transfer_log")
+            self.CONFIG.get("Element", {})
+            .get(self.sequencer_type, {})
+            .get("transfer_log")
         )  # TODO: change and add to taca.yaml
-        self.rsync_exit_file = os.path.join(self.run_dir, '.rsync_exit_status')
+        self.rsync_exit_file = os.path.join(self.run_dir, ".rsync_exit_status")
 
         # Instrument generated files
         self.run_parameters_file = os.path.join(self.run_dir, "RunParameters.json")
@@ -46,7 +49,9 @@ def __init__(self, run_dir, configuration):
         )
         self.run_uploaded_file = os.path.join(self.run_dir, "RunUploaded.json")
 
-        self.db = ElementRunsConnection(self.CONFIG["statusdb"], dbname="element_runs")
+        self.db = ElementRunsConnection(
+            self.CONFIG.get("statusdb", {}), dbname="element_runs"
+        )
 
         # Fields to be set by TACA
         self.status = None
@@ -149,19 +154,19 @@ def check_sequencing_status(self):
     def get_demultiplexing_status(self):
         if not os.path.exists(self.demux_dir):
             return "not started"
-        demux_dirs = glob.glob(
-            os.path.join(self.run_dir, "Delmultiplexing*")
-            )
+        demux_dirs = glob.glob(os.path.join(self.run_dir, "Delmultiplexing*"))
         finished_count = 0
         for demux_dir in demux_dirs:
             if os.path.exists(self.demux_dir) and not os.path.isfile(
                 os.path.join(demux_dir, self.demux_stats_file)
-                ):
+            ):
                 return "ongoing"
             elif os.path.exists(self.demux_dir) and os.path.isfile(
                 os.path.join(demux_dir, self.demux_stats_file)
-                ):
-                finished_count += 1  # TODO: check exit status of demux in exit status file
+            ):
+                finished_count += (
+                    1  # TODO: check exit status of demux in exit status file
+                )
         if finished_count == len(demux_dirs):
             return "finished"
         else:
@@ -370,14 +375,15 @@ def make_demux_manifests(
         return manifest_paths
 
     def generate_demux_command(self, run_manifest, demux_dir):
-        command = (f"{self.CONFIG.get(self.software)["bin"]}"   # TODO: add path to bases2fastq executable to config
+        command = (
+            f"{self.CONFIG.get(self.software)['bin']}"  # TODO: add path to bases2fastq executable to config
             + f" {self.run_dir}"
             + f" {demux_dir}"
             + " -p 8"
             + f" -r {run_manifest}"
             + " --legacy-fastq"  # TODO: except if Smart-seq3
-            + f" --force-index-orientation"
-            )  # TODO: any other options?
+            + " --force-index-orientation"
+        )  # TODO: any other options?
         return command
 
     def start_demux(self, run_manifest, demux_dir):
@@ -385,20 +391,26 @@ def start_demux(self, run_manifest, demux_dir):
             cmd = self.generate_demux_command(run_manifest, demux_dir)
             # TODO: handle multiple composite manifests for demux
             try:
-                p_handle = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True, cwd=self.run_dir)
+                p_handle = subprocess.Popen(
+                    cmd, stdout=subprocess.PIPE, shell=True, cwd=self.run_dir
+                )
                 logger.info(
                     "Bases2Fastq conversion and demultiplexing "
                     f"started for run {self} on {datetime.now()}"
                 )
             except subprocess.CalledProcessError:
-                logger.warning("An error occurred while starting demultiplexing for "
-                               f"{self} on {datetime.now()}."
+                logger.warning(
+                    "An error occurred while starting demultiplexing for "
+                    f"{self} on {datetime.now()}."
                 )
         return
-            
 
     def get_transfer_status(self):
-        if not self.in_transfer_log() and not self.transfer_ongoing() and not self.rsync_complete():
+        if (
+            not self.in_transfer_log()
+            and not self.transfer_ongoing()
+            and not self.rsync_complete()
+        ):
             return "not started"
         elif self.transfer_ongoing() and not self.rsync_complete():
             return "ongoing"
@@ -406,22 +418,22 @@ def get_transfer_status(self):
             return "rsync done"
         elif self.in_transfer_log():
             return "unknown"
-    
+
     def in_transfer_log(self):
-        with open(self.transfer_file, "r") as transfer_file:
+        with open(self.transfer_file) as transfer_file:
             for row in transfer_file.read():
                 if self.NGI_run_id in row:
                     return True
         return False
 
     def transfer_ongoing(self):
-        return os.path.isfile(os.path.join(self.run_dir, '.rsync_ongoing'))
+        return os.path.isfile(os.path.join(self.run_dir, ".rsync_ongoing"))
 
     def rsync_complete(self):
         return os.path.isfile(self.rsync_exit_file)
 
     def rsync_successful(self):
-        with open(os.path.join(self.run_dir, '.rsync_exit_status')) as rsync_exit_file:
+        with open(os.path.join(self.run_dir, ".rsync_exit_status")) as rsync_exit_file:
             rsync_exit_status = rsync_exit_file.readlines()
         if rsync_exit_status[0].strip() == 0:
             return True
@@ -431,28 +443,36 @@ def rsync_successful(self):
     def aggregate_demux_results(self, demux_results_dirs):
         # TODO: Correct this based on comments from Chuan
         for demux_dir in demux_results_dirs:
-            data_dirs = [f.path for f in os.scandir(os.path.join(demux_dir, 'Samples')) if f.is_dir()]
+            data_dirs = [
+                f.path
+                for f in os.scandir(os.path.join(demux_dir, "Samples"))
+                if f.is_dir()
+            ]
         for data_dir in data_dirs:
             if not "PhiX" in data_dir in data_dir:
                 shutil.move(data_dir, self.demux_dir)
-                
+
     def upload_demux_results_to_statusdb(self):
         # TODO: dump contents of IndexAssignment.csv and UnassignedSequences.csv into statusdb document
         doc_obj = self.db.get_db_entry(self.NGI_run_id)
-        index_assignement_file = os.path.join(self.run_dir, "Demultiplexing", "IndexAssignment.csv")
-        with open(index_assignement_file, 'r') as index_file:
+        index_assignement_file = os.path.join(
+            self.run_dir, "Demultiplexing", "IndexAssignment.csv"
+        )
+        with open(index_assignement_file) as index_file:
             reader = csv.DictReader(index_file)
             index_assignments = [row for row in reader]
-        unassigned_sequences_file = os.path.join(self.run_dir, "Demultiplexing", "UnassignedSequences.csv")
-        with open(unassigned_sequences_file, 'r') as unassigned_file:
+        unassigned_sequences_file = os.path.join(
+            self.run_dir, "Demultiplexing", "UnassignedSequences.csv"
+        )
+        with open(unassigned_sequences_file) as unassigned_file:
             reader = csv.DictReader(unassigned_file)
             unassigned_sequences = [row for row in reader]
         dirs = os.scandir("Demultiplexing")
         project_dirs = []
         for directory in dirs:
-            if os.path.isdir(directory.path) and not "Unassigned" in directory.path:
+            if os.path.isdir(directory.path) and "Unassigned" not in directory.path:
                 project_dirs.append(directory.path)
-        for project_dir in project_dirs: # TODO: remove this block when q30 is added to IndexAssignment.csv by Element
+        for project_dir in project_dirs:  # TODO: remove this block when q30 is added to IndexAssignment.csv by Element
             run_stats_file = glob.glob(os.path.join(project_dir, "*_RunStats.json"))
             with open(run_stats_file) as stats_json:
                 project_sample_stats_raw = json.load(stats_json)
@@ -465,21 +485,25 @@ def upload_demux_results_to_statusdb(self):
                 collected_sample_stats[sample_name] = {
                     "PercentQ30": percent_q30,
                     "QualityScoreMean": quality_score_mean,
-                    "PercentMismatch": percent_mismatch
-                    }
+                    "PercentMismatch": percent_mismatch,
+                }
             for assignment in index_assignments:
                 sample = assignment.get("SampleName")
                 if sample != "PhiX":
                     sample_stats_to_add = collected_sample_stats.get(sample)
                     assignment["PercentQ30"] = sample_stats_to_add.get("PercentQ30")
-                    assignment["QualityScoreMean"] = sample_stats_to_add.get("QualityScoreMean")
-                    assignment["PercentMismatch"] = sample_stats_to_add.get("PercentMismatch")
+                    assignment["QualityScoreMean"] = sample_stats_to_add.get(
+                        "QualityScoreMean"
+                    )
+                    assignment["PercentMismatch"] = sample_stats_to_add.get(
+                        "PercentMismatch"
+                    )
         demultiplex_stats = {
             "Demultiplex_Stats": {
                 "Index_Assignment": index_assignments,
-                "Unassigned_Sequences": unassigned_sequences
-                }
+                "Unassigned_Sequences": unassigned_sequences,
             }
+        }
         doc_obj["Aviti"] = demultiplex_stats
         self.db.upload_to_statusdb(doc_obj)
 
@@ -488,21 +512,24 @@ def sync_metadata(self):
         pass
 
     def make_transfer_indicator(self):
-        transfer_indicator = os.path.join(self.run_dir, '.rsync_ongoing')
+        transfer_indicator = os.path.join(self.run_dir, ".rsync_ongoing")
         Path(transfer_indicator).touch()
 
     def transfer(self):
-        transfer_details = self.CONFIG.get("Element").get(self.sequencer_type).get("transfer_details") #TODO: Add section to taca.yaml
-        command = ("rsync"
-                   + " -rLav"
-                   + f" --chown={transfer_details.get("owner")}"
-                   + f" --chmod={transfer_details.get("permissions")}"
-                   + " --exclude BaseCalls" # TODO: check that we actually want to exclude these
-                   + " --exclude Alignment"
-                   + f" {self.run_dir}"
-                   + f" {transfer_details.get("user")@transfer_details.get("host")}:/"
-                   + "; echo $? > .rsync_exit_status"
-            )  # TODO: any other options?
+        transfer_details = (
+            self.CONFIG.get("Element").get(self.sequencer_type).get("transfer_details")
+        )  # TODO: Add section to taca.yaml
+        command = (
+            "rsync"
+            + " -rLav"
+            + f" --chown={transfer_details.get('owner')}"
+            + f" --chmod={transfer_details.get('permissions')}"
+            + " --exclude BaseCalls"  # TODO: check that we actually want to exclude these
+            + " --exclude Alignment"
+            + f" {self.run_dir}"
+            + f" {transfer_details.get('user')@transfer_details.get('host')}:/"
+            + "; echo $? > .rsync_exit_status"
+        )  # TODO: any other options?
         try:
             p_handle = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
             logger.info(
@@ -510,8 +537,9 @@ def transfer(self):
                 f"started for run {self} on {datetime.now()}"
             )
         except subprocess.CalledProcessError:
-            logger.warning("An error occurred while starting transfer to analysis cluster "
-                            f"for {self} on {datetime.now()}."
+            logger.warning(
+                "An error occurred while starting transfer to analysis cluster "
+                f"for {self} on {datetime.now()}."
             )
         return
 

From d9581d561762d18f9da976f269b192a91bc1fb92 Mon Sep 17 00:00:00 2001
From: Johannes Alneberg <alneberg@kth.se>
Date: Thu, 19 Sep 2024 10:21:38 +0200
Subject: [PATCH 046/187] Trying to use moch patch for element tests

---
 tests/element/test_Aviti_Runs.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/element/test_Aviti_Runs.py b/tests/element/test_Aviti_Runs.py
index 0ff3e7db..7d536547 100644
--- a/tests/element/test_Aviti_Runs.py
+++ b/tests/element/test_Aviti_Runs.py
@@ -1,4 +1,5 @@
 import tempfile
+from unittest.mock import patch
 
 import pytest
 
@@ -10,6 +11,11 @@ class TestAviti_Run:
     def test_init(self, create_dirs: pytest.fixture):
         tmp: tempfile.TemporaryDirectory = create_dirs
         run_dir = create_element_run_dir(tmp)
+
+        # Mock db
+        mock_db = patch("taca.utils.statusdb.ElementRunsConnection")
+        mock_db.start()
+
         run = to_test.Aviti_Run(run_dir, {})
         assert run.run_dir == run_dir
         assert run.sequencer_type == "Aviti"

From b831d429f9c68b8b5212229c6c8e26b0394373e5 Mon Sep 17 00:00:00 2001
From: Johannes Alneberg <alneberg@kth.se>
Date: Thu, 19 Sep 2024 11:53:08 +0200
Subject: [PATCH 047/187] Added extensions to devcontainer

---
 .devcontainer/devcontainer.json | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index 796444ad..0550f01b 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -12,7 +12,12 @@
   "features": {},
   "customizations": {
     "vscode": {
-      "extensions": ["ms-python.python", "eamodio.gitlens"]
+      "extensions": [
+        "ms-python.python",
+        "eamodio.gitlens",
+        "charliermarsh.ruff",
+        "ms-python.mypy-type-checker"
+      ]
     }
   },
   // Features to add to the dev container. More info: https://containers.dev/features.

From 6844ab7036ac2efcf69dab173a229ab11f44f046 Mon Sep 17 00:00:00 2001
From: Johannes Alneberg <alneberg@kth.se>
Date: Thu, 19 Sep 2024 11:54:05 +0200
Subject: [PATCH 048/187] Fix mocking of statusb for element tests

---
 tests/element/test_Aviti_Runs.py   | 2 +-
 tests/element/test_Element_Runs.py | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/element/test_Aviti_Runs.py b/tests/element/test_Aviti_Runs.py
index 7d536547..bf32089c 100644
--- a/tests/element/test_Aviti_Runs.py
+++ b/tests/element/test_Aviti_Runs.py
@@ -13,7 +13,7 @@ def test_init(self, create_dirs: pytest.fixture):
         run_dir = create_element_run_dir(tmp)
 
         # Mock db
-        mock_db = patch("taca.utils.statusdb.ElementRunsConnection")
+        mock_db = patch("taca.element.Element_Runs.ElementRunsConnection")
         mock_db.start()
 
         run = to_test.Aviti_Run(run_dir, {})
diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py
index a6eca20a..3af02a41 100644
--- a/tests/element/test_Element_Runs.py
+++ b/tests/element/test_Element_Runs.py
@@ -64,6 +64,11 @@ class TestRun:
     def test_init(self, create_dirs: pytest.fixture):
         tmp: tempfile.TemporaryDirectory = create_dirs
         run_dir = create_element_run_dir(tmp)
+
+        # Mock db
+        mock_db = patch("taca.element.Element_Runs.ElementRunsConnection")
+        mock_db.start()
+
         run = to_test.Run(run_dir, {})
         assert run.run_dir == run_dir
 

From 871f7c45c818f9c2e65f792e5355c362eea86ec0 Mon Sep 17 00:00:00 2001
From: Johannes Alneberg <alneberg@kth.se>
Date: Thu, 19 Sep 2024 11:55:07 +0200
Subject: [PATCH 049/187] For testing, sequencer_type was not defined

---
 taca/element/Element_Runs.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 1a660d82..134788e9 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -22,6 +22,10 @@ class Run:
     """Defines an Element run"""
 
     def __init__(self, run_dir, configuration):
+        if not hasattr(self, "sequencer_type"):
+            # Mostly for testing, since this class is not meant to be instantiated
+            self.sequencer_type = "GenericElement"
+
         if not os.path.exists(run_dir):
             raise RuntimeError(f"Could not locate run directory {run_dir}")
         self.run_parameters_parsed = False

From 7aea5427bac6bd050e2c138da55867712e76fc31 Mon Sep 17 00:00:00 2001
From: Johannes Alneberg <alneberg@kth.se>
Date: Thu, 19 Sep 2024 13:22:18 +0200
Subject: [PATCH 050/187] Importing glob according to code usage

---
 taca/element/Element_Runs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 134788e9..942b95b0 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -1,4 +1,5 @@
 import csv
+import glob
 import json
 import logging
 import os
@@ -7,7 +8,6 @@
 import subprocess
 import zipfile
 from datetime import datetime
-from glob import glob
 from pathlib import Path
 
 import pandas as pd

From 99c20ea143605bbf0f62f2d17a12293586ea3f53 Mon Sep 17 00:00:00 2001
From: Johannes Alneberg <alneberg@kth.se>
Date: Thu, 19 Sep 2024 13:23:43 +0200
Subject: [PATCH 051/187] Changed mock behaviour

---
 tests/element/test_Element_Runs.py | 36 +++++++++++++++++-------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py
index 3af02a41..498b4b65 100644
--- a/tests/element/test_Element_Runs.py
+++ b/tests/element/test_Element_Runs.py
@@ -1,7 +1,7 @@
 import json
 import os
 import tempfile
-from unittest.mock import patch
+from unittest import mock
 
 import pytest
 
@@ -60,15 +60,12 @@ def create_element_run_dir(
     return run_path
 
 
+@mock.patch("taca.element.Element_Runs.ElementRunsConnection")
 class TestRun:
-    def test_init(self, create_dirs: pytest.fixture):
+    def test_init(self, mock_db: mock.Mock, create_dirs: pytest.fixture):
         tmp: tempfile.TemporaryDirectory = create_dirs
         run_dir = create_element_run_dir(tmp)
 
-        # Mock db
-        mock_db = patch("taca.element.Element_Runs.ElementRunsConnection")
-        mock_db.start()
-
         run = to_test.Run(run_dir, {})
         assert run.run_dir == run_dir
 
@@ -82,7 +79,10 @@ def test_init(self, create_dirs: pytest.fixture):
         ids=["success", "failure", "ongoing"],
     )
     def test_check_sequencing_status(
-        self, p: pytest.fixture, create_dirs: pytest.fixture
+        self,
+        mock_db: mock.Mock,
+        p: pytest.fixture,
+        create_dirs: pytest.fixture,
     ):
         tmp: tempfile.TemporaryDirectory = create_dirs
 
@@ -106,10 +106,14 @@ def test_check_sequencing_status(
         ids=["not started", "ongoing", "finished"],
     )
     def test_get_demultiplexing_status(
-        self, p: pytest.fixture, create_dirs: pytest.fixture
+        self, mock_db: mock.Mock, p: pytest.fixture, create_dirs: pytest.fixture
     ):
         tmp: tempfile.TemporaryDirectory = create_dirs
 
+        if p["demux_dir"] and not p["demux_done"]:
+            import pdb
+
+            pdb.set_trace()
         run = to_test.Run(
             create_element_run_dir(
                 tmp,
@@ -128,7 +132,9 @@ def test_get_demultiplexing_status(
         ],
         ids=["exists", "does not exist"],
     )
-    def test_manifest_exists(self, create_dirs: pytest.fixture, p: pytest.fixture):
+    def test_manifest_exists(
+        self, mock_db: mock.Mock, create_dirs: pytest.fixture, p: pytest.fixture
+    ):
         tmp: tempfile.TemporaryDirectory = create_dirs
 
         run = to_test.Run(
@@ -141,13 +147,13 @@ def test_manifest_exists(self, create_dirs: pytest.fixture, p: pytest.fixture):
         assert run.manifest_exists() == p["expected"]
 
     @pytest.mark.skip(reason="Not implemented yet")
-    def test_generate_demux_command(self):
+    def test_generate_demux_command(self, mock_db):
         pass
 
-    def test_start_demux(self, create_dirs):
-        with patch(
+    def test_start_demux(self, mock_db, create_dirs):
+        with mock.patch(
             "taca.utils.misc.call_external_command_detached"
-        ) as mock_call, patch(
+        ) as mock_call, mock.patch(
             "taca.element.Element_Runs.Run.generate_demux_command"
         ) as mock_command:
             mock_command.return_value = "test command"
@@ -159,9 +165,9 @@ def test_start_demux(self, create_dirs):
             )
 
     @pytest.mark.skip(reason="Not implemented yet")
-    def test_is_transferred(self, create_dirs):
+    def test_is_transferred(self, mock_db, create_dirs):
         pass
 
     @pytest.mark.skip(reason="Not implemented yet")
-    def test_parse_rundir(self, create_dirs):
+    def test_parse_rundir(self, mock_db, create_dirs):
         pass

From a9fc4f7b1f7b465419268a39942228e8ce72b027 Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Thu, 19 Sep 2024 14:52:35 +0200
Subject: [PATCH 052/187] Fix check for manifest zip file

---
 taca/analysis/analysis_element.py |  4 ++--
 taca/element/Element_Runs.py      | 18 ++++++++++--------
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index 5a21d718..ccc95ee5 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -40,7 +40,7 @@ def _process(run):
         ):  # Sequencing done. Start demux
             if (
                 not run.manifest_exists()
-            ):  # TODO: this should check for the zip file in lims output location
+            ):
                 logger.warning(
                     f"Run manifest is missing for {run}, demultiplexing aborted"
                 )
@@ -86,7 +86,7 @@ def _process(run):
                 if run.status_changed:
                     run.update_statusdb()
                     # TODO: Also update statusdb with a timestamp of when the transfer started
-                run.transfer()  # I think this should be a detached command as well
+                run.transfer()
             elif transfer_status == "ongoing":
                 run.status = "transferring"
                 if run.status_changed:
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index d22fced6..ed956abd 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -180,7 +180,8 @@ def update_statusdb(self):
         self.db.upload_to_statusdb(doc_obj)
 
     def manifest_exists(self):
-        return os.path.isfile(self.run_manifest_zip_file)
+        zip_src_path = self.find_manifest_zip()
+        return os.path.isfile(zip_src_path)
 
     def get_lims_step_id(self) -> str | None:
         """If the run was started using a LIMS-generated manifest,
@@ -197,12 +198,8 @@ def get_lims_step_id(self) -> str | None:
                 lims_step_id = line.split(",")[1]
                 return lims_step_id
         return None
-
-    def copy_manifests(self) -> bool:
-        """Fetch the LIMS-generated run manifests from ngi-nas-ns and unzip them into a run subdir."""
-
-        # TODO test me
-
+    
+    def find_manifest_zip(self):
         # Specify dir in which LIMS drop the manifest zip files
         dir_to_search = os.path.join(
             self.CONFIG.get("Aviti").get(
@@ -238,7 +235,13 @@ def copy_manifests(self) -> bool:
             zip_src_path = glob_results[-1]
         else:
             zip_src_path = glob_results[0]
+        return zip_src_path
+
 
+    def copy_manifests(self) -> bool:
+        """Fetch the LIMS-generated run manifests from ngi-nas-ns and unzip them into a run subdir."""
+        # TODO: test me
+        zip_src_path = self.find_manifest_zip()
         # Make a run subdir named after the zip file and extract manifests there
         zip_name = os.path.basename(zip_src_path)
         zip_dst_path = os.path.join(self.run_dir, zip_name)
@@ -437,7 +440,6 @@ def aggregate_demux_results(self, demux_results_dirs):
                 shutil.move(data_dir, self.demux_dir)
                 
     def upload_demux_results_to_statusdb(self):
-        # TODO: dump contents of IndexAssignment.csv and UnassignedSequences.csv into statusdb document
         doc_obj = self.db.get_db_entry(self.NGI_run_id)
         index_assignement_file = os.path.join(self.run_dir, "Demultiplexing", "IndexAssignment.csv")
         with open(index_assignement_file, 'r') as index_file:

From 591e6339c887e520cad4676a81fa9b1fc0baa8cb Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Thu, 19 Sep 2024 14:53:22 +0200
Subject: [PATCH 053/187] Remove pdb

---
 tests/element/test_Element_Runs.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py
index 498b4b65..1100aff9 100644
--- a/tests/element/test_Element_Runs.py
+++ b/tests/element/test_Element_Runs.py
@@ -111,9 +111,7 @@ def test_get_demultiplexing_status(
         tmp: tempfile.TemporaryDirectory = create_dirs
 
         if p["demux_dir"] and not p["demux_done"]:
-            import pdb
 
-            pdb.set_trace()
         run = to_test.Run(
             create_element_run_dir(
                 tmp,

From babcbd0948237ba4c76677285e1632d5556eb047 Mon Sep 17 00:00:00 2001
From: chuan-wang <chuan.wang@scilifelab.se>
Date: Fri, 20 Sep 2024 11:22:28 +0200
Subject: [PATCH 054/187] TACA Aviti integration WIP

---
 taca/analysis/analysis_element.py |  10 +-
 taca/element/Aviti_Runs.py        |   1 +
 taca/element/Element_Runs.py      | 150 ++++++++++++++++++++++++++++--
 3 files changed, 145 insertions(+), 16 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index 5a21d718..63cce774 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -56,10 +56,7 @@ def _process(run):
                 )
                 sub_demux_count = 0
                 for run_manifest in run_manifests.sort():
-                    if len(run_manifests) == 1:
-                        demux_dir = run.demux_dir
-                    elif len(run_manifests) > 1:
-                        demux_dir = f"Demultiplexing_{sub_demux_count}"
+                    demux_dir = f"Demultiplexing_{sub_demux_count}"
                     os.mkdir(demux_dir)
                     run.start_demux(run_manifest, demux_dir)
                     sub_demux_count += 1
@@ -75,10 +72,9 @@ def _process(run):
             transfer_status = run.get_transfer_status()
             if transfer_status == "not started":
                 demux_results_dirs = glob.glob(
-                    os.path.join(run.run_dir, "Delmultiplexing*")
+                    os.path.join(run.run_dir, "Delmultiplexing_*")
                     )
-                if len(demux_results_dirs > 1):
-                    run.aggregate_demux_results(demux_results_dirs)
+                run.aggregate_demux_results(demux_results_dirs)
                 run.upload_demux_results_to_statusdb()
                 run.sync_metadata()
                 run.make_transfer_indicator()
diff --git a/taca/element/Aviti_Runs.py b/taca/element/Aviti_Runs.py
index ad162ac4..ccfb5a33 100644
--- a/taca/element/Aviti_Runs.py
+++ b/taca/element/Aviti_Runs.py
@@ -5,3 +5,4 @@ class Aviti_Run(Run):
     def __init__(self, run_dir, configuration):
         super().__init__(run_dir, configuration)
         self.sequencer_type = "Aviti"
+        self.demux_dir = "Demultiplexing"
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index af54aa03..e566be6d 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -395,7 +395,7 @@ def start_demux(self, run_manifest, demux_dir):
                                f"{self} on {datetime.now()}."
                 )
         return
-            
+
 
     def get_transfer_status(self):
         if not self.in_transfer_log() and not self.transfer_ongoing() and not self.rsync_complete():
@@ -406,7 +406,7 @@ def get_transfer_status(self):
             return "rsync done"
         elif self.in_transfer_log():
             return "unknown"
-    
+
     def in_transfer_log(self):
         with open(self.transfer_file, "r") as transfer_file:
             for row in transfer_file.read():
@@ -428,14 +428,146 @@ def rsync_successful(self):
         else:
             return False
 
-    def aggregate_demux_results(self, demux_results_dirs):
-        # TODO: Correct this based on comments from Chuan
+    # Clear all content under a dir
+    def clear_dir(dir):
+        for filename in os.listdir(dir):
+            file_path = os.path.join(dir, filename)
+            try:
+                if os.path.isfile(file_path) or os.path.islink(file_path):
+                    os.unlink(file_path)
+                elif os.path.isdir(file_path):
+                    shutil.rmtree(file_path)
+            except Exception as e:
+                print(f"Failed to delete {file_path} Reason {e}")
+
+    # Create symlink for a simple demultiplexing dir
+    def symlink_demux_dir(src_dir, dest_dir):
+        # Ensure the destination directory exists
+        if not os.path.exists(dest_dir):
+            os.makedirs(dest_dir)
+        # Clear all content under dest_dir
+        clear_dir(dest_dir)
+        # Loop through all files and directories in the source directory
+        for item in os.listdir(src_dir):
+            src_path = os.path.join(src_dir, item)
+            # Move content of Samples to the parental dir
+            if item == "Samples":
+                dest_path = dest_dir
+            else:
+                dest_path = os.path.join(dest_dir, item)
+            try:
+                # Create symbolic link only if it doesn't already exist
+                if not os.path.exists(dest_path):
+                    os.symlink(src_path, dest_path)
+                    print(f"Linked {src_path} to {dest_path}")
+                else:
+                    print(f"{dest_path} already exists.")
+            except OSError as e:
+                print(f"Error linking {src_path} to {dest_path}: {e}")
+
+
+    # Collect demux info into a list of dictionaries
+    # Structure: [{'sub_demux_count':XXX, 'SampleName':XXX, 'Index1':XXX, 'Index2':XXX, 'Lane':XXX, 'Project':XXX, 'Recipe':XXX}]
+    def collect_demux_runmanifest(self, demux_results_dirs):
+        demux_runmanifest = []
         for demux_dir in demux_results_dirs:
-            data_dirs = [f.path for f in os.scandir(os.path.join(demux_dir, 'Samples')) if f.is_dir()]
-        for data_dir in data_dirs:
-            if not "PhiX" in data_dir in data_dir:
-                shutil.move(data_dir, self.demux_dir)
-                
+            sub_demux_count = demux_dir.split('_')[1]
+            with open(os.path.join(self.run_dir, demux_dir, 'RunManifest.csv'), 'r') as file:
+                lines = file.readlines()
+            sample_section = False
+            headers = []
+            # Loop through each line
+            for line in lines:
+                # Check if we reached the "[SAMPLES]" section
+                if '[SAMPLES]' in line:
+                    sample_section = True
+                    continue
+                # Exit the sample section if another section is encountered
+                if sample_section and line.startswith('['):
+                    break
+                # If in the sample section, process the sample lines
+                if sample_section:
+                    # Clean up the line
+                    line = line.strip()
+                    # Skip empty lines
+                    if not line:
+                        continue
+                    # Get the headers from the first line
+                    if not headers:
+                        headers = line.split(',')
+                    else:
+                        # Parse sample data
+                        values = line.split(',')
+                        sample_dict = dict(zip(headers, values))
+                        sample_dict['sub_demux_count'] = sub_demux_count
+                        demux_runmanifest.append(sample_dict)
+        sorted_demux_runmanifest = sorted(demux_runmanifest, key=lambda x: (x['Lane'], x['SampleName'], x['sub_demux_count']))
+        return sorted_demux_runmanifest
+
+
+    # Aggregate the output FastQ files of samples from multiple demux
+    def aggregate_sample_fastq(self, demux_runmanifest):
+        lanes = sorted(list(set(sample['Lane'] for sample in demux_runmanifest)))
+        unique_sample_demux = set()
+        for lane in lanes:
+            sample_count = 1
+            for sample in demux_runmanifest:
+                lanenr = sample['Lane']
+                project = sample['Project']
+                sample = sample['SampleName']
+                sub_demux_count = sample['sub_demux_count']
+                # Skip PhiX
+                if lanenr == lane and sample != "PhiX":
+                    sample_tuple = (sample, sub_demux_count)
+                    if sample_tuple not in unique_sample_demux:
+                        project_dest = os.path.join(self.run_dir, self.demux_dir, project)
+                        sample_dest = os.path.join(self.run_dir, self.demux_dir, project, sample)
+                        if not os.path.exists(project_dest):
+                            os.makedirs(project_dest)
+                        if not os.path.exists(sample_dest):
+                            os.makedirs(sample_dest)
+                        fastqfiles = glob.glob(os.path.join(self.run_dir, f"Demultiplexing_{sub_demux_count}", "Samples", project, sample, f"*L00{lane}*.fastq.gz"))
+                        for fastqfile in fastqfiles:
+                            old_name = os.path.basename(fastqfile)
+                            read_label = re.search(rf"L00{lane}_(.*?)_001", old_name).group(1)
+                            new_name = "_".join([sample, f"S{sample_count}", f"L00{lane}", read_label, "001.fastq.gz"])
+                            os.symlink(fastqfile, os.path.join(sample_dest, new_name))
+                        unique_sample_demux.add(sample_tuple)
+                        sample_count += 1
+
+
+    # Symplink the output FastQ files of undet only if a lane does not have multiple demux
+    def aggregate_undet_fastq(self, demux_runmanifest):
+
+
+
+    # Aggregate demux results
+    def aggregate_demux_results(self, demux_results_dirs):
+        # In case of single demux
+        if len(demux_results_dirs) == 1:
+            # TODO: Check NoIndex case. Can Base2Fastq generate FastQs for both reads and indexes for NoIndex sample?
+            # Otherwise just softlink contents of Demultplexing_0 into Demultiplexing
+            symlink_demux_dir(demux_results_dirs[0], os.path.join(self.run_dir, self.demux_dir))
+        else:
+            # Ensure the destination directory exists
+            if not os.path.exists(os.path.join(self.run_dir, self.demux_dir):
+                os.makedirs(os.path.join(self.run_dir, self.demux_dir)
+            # Clear all content under dest_dir
+            clear_dir(os.path.join(self.run_dir, self.demux_dir)
+            demux_runmanifest = collect_demux_runmanifest(demux_results_dirs)
+            # Aggregate the output FastQ files of samples from multiple demux
+            aggregate_sample_fastq(demux_runmanifest)
+            # Symplink the output FastQ files of undet only if a lane does not have multiple demux
+            aggregate_undet_fastq(demux_runmanifest)
+            # Aggregate stats in IndexAssignment.csv
+            TBD
+            # Aggregate stats in UnassignedSequences.csv
+            TBD
+            # Aggregate stats in Project_RunStats.json
+            TBD
+
+
+
     def upload_demux_results_to_statusdb(self):
         # TODO: dump contents of IndexAssignment.csv and UnassignedSequences.csv into statusdb document
         doc_obj = self.db.get_db_entry(self.NGI_run_id)

From efe3e96ea634a59b14b962b5c6e4f19a7fa28a3f Mon Sep 17 00:00:00 2001
From: chuan-wang <chuan.wang@scilifelab.se>
Date: Fri, 20 Sep 2024 11:36:55 +0200
Subject: [PATCH 055/187] Fix undet

---
 taca/element/Element_Runs.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index e566be6d..cb3b9d5b 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -538,7 +538,17 @@ def aggregate_sample_fastq(self, demux_runmanifest):
 
     # Symplink the output FastQ files of undet only if a lane does not have multiple demux
     def aggregate_undet_fastq(self, demux_runmanifest):
-
+        lanes = sorted(list(set(sample['Lane'] for sample in demux_runmanifest)))
+        for lane in lanes:
+            sub_demux = list(set(sample['sub_demux_count'] for sample in demux_runmanifest if sample['Lane']==lane))
+            if sub_demux == 1:
+                project_dest = os.path.join(self.run_dir, self.demux_dir, "Undetermined")
+                if not os.path.exists(project_dest):
+                    os.makedirs(project_dest)
+                fastqfiles = glob.glob(os.path.join(self.run_dir, f"Demultiplexing_{sub_demux[0]}", "Samples", "Undetermined", "*.fastq.gz"))
+                for fastqfile in fastqfiles:
+                    base_name = os.path.basename(fastqfile)
+                    os.symlink(fastqfile, os.path.join(project_dest, base_name))
 
 
     # Aggregate demux results

From 04808462636a8d3d89a8bb7293b5c4564f69c671 Mon Sep 17 00:00:00 2001
From: chuan-wang <chuan.wang@scilifelab.se>
Date: Fri, 20 Sep 2024 11:38:35 +0200
Subject: [PATCH 056/187] Fix typo

---
 taca/element/Element_Runs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index cb3b9d5b..2183eafa 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -541,7 +541,7 @@ def aggregate_undet_fastq(self, demux_runmanifest):
         lanes = sorted(list(set(sample['Lane'] for sample in demux_runmanifest)))
         for lane in lanes:
             sub_demux = list(set(sample['sub_demux_count'] for sample in demux_runmanifest if sample['Lane']==lane))
-            if sub_demux == 1:
+            if len(sub_demux) == 1:
                 project_dest = os.path.join(self.run_dir, self.demux_dir, "Undetermined")
                 if not os.path.exists(project_dest):
                     os.makedirs(project_dest)

From f01333db08d80b47449f7daa0b1c769c2e819419 Mon Sep 17 00:00:00 2001
From: chuan-wang <chuan.wang@scilifelab.se>
Date: Fri, 20 Sep 2024 16:16:09 +0200
Subject: [PATCH 057/187] Still WIP

---
 taca/element/Element_Runs.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 2183eafa..67c89975 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -551,6 +551,15 @@ def aggregate_undet_fastq(self, demux_runmanifest):
                     os.symlink(fastqfile, os.path.join(project_dest, base_name))
 
 
+    # Aggregate
+    def aggregate_stats_unassigned(demux_runmanifest):
+        lanes = sorted(list(set(sample['Lane'] for sample in demux_runmanifest)))
+        for lane in lanes:
+            sub_demux = list(set(sample['sub_demux_count'] for sample in demux_runmanifest if sample['Lane']==lane))
+            if len(sub_demux) == 1:
+                unassigned_csv = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux[0]}", "UnassignedSequences.csv")
+
+
     # Aggregate demux results
     def aggregate_demux_results(self, demux_results_dirs):
         # In case of single demux
@@ -572,7 +581,7 @@ def aggregate_demux_results(self, demux_results_dirs):
             # Aggregate stats in IndexAssignment.csv
             TBD
             # Aggregate stats in UnassignedSequences.csv
-            TBD
+            aggregate_stats_unassigned(demux_runmanifest)
             # Aggregate stats in Project_RunStats.json
             TBD
 

From d118931d47e849f53acbc82f367476f1df56ac0d Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Mon, 23 Sep 2024 11:38:10 +0200
Subject: [PATCH 058/187] Fix config settings

---
 taca/analysis/analysis_element.py |  2 +-
 taca/element/Element_Runs.py      | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index 7249964f..209ac163 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -134,7 +134,7 @@ def _process(run):
             return
 
     if given_run:
-        run = Aviti_Run(given_run)
+        run = Aviti_Run(given_run, CONFIG.get("element_analysis"))
         # TODO: Needs to change if more types of Element machines are aquired in the future
 
         _process(run)
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 21f8989a..3c87ba25 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -211,9 +211,9 @@ def get_lims_step_id(self) -> str | None:
     def find_manifest_zip(self):
         # Specify dir in which LIMS drop the manifest zip files
         dir_to_search = os.path.join(
-            self.CONFIG.get("Aviti").get(
-                "manifest_zip_location"
-            ),  # TODO: change and add to taca.yaml
+            self.CONFIG.get("Element", {})
+            .get(self.sequencer_type, {})
+            .get("manifest_zip_location"),  # TODO: add to taca.yaml
             datetime.now().year,
         )
 
@@ -383,7 +383,7 @@ def make_demux_manifests(
 
     def generate_demux_command(self, run_manifest, demux_dir):
         command = (
-            f"{self.CONFIG.get(self.software)['bin']}"  # TODO: add path to bases2fastq executable to config
+            f"{self.CONFIG.get('bases2fastq')}"  # TODO: add path to bases2fastq executable to config
             + f" {self.run_dir}"
             + f" {demux_dir}"
             + " -p 8"
@@ -523,7 +523,7 @@ def make_transfer_indicator(self):
 
     def transfer(self):
         transfer_details = (
-            self.CONFIG.get("Element").get(self.sequencer_type).get("transfer_details")
+            self.CONFIG.get(self.sequencer_type).get("transfer_details")
         )  # TODO: Add section to taca.yaml
         command = (
             "rsync"
@@ -533,7 +533,7 @@ def transfer(self):
             + " --exclude BaseCalls"  # TODO: check that we actually want to exclude these
             + " --exclude Alignment"
             + f" {self.run_dir}"
-            + f" {transfer_details.get('user')@transfer_details.get('host')}:/"
+            + f" {transfer_details.get('user')@transfer_details.get('host')}:/aviti"
             + "; echo $? > .rsync_exit_status"
         )  # TODO: any other options?
         try:

From 73c54a3822827b1b1281bbd48d6385b99b1ca209 Mon Sep 17 00:00:00 2001
From: chuan-wang <chuan.wang@scilifelab.se>
Date: Mon, 23 Sep 2024 14:47:46 +0200
Subject: [PATCH 059/187] Fix UnassignedSequences.csv

---
 taca/element/Element_Runs.py | 64 ++++++++++++++++++++++++++++++++----
 1 file changed, 58 insertions(+), 6 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 67c89975..c757fc94 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -550,14 +550,66 @@ def aggregate_undet_fastq(self, demux_runmanifest):
                     base_name = os.path.basename(fastqfile)
                     os.symlink(fastqfile, os.path.join(project_dest, base_name))
 
-
-    # Aggregate
-    def aggregate_stats_unassigned(demux_runmanifest):
+    # Write to csv
+    def write_to_csv(data, filename):
+        # Get the fieldnames from the keys of the first dictionary
+        fieldnames = data[0].keys()
+        # Open the file and write the CSV
+        with open(filename, mode='w', newline='') as file:
+            writer = csv.DictWriter(file, fieldnames=fieldnames)
+            # Write the header (fieldnames)
+            writer.writeheader()
+            # Write the data (rows)
+            writer.writerows(data)
+
+
+    # Aggregate stats in UnassignedSequences.csv
+    def aggregate_stats_unassigned(self, demux_runmanifest):
+        aggregated_unassigned_indexes = []
         lanes = sorted(list(set(sample['Lane'] for sample in demux_runmanifest)))
         for lane in lanes:
-            sub_demux = list(set(sample['sub_demux_count'] for sample in demux_runmanifest if sample['Lane']==lane))
-            if len(sub_demux) == 1:
-                unassigned_csv = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux[0]}", "UnassignedSequences.csv")
+            sub_demux_index_lens = set()
+            for sample in demux_runmanifest:
+                if sample['Lane'] == lane:
+                    sub_demux_index_lens.add((sample['sub_demux_count'], (len(sample.get("Index1", "")), len(sample.get("Index2", "")))))
+            # List of sub-demux with a decreasing order of index lengths
+            sub_demux_list = [x[0] for x in sorted(sub_demux_index_lens, key=lambda x: sum(x[1]), reverse=True)]
+            sub_demux_with_max_index_lens = sub_demux_list[0]
+            # Start with the unassigned list with the longest index
+            max_unassigned_csv = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux_with_max_index_lens}", "UnassignedSequences.csv")
+            with open(max_unassigned_csv, 'r') as max_unassigned_file:
+                reader = csv.DictReader(max_unassigned_file)
+                max_unassigned_indexes = [row for row in reader]
+            # Filter by lane
+            max_unassigned_indexes = [idx for idx in max_unassigned_indexes if idx["Lane"] == lane]
+            # Complicated case with multiple demuxes. Take the full list if there is only one sub-demux otherwise
+            if len(sub_demux_list) > 1:
+                # Order: from longer to shorter indexes
+                sub_demux_with_shorter_index_lens = sub_demux_list[1:]
+                for sub_demux in sub_demux_with_shorter_index_lens:
+                    unassigned_csv = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux}", "UnassignedSequences.csv")
+                    with open(unassigned_csv, 'r') as unassigned_file:
+                        reader = csv.DictReader(unassigned_file)
+                        unassigned_indexes = [row for row in reader]
+                    # Filter by lane
+                    unassigned_indexes = [unassigned_index for unassigned_index in unassigned_indexes if unassigned_index["Lane"] == lane]
+                    # Remove overlapped indexes from the list of max_unassigned_indexes
+                    idx1_overlapped_len = min([demux_lens_pair[1] for demux_lens_pair in sub_demux_index_lens if demux_lens_pair[0] == sub_demux][0][0],
+                                              [demux_lens_pair[1] for demux_lens_pair in sub_demux_index_lens if demux_lens_pair[0] == sub_demux_with_max_index_lens][0][0])
+                    idx2_overlapped_len = min([demux_lens_pair[1] for demux_lens_pair in sub_demux_index_lens if demux_lens_pair[0] == sub_demux][0][1],
+                                              [demux_lens_pair[1] for demux_lens_pair in sub_demux_index_lens if demux_lens_pair[0] == sub_demux_with_max_index_lens][0][1])
+                    for unassigned_index in unassigned_indexes:
+                        idx1_overlapped_seq = unassigned_index['I1'][:idx1_overlapped_len]
+                        idx2_overlapped_seq = unassigned_index['I2'][:idx2_overlapped_len]
+                        # Remove the overlapped record from the max_unassigned_indexes list
+                        max_unassigned_indexes = [max_unassigned_index for max_unassigned_index in max_unassigned_indexes if not (max_unassigned_index['I1'][:idx1_overlapped_len] == idx1_overlapped_seq and max_unassigned_index['I2'][:idx2_overlapped_len] == idx2_overlapped_seq)]
+            # Append to the aggregated_unassigned_indexes list
+            aggregated_unassigned_indexes += max_unassigned_indexes
+        # Sort aggregated_unassigned_indexes list first by lane and then by Count in the decreasing order
+        aggregated_unassigned_indexes = sorted(aggregated_unassigned_indexes, key=lambda x: (x['Lane'], -int(x['Count'])))
+        # Write to a new UnassignedSequences.csv file under
+        aggregated_unassigned_csv = os.path.join(self.run_dir, self.demux_dir, "UnassignedSequences.csv")
+        write_to_csv(aggregated_unassigned_indexes, aggregated_unassigned_csv)
 
 
     # Aggregate demux results

From 5b4f94732cffebf3987952282da17e2a080330dd Mon Sep 17 00:00:00 2001
From: chuan-wang <chuan.wang@scilifelab.se>
Date: Mon, 23 Sep 2024 16:19:38 +0200
Subject: [PATCH 060/187] Fix IndexAssignment.csv

---
 taca/element/Element_Runs.py | 68 +++++++++++++++++++++++++++++++-----
 1 file changed, 60 insertions(+), 8 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index c757fc94..3391824f 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -563,6 +563,50 @@ def write_to_csv(data, filename):
             writer.writerows(data)
 
 
+    # Aggregate stats in IndexAssignment.csv
+    def aggregate_stats_assigned(self, demux_runmanifest):
+        aggregated_assigned_indexes = []
+        sub_demux_list = sorted(list(set(sample['sub_demux_count'] for sample in demux_runmanifest)))
+        lanes = sorted(list(set(sample['Lane'] for sample in demux_runmanifest)))
+        for sub_demux in sub_demux_list:
+            # Read in IndexAssignment.csv
+            assigned_csv = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux}", "IndexAssignment.csv")
+            if os.path.exists(assigned_csv):
+                with open(assigned_csv, 'r') as assigned_file:
+                    reader = csv.DictReader(assigned_file)
+                    index_assignment = [row for row in reader]
+                for sample in index_assignment:
+                    if sample['Lane'] in lanes:
+                        sample['sub_demux_count'] = sub_demux
+                        aggregated_assigned_indexes.append(sample)
+            else:
+                logger.warning(f"No IndexAssignment.csv file found for sub-demultiplexing {sub_demux}.")
+        # Remove redundant rows for PhiX
+        aggregated_assigned_indexes_filtered = []
+        unique_phiX_combination = set()
+        for sample in aggregated_assigned_indexes:
+            if sample['SampleName'] == 'PhiX':
+                combination = (sample['I1'], sample['I2'], sample['Lane'])
+                if combination not in unique_phiX_combination:
+                    aggregated_assigned_indexes_filtered.append(sample)
+                    unique_phiX_combination.add(combination)
+            else:
+                aggregated_assigned_indexes_filtered.append(sample)
+        # Sort the list by Lane, SampleName and sub_demux_count
+        aggregated_assigned_indexes_filtered_sorted = sorted(aggregated_assigned_indexes_filtered, key=lambda x: (x['Lane'], x['SampleName'], x['sub_demux_count']))
+        # Fix new sample number based on SampleName and Lane
+        sample_count = 0
+        previous_samplename_lane = ('NA', 'NA')
+        for sample in aggregated_assigned_indexes_filtered_sorted:
+            if (sample['SampleName'], sample['Lane']) != previous_samplename_lane:
+                sample_count += 1
+                previous_samplename_lane = (sample['SampleName'], sample['Lane'])
+            sample['SampleNumber'] = sample_count
+        # Write to a new UnassignedSequences.csv file under demux_dir
+        aggregated_assigned_indexes_csv = os.path.join(self.run_dir, self.demux_dir, "IndexAssignment.csv")
+        write_to_csv(aggregated_assigned_indexes_filtered_sorted, aggregated_assigned_indexes_csv)
+
+
     # Aggregate stats in UnassignedSequences.csv
     def aggregate_stats_unassigned(self, demux_runmanifest):
         aggregated_unassigned_indexes = []
@@ -577,9 +621,13 @@ def aggregate_stats_unassigned(self, demux_runmanifest):
             sub_demux_with_max_index_lens = sub_demux_list[0]
             # Start with the unassigned list with the longest index
             max_unassigned_csv = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux_with_max_index_lens}", "UnassignedSequences.csv")
-            with open(max_unassigned_csv, 'r') as max_unassigned_file:
-                reader = csv.DictReader(max_unassigned_file)
-                max_unassigned_indexes = [row for row in reader]
+            if os.path.exists(max_unassigned_csv):
+                with open(max_unassigned_csv, 'r') as max_unassigned_file:
+                    reader = csv.DictReader(max_unassigned_file)
+                    max_unassigned_indexes = [row for row in reader]
+            else:
+                logger.warning(f"No UnassignedSequences.csv file found for sub-demultiplexing {sub_demux_with_max_index_lens}.")
+                break
             # Filter by lane
             max_unassigned_indexes = [idx for idx in max_unassigned_indexes if idx["Lane"] == lane]
             # Complicated case with multiple demuxes. Take the full list if there is only one sub-demux otherwise
@@ -588,9 +636,13 @@ def aggregate_stats_unassigned(self, demux_runmanifest):
                 sub_demux_with_shorter_index_lens = sub_demux_list[1:]
                 for sub_demux in sub_demux_with_shorter_index_lens:
                     unassigned_csv = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux}", "UnassignedSequences.csv")
-                    with open(unassigned_csv, 'r') as unassigned_file:
-                        reader = csv.DictReader(unassigned_file)
-                        unassigned_indexes = [row for row in reader]
+                    if os.path.exists(unassigned_csv):
+                        with open(unassigned_csv, 'r') as unassigned_file:
+                            reader = csv.DictReader(unassigned_file)
+                            unassigned_indexes = [row for row in reader]
+                    else:
+                        logger.warning(f"No UnassignedSequences.csv file found for sub-demultiplexing {sub_demux}.")
+                        continue
                     # Filter by lane
                     unassigned_indexes = [unassigned_index for unassigned_index in unassigned_indexes if unassigned_index["Lane"] == lane]
                     # Remove overlapped indexes from the list of max_unassigned_indexes
@@ -607,7 +659,7 @@ def aggregate_stats_unassigned(self, demux_runmanifest):
             aggregated_unassigned_indexes += max_unassigned_indexes
         # Sort aggregated_unassigned_indexes list first by lane and then by Count in the decreasing order
         aggregated_unassigned_indexes = sorted(aggregated_unassigned_indexes, key=lambda x: (x['Lane'], -int(x['Count'])))
-        # Write to a new UnassignedSequences.csv file under
+        # Write to a new UnassignedSequences.csv file under demux_dir
         aggregated_unassigned_csv = os.path.join(self.run_dir, self.demux_dir, "UnassignedSequences.csv")
         write_to_csv(aggregated_unassigned_indexes, aggregated_unassigned_csv)
 
@@ -631,7 +683,7 @@ def aggregate_demux_results(self, demux_results_dirs):
             # Symplink the output FastQ files of undet only if a lane does not have multiple demux
             aggregate_undet_fastq(demux_runmanifest)
             # Aggregate stats in IndexAssignment.csv
-            TBD
+            aggregate_stats_assigned(demux_runmanifest)
             # Aggregate stats in UnassignedSequences.csv
             aggregate_stats_unassigned(demux_runmanifest)
             # Aggregate stats in Project_RunStats.json

From fac9912c8af86273d44cb0eba2c964c535a5492e Mon Sep 17 00:00:00 2001
From: chuan-wang <chuan.wang@scilifelab.se>
Date: Tue, 24 Sep 2024 11:35:42 +0200
Subject: [PATCH 061/187] Fix project run stats WIP

---
 taca/element/Element_Runs.py | 63 ++++++++++++++++++++++++++++--------
 1 file changed, 49 insertions(+), 14 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 3391824f..b8cf07e3 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -466,6 +466,19 @@ def symlink_demux_dir(src_dir, dest_dir):
                 print(f"Error linking {src_path} to {dest_path}: {e}")
 
 
+    # Write to csv
+    def write_to_csv(data, filename):
+        # Get the fieldnames from the keys of the first dictionary
+        fieldnames = data[0].keys()
+        # Open the file and write the CSV
+        with open(filename, mode='w', newline='') as file:
+            writer = csv.DictWriter(file, fieldnames=fieldnames)
+            # Write the header (fieldnames)
+            writer.writeheader()
+            # Write the data (rows)
+            writer.writerows(data)
+
+
     # Collect demux info into a list of dictionaries
     # Structure: [{'sub_demux_count':XXX, 'SampleName':XXX, 'Index1':XXX, 'Index2':XXX, 'Lane':XXX, 'Project':XXX, 'Recipe':XXX}]
     def collect_demux_runmanifest(self, demux_results_dirs):
@@ -550,17 +563,38 @@ def aggregate_undet_fastq(self, demux_runmanifest):
                     base_name = os.path.basename(fastqfile)
                     os.symlink(fastqfile, os.path.join(project_dest, base_name))
 
-    # Write to csv
-    def write_to_csv(data, filename):
-        # Get the fieldnames from the keys of the first dictionary
-        fieldnames = data[0].keys()
-        # Open the file and write the CSV
-        with open(filename, mode='w', newline='') as file:
-            writer = csv.DictWriter(file, fieldnames=fieldnames)
-            # Write the header (fieldnames)
-            writer.writeheader()
-            # Write the data (rows)
-            writer.writerows(data)
+
+    # Read in each Project_RunStats.json to fetch PercentMismatch, PercentQ30, PercentQ40 and QualityScoreMean
+    # Note that Element promised that they would include these stats into IndexAssignment.csv
+    # But for now we have to do this by ourselves in this hard way
+    def get_project_runstats(self, sub_demux, demux_runmanifest):
+        project_runstats = []
+        project_list = sorted(list(set(sample['Project'] for sample in demux_runmanifest if sample['sub_demux_count']==sub_demux)))
+        for project in project_list:
+            project_runstats_json_path = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux}", "Samples", project, f"{project}_RunStats.json")
+            if os.path.exists(project_runstats_json_path):
+                with open(project_runstats_json_path) as stats_json:
+                    project_runstats_json = json.load(stats_json)
+                for sample in project_runstats_json["SampleStats"]:
+                    sample_name = sample["SampleName"]
+                    for occurrence in sample["Occurrences"]:
+                        lane = occurrence["Lane"]
+                        expected_sequence = occurrence["ExpectedSequence"]
+                        percentage_mismatch = occurrence["PercentMismatch"]
+                        percentage_q30 = occurrence["PercentQ30"]
+                        percentage_q40 = occurrence["PercentQ40"]
+                        quality_score_mean = occurrence["QualityScoreMean"]
+                        project_runstats.append({ "SampleName"       : sample_name,
+                                                  "Lane"             : lane,
+                                                  "ExpectedSequence" : expected_sequence,
+                                                  "PercentMismatch"  : percentage_mismatch,
+                                                  "PercentQ30"       : percentage_q30,
+                                                  "PercentQ40"       : percentage_q40,
+                                                  "QualityScoreMean" : quality_score_mean
+                        })
+            else:
+                continue
+        return project_runstats
 
 
     # Aggregate stats in IndexAssignment.csv
@@ -569,6 +603,10 @@ def aggregate_stats_assigned(self, demux_runmanifest):
         sub_demux_list = sorted(list(set(sample['sub_demux_count'] for sample in demux_runmanifest)))
         lanes = sorted(list(set(sample['Lane'] for sample in demux_runmanifest)))
         for sub_demux in sub_demux_list:
+            # Read in each Project_RunStats.json to fetch PercentMismatch, PercentQ30, PercentQ40 and QualityScoreMean
+            # Note that Element promised that they would include these stats into IndexAssignment.csv
+            # But for now we have to do this by ourselves in this hard way
+            project_runstats = get_project_runstats(sub_demux, demux_runmanifest)
             # Read in IndexAssignment.csv
             assigned_csv = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux}", "IndexAssignment.csv")
             if os.path.exists(assigned_csv):
@@ -686,9 +724,6 @@ def aggregate_demux_results(self, demux_results_dirs):
             aggregate_stats_assigned(demux_runmanifest)
             # Aggregate stats in UnassignedSequences.csv
             aggregate_stats_unassigned(demux_runmanifest)
-            # Aggregate stats in Project_RunStats.json
-            TBD
-
 
 
     def upload_demux_results_to_statusdb(self):

From 3a0b10a2c7cad860915fc8640492c30faf106bfc Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Tue, 24 Sep 2024 13:15:40 +0200
Subject: [PATCH 062/187] bug fixes

---
 taca/analysis/analysis_element.py |  4 ++--
 taca/element/Element_Runs.py      | 18 ++++++++++--------
 taca/utils/statusdb.py            |  2 +-
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index 209ac163..9b0b62cf 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -134,7 +134,7 @@ def _process(run):
             return
 
     if given_run:
-        run = Aviti_Run(given_run, CONFIG.get("element_analysis"))
+        run = Aviti_Run(given_run, CONFIG)
         # TODO: Needs to change if more types of Element machines are aquired in the future
 
         _process(run)
@@ -148,7 +148,7 @@ def _process(run):
                 os.path.join(data_dir, "[1-9]*_*_*_*")
             )  # TODO: adapt to aviti format
             for run in runs:
-                runObj = Aviti_Run(run)
+                runObj = Aviti_Run(run, CONFIG)
                 try:
                     _process(runObj)
                 except:  # TODO: chatch error message and print it
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 3c87ba25..88ba3967 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -39,7 +39,7 @@ def __init__(self, run_dir, configuration):
             "RunStats.json"  # Assumes demux is finished when this file is created
         )
         self.transfer_file = (
-            self.CONFIG.get("Element", {})
+            self.CONFIG.get("element_analysis").get("Element", {})
             .get(self.sequencer_type, {})
             .get("transfer_log")
         )  # TODO: change and add to taca.yaml
@@ -112,7 +112,8 @@ def parse_run_parameters(self) -> None:
         )  # Sequencing, wash or prime I believe?
         self.flowcell_id = run_parameters.get("FlowcellID")
         self.instrument_name = run_parameters.get("InstrumentName")
-        self.date = run_parameters.get("Date")
+        self.date = run_parameters.get("Date")[0:10].replace("-", "")
+        self.year = self.date[0:4]
         self.operator_name = run_parameters.get("OperatorName")
         self.run_parameters_parsed = True
 
@@ -136,6 +137,7 @@ def to_doc_obj(self):
                 instrument_generated_files[os.path.basename(file)] = None
 
         doc_obj = {
+            "name": self.NGI_run_id,
             "run_path": self.run_dir,
             "run_status": self.status,
             "NGI_run_id": self.NGI_run_id,
@@ -158,7 +160,7 @@ def check_sequencing_status(self):
     def get_demultiplexing_status(self):
         if not os.path.exists(self.demux_dir):
             return "not started"
-        demux_dirs = glob.glob(os.path.join(self.run_dir, "Delmultiplexing*"))
+        demux_dirs = glob.glob(os.path.join(self.run_dir, "Demultiplexing*"))
         finished_count = 0
         for demux_dir in demux_dirs:
             if os.path.exists(self.demux_dir) and not os.path.isfile(
@@ -211,10 +213,10 @@ def get_lims_step_id(self) -> str | None:
     def find_manifest_zip(self):
         # Specify dir in which LIMS drop the manifest zip files
         dir_to_search = os.path.join(
-            self.CONFIG.get("Element", {})
+            self.CONFIG.get("element_analysis").get("Element", {})
             .get(self.sequencer_type, {})
             .get("manifest_zip_location"),  # TODO: add to taca.yaml
-            datetime.now().year,
+            str(self.year),
         )
 
         # Use LIMS step ID if available, else flowcell ID, to make a query pattern
@@ -230,7 +232,7 @@ def find_manifest_zip(self):
             glob_pattern = f"{dir_to_search}/*{self.flowcell_id}*.zip"
 
         # Find paths matching the pattern
-        glob_results = glob(glob_pattern)
+        glob_results = glob.glob(glob_pattern)
         if len(glob_results) == 0:
             logger.warning(
                 f"No manifest found for run '{self.run_dir}' with pattern '{glob_pattern}'."
@@ -383,7 +385,7 @@ def make_demux_manifests(
 
     def generate_demux_command(self, run_manifest, demux_dir):
         command = (
-            f"{self.CONFIG.get('bases2fastq')}"  # TODO: add path to bases2fastq executable to config
+            f"{self.CONFIG.get("element_analysis").get('bases2fastq')}"  # TODO: add path to bases2fastq executable to config
             + f" {self.run_dir}"
             + f" {demux_dir}"
             + " -p 8"
@@ -523,7 +525,7 @@ def make_transfer_indicator(self):
 
     def transfer(self):
         transfer_details = (
-            self.CONFIG.get(self.sequencer_type).get("transfer_details")
+            self.CONFIG.get("element_analysis").get(self.sequencer_type).get("transfer_details")
         )  # TODO: Add section to taca.yaml
         command = (
             "rsync"
diff --git a/taca/utils/statusdb.py b/taca/utils/statusdb.py
index 47620ea2..a2920550 100644
--- a/taca/utils/statusdb.py
+++ b/taca/utils/statusdb.py
@@ -210,7 +210,7 @@ def update_doc(db, obj, over_write_db_entry=False):
         db.save(obj)
         logger.info("Saving {}".format(obj["name"]))
     else:
-        logger.warn("More than one row with name {} found".format(obj["name"]))
+        logger.warning("More than one row with name {} found".format(obj["name"]))
 
 
 def merge_dicts(d1, d2):

From 9e80a4f6be4371d691284222e54045f867699ae3 Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Tue, 24 Sep 2024 13:54:12 +0200
Subject: [PATCH 063/187] Fix getting demux status

---
 taca/element/Element_Runs.py | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 88ba3967..1606df50 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -36,7 +36,7 @@ def __init__(self, run_dir, configuration):
         self.demux_dir = os.path.join(self.run_dir, "Demultiplexing")
         self.final_sequencing_file = os.path.join(self.run_dir, "RunUploaded.json")
         self.demux_stats_file = (
-            "RunStats.json"  # Assumes demux is finished when this file is created
+            "*RunStats.json"  # Assumes demux is finished when this file is created
         )
         self.transfer_file = (
             self.CONFIG.get("element_analysis").get("Element", {})
@@ -160,20 +160,17 @@ def check_sequencing_status(self):
     def get_demultiplexing_status(self):
         if not os.path.exists(self.demux_dir):
             return "not started"
-        demux_dirs = glob.glob(os.path.join(self.run_dir, "Demultiplexing*"))
+        sub_demux_dirs = glob.glob(os.path.join(self.run_dir, "Demultiplexing_*"))
         finished_count = 0
-        for demux_dir in demux_dirs:
-            if os.path.exists(self.demux_dir) and not os.path.isfile(
-                os.path.join(demux_dir, self.demux_stats_file)
-            ):
+        for demux_dir in sub_demux_dirs:
+            found_demux_stats_file = glob.glob(os.path.join(demux_dir, self.demux_stats_file))
+            if not found_demux_stats_file:
                 return "ongoing"
-            elif os.path.exists(self.demux_dir) and os.path.isfile(
-                os.path.join(demux_dir, self.demux_stats_file)
-            ):
+            elif found_demux_stats_file:
                 finished_count += (
                     1  # TODO: check exit status of demux in exit status file
                 )
-        if finished_count == len(demux_dirs):
+        if finished_count == len(sub_demux_dirs):
             return "finished"
         else:
             return "unknown"

From feec3217e2f980c4afad5df9d5c39425fb1a964d Mon Sep 17 00:00:00 2001
From: chuan-wang <chuan.wang@scilifelab.se>
Date: Tue, 24 Sep 2024 14:18:42 +0200
Subject: [PATCH 064/187] Finalize scripts and fix bugs

---
 VERSIONLOG.md                     |  4 +++
 taca/analysis/analysis_element.py |  2 +-
 taca/element/Element_Runs.py      | 57 +++++++++++++++----------------
 3 files changed, 33 insertions(+), 30 deletions(-)

diff --git a/VERSIONLOG.md b/VERSIONLOG.md
index 73323deb..d0563c9a 100644
--- a/VERSIONLOG.md
+++ b/VERSIONLOG.md
@@ -1,5 +1,9 @@
 # TACA Version Log
 
+## 20240924.1
+
+Aggregate aviti demultiplexing results
+
 ## 20240705.1
 
 Add section header in samplesheet for run folder transfer
diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index 63cce774..867f13c6 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -72,7 +72,7 @@ def _process(run):
             transfer_status = run.get_transfer_status()
             if transfer_status == "not started":
                 demux_results_dirs = glob.glob(
-                    os.path.join(run.run_dir, "Delmultiplexing_*")
+                    os.path.join(run.run_dir, "Demultiplexing_*")
                     )
                 run.aggregate_demux_results(demux_results_dirs)
                 run.upload_demux_results_to_statusdb()
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index b8cf07e3..fb5de39c 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -484,7 +484,7 @@ def write_to_csv(data, filename):
     def collect_demux_runmanifest(self, demux_results_dirs):
         demux_runmanifest = []
         for demux_dir in demux_results_dirs:
-            sub_demux_count = demux_dir.split('_')[1]
+            sub_demux_count = os.path.basename(demux_dir).split('_')[1]
             with open(os.path.join(self.run_dir, demux_dir, 'RunManifest.csv'), 'r') as file:
                 lines = file.readlines()
             sample_section = False
@@ -527,23 +527,23 @@ def aggregate_sample_fastq(self, demux_runmanifest):
             for sample in demux_runmanifest:
                 lanenr = sample['Lane']
                 project = sample['Project']
-                sample = sample['SampleName']
+                sample_name = sample['SampleName']
                 sub_demux_count = sample['sub_demux_count']
                 # Skip PhiX
-                if lanenr == lane and sample != "PhiX":
-                    sample_tuple = (sample, sub_demux_count)
+                if lanenr == lane and sample_name != "PhiX":
+                    sample_tuple = (sample_name, sub_demux_count)
                     if sample_tuple not in unique_sample_demux:
                         project_dest = os.path.join(self.run_dir, self.demux_dir, project)
-                        sample_dest = os.path.join(self.run_dir, self.demux_dir, project, sample)
+                        sample_dest = os.path.join(self.run_dir, self.demux_dir, project, sample_name)
                         if not os.path.exists(project_dest):
                             os.makedirs(project_dest)
                         if not os.path.exists(sample_dest):
                             os.makedirs(sample_dest)
-                        fastqfiles = glob.glob(os.path.join(self.run_dir, f"Demultiplexing_{sub_demux_count}", "Samples", project, sample, f"*L00{lane}*.fastq.gz"))
+                        fastqfiles = glob.glob(os.path.join(self.run_dir, f"Demultiplexing_{sub_demux_count}", "Samples", project, sample_name, f"*L00{lane}*.fastq.gz"))
                         for fastqfile in fastqfiles:
                             old_name = os.path.basename(fastqfile)
                             read_label = re.search(rf"L00{lane}_(.*?)_001", old_name).group(1)
-                            new_name = "_".join([sample, f"S{sample_count}", f"L00{lane}", read_label, "001.fastq.gz"])
+                            new_name = "_".join([sample_name, f"S{sample_count}", f"L00{lane}", read_label, "001.fastq.gz"])
                             os.symlink(fastqfile, os.path.join(sample_dest, new_name))
                         unique_sample_demux.add(sample_tuple)
                         sample_count += 1
@@ -558,7 +558,7 @@ def aggregate_undet_fastq(self, demux_runmanifest):
                 project_dest = os.path.join(self.run_dir, self.demux_dir, "Undetermined")
                 if not os.path.exists(project_dest):
                     os.makedirs(project_dest)
-                fastqfiles = glob.glob(os.path.join(self.run_dir, f"Demultiplexing_{sub_demux[0]}", "Samples", "Undetermined", "*.fastq.gz"))
+                fastqfiles = glob.glob(os.path.join(self.run_dir, f"Demultiplexing_{sub_demux[0]}", "Samples", "Undetermined", f"*L00{lane}*.fastq.gz"))
                 for fastqfile in fastqfiles:
                     base_name = os.path.basename(fastqfile)
                     os.symlink(fastqfile, os.path.join(project_dest, base_name))
@@ -585,7 +585,7 @@ def get_project_runstats(self, sub_demux, demux_runmanifest):
                         percentage_q40 = occurrence["PercentQ40"]
                         quality_score_mean = occurrence["QualityScoreMean"]
                         project_runstats.append({ "SampleName"       : sample_name,
-                                                  "Lane"             : lane,
+                                                  "Lane"             : str(lane),
                                                   "ExpectedSequence" : expected_sequence,
                                                   "PercentMismatch"  : percentage_mismatch,
                                                   "PercentQ30"       : percentage_q30,
@@ -615,7 +615,12 @@ def aggregate_stats_assigned(self, demux_runmanifest):
                     index_assignment = [row for row in reader]
                 for sample in index_assignment:
                     if sample['Lane'] in lanes:
+                        project_runstats_sample = [d for d in project_runstats if d['SampleName'] == sample['SampleName'] and d['Lane'] == sample['Lane'] and d['ExpectedSequence'] == sample['I1']+sample['I2']]
                         sample['sub_demux_count'] = sub_demux
+                        sample['PercentMismatch'] = project_runstats_sample[0]['PercentMismatch']
+                        sample['PercentQ30'] = project_runstats_sample[0]['PercentQ30']
+                        sample['PercentQ40'] = project_runstats_sample[0]['PercentQ40']
+                        sample['QualityScoreMean'] = project_runstats_sample[0]['QualityScoreMean']
                         aggregated_assigned_indexes.append(sample)
             else:
                 logger.warning(f"No IndexAssignment.csv file found for sub-demultiplexing {sub_demux}.")
@@ -704,26 +709,20 @@ def aggregate_stats_unassigned(self, demux_runmanifest):
 
     # Aggregate demux results
     def aggregate_demux_results(self, demux_results_dirs):
-        # In case of single demux
-        if len(demux_results_dirs) == 1:
-            # TODO: Check NoIndex case. Can Base2Fastq generate FastQs for both reads and indexes for NoIndex sample?
-            # Otherwise just softlink contents of Demultplexing_0 into Demultiplexing
-            symlink_demux_dir(demux_results_dirs[0], os.path.join(self.run_dir, self.demux_dir))
-        else:
-            # Ensure the destination directory exists
-            if not os.path.exists(os.path.join(self.run_dir, self.demux_dir):
-                os.makedirs(os.path.join(self.run_dir, self.demux_dir)
-            # Clear all content under dest_dir
-            clear_dir(os.path.join(self.run_dir, self.demux_dir)
-            demux_runmanifest = collect_demux_runmanifest(demux_results_dirs)
-            # Aggregate the output FastQ files of samples from multiple demux
-            aggregate_sample_fastq(demux_runmanifest)
-            # Symplink the output FastQ files of undet only if a lane does not have multiple demux
-            aggregate_undet_fastq(demux_runmanifest)
-            # Aggregate stats in IndexAssignment.csv
-            aggregate_stats_assigned(demux_runmanifest)
-            # Aggregate stats in UnassignedSequences.csv
-            aggregate_stats_unassigned(demux_runmanifest)
+        # Ensure the destination directory exists
+        if not os.path.exists(os.path.join(self.run_dir, self.demux_dir)):
+            os.makedirs(os.path.join(self.run_dir, self.demux_dir))
+        # Clear all content under dest_dir
+        clear_dir(os.path.join(self.run_dir, self.demux_dir))
+        demux_runmanifest = collect_demux_runmanifest(demux_results_dirs)
+        # Aggregate the output FastQ files of samples from multiple demux
+        aggregate_sample_fastq(demux_runmanifest)
+        # Symplink the output FastQ files of undet only if a lane does not have multiple demux
+        aggregate_undet_fastq(demux_runmanifest)
+        # Aggregate stats in IndexAssignment.csv
+        aggregate_stats_assigned(demux_runmanifest)
+        # Aggregate stats in UnassignedSequences.csv
+        aggregate_stats_unassigned(demux_runmanifest)
 
 
     def upload_demux_results_to_statusdb(self):

From 23e9b6407627f0603d88da9da2002e191dfe0ef7 Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Wed, 25 Sep 2024 09:37:21 +0200
Subject: [PATCH 065/187] Bug fixes

---
 taca/analysis/analysis_element.py | 6 +++---
 taca/element/Element_Runs.py      | 9 +++++----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index 9b0b62cf..b469a5af 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -85,7 +85,7 @@ def _process(run):
             demux_results_dirs = glob.glob(
                 os.path.join(run.run_dir, "Delmultiplexing*")
             )
-            if len(demux_results_dirs > 1):
+            if len(demux_results_dirs) > 1:
                 run.aggregate_demux_results(demux_results_dirs)
             run.sync_metadata()
             run.make_transfer_indicator()
@@ -93,13 +93,13 @@ def _process(run):
             if run.status_changed:
                 run.update_statusdb()
                 # TODO: Also update statusdb with a timestamp of when the transfer started
-            run.transfer()  # I think this should be a detached command as well
+            run.transfer()
             return
         elif transfer_status == "ongoing":
             run.status = "transferring"
             if run.status_changed:
                 run.update_statusdb()
-            logger.info(f"{run} is being transferred. Skipping.")
+            logger.info(f"{run} is being transferred. Skipping.") # TODO: fix formatting, currently prints "ElementRun(20240910_AV242106_B2403418431) is being transferred"
             return
         elif transfer_status == "rsync done":
             if run.rsync_successful():
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 1606df50..7efce962 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -441,7 +441,7 @@ def rsync_complete(self):
     def rsync_successful(self):
         with open(os.path.join(self.run_dir, ".rsync_exit_status")) as rsync_exit_file:
             rsync_exit_status = rsync_exit_file.readlines()
-        if rsync_exit_status[0].strip() == 0:
+        if rsync_exit_status[0].strip() == '0':
             return True
         else:
             return False
@@ -522,7 +522,7 @@ def make_transfer_indicator(self):
 
     def transfer(self):
         transfer_details = (
-            self.CONFIG.get("element_analysis").get(self.sequencer_type).get("transfer_details")
+            self.CONFIG.get("element_analysis").get("transfer_details")
         )  # TODO: Add section to taca.yaml
         command = (
             "rsync"
@@ -532,8 +532,8 @@ def transfer(self):
             + " --exclude BaseCalls"  # TODO: check that we actually want to exclude these
             + " --exclude Alignment"
             + f" {self.run_dir}"
-            + f" {transfer_details.get('user')@transfer_details.get('host')}:/aviti"
-            + "; echo $? > .rsync_exit_status"
+            + f" {transfer_details.get('user')}@{transfer_details.get('host')}:/aviti"
+            + f"; echo $? > {os.path.join(self.run_dir, ".rsync_exit_status")}"
         )  # TODO: any other options?
         try:
             p_handle = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
@@ -568,3 +568,4 @@ def archive(self):
         src = self.run_dir
         dst = os.path.join(self.run_dir, os.pardir, "nosync")
         shutil.move(src, dst)
+        self.run_dir = 

From ab99114bad809b76c287bd09d95695545174c640 Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@scilifelab.se>
Date: Thu, 26 Sep 2024 08:49:51 +0200
Subject: [PATCH 066/187] Update taca/element/Element_Runs.py

Co-authored-by: Johannes Alneberg <johannes.alneberg@scilifelab.se>
---
 taca/element/Element_Runs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 4dd1684b..58a6b646 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -567,7 +567,7 @@ def aggregate_sample_fastq(self, demux_runmanifest):
                         sample_count += 1
 
 
-    # Symplink the output FastQ files of undet only if a lane does not have multiple demux
+    # Symlink the output FastQ files of undet only if a lane does not have multiple demux
     def aggregate_undet_fastq(self, demux_runmanifest):
         lanes = sorted(list(set(sample['Lane'] for sample in demux_runmanifest)))
         for lane in lanes:

From cf64192b549440c43f2e6b54d79032855edae296 Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@scilifelab.se>
Date: Thu, 26 Sep 2024 08:50:17 +0200
Subject: [PATCH 067/187] Update taca/element/Element_Runs.py

Co-authored-by: Johannes Alneberg <johannes.alneberg@scilifelab.se>
---
 taca/element/Element_Runs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 58a6b646..e3d25989 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -735,7 +735,7 @@ def aggregate_demux_results(self, demux_results_dirs):
         demux_runmanifest = collect_demux_runmanifest(demux_results_dirs)
         # Aggregate the output FastQ files of samples from multiple demux
         aggregate_sample_fastq(demux_runmanifest)
-        # Symplink the output FastQ files of undet only if a lane does not have multiple demux
+        # Symlink the output FastQ files of undet only if a lane does not have multiple demux
         aggregate_undet_fastq(demux_runmanifest)
         # Aggregate stats in IndexAssignment.csv
         aggregate_stats_assigned(demux_runmanifest)

From 2559eccae794230e602743311f0ac0533cbfdc01 Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@scilifelab.se>
Date: Thu, 26 Sep 2024 08:52:31 +0200
Subject: [PATCH 068/187] Update taca/element/Element_Runs.py

---
 taca/element/Element_Runs.py | 23 -----------------------
 1 file changed, 23 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index e3d25989..ee9a2cc2 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -459,29 +459,6 @@ def clear_dir(dir):
                 print(f"Failed to delete {file_path} Reason {e}")
 
     # Create symlink for a simple demultiplexing dir
-    def symlink_demux_dir(src_dir, dest_dir):
-        # Ensure the destination directory exists
-        if not os.path.exists(dest_dir):
-            os.makedirs(dest_dir)
-        # Clear all content under dest_dir
-        clear_dir(dest_dir)
-        # Loop through all files and directories in the source directory
-        for item in os.listdir(src_dir):
-            src_path = os.path.join(src_dir, item)
-            # Move content of Samples to the parental dir
-            if item == "Samples":
-                dest_path = dest_dir
-            else:
-                dest_path = os.path.join(dest_dir, item)
-            try:
-                # Create symbolic link only if it doesn't already exist
-                if not os.path.exists(dest_path):
-                    os.symlink(src_path, dest_path)
-                    print(f"Linked {src_path} to {dest_path}")
-                else:
-                    print(f"{dest_path} already exists.")
-            except OSError as e:
-                print(f"Error linking {src_path} to {dest_path}: {e}")
 
 
     # Write to csv

From 9627961ceb6ac295ce69fd1ff56744f197605920 Mon Sep 17 00:00:00 2001
From: chuan-wang <chuan.wang@scilifelab.se>
Date: Thu, 26 Sep 2024 08:53:08 +0200
Subject: [PATCH 069/187] Remove unused function

---
 taca/element/Element_Runs.py | 29 ++---------------------------
 1 file changed, 2 insertions(+), 27 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index fb5de39c..ad977c9f 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -440,31 +440,6 @@ def clear_dir(dir):
             except Exception as e:
                 print(f"Failed to delete {file_path} Reason {e}")
 
-    # Create symlink for a simple demultiplexing dir
-    def symlink_demux_dir(src_dir, dest_dir):
-        # Ensure the destination directory exists
-        if not os.path.exists(dest_dir):
-            os.makedirs(dest_dir)
-        # Clear all content under dest_dir
-        clear_dir(dest_dir)
-        # Loop through all files and directories in the source directory
-        for item in os.listdir(src_dir):
-            src_path = os.path.join(src_dir, item)
-            # Move content of Samples to the parental dir
-            if item == "Samples":
-                dest_path = dest_dir
-            else:
-                dest_path = os.path.join(dest_dir, item)
-            try:
-                # Create symbolic link only if it doesn't already exist
-                if not os.path.exists(dest_path):
-                    os.symlink(src_path, dest_path)
-                    print(f"Linked {src_path} to {dest_path}")
-                else:
-                    print(f"{dest_path} already exists.")
-            except OSError as e:
-                print(f"Error linking {src_path} to {dest_path}: {e}")
-
 
     # Write to csv
     def write_to_csv(data, filename):
@@ -549,7 +524,7 @@ def aggregate_sample_fastq(self, demux_runmanifest):
                         sample_count += 1
 
 
-    # Symplink the output FastQ files of undet only if a lane does not have multiple demux
+    # Symlink the output FastQ files of undet only if a lane does not have multiple demux
     def aggregate_undet_fastq(self, demux_runmanifest):
         lanes = sorted(list(set(sample['Lane'] for sample in demux_runmanifest)))
         for lane in lanes:
@@ -717,7 +692,7 @@ def aggregate_demux_results(self, demux_results_dirs):
         demux_runmanifest = collect_demux_runmanifest(demux_results_dirs)
         # Aggregate the output FastQ files of samples from multiple demux
         aggregate_sample_fastq(demux_runmanifest)
-        # Symplink the output FastQ files of undet only if a lane does not have multiple demux
+        # Symlink the output FastQ files of undet only if a lane does not have multiple demux
         aggregate_undet_fastq(demux_runmanifest)
         # Aggregate stats in IndexAssignment.csv
         aggregate_stats_assigned(demux_runmanifest)

From 8a9ce0f4dce83934511466eec234bc296e120dd6 Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Thu, 26 Sep 2024 10:12:01 +0200
Subject: [PATCH 070/187] fix references to functions

---
 taca/element/Element_Runs.py | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index ee9a2cc2..8dee9732 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -458,9 +458,6 @@ def clear_dir(dir):
             except Exception as e:
                 print(f"Failed to delete {file_path} Reason {e}")
 
-    # Create symlink for a simple demultiplexing dir
-
-
     # Write to csv
     def write_to_csv(data, filename):
         # Get the fieldnames from the keys of the first dictionary
@@ -601,7 +598,7 @@ def aggregate_stats_assigned(self, demux_runmanifest):
             # Read in each Project_RunStats.json to fetch PercentMismatch, PercentQ30, PercentQ40 and QualityScoreMean
             # Note that Element promised that they would include these stats into IndexAssignment.csv
             # But for now we have to do this by ourselves in this hard way
-            project_runstats = get_project_runstats(sub_demux, demux_runmanifest)
+            project_runstats = self.get_project_runstats(sub_demux, demux_runmanifest)
             # Read in IndexAssignment.csv
             assigned_csv = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux}", "IndexAssignment.csv")
             if os.path.exists(assigned_csv):
@@ -642,7 +639,7 @@ def aggregate_stats_assigned(self, demux_runmanifest):
             sample['SampleNumber'] = sample_count
         # Write to a new UnassignedSequences.csv file under demux_dir
         aggregated_assigned_indexes_csv = os.path.join(self.run_dir, self.demux_dir, "IndexAssignment.csv")
-        write_to_csv(aggregated_assigned_indexes_filtered_sorted, aggregated_assigned_indexes_csv)
+        self.write_to_csv(aggregated_assigned_indexes_filtered_sorted, aggregated_assigned_indexes_csv)
 
 
     # Aggregate stats in UnassignedSequences.csv
@@ -699,7 +696,7 @@ def aggregate_stats_unassigned(self, demux_runmanifest):
         aggregated_unassigned_indexes = sorted(aggregated_unassigned_indexes, key=lambda x: (x['Lane'], -int(x['Count'])))
         # Write to a new UnassignedSequences.csv file under demux_dir
         aggregated_unassigned_csv = os.path.join(self.run_dir, self.demux_dir, "UnassignedSequences.csv")
-        write_to_csv(aggregated_unassigned_indexes, aggregated_unassigned_csv)
+        self.write_to_csv(aggregated_unassigned_indexes, aggregated_unassigned_csv)
 
 
     # Aggregate demux results
@@ -708,16 +705,16 @@ def aggregate_demux_results(self, demux_results_dirs):
         if not os.path.exists(os.path.join(self.run_dir, self.demux_dir)):
             os.makedirs(os.path.join(self.run_dir, self.demux_dir))
         # Clear all content under dest_dir
-        clear_dir(os.path.join(self.run_dir, self.demux_dir))
-        demux_runmanifest = collect_demux_runmanifest(demux_results_dirs)
+        self.clear_dir(os.path.join(self.run_dir, self.demux_dir))
+        demux_runmanifest = self.collect_demux_runmanifest(demux_results_dirs)
         # Aggregate the output FastQ files of samples from multiple demux
-        aggregate_sample_fastq(demux_runmanifest)
+        self.aggregate_sample_fastq(demux_runmanifest)
         # Symlink the output FastQ files of undet only if a lane does not have multiple demux
-        aggregate_undet_fastq(demux_runmanifest)
+        self.aggregate_undet_fastq(demux_runmanifest)
         # Aggregate stats in IndexAssignment.csv
-        aggregate_stats_assigned(demux_runmanifest)
+        self.aggregate_stats_assigned(demux_runmanifest)
         # Aggregate stats in UnassignedSequences.csv
-        aggregate_stats_unassigned(demux_runmanifest)
+        self.aggregate_stats_unassigned(demux_runmanifest)
 
     def upload_demux_results_to_statusdb(self):
         doc_obj = self.db.get_db_entry(self.NGI_run_id)
@@ -829,4 +826,4 @@ def archive(self):
         src = self.run_dir
         dst = os.path.join(self.run_dir, os.pardir, "nosync")
         shutil.move(src, dst)
-        self.run_dir = 
+        self.run_dir = os.path.join(dst, self.NGI_run_id) # Needs to be redirected to new location so that TACA can find files to upload to statusdb

From 80d03c443061e0b07aaaad6072775012e7fb88cf Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Thu, 26 Sep 2024 13:58:31 +0200
Subject: [PATCH 071/187] make config to use for tests

---
 tests/element/test_Element_Runs.py | 47 ++++++++++++++++++++++--------
 1 file changed, 35 insertions(+), 12 deletions(-)

diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py
index 1100aff9..5196e678 100644
--- a/tests/element/test_Element_Runs.py
+++ b/tests/element/test_Element_Runs.py
@@ -7,6 +7,17 @@
 
 from taca.element import Element_Runs as to_test
 
+CONFIG = {
+    "element_analysis": {
+        "Element": {
+            "GenericElement": {
+                "demux_dir": "mock_demux_dir_path",
+                "transfer_log": "mock_transfer_log_file.log",
+            },
+        },
+    },
+}
+
 
 def create_element_run_dir(
     tmp: tempfile.TemporaryDirectory,
@@ -15,6 +26,7 @@ def create_element_run_dir(
     run_finished: bool = True,
     sync_finished: bool = True,
     demux_dir: bool = True,
+    n_demux_subdirs: int = 1,
     demux_done: bool = True,
     outcome_completed: bool = True,
 ) -> str:
@@ -27,8 +39,12 @@ def create_element_run_dir(
         ├── RunParameters.json
         ├── RunUploaded.json
         ├── .sync_finished
-        └── Demultiplexing
-            └── RunStats.json
+        ├── Demultiplexing
+            ├── Demultiplexing_0
+            |   └── RunStats.json
+            ├── Demultiplexing_1
+            |   └── RunStats.json
+            └── ...
 
     """
 
@@ -53,9 +69,18 @@ def create_element_run_dir(
 
     if demux_dir:
         os.mkdir(os.path.join(run_path, "Demultiplexing"))
-
-    if demux_done:
-        open(os.path.join(run_path, "Demultiplexing", "RunStats.json"), "w").close()
+        for i in range(n_demux_subdirs):
+            os.mkdir(os.path.join(run_path, "Demultiplexing", f"Demultiplexing_{i}"))
+            if demux_done:
+                open(
+                    os.path.join(
+                        run_path,
+                        "Demultiplexing",
+                        f"Demultiplexing_{i}",
+                        "RunStats.json",
+                    ),
+                    "w",
+                ).close()
 
     return run_path
 
@@ -66,7 +91,7 @@ def test_init(self, mock_db: mock.Mock, create_dirs: pytest.fixture):
         tmp: tempfile.TemporaryDirectory = create_dirs
         run_dir = create_element_run_dir(tmp)
 
-        run = to_test.Run(run_dir, {})
+        run = to_test.Run(run_dir, CONFIG)
         assert run.run_dir == run_dir
 
     @pytest.mark.parametrize(
@@ -92,7 +117,7 @@ def test_check_sequencing_status(
                 run_finished=p["run_finished"],
                 outcome_completed=p["outcome_completed"],
             ),
-            {},
+            CONFIG,
         )
         assert run.check_sequencing_status() is p["expected"]
 
@@ -110,15 +135,13 @@ def test_get_demultiplexing_status(
     ):
         tmp: tempfile.TemporaryDirectory = create_dirs
 
-        if p["demux_dir"] and not p["demux_done"]:
-
         run = to_test.Run(
             create_element_run_dir(
                 tmp,
                 demux_dir=p["demux_dir"],
                 demux_done=p["demux_done"],
             ),
-            {},
+            CONFIG,
         )
         assert run.get_demultiplexing_status() == p["expected"]
 
@@ -140,7 +163,7 @@ def test_manifest_exists(
                 tmp,
                 run_finished=p["run_finished"],
             ),
-            {},
+            CONFIG,
         )
         assert run.manifest_exists() == p["expected"]
 
@@ -155,7 +178,7 @@ def test_start_demux(self, mock_db, create_dirs):
             "taca.element.Element_Runs.Run.generate_demux_command"
         ) as mock_command:
             mock_command.return_value = "test command"
-            run = to_test.Run(create_element_run_dir(create_dirs), {})
+            run = to_test.Run(create_element_run_dir(create_dirs), CONFIG)
             run.start_demux()
             mock_command.assert_called_once()
             mock_call.assert_called_once_with(

From 7936e8fb325a45466db030dd7fa1925c7b548744 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Thu, 26 Sep 2024 14:12:18 +0200
Subject: [PATCH 072/187] add config

---
 tests/element/test_Aviti_Runs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/element/test_Aviti_Runs.py b/tests/element/test_Aviti_Runs.py
index bf32089c..3c61276a 100644
--- a/tests/element/test_Aviti_Runs.py
+++ b/tests/element/test_Aviti_Runs.py
@@ -4,7 +4,7 @@
 import pytest
 
 from taca.element import Aviti_Runs as to_test
-from tests.element.test_Element_Runs import create_element_run_dir
+from tests.element.test_Element_Runs import CONFIG, create_element_run_dir
 
 
 class TestAviti_Run:
@@ -16,6 +16,6 @@ def test_init(self, create_dirs: pytest.fixture):
         mock_db = patch("taca.element.Element_Runs.ElementRunsConnection")
         mock_db.start()
 
-        run = to_test.Aviti_Run(run_dir, {})
+        run = to_test.Aviti_Run(run_dir, CONFIG)
         assert run.run_dir == run_dir
         assert run.sequencer_type == "Aviti"

From 41dd4777a56e8d7978c1d98da69923cd85ab49ad Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Thu, 26 Sep 2024 14:14:01 +0200
Subject: [PATCH 073/187] bugfix

---
 taca/element/Element_Runs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 8dee9732..40beb05c 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -382,7 +382,7 @@ def make_demux_manifests(
 
     def generate_demux_command(self, run_manifest, demux_dir):
         command = (
-            f"{self.CONFIG.get("element_analysis").get('bases2fastq')}"  # TODO: add path to bases2fastq executable to config
+            f"{self.CONFIG.get('element_analysis').get('bases2fastq')}"  # TODO: add path to bases2fastq executable to config
             + f" {self.run_dir}"
             + f" {demux_dir}"
             + " -p 8"
@@ -791,7 +791,7 @@ def transfer(self):
             + " --exclude Alignment"
             + f" {self.run_dir}"
             + f" {transfer_details.get('user')}@{transfer_details.get('host')}:/aviti"
-            + f"; echo $? > {os.path.join(self.run_dir, ".rsync_exit_status")}"
+            + f"; echo $? > {os.path.join(self.run_dir, '.rsync_exit_status')}"
         )  # TODO: any other options?
         try:
             p_handle = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)

From 48ec3438925e4c4cdfa625ac9c122067a8dd2d51 Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Thu, 26 Sep 2024 15:24:42 +0200
Subject: [PATCH 074/187] Upload more stuff to statusdb

---
 taca/analysis/analysis_element.py |   1 -
 taca/element/Element_Runs.py      | 126 ++++++++++++++++--------------
 2 files changed, 68 insertions(+), 59 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index ab142d69..1d2f4d3f 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -84,7 +84,6 @@ def _process(run):
                 os.path.join(run.run_dir, "Delmultiplexing_*")
             )
             run.aggregate_demux_results(demux_results_dirs)
-            run.upload_demux_results_to_statusdb()
             run.sync_metadata()
             run.make_transfer_indicator()
             run.status = "transferring"
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 8dee9732..0d48edf5 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -42,12 +42,12 @@ def __init__(self, run_dir, configuration):
             self.CONFIG.get("element_analysis").get("Element", {})
             .get(self.sequencer_type, {})
             .get("transfer_log")
-        )  # TODO: change and add to taca.yaml
+        )  # TODO: add to taca.yaml
         self.rsync_exit_file = os.path.join(self.run_dir, ".rsync_exit_status")
 
         # Instrument generated files
         self.run_parameters_file = os.path.join(self.run_dir, "RunParameters.json")
-        self.run_stats_file = os.path.join(self.run_dir, "RunStats.json")
+        self.run_stats_file = os.path.join(self.run_dir, "AvitiRunStats.json")
         self.run_manifest_file_from_instrument = os.path.join(
             self.run_dir, "RunManifest.json"
         )
@@ -135,13 +135,64 @@ def to_doc_obj(self):
                     )
             else:
                 instrument_generated_files[os.path.basename(file)] = None
+        # Aggregated demux stats files
+        index_assignement_file = os.path.join(
+            self.run_dir, "Demultiplexing", "IndexAssignment.csv"
+        )
+        if os.path.exists(index_assignement_file):
+            with open(index_assignement_file) as index_file:
+                reader = csv.DictReader(index_file)
+                index_assignments = [row for row in reader]
+        else:
+            index_assignments = None
+
+        unassigned_sequences_file = os.path.join(
+            self.run_dir, "Demultiplexing", "UnassignedSequences.csv"
+        )
+        if os.path.exists(unassigned_sequences_file):
+            with open(unassigned_sequences_file) as unassigned_file:
+                reader = csv.DictReader(unassigned_file)
+                unassigned_sequences = [row for row in reader]
+        else:
+            unassigned_sequences = None
 
+        demultiplex_stats = {
+            "Demultiplex_Stats": {
+                "Index_Assignment": index_assignments,
+                "Unassigned_Sequences": unassigned_sequences,
+            }
+        }
+        
+        demux_command_file = os.path.join(self.run_dir, ".bases2fastq_command")
+        if os.path.exists(demux_command_file):
+            with open(demux_command_file) as command_file:
+                demux_command = command_file.readlines()[0]
+        else:
+            demux_command = None
+        demux_version_file = os.path.join(self.run_dir,"Demultiplexing_0", "RunStats.json")
+        if os.path.exists(demux_version_file):
+            with open(demux_version_file) as json_file:
+                    demux_info = json.load(
+                        json_file
+                    )
+            demux_version = demux_info.get("AnalysisVersion")
+        else:
+            demux_version = None
+
+        software_info = {
+            "Version": demux_version,
+            "bin": self.CONFIG.get("element_analysis").get("bases2fastq"),
+            "options": demux_command,
+        }
+        
         doc_obj = {
             "name": self.NGI_run_id,
             "run_path": self.run_dir,
             "run_status": self.status,
             "NGI_run_id": self.NGI_run_id,
             "instrument_generated_files": instrument_generated_files,
+            "Element": demultiplex_stats,
+            "Software": software_info,
         }
 
         return doc_obj
@@ -390,6 +441,8 @@ def generate_demux_command(self, run_manifest, demux_dir):
             + " --legacy-fastq"  # TODO: except if Smart-seq3
             + " --force-index-orientation"
         )  # TODO: any other options?
+        with open(os.path.join(self.run_dir, '.bases2fastq_command')) as command_file:
+            command_file.write(command)
         return command
 
     def start_demux(self, run_manifest, demux_dir):
@@ -716,60 +769,6 @@ def aggregate_demux_results(self, demux_results_dirs):
         # Aggregate stats in UnassignedSequences.csv
         self.aggregate_stats_unassigned(demux_runmanifest)
 
-    def upload_demux_results_to_statusdb(self):
-        doc_obj = self.db.get_db_entry(self.NGI_run_id)
-        index_assignement_file = os.path.join(
-            self.run_dir, "Demultiplexing", "IndexAssignment.csv"
-        )
-        with open(index_assignement_file) as index_file:
-            reader = csv.DictReader(index_file)
-            index_assignments = [row for row in reader]
-        unassigned_sequences_file = os.path.join(
-            self.run_dir, "Demultiplexing", "UnassignedSequences.csv"
-        )
-        with open(unassigned_sequences_file) as unassigned_file:
-            reader = csv.DictReader(unassigned_file)
-            unassigned_sequences = [row for row in reader]
-        dirs = os.scandir("Demultiplexing")
-        project_dirs = []
-        for directory in dirs:
-            if os.path.isdir(directory.path) and "Unassigned" not in directory.path:
-                project_dirs.append(directory.path)
-        for project_dir in project_dirs:  # TODO: remove this block when q30 is added to IndexAssignment.csv by Element
-            run_stats_file = glob.glob(os.path.join(project_dir, "*_RunStats.json"))
-            with open(run_stats_file) as stats_json:
-                project_sample_stats_raw = json.load(stats_json)
-            collected_sample_stats = {}
-            for sample_stats in project_sample_stats_raw["SampleStats"]:
-                sample_name = sample_stats["SampleName"]
-                percent_q30 = sample_stats["PercentQ30"]
-                quality_score_mean = sample_stats["QualityScoreMean"]
-                percent_mismatch = sample_stats["PercentMismatch"]
-                collected_sample_stats[sample_name] = {
-                    "PercentQ30": percent_q30,
-                    "QualityScoreMean": quality_score_mean,
-                    "PercentMismatch": percent_mismatch,
-                }
-            for assignment in index_assignments:
-                sample = assignment.get("SampleName")
-                if sample != "PhiX":
-                    sample_stats_to_add = collected_sample_stats.get(sample)
-                    assignment["PercentQ30"] = sample_stats_to_add.get("PercentQ30")
-                    assignment["QualityScoreMean"] = sample_stats_to_add.get(
-                        "QualityScoreMean"
-                    )
-                    assignment["PercentMismatch"] = sample_stats_to_add.get(
-                        "PercentMismatch"
-                    )
-        demultiplex_stats = {
-            "Demultiplex_Stats": {
-                "Index_Assignment": index_assignments,
-                "Unassigned_Sequences": unassigned_sequences,
-            }
-        }
-        doc_obj["Aviti"] = demultiplex_stats
-        self.db.upload_to_statusdb(doc_obj)
-
     def sync_metadata(self):
         # TODO: copy metadata from demuxed run to ngi-nas-ns
         pass
@@ -821,9 +820,20 @@ def update_transfer_log(self):
             logger.error(msg)
             raise OSError(msg)
 
+    def update_paths_after_archiving(self, new_location):
+        self.run_dir = os.path.join(new_location, self.NGI_run_id) # Needs to be redirected to new location so that TACA can find files to upload to statusdb
+        self.run_parameters_file = os.path.join(self.run_dir, "RunParameters.json")
+        self.run_stats_file = os.path.join(self.run_dir, "RunStats.json")
+        self.run_manifest_file_from_instrument = os.path.join(
+            self.run_dir, "RunManifest.json"
+        )
+        self.run_uploaded_file = os.path.join(self.run_dir, "RunUploaded.json")
+        # TODO: also update location of demux files?
+
     def archive(self):
         """Move directory to nosync."""
         src = self.run_dir
-        dst = os.path.join(self.run_dir, os.pardir, "nosync")
+        parent_dir = Path(self.run_dir).parent.absolute()
+        dst = os.path.join(parent_dir, "nosync")
         shutil.move(src, dst)
-        self.run_dir = os.path.join(dst, self.NGI_run_id) # Needs to be redirected to new location so that TACA can find files to upload to statusdb
+        self.update_paths_after_archiving(dst)

From d16ef8f0e94e88ad9ff3ff8dbf5c6d8c97820bf7 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Thu, 26 Sep 2024 15:28:58 +0200
Subject: [PATCH 075/187] fix formatting of .toml and scale down defaults args
 of pytest to enable IDE debugging

---
 pyproject.toml | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index cf0d04c8..f9ceff6f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,3 @@
-title = "taca"
-
 # === LINTING ================================================================
 
 [tool.ruff]
@@ -37,7 +35,12 @@ filterwarnings = [
     'ignore::DeprecationWarning:couchdb.*',
     'ignore::DeprecationWarning:pkg_resources.*',
 ]
-addopts = "--cov=./taca --cov-report term-missing -vv --cache-clear tests/"
+# Default addopts
+addopts = "--ignore tests_old/ " #--cov=./taca --cov-report=xml -vv"
+
+# CLI coverage reports, messes with IDE debugging
+#addopts = "--ignore tests_old/ --cov=./taca --cov-report=xml -vv"
+
 
 [tool.coverage.run]
 # The comment "# pragma: no cover" can be used to exclude a line from coverage

From 04001fd5f4a82f609d78627d8a3eda2697e6d8ed Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Thu, 26 Sep 2024 15:29:13 +0200
Subject: [PATCH 076/187] add aviti transfer log

---
 tests/conftest.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/conftest.py b/tests/conftest.py
index c26d4c03..e9a3fd89 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -17,6 +17,7 @@ def create_dirs():
         │   ├── transfer_minion_qc.tsv
         │   ├── transfer_minion.tsv
         │   └── transfer_promethion.tsv
+        │   └── transfer_aviti.tsv
         │   └── transfer.tsv
         │   └── taca.log
         ├── miarka
@@ -95,6 +96,7 @@ def create_dirs():
     open(f"{tmp.name}/log/transfer_promethion.tsv", "w").close()
     open(f"{tmp.name}/log/transfer_minion.tsv", "w").close()
     open(f"{tmp.name}/log/transfer_minion_qc.tsv", "w").close()
+    open(f"{tmp.name}/log/transfer_aviti.tsv", "w").close()
     open(f"{tmp.name}/log/transfer.tsv", "w").close()
     open(f"{tmp.name}/log/taca.log", "w").close()
 

From bcec98fda552c2a98ebfcecf409d475d66bd3717 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Thu, 26 Sep 2024 15:29:41 +0200
Subject: [PATCH 077/187] build config from tmp dir and update tests to pass

---
 tests/element/test_Aviti_Runs.py   |  4 +-
 tests/element/test_Element_Runs.py | 67 +++++++++++++++++-------------
 2 files changed, 41 insertions(+), 30 deletions(-)

diff --git a/tests/element/test_Aviti_Runs.py b/tests/element/test_Aviti_Runs.py
index 3c61276a..62d142bc 100644
--- a/tests/element/test_Aviti_Runs.py
+++ b/tests/element/test_Aviti_Runs.py
@@ -4,7 +4,7 @@
 import pytest
 
 from taca.element import Aviti_Runs as to_test
-from tests.element.test_Element_Runs import CONFIG, create_element_run_dir
+from tests.element.test_Element_Runs import create_element_run_dir, get_config
 
 
 class TestAviti_Run:
@@ -16,6 +16,6 @@ def test_init(self, create_dirs: pytest.fixture):
         mock_db = patch("taca.element.Element_Runs.ElementRunsConnection")
         mock_db.start()
 
-        run = to_test.Aviti_Run(run_dir, CONFIG)
+        run = to_test.Aviti_Run(run_dir, get_config(tmp))
         assert run.run_dir == run_dir
         assert run.sequencer_type == "Aviti"
diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py
index 5196e678..af373891 100644
--- a/tests/element/test_Element_Runs.py
+++ b/tests/element/test_Element_Runs.py
@@ -7,16 +7,19 @@
 
 from taca.element import Element_Runs as to_test
 
-CONFIG = {
-    "element_analysis": {
-        "Element": {
-            "GenericElement": {
-                "demux_dir": "mock_demux_dir_path",
-                "transfer_log": "mock_transfer_log_file.log",
+
+def get_config(tmp: tempfile.TemporaryDirectory) -> dict:
+    config = {
+        "element_analysis": {
+            "Element": {
+                "GenericElement": {
+                    "manifest_zip_location": f"{tmp}/ngi-nas-ns/samplesheets/Aviti",
+                    "transfer_log": f"{tmp}/log/transfer_aviti.tsv",
+                },
             },
         },
-    },
-}
+    }
+    return config
 
 
 def create_element_run_dir(
@@ -40,11 +43,11 @@ def create_element_run_dir(
         ├── RunUploaded.json
         ├── .sync_finished
         ├── Demultiplexing
-            ├── Demultiplexing_0
-            |   └── RunStats.json
-            ├── Demultiplexing_1
-            |   └── RunStats.json
-            └── ...
+        ├── Demultiplexing_0
+        |   └── RunStats.json
+        ├── Demultiplexing_1
+        |   └── RunStats.json
+        └── ...
 
     """
 
@@ -69,13 +72,21 @@ def create_element_run_dir(
 
     if demux_dir:
         os.mkdir(os.path.join(run_path, "Demultiplexing"))
+        if demux_done:
+            open(
+                os.path.join(
+                    run_path,
+                    f"Demultiplexing",
+                    "RunStats.json",
+                ),
+                "w",
+            ).close()
         for i in range(n_demux_subdirs):
-            os.mkdir(os.path.join(run_path, "Demultiplexing", f"Demultiplexing_{i}"))
+            os.mkdir(os.path.join(run_path, f"Demultiplexing_{i}"))
             if demux_done:
                 open(
                     os.path.join(
                         run_path,
-                        "Demultiplexing",
                         f"Demultiplexing_{i}",
                         "RunStats.json",
                     ),
@@ -91,7 +102,7 @@ def test_init(self, mock_db: mock.Mock, create_dirs: pytest.fixture):
         tmp: tempfile.TemporaryDirectory = create_dirs
         run_dir = create_element_run_dir(tmp)
 
-        run = to_test.Run(run_dir, CONFIG)
+        run = to_test.Run(run_dir, get_config(tmp))
         assert run.run_dir == run_dir
 
     @pytest.mark.parametrize(
@@ -117,7 +128,7 @@ def test_check_sequencing_status(
                 run_finished=p["run_finished"],
                 outcome_completed=p["outcome_completed"],
             ),
-            CONFIG,
+            get_config(tmp),
         )
         assert run.check_sequencing_status() is p["expected"]
 
@@ -141,10 +152,12 @@ def test_get_demultiplexing_status(
                 demux_dir=p["demux_dir"],
                 demux_done=p["demux_done"],
             ),
-            CONFIG,
+            get_config(tmp),
         )
+
         assert run.get_demultiplexing_status() == p["expected"]
 
+    @pytest.mark.skip(reason="Not implemented yet")
     @pytest.mark.parametrize(
         "p",
         [
@@ -163,8 +176,9 @@ def test_manifest_exists(
                 tmp,
                 run_finished=p["run_finished"],
             ),
-            CONFIG,
+            get_config(tmp),
         )
+
         assert run.manifest_exists() == p["expected"]
 
     @pytest.mark.skip(reason="Not implemented yet")
@@ -172,18 +186,15 @@ def test_generate_demux_command(self, mock_db):
         pass
 
     def test_start_demux(self, mock_db, create_dirs):
-        with mock.patch(
-            "taca.utils.misc.call_external_command_detached"
-        ) as mock_call, mock.patch(
+        tmp: tempfile.TemporaryDirectory = create_dirs
+        with mock.patch("subprocess.Popen") as mock_Popen, mock.patch(
             "taca.element.Element_Runs.Run.generate_demux_command"
         ) as mock_command:
             mock_command.return_value = "test command"
-            run = to_test.Run(create_element_run_dir(create_dirs), CONFIG)
-            run.start_demux()
-            mock_command.assert_called_once()
-            mock_call.assert_called_once_with(
-                "test command", with_log_files=True, prefix="demux_"
-            )
+            run = to_test.Run(create_element_run_dir(create_dirs), get_config(tmp))
+            run.start_demux("mock_run_manifest", "mock_demux_dir")
+            mock_command.assert_called_once_with("mock_run_manifest", "mock_demux_dir")
+            mock_Popen.assert_called_once()
 
     @pytest.mark.skip(reason="Not implemented yet")
     def test_is_transferred(self, mock_db, create_dirs):

From eadc072738901898947c62bdfbbacd0934ecae5f Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Thu, 26 Sep 2024 15:30:33 +0200
Subject: [PATCH 078/187] remove comment

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index f9ceff6f..4c9bfa38 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,7 +36,7 @@ filterwarnings = [
     'ignore::DeprecationWarning:pkg_resources.*',
 ]
 # Default addopts
-addopts = "--ignore tests_old/ " #--cov=./taca --cov-report=xml -vv"
+addopts = "--ignore tests_old/"
 
 # CLI coverage reports, messes with IDE debugging
 #addopts = "--ignore tests_old/ --cov=./taca --cov-report=xml -vv"

From 4e1c568359c6f5db33e2010159d2f8e6c1a31c71 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Thu, 26 Sep 2024 15:33:04 +0200
Subject: [PATCH 079/187] fix full command with inline results

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 4c9bfa38..d5d152b2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,7 +39,7 @@ filterwarnings = [
 addopts = "--ignore tests_old/"
 
 # CLI coverage reports, messes with IDE debugging
-#addopts = "--ignore tests_old/ --cov=./taca --cov-report=xml -vv"
+# pytest --ignore tests_old/ --cov=./taca --cov-report term-missing -vv
 
 
 [tool.coverage.run]

From a0696b15b9d908ab5efc0c922962eb4151e68ddf Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Thu, 26 Sep 2024 17:14:52 +0200
Subject: [PATCH 080/187] bugfix

---
 tests/element/test_Element_Runs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py
index af373891..9eb264b1 100644
--- a/tests/element/test_Element_Runs.py
+++ b/tests/element/test_Element_Runs.py
@@ -13,8 +13,8 @@ def get_config(tmp: tempfile.TemporaryDirectory) -> dict:
         "element_analysis": {
             "Element": {
                 "GenericElement": {
-                    "manifest_zip_location": f"{tmp}/ngi-nas-ns/samplesheets/Aviti",
-                    "transfer_log": f"{tmp}/log/transfer_aviti.tsv",
+                    "manifest_zip_location": f"{tmp.name}/ngi-nas-ns/samplesheets/Aviti",
+                    "transfer_log": f"{tmp.name}/log/transfer_aviti.tsv",
                 },
             },
         },

From 891595d66cbdc7ef86e69d40c2e87341a7dbb26c Mon Sep 17 00:00:00 2001
From: chuan-wang <chuan.wang@scilifelab.se>
Date: Fri, 27 Sep 2024 15:56:34 +0200
Subject: [PATCH 081/187] Add project name in IndexAssignment; Correct index
 percentage in undet

---
 VERSIONLOG.md                     |  5 +++++
 taca/analysis/analysis_element.py |  6 +++---
 taca/element/Element_Runs.py      | 21 ++++++++++++++++++---
 3 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/VERSIONLOG.md b/VERSIONLOG.md
index d0563c9a..bc899c3c 100644
--- a/VERSIONLOG.md
+++ b/VERSIONLOG.md
@@ -1,5 +1,10 @@
 # TACA Version Log
 
+## 20240927.1
+
+Add project name in IndexAssignment;
+Correct index percentage in undet
+
 ## 20240924.1
 
 Aggregate aviti demultiplexing results
diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index 1d2f4d3f..7109a027 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -70,7 +70,7 @@ def _process(run):
             if run.status_changed:
                 run.update_statusdb()
             return
-          
+
         elif demultiplexing_status != "finished":
             logger.warning(
                 f"Unknown demultiplexing status {demultiplexing_status} of run {run}. Please investigate"
@@ -81,7 +81,7 @@ def _process(run):
         transfer_status = run.get_transfer_status()
         if transfer_status == "not started":
             demux_results_dirs = glob.glob(
-                os.path.join(run.run_dir, "Delmultiplexing_*")
+                os.path.join(run.run_dir, "Demultiplexing_*")
             )
             run.aggregate_demux_results(demux_results_dirs)
             run.sync_metadata()
@@ -107,7 +107,7 @@ def _process(run):
                     run.update_statusdb()
                 run.archive()
                 run.status = "archived"
-                
+
                 if run.status_changed:
                     run.update_statusdb()
             else:
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 397ab0ee..eaab13f6 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -162,7 +162,7 @@ def to_doc_obj(self):
                 "Unassigned_Sequences": unassigned_sequences,
             }
         }
-        
+
         demux_command_file = os.path.join(self.run_dir, ".bases2fastq_command")
         if os.path.exists(demux_command_file):
             with open(demux_command_file) as command_file:
@@ -184,7 +184,7 @@ def to_doc_obj(self):
             "bin": self.CONFIG.get("element_analysis").get("bases2fastq"),
             "options": demux_command,
         }
-        
+
         doc_obj = {
             "name": self.NGI_run_id,
             "run_path": self.run_dir,
@@ -257,7 +257,7 @@ def get_lims_step_id(self) -> str | None:
                 lims_step_id = line.split(",")[1]
                 return lims_step_id
         return None
-    
+
     def find_manifest_zip(self):
         # Specify dir in which LIMS drop the manifest zip files
         dir_to_search = os.path.join(
@@ -674,6 +674,8 @@ def aggregate_stats_assigned(self, demux_runmanifest):
         aggregated_assigned_indexes_filtered = []
         unique_phiX_combination = set()
         for sample in aggregated_assigned_indexes:
+            # Add project name
+            sample['Project'] = [d for d in demux_runmanifest if d['SampleName'] == sample['SampleName']][0]['Project']
             if sample['SampleName'] == 'PhiX':
                 combination = (sample['I1'], sample['I2'], sample['Lane'])
                 if combination not in unique_phiX_combination:
@@ -748,6 +750,19 @@ def aggregate_stats_unassigned(self, demux_runmanifest):
             aggregated_unassigned_indexes += max_unassigned_indexes
         # Sort aggregated_unassigned_indexes list first by lane and then by Count in the decreasing order
         aggregated_unassigned_indexes = sorted(aggregated_unassigned_indexes, key=lambda x: (x['Lane'], -int(x['Count'])))
+        # Fetch PFCount for each lane
+        pfcount_lane = {}
+        aviti_runstats_json = os.path.join(self.run_dir, "AvitiRunStats.json")
+        if os.path.exists(aviti_runstats_json):
+            with open(aviti_runstats_json) as stats_json:
+                aviti_runstats_json = json.load(stats_json)
+            for lane_stats in aviti_runstats_json["LaneStats"]:
+                pfcount_lane[str(lane_stats["Lane"])] = float(lane_stats["PFCount"])
+        else:
+            logger.warning(f"No AvitiRunStats.json file found for the run.")
+        # Modify the % Polonies values based on PFCount for each lane
+        for unassigned_index in aggregated_unassigned_indexes:
+            unassigned_index["% Polonies"] = float(unassigned_index["Count"])/pfcount_lane[unassigned_index["Lane"]]*100
         # Write to a new UnassignedSequences.csv file under demux_dir
         aggregated_unassigned_csv = os.path.join(self.run_dir, self.demux_dir, "UnassignedSequences.csv")
         self.write_to_csv(aggregated_unassigned_indexes, aggregated_unassigned_csv)

From d6e225196761a5bd11ff68ad21ecf611e57cfe16 Mon Sep 17 00:00:00 2001
From: chuan-wang <chuan.wang@scilifelab.se>
Date: Sat, 28 Sep 2024 07:28:16 +0200
Subject: [PATCH 082/187] Add Sample_ in sample folder name

---
 taca/element/Element_Runs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index eaab13f6..794f6892 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -580,7 +580,7 @@ def aggregate_sample_fastq(self, demux_runmanifest):
                     sample_tuple = (sample_name, sub_demux_count)
                     if sample_tuple not in unique_sample_demux:
                         project_dest = os.path.join(self.run_dir, self.demux_dir, project)
-                        sample_dest = os.path.join(self.run_dir, self.demux_dir, project, sample_name)
+                        sample_dest = os.path.join(self.run_dir, self.demux_dir, project, f"Sample_{sample_name}")
                         if not os.path.exists(project_dest):
                             os.makedirs(project_dest)
                         if not os.path.exists(sample_dest):

From 8a1a4981469f0ed23d6aa2563e2b13e7604b35f7 Mon Sep 17 00:00:00 2001
From: chuan-wang <chuan.wang@scilifelab.se>
Date: Mon, 30 Sep 2024 08:51:45 +0200
Subject: [PATCH 083/187] Refactors based on comments from SS

---
 taca/element/Element_Runs.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 794f6892..833f0460 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -752,17 +752,18 @@ def aggregate_stats_unassigned(self, demux_runmanifest):
         aggregated_unassigned_indexes = sorted(aggregated_unassigned_indexes, key=lambda x: (x['Lane'], -int(x['Count'])))
         # Fetch PFCount for each lane
         pfcount_lane = {}
-        aviti_runstats_json = os.path.join(self.run_dir, "AvitiRunStats.json")
-        if os.path.exists(aviti_runstats_json):
-            with open(aviti_runstats_json) as stats_json:
+        if os.path.exists(self.run_stats_file):
+            with open(self.run_stats_file) as stats_json:
                 aviti_runstats_json = json.load(stats_json)
             for lane_stats in aviti_runstats_json["LaneStats"]:
                 pfcount_lane[str(lane_stats["Lane"])] = float(lane_stats["PFCount"])
+            # Modify the % Polonies values based on PFCount for each lane
+            for unassigned_index in aggregated_unassigned_indexes:
+                if pfcount_lane.get(unassigned_index["Lane"]):
+                    unassigned_index["% Polonies"] = float(unassigned_index["Count"])/pfcount_lane[unassigned_index["Lane"]]*100
         else:
             logger.warning(f"No AvitiRunStats.json file found for the run.")
-        # Modify the % Polonies values based on PFCount for each lane
-        for unassigned_index in aggregated_unassigned_indexes:
-            unassigned_index["% Polonies"] = float(unassigned_index["Count"])/pfcount_lane[unassigned_index["Lane"]]*100
+
         # Write to a new UnassignedSequences.csv file under demux_dir
         aggregated_unassigned_csv = os.path.join(self.run_dir, self.demux_dir, "UnassignedSequences.csv")
         self.write_to_csv(aggregated_unassigned_indexes, aggregated_unassigned_csv)
@@ -839,7 +840,7 @@ def update_transfer_log(self):
     def update_paths_after_archiving(self, new_location):
         self.run_dir = os.path.join(new_location, self.NGI_run_id) # Needs to be redirected to new location so that TACA can find files to upload to statusdb
         self.run_parameters_file = os.path.join(self.run_dir, "RunParameters.json")
-        self.run_stats_file = os.path.join(self.run_dir, "RunStats.json")
+        self.run_stats_file = os.path.join(self.run_dir, "AvitiRunStats.json")
         self.run_manifest_file_from_instrument = os.path.join(
             self.run_dir, "RunManifest.json"
         )

From 9a1c25924abae4f6a38e04c83c3b456262dbb05a Mon Sep 17 00:00:00 2001
From: chuan-wang <chuan.wang@scilifelab.se>
Date: Mon, 30 Sep 2024 09:08:11 +0200
Subject: [PATCH 084/187] Replace hard-coded file names

---
 taca/element/Element_Runs.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 833f0460..3c4816bc 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -669,7 +669,7 @@ def aggregate_stats_assigned(self, demux_runmanifest):
                         sample['QualityScoreMean'] = project_runstats_sample[0]['QualityScoreMean']
                         aggregated_assigned_indexes.append(sample)
             else:
-                logger.warning(f"No IndexAssignment.csv file found for sub-demultiplexing {sub_demux}.")
+                logger.warning(f"No {os.path.basename(assigned_csv)} file found for sub-demultiplexing {sub_demux}.")
         # Remove redundant rows for PhiX
         aggregated_assigned_indexes_filtered = []
         unique_phiX_combination = set()
@@ -717,7 +717,7 @@ def aggregate_stats_unassigned(self, demux_runmanifest):
                     reader = csv.DictReader(max_unassigned_file)
                     max_unassigned_indexes = [row for row in reader]
             else:
-                logger.warning(f"No UnassignedSequences.csv file found for sub-demultiplexing {sub_demux_with_max_index_lens}.")
+                logger.warning(f"No {os.path.basename(max_unassigned_csv)} file found for sub-demultiplexing {sub_demux_with_max_index_lens}.")
                 break
             # Filter by lane
             max_unassigned_indexes = [idx for idx in max_unassigned_indexes if idx["Lane"] == lane]
@@ -732,7 +732,7 @@ def aggregate_stats_unassigned(self, demux_runmanifest):
                             reader = csv.DictReader(unassigned_file)
                             unassigned_indexes = [row for row in reader]
                     else:
-                        logger.warning(f"No UnassignedSequences.csv file found for sub-demultiplexing {sub_demux}.")
+                        logger.warning(f"No {os.path.basename(unassigned_csv)} file found for sub-demultiplexing {sub_demux}.")
                         continue
                     # Filter by lane
                     unassigned_indexes = [unassigned_index for unassigned_index in unassigned_indexes if unassigned_index["Lane"] == lane]
@@ -762,7 +762,7 @@ def aggregate_stats_unassigned(self, demux_runmanifest):
                 if pfcount_lane.get(unassigned_index["Lane"]):
                     unassigned_index["% Polonies"] = float(unassigned_index["Count"])/pfcount_lane[unassigned_index["Lane"]]*100
         else:
-            logger.warning(f"No AvitiRunStats.json file found for the run.")
+            logger.warning(f"No {os.path.basename(self.run_stats_file)} file found for the run.")
 
         # Write to a new UnassignedSequences.csv file under demux_dir
         aggregated_unassigned_csv = os.path.join(self.run_dir, self.demux_dir, "UnassignedSequences.csv")

From 8c0263ef88c9594396dd68e2ce245e1cf1c0f814 Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Mon, 30 Sep 2024 10:10:22 +0200
Subject: [PATCH 085/187] Small fixes

---
 taca/analysis/analysis_element.py | 23 +++++++----------------
 taca/element/Element_Runs.py      |  8 +++-----
 2 files changed, 10 insertions(+), 21 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index 7109a027..72c3c74d 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -47,7 +47,7 @@ def _process(run):
                 run_manifests = glob.glob(
                     os.path.join(
                         run.run_dir, "RunManifest_*.csv"
-                    )  # TODO: is this filename right?
+                    )
                 )
                 sub_demux_count = 0
                 for run_manifest in run_manifests.sort():
@@ -117,38 +117,29 @@ def _process(run):
                 )
                 # TODO: email warning to operator
             return
-        elif transfer_status == "unknown":
-            logger.warning(
-                f"The run {run} has already been transferred but has not been archived. Please investigate"
-            )
-            # TODO: email operator warning
-            return
         else:
-            # TODO Merge with the one above?
             logger.warning(
                 f"Unknown transfer status {transfer_status} of run {run}. Please investigate"
-            )
+            ) # TODO: email warning to operator
             return
 
     if given_run:
         run = Aviti_Run(given_run, CONFIG)
-        # TODO: Needs to change if more types of Element machines are aquired in the future
-
         _process(run)
     else:
         data_dirs = CONFIG.get("element_analysis").get(
             "data_dirs"
         )  # TODO: add to config
-        for data_dir in data_dirs:  # TODO: make sure to look in both side A and B
-            # Run folder looks like DATE_*_*_*, the last section is the FC name.
+        for data_dir in data_dirs:
+            # Run folder looks like DATE_*_*, the last section is the FC side (A/B) and name
             runs = glob.glob(
-                os.path.join(data_dir, "[1-9]*_*_*_*")
-            )  # TODO: adapt to aviti format
+                os.path.join(data_dir, "[1-9]*_*_*")
+            )
             for run in runs:
                 runObj = Aviti_Run(run, CONFIG)
                 try:
                     _process(runObj)
-                except:  # TODO: chatch error message and print it
+                except:
                     # This function might throw and exception,
                     # it is better to continue processing other runs
                     logger.warning(f"There was an error processing the run {run}")
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 3c4816bc..e1923b27 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -285,7 +285,7 @@ def find_manifest_zip(self):
             logger.warning(
                 f"No manifest found for run '{self.run_dir}' with pattern '{glob_pattern}'."
             )
-            return False  # TODO determine whether to raise an error here instead
+            return False  # TODO: determine whether to raise an error here instead
         elif len(glob_results) > 1:
             logger.warning(
                 f"Multiple manifests found for run '{self.run_dir}' with pattern '{glob_pattern}', using latest one."
@@ -448,7 +448,6 @@ def generate_demux_command(self, run_manifest, demux_dir):
     def start_demux(self, run_manifest, demux_dir):
         with chdir(self.run_dir):
             cmd = self.generate_demux_command(run_manifest, demux_dir)
-            # TODO: handle multiple composite manifests for demux
             try:
                 p_handle = subprocess.Popen(
                     cmd, stdout=subprocess.PIPE, shell=True, cwd=self.run_dir
@@ -803,12 +802,12 @@ def transfer(self):
             + " -rLav"
             + f" --chown={transfer_details.get('owner')}"
             + f" --chmod={transfer_details.get('permissions')}"
-            + " --exclude BaseCalls"  # TODO: check that we actually want to exclude these
+            + " --exclude BaseCalls"
             + " --exclude Alignment"
             + f" {self.run_dir}"
             + f" {transfer_details.get('user')}@{transfer_details.get('host')}:/aviti"
             + f"; echo $? > {os.path.join(self.run_dir, '.rsync_exit_status')}"
-        )  # TODO: any other options?
+        )
         try:
             p_handle = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
             logger.info(
@@ -845,7 +844,6 @@ def update_paths_after_archiving(self, new_location):
             self.run_dir, "RunManifest.json"
         )
         self.run_uploaded_file = os.path.join(self.run_dir, "RunUploaded.json")
-        # TODO: also update location of demux files?
 
     def archive(self):
         """Move directory to nosync."""

From c86b6bf6d617fab4ab0041a0a30518f8908571e3 Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Mon, 30 Sep 2024 10:30:41 +0200
Subject: [PATCH 086/187] Sync metadata

---
 taca/element/Element_Runs.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index e1923b27..1e67c4ba 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -786,8 +786,15 @@ def aggregate_demux_results(self, demux_results_dirs):
         self.aggregate_stats_unassigned(demux_runmanifest)
 
     def sync_metadata(self):
-        # TODO: copy metadata from demuxed run to ngi-nas-ns
-        pass
+        files_to_copy = [self.run_stats_file,
+                         os.path.join(self.run_dir, "Demultiplexing", "IndexAssignment.csv"),
+                         os.path.join(self.run_dir, "Demultiplexing", "UnassignedSequences.csv")
+                         ]
+        metadata_archive = self.CONFIG.get("element_analysis").get("metadata_location") # TODO: add to aca.yaml
+        dest = os.path.join(metadata_archive, self.NGI_run_id)
+        os.makedirs(dest)
+        for f in files_to_copy:
+            shutil.copy(f, dest)
 
     def make_transfer_indicator(self):
         transfer_indicator = os.path.join(self.run_dir, ".rsync_ongoing")

From f221c984cb66c9030c3d96ba596cbb985bc16bdb Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Mon, 30 Sep 2024 11:13:50 +0200
Subject: [PATCH 087/187] Also sync RunParameters.json

---
 taca/element/Element_Runs.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 1e67c4ba..543f9b55 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -437,6 +437,7 @@ def generate_demux_command(self, run_manifest, demux_dir):
             + f" {self.run_dir}"
             + f" {demux_dir}"
             + " -p 8"
+            + " --num-unassigned 500"
             + f" -r {run_manifest}"
             + " --legacy-fastq"  # TODO: except if Smart-seq3
             + " --force-index-orientation"
@@ -788,7 +789,8 @@ def aggregate_demux_results(self, demux_results_dirs):
     def sync_metadata(self):
         files_to_copy = [self.run_stats_file,
                          os.path.join(self.run_dir, "Demultiplexing", "IndexAssignment.csv"),
-                         os.path.join(self.run_dir, "Demultiplexing", "UnassignedSequences.csv")
+                         os.path.join(self.run_dir, "Demultiplexing", "UnassignedSequences.csv"),
+                         self.run_parameters_file,
                          ]
         metadata_archive = self.CONFIG.get("element_analysis").get("metadata_location") # TODO: add to aca.yaml
         dest = os.path.join(metadata_archive, self.NGI_run_id)

From da9222ac36c98f87a0c260479d6f98f03ed104a5 Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Mon, 30 Sep 2024 11:36:05 +0200
Subject: [PATCH 088/187] Cleanup

---
 taca/element/Element_Runs.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 543f9b55..d6dd0b3a 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -106,7 +106,7 @@ def parse_run_parameters(self) -> None:
             "runID"
         )  # Unique hash that we don't really use
         self.side = run_parameters.get("Side")  # SideA or SideB
-        self.side_letter = self.side[-1]  # A or B
+        self.side_letter = self.side[-1]  # A or B TODO: compare side letter with manually entered letter in run name
         self.run_type = run_parameters.get(
             "RunType"
         )  # Sequencing, wash or prime I believe?
@@ -299,7 +299,6 @@ def find_manifest_zip(self):
 
     def copy_manifests(self) -> bool:
         """Fetch the LIMS-generated run manifests from ngi-nas-ns and unzip them into a run subdir."""
-        # TODO: test me
         zip_src_path = self.find_manifest_zip()
         # Make a run subdir named after the zip file and extract manifests there
         zip_name = os.path.basename(zip_src_path)
@@ -439,9 +438,9 @@ def generate_demux_command(self, run_manifest, demux_dir):
             + " -p 8"
             + " --num-unassigned 500"
             + f" -r {run_manifest}"
-            + " --legacy-fastq"  # TODO: except if Smart-seq3
+            + " --legacy-fastq"
             + " --force-index-orientation"
-        )  # TODO: any other options?
+        )
         with open(os.path.join(self.run_dir, '.bases2fastq_command')) as command_file:
             command_file.write(command)
         return command
@@ -792,7 +791,7 @@ def sync_metadata(self):
                          os.path.join(self.run_dir, "Demultiplexing", "UnassignedSequences.csv"),
                          self.run_parameters_file,
                          ]
-        metadata_archive = self.CONFIG.get("element_analysis").get("metadata_location") # TODO: add to aca.yaml
+        metadata_archive = self.CONFIG.get("element_analysis").get("metadata_location") # TODO: add to taca.yaml
         dest = os.path.join(metadata_archive, self.NGI_run_id)
         os.makedirs(dest)
         for f in files_to_copy:

From 58b72529ee8f58002c74ed5870d3df0d747fd934 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Tue, 1 Oct 2024 09:26:28 +0200
Subject: [PATCH 089/187] bugfix

---
 tests/element/test_Element_Runs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py
index 9eb264b1..0471b0a4 100644
--- a/tests/element/test_Element_Runs.py
+++ b/tests/element/test_Element_Runs.py
@@ -12,7 +12,7 @@ def get_config(tmp: tempfile.TemporaryDirectory) -> dict:
     config = {
         "element_analysis": {
             "Element": {
-                "GenericElement": {
+                "Aviti": {
                     "manifest_zip_location": f"{tmp.name}/ngi-nas-ns/samplesheets/Aviti",
                     "transfer_log": f"{tmp.name}/log/transfer_aviti.tsv",
                 },

From 3b87b45d7bde6428640cf9d42ded0981dcc6c0db Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Tue, 1 Oct 2024 09:26:56 +0200
Subject: [PATCH 090/187] ruff

---
 taca/element/Element_Runs.py | 348 +++++++++++++++++++++++++----------
 1 file changed, 249 insertions(+), 99 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 6faf5a53..610f77fc 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -39,7 +39,8 @@ def __init__(self, run_dir, configuration):
             "*RunStats.json"  # Assumes demux is finished when this file is created
         )
         self.transfer_file = (
-            self.CONFIG.get("element_analysis").get("Element", {})
+            self.CONFIG.get("element_analysis")
+            .get("Element", {})
             .get(self.sequencer_type, {})
             .get("transfer_log")
         )  # TODO: add to taca.yaml
@@ -162,19 +163,19 @@ def to_doc_obj(self):
                 "Unassigned_Sequences": unassigned_sequences,
             }
         }
-        
+
         demux_command_file = os.path.join(self.run_dir, ".bases2fastq_command")
         if os.path.exists(demux_command_file):
             with open(demux_command_file) as command_file:
                 demux_command = command_file.readlines()[0]
         else:
             demux_command = None
-        demux_version_file = os.path.join(self.run_dir,"Demultiplexing_0", "RunStats.json")
+        demux_version_file = os.path.join(
+            self.run_dir, "Demultiplexing_0", "RunStats.json"
+        )
         if os.path.exists(demux_version_file):
             with open(demux_version_file) as json_file:
-                    demux_info = json.load(
-                        json_file
-                    )
+                demux_info = json.load(json_file)
             demux_version = demux_info.get("AnalysisVersion")
         else:
             demux_version = None
@@ -184,7 +185,7 @@ def to_doc_obj(self):
             "bin": self.CONFIG.get("element_analysis").get("bases2fastq"),
             "options": demux_command,
         }
-        
+
         doc_obj = {
             "name": self.NGI_run_id,
             "run_path": self.run_dir,
@@ -214,7 +215,9 @@ def get_demultiplexing_status(self):
         sub_demux_dirs = glob.glob(os.path.join(self.run_dir, "Demultiplexing_*"))
         finished_count = 0
         for demux_dir in sub_demux_dirs:
-            found_demux_stats_file = glob.glob(os.path.join(demux_dir, self.demux_stats_file))
+            found_demux_stats_file = glob.glob(
+                os.path.join(demux_dir, self.demux_stats_file)
+            )
             if not found_demux_stats_file:
                 return "ongoing"
             elif found_demux_stats_file:
@@ -257,11 +260,12 @@ def get_lims_step_id(self) -> str | None:
                 lims_step_id = line.split(",")[1]
                 return lims_step_id
         return None
-    
+
     def find_manifest_zip(self):
         # Specify dir in which LIMS drop the manifest zip files
         dir_to_search = os.path.join(
-            self.CONFIG.get("element_analysis").get("Element", {})
+            self.CONFIG.get("element_analysis")
+            .get("Element", {})
             .get(self.sequencer_type, {})
             .get("manifest_zip_location"),  # TODO: add to taca.yaml
             str(self.year),
@@ -296,7 +300,6 @@ def find_manifest_zip(self):
             zip_src_path = glob_results[0]
         return zip_src_path
 
-
     def copy_manifests(self) -> bool:
         """Fetch the LIMS-generated run manifests from ngi-nas-ns and unzip them into a run subdir."""
         # TODO: test me
@@ -441,7 +444,7 @@ def generate_demux_command(self, run_manifest, demux_dir):
             + " --legacy-fastq"  # TODO: except if Smart-seq3
             + " --force-index-orientation"
         )  # TODO: any other options?
-        with open(os.path.join(self.run_dir, '.bases2fastq_command')) as command_file:
+        with open(os.path.join(self.run_dir, ".bases2fastq_command")) as command_file:
             command_file.write(command)
         return command
 
@@ -494,7 +497,7 @@ def rsync_complete(self):
     def rsync_successful(self):
         with open(os.path.join(self.run_dir, ".rsync_exit_status")) as rsync_exit_file:
             rsync_exit_status = rsync_exit_file.readlines()
-        if rsync_exit_status[0].strip() == '0':
+        if rsync_exit_status[0].strip() == "0":
             return True
         else:
             return False
@@ -516,32 +519,33 @@ def write_to_csv(data, filename):
         # Get the fieldnames from the keys of the first dictionary
         fieldnames = data[0].keys()
         # Open the file and write the CSV
-        with open(filename, mode='w', newline='') as file:
+        with open(filename, mode="w", newline="") as file:
             writer = csv.DictWriter(file, fieldnames=fieldnames)
             # Write the header (fieldnames)
             writer.writeheader()
             # Write the data (rows)
             writer.writerows(data)
 
-
     # Collect demux info into a list of dictionaries
     # Structure: [{'sub_demux_count':XXX, 'SampleName':XXX, 'Index1':XXX, 'Index2':XXX, 'Lane':XXX, 'Project':XXX, 'Recipe':XXX}]
     def collect_demux_runmanifest(self, demux_results_dirs):
         demux_runmanifest = []
         for demux_dir in demux_results_dirs:
-            sub_demux_count = os.path.basename(demux_dir).split('_')[1]
-            with open(os.path.join(self.run_dir, demux_dir, 'RunManifest.csv'), 'r') as file:
+            sub_demux_count = os.path.basename(demux_dir).split("_")[1]
+            with open(
+                os.path.join(self.run_dir, demux_dir, "RunManifest.csv"), "r"
+            ) as file:
                 lines = file.readlines()
             sample_section = False
             headers = []
             # Loop through each line
             for line in lines:
                 # Check if we reached the "[SAMPLES]" section
-                if '[SAMPLES]' in line:
+                if "[SAMPLES]" in line:
                     sample_section = True
                     continue
                 # Exit the sample section if another section is encountered
-                if sample_section and line.startswith('['):
+                if sample_section and line.startswith("["):
                     break
                 # If in the sample section, process the sample lines
                 if sample_section:
@@ -552,71 +556,124 @@ def collect_demux_runmanifest(self, demux_results_dirs):
                         continue
                     # Get the headers from the first line
                     if not headers:
-                        headers = line.split(',')
+                        headers = line.split(",")
                     else:
                         # Parse sample data
-                        values = line.split(',')
+                        values = line.split(",")
                         sample_dict = dict(zip(headers, values))
-                        sample_dict['sub_demux_count'] = sub_demux_count
+                        sample_dict["sub_demux_count"] = sub_demux_count
                         demux_runmanifest.append(sample_dict)
-        sorted_demux_runmanifest = sorted(demux_runmanifest, key=lambda x: (x['Lane'], x['SampleName'], x['sub_demux_count']))
+        sorted_demux_runmanifest = sorted(
+            demux_runmanifest,
+            key=lambda x: (x["Lane"], x["SampleName"], x["sub_demux_count"]),
+        )
         return sorted_demux_runmanifest
 
-
     # Aggregate the output FastQ files of samples from multiple demux
     def aggregate_sample_fastq(self, demux_runmanifest):
-        lanes = sorted(list(set(sample['Lane'] for sample in demux_runmanifest)))
+        lanes = sorted(list(set(sample["Lane"] for sample in demux_runmanifest)))
         unique_sample_demux = set()
         for lane in lanes:
             sample_count = 1
             for sample in demux_runmanifest:
-                lanenr = sample['Lane']
-                project = sample['Project']
-                sample_name = sample['SampleName']
-                sub_demux_count = sample['sub_demux_count']
+                lanenr = sample["Lane"]
+                project = sample["Project"]
+                sample_name = sample["SampleName"]
+                sub_demux_count = sample["sub_demux_count"]
                 # Skip PhiX
                 if lanenr == lane and sample_name != "PhiX":
                     sample_tuple = (sample_name, sub_demux_count)
                     if sample_tuple not in unique_sample_demux:
-                        project_dest = os.path.join(self.run_dir, self.demux_dir, project)
-                        sample_dest = os.path.join(self.run_dir, self.demux_dir, project, sample_name)
+                        project_dest = os.path.join(
+                            self.run_dir, self.demux_dir, project
+                        )
+                        sample_dest = os.path.join(
+                            self.run_dir, self.demux_dir, project, sample_name
+                        )
                         if not os.path.exists(project_dest):
                             os.makedirs(project_dest)
                         if not os.path.exists(sample_dest):
                             os.makedirs(sample_dest)
-                        fastqfiles = glob.glob(os.path.join(self.run_dir, f"Demultiplexing_{sub_demux_count}", "Samples", project, sample_name, f"*L00{lane}*.fastq.gz"))
+                        fastqfiles = glob.glob(
+                            os.path.join(
+                                self.run_dir,
+                                f"Demultiplexing_{sub_demux_count}",
+                                "Samples",
+                                project,
+                                sample_name,
+                                f"*L00{lane}*.fastq.gz",
+                            )
+                        )
                         for fastqfile in fastqfiles:
                             old_name = os.path.basename(fastqfile)
-                            read_label = re.search(rf"L00{lane}_(.*?)_001", old_name).group(1)
-                            new_name = "_".join([sample_name, f"S{sample_count}", f"L00{lane}", read_label, "001.fastq.gz"])
+                            read_label = re.search(
+                                rf"L00{lane}_(.*?)_001", old_name
+                            ).group(1)
+                            new_name = "_".join(
+                                [
+                                    sample_name,
+                                    f"S{sample_count}",
+                                    f"L00{lane}",
+                                    read_label,
+                                    "001.fastq.gz",
+                                ]
+                            )
                             os.symlink(fastqfile, os.path.join(sample_dest, new_name))
                         unique_sample_demux.add(sample_tuple)
                         sample_count += 1
 
-
     # Symlink the output FastQ files of undet only if a lane does not have multiple demux
     def aggregate_undet_fastq(self, demux_runmanifest):
-        lanes = sorted(list(set(sample['Lane'] for sample in demux_runmanifest)))
+        lanes = sorted(list(set(sample["Lane"] for sample in demux_runmanifest)))
         for lane in lanes:
-            sub_demux = list(set(sample['sub_demux_count'] for sample in demux_runmanifest if sample['Lane']==lane))
+            sub_demux = list(
+                set(
+                    sample["sub_demux_count"]
+                    for sample in demux_runmanifest
+                    if sample["Lane"] == lane
+                )
+            )
             if len(sub_demux) == 1:
-                project_dest = os.path.join(self.run_dir, self.demux_dir, "Undetermined")
+                project_dest = os.path.join(
+                    self.run_dir, self.demux_dir, "Undetermined"
+                )
                 if not os.path.exists(project_dest):
                     os.makedirs(project_dest)
-                fastqfiles = glob.glob(os.path.join(self.run_dir, f"Demultiplexing_{sub_demux[0]}", "Samples", "Undetermined", f"*L00{lane}*.fastq.gz"))
+                fastqfiles = glob.glob(
+                    os.path.join(
+                        self.run_dir,
+                        f"Demultiplexing_{sub_demux[0]}",
+                        "Samples",
+                        "Undetermined",
+                        f"*L00{lane}*.fastq.gz",
+                    )
+                )
                 for fastqfile in fastqfiles:
                     base_name = os.path.basename(fastqfile)
                     os.symlink(fastqfile, os.path.join(project_dest, base_name))
 
-
     # Read in each Project_RunStats.json to fetch PercentMismatch, PercentQ30, PercentQ40 and QualityScoreMean
     # Note that Element promised that they would include these stats into IndexAssignment.csv
     # But for now we have to do this by ourselves in this hard way
     def get_project_runstats(self, sub_demux, demux_runmanifest):
         project_runstats = []
-        project_list = sorted(list(set(sample['Project'] for sample in demux_runmanifest if sample['sub_demux_count']==sub_demux)))
+        project_list = sorted(
+            list(
+                set(
+                    sample["Project"]
+                    for sample in demux_runmanifest
+                    if sample["sub_demux_count"] == sub_demux
+                )
+            )
+        )
         for project in project_list:
-            project_runstats_json_path = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux}", "Samples", project, f"{project}_RunStats.json")
+            project_runstats_json_path = os.path.join(
+                self.run_dir,
+                f"Demultiplexing_{sub_demux}",
+                "Samples",
+                project,
+                f"{project}_RunStats.json",
+            )
             if os.path.exists(project_runstats_json_path):
                 with open(project_runstats_json_path) as stats_json:
                     project_runstats_json = json.load(stats_json)
@@ -629,129 +686,220 @@ def get_project_runstats(self, sub_demux, demux_runmanifest):
                         percentage_q30 = occurrence["PercentQ30"]
                         percentage_q40 = occurrence["PercentQ40"]
                         quality_score_mean = occurrence["QualityScoreMean"]
-                        project_runstats.append({ "SampleName"       : sample_name,
-                                                  "Lane"             : str(lane),
-                                                  "ExpectedSequence" : expected_sequence,
-                                                  "PercentMismatch"  : percentage_mismatch,
-                                                  "PercentQ30"       : percentage_q30,
-                                                  "PercentQ40"       : percentage_q40,
-                                                  "QualityScoreMean" : quality_score_mean
-                        })
+                        project_runstats.append(
+                            {
+                                "SampleName": sample_name,
+                                "Lane": str(lane),
+                                "ExpectedSequence": expected_sequence,
+                                "PercentMismatch": percentage_mismatch,
+                                "PercentQ30": percentage_q30,
+                                "PercentQ40": percentage_q40,
+                                "QualityScoreMean": quality_score_mean,
+                            }
+                        )
             else:
                 continue
         return project_runstats
 
-
     # Aggregate stats in IndexAssignment.csv
     def aggregate_stats_assigned(self, demux_runmanifest):
         aggregated_assigned_indexes = []
-        sub_demux_list = sorted(list(set(sample['sub_demux_count'] for sample in demux_runmanifest)))
-        lanes = sorted(list(set(sample['Lane'] for sample in demux_runmanifest)))
+        sub_demux_list = sorted(
+            list(set(sample["sub_demux_count"] for sample in demux_runmanifest))
+        )
+        lanes = sorted(list(set(sample["Lane"] for sample in demux_runmanifest)))
         for sub_demux in sub_demux_list:
             # Read in each Project_RunStats.json to fetch PercentMismatch, PercentQ30, PercentQ40 and QualityScoreMean
             # Note that Element promised that they would include these stats into IndexAssignment.csv
             # But for now we have to do this by ourselves in this hard way
             project_runstats = self.get_project_runstats(sub_demux, demux_runmanifest)
             # Read in IndexAssignment.csv
-            assigned_csv = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux}", "IndexAssignment.csv")
+            assigned_csv = os.path.join(
+                self.run_dir, f"Demultiplexing_{sub_demux}", "IndexAssignment.csv"
+            )
             if os.path.exists(assigned_csv):
-                with open(assigned_csv, 'r') as assigned_file:
+                with open(assigned_csv, "r") as assigned_file:
                     reader = csv.DictReader(assigned_file)
                     index_assignment = [row for row in reader]
                 for sample in index_assignment:
-                    if sample['Lane'] in lanes:
-                        project_runstats_sample = [d for d in project_runstats if d['SampleName'] == sample['SampleName'] and d['Lane'] == sample['Lane'] and d['ExpectedSequence'] == sample['I1']+sample['I2']]
-                        sample['sub_demux_count'] = sub_demux
-                        sample['PercentMismatch'] = project_runstats_sample[0]['PercentMismatch']
-                        sample['PercentQ30'] = project_runstats_sample[0]['PercentQ30']
-                        sample['PercentQ40'] = project_runstats_sample[0]['PercentQ40']
-                        sample['QualityScoreMean'] = project_runstats_sample[0]['QualityScoreMean']
+                    if sample["Lane"] in lanes:
+                        project_runstats_sample = [
+                            d
+                            for d in project_runstats
+                            if d["SampleName"] == sample["SampleName"]
+                            and d["Lane"] == sample["Lane"]
+                            and d["ExpectedSequence"] == sample["I1"] + sample["I2"]
+                        ]
+                        sample["sub_demux_count"] = sub_demux
+                        sample["PercentMismatch"] = project_runstats_sample[0][
+                            "PercentMismatch"
+                        ]
+                        sample["PercentQ30"] = project_runstats_sample[0]["PercentQ30"]
+                        sample["PercentQ40"] = project_runstats_sample[0]["PercentQ40"]
+                        sample["QualityScoreMean"] = project_runstats_sample[0][
+                            "QualityScoreMean"
+                        ]
                         aggregated_assigned_indexes.append(sample)
             else:
-                logger.warning(f"No IndexAssignment.csv file found for sub-demultiplexing {sub_demux}.")
+                logger.warning(
+                    f"No IndexAssignment.csv file found for sub-demultiplexing {sub_demux}."
+                )
         # Remove redundant rows for PhiX
         aggregated_assigned_indexes_filtered = []
         unique_phiX_combination = set()
         for sample in aggregated_assigned_indexes:
-            if sample['SampleName'] == 'PhiX':
-                combination = (sample['I1'], sample['I2'], sample['Lane'])
+            if sample["SampleName"] == "PhiX":
+                combination = (sample["I1"], sample["I2"], sample["Lane"])
                 if combination not in unique_phiX_combination:
                     aggregated_assigned_indexes_filtered.append(sample)
                     unique_phiX_combination.add(combination)
             else:
                 aggregated_assigned_indexes_filtered.append(sample)
         # Sort the list by Lane, SampleName and sub_demux_count
-        aggregated_assigned_indexes_filtered_sorted = sorted(aggregated_assigned_indexes_filtered, key=lambda x: (x['Lane'], x['SampleName'], x['sub_demux_count']))
+        aggregated_assigned_indexes_filtered_sorted = sorted(
+            aggregated_assigned_indexes_filtered,
+            key=lambda x: (x["Lane"], x["SampleName"], x["sub_demux_count"]),
+        )
         # Fix new sample number based on SampleName and Lane
         sample_count = 0
-        previous_samplename_lane = ('NA', 'NA')
+        previous_samplename_lane = ("NA", "NA")
         for sample in aggregated_assigned_indexes_filtered_sorted:
-            if (sample['SampleName'], sample['Lane']) != previous_samplename_lane:
+            if (sample["SampleName"], sample["Lane"]) != previous_samplename_lane:
                 sample_count += 1
-                previous_samplename_lane = (sample['SampleName'], sample['Lane'])
-            sample['SampleNumber'] = sample_count
+                previous_samplename_lane = (sample["SampleName"], sample["Lane"])
+            sample["SampleNumber"] = sample_count
         # Write to a new UnassignedSequences.csv file under demux_dir
-        aggregated_assigned_indexes_csv = os.path.join(self.run_dir, self.demux_dir, "IndexAssignment.csv")
-        self.write_to_csv(aggregated_assigned_indexes_filtered_sorted, aggregated_assigned_indexes_csv)
-
+        aggregated_assigned_indexes_csv = os.path.join(
+            self.run_dir, self.demux_dir, "IndexAssignment.csv"
+        )
+        self.write_to_csv(
+            aggregated_assigned_indexes_filtered_sorted, aggregated_assigned_indexes_csv
+        )
 
     # Aggregate stats in UnassignedSequences.csv
     def aggregate_stats_unassigned(self, demux_runmanifest):
         aggregated_unassigned_indexes = []
-        lanes = sorted(list(set(sample['Lane'] for sample in demux_runmanifest)))
+        lanes = sorted(list(set(sample["Lane"] for sample in demux_runmanifest)))
         for lane in lanes:
             sub_demux_index_lens = set()
             for sample in demux_runmanifest:
-                if sample['Lane'] == lane:
-                    sub_demux_index_lens.add((sample['sub_demux_count'], (len(sample.get("Index1", "")), len(sample.get("Index2", "")))))
+                if sample["Lane"] == lane:
+                    sub_demux_index_lens.add(
+                        (
+                            sample["sub_demux_count"],
+                            (
+                                len(sample.get("Index1", "")),
+                                len(sample.get("Index2", "")),
+                            ),
+                        )
+                    )
             # List of sub-demux with a decreasing order of index lengths
-            sub_demux_list = [x[0] for x in sorted(sub_demux_index_lens, key=lambda x: sum(x[1]), reverse=True)]
+            sub_demux_list = [
+                x[0]
+                for x in sorted(
+                    sub_demux_index_lens, key=lambda x: sum(x[1]), reverse=True
+                )
+            ]
             sub_demux_with_max_index_lens = sub_demux_list[0]
             # Start with the unassigned list with the longest index
-            max_unassigned_csv = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux_with_max_index_lens}", "UnassignedSequences.csv")
+            max_unassigned_csv = os.path.join(
+                self.run_dir,
+                f"Demultiplexing_{sub_demux_with_max_index_lens}",
+                "UnassignedSequences.csv",
+            )
             if os.path.exists(max_unassigned_csv):
-                with open(max_unassigned_csv, 'r') as max_unassigned_file:
+                with open(max_unassigned_csv, "r") as max_unassigned_file:
                     reader = csv.DictReader(max_unassigned_file)
                     max_unassigned_indexes = [row for row in reader]
             else:
-                logger.warning(f"No UnassignedSequences.csv file found for sub-demultiplexing {sub_demux_with_max_index_lens}.")
+                logger.warning(
+                    f"No UnassignedSequences.csv file found for sub-demultiplexing {sub_demux_with_max_index_lens}."
+                )
                 break
             # Filter by lane
-            max_unassigned_indexes = [idx for idx in max_unassigned_indexes if idx["Lane"] == lane]
+            max_unassigned_indexes = [
+                idx for idx in max_unassigned_indexes if idx["Lane"] == lane
+            ]
             # Complicated case with multiple demuxes. Take the full list if there is only one sub-demux otherwise
             if len(sub_demux_list) > 1:
                 # Order: from longer to shorter indexes
                 sub_demux_with_shorter_index_lens = sub_demux_list[1:]
                 for sub_demux in sub_demux_with_shorter_index_lens:
-                    unassigned_csv = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux}", "UnassignedSequences.csv")
+                    unassigned_csv = os.path.join(
+                        self.run_dir,
+                        f"Demultiplexing_{sub_demux}",
+                        "UnassignedSequences.csv",
+                    )
                     if os.path.exists(unassigned_csv):
-                        with open(unassigned_csv, 'r') as unassigned_file:
+                        with open(unassigned_csv, "r") as unassigned_file:
                             reader = csv.DictReader(unassigned_file)
                             unassigned_indexes = [row for row in reader]
                     else:
-                        logger.warning(f"No UnassignedSequences.csv file found for sub-demultiplexing {sub_demux}.")
+                        logger.warning(
+                            f"No UnassignedSequences.csv file found for sub-demultiplexing {sub_demux}."
+                        )
                         continue
                     # Filter by lane
-                    unassigned_indexes = [unassigned_index for unassigned_index in unassigned_indexes if unassigned_index["Lane"] == lane]
+                    unassigned_indexes = [
+                        unassigned_index
+                        for unassigned_index in unassigned_indexes
+                        if unassigned_index["Lane"] == lane
+                    ]
                     # Remove overlapped indexes from the list of max_unassigned_indexes
-                    idx1_overlapped_len = min([demux_lens_pair[1] for demux_lens_pair in sub_demux_index_lens if demux_lens_pair[0] == sub_demux][0][0],
-                                              [demux_lens_pair[1] for demux_lens_pair in sub_demux_index_lens if demux_lens_pair[0] == sub_demux_with_max_index_lens][0][0])
-                    idx2_overlapped_len = min([demux_lens_pair[1] for demux_lens_pair in sub_demux_index_lens if demux_lens_pair[0] == sub_demux][0][1],
-                                              [demux_lens_pair[1] for demux_lens_pair in sub_demux_index_lens if demux_lens_pair[0] == sub_demux_with_max_index_lens][0][1])
+                    idx1_overlapped_len = min(
+                        [
+                            demux_lens_pair[1]
+                            for demux_lens_pair in sub_demux_index_lens
+                            if demux_lens_pair[0] == sub_demux
+                        ][0][0],
+                        [
+                            demux_lens_pair[1]
+                            for demux_lens_pair in sub_demux_index_lens
+                            if demux_lens_pair[0] == sub_demux_with_max_index_lens
+                        ][0][0],
+                    )
+                    idx2_overlapped_len = min(
+                        [
+                            demux_lens_pair[1]
+                            for demux_lens_pair in sub_demux_index_lens
+                            if demux_lens_pair[0] == sub_demux
+                        ][0][1],
+                        [
+                            demux_lens_pair[1]
+                            for demux_lens_pair in sub_demux_index_lens
+                            if demux_lens_pair[0] == sub_demux_with_max_index_lens
+                        ][0][1],
+                    )
                     for unassigned_index in unassigned_indexes:
-                        idx1_overlapped_seq = unassigned_index['I1'][:idx1_overlapped_len]
-                        idx2_overlapped_seq = unassigned_index['I2'][:idx2_overlapped_len]
+                        idx1_overlapped_seq = unassigned_index["I1"][
+                            :idx1_overlapped_len
+                        ]
+                        idx2_overlapped_seq = unassigned_index["I2"][
+                            :idx2_overlapped_len
+                        ]
                         # Remove the overlapped record from the max_unassigned_indexes list
-                        max_unassigned_indexes = [max_unassigned_index for max_unassigned_index in max_unassigned_indexes if not (max_unassigned_index['I1'][:idx1_overlapped_len] == idx1_overlapped_seq and max_unassigned_index['I2'][:idx2_overlapped_len] == idx2_overlapped_seq)]
+                        max_unassigned_indexes = [
+                            max_unassigned_index
+                            for max_unassigned_index in max_unassigned_indexes
+                            if not (
+                                max_unassigned_index["I1"][:idx1_overlapped_len]
+                                == idx1_overlapped_seq
+                                and max_unassigned_index["I2"][:idx2_overlapped_len]
+                                == idx2_overlapped_seq
+                            )
+                        ]
             # Append to the aggregated_unassigned_indexes list
             aggregated_unassigned_indexes += max_unassigned_indexes
         # Sort aggregated_unassigned_indexes list first by lane and then by Count in the decreasing order
-        aggregated_unassigned_indexes = sorted(aggregated_unassigned_indexes, key=lambda x: (x['Lane'], -int(x['Count'])))
+        aggregated_unassigned_indexes = sorted(
+            aggregated_unassigned_indexes, key=lambda x: (x["Lane"], -int(x["Count"]))
+        )
         # Write to a new UnassignedSequences.csv file under demux_dir
-        aggregated_unassigned_csv = os.path.join(self.run_dir, self.demux_dir, "UnassignedSequences.csv")
+        aggregated_unassigned_csv = os.path.join(
+            self.run_dir, self.demux_dir, "UnassignedSequences.csv"
+        )
         self.write_to_csv(aggregated_unassigned_indexes, aggregated_unassigned_csv)
 
-
     # Aggregate demux results
     def aggregate_demux_results(self, demux_results_dirs):
         # Ensure the destination directory exists
@@ -778,8 +926,8 @@ def make_transfer_indicator(self):
         Path(transfer_indicator).touch()
 
     def transfer(self):
-        transfer_details = (
-            self.CONFIG.get("element_analysis").get("transfer_details")
+        transfer_details = self.CONFIG.get("element_analysis").get(
+            "transfer_details"
         )  # TODO: Add section to taca.yaml
         command = (
             "rsync"
@@ -806,7 +954,7 @@ def transfer(self):
         return
 
     def remove_transfer_indicator(self):
-        transfer_indicator = os.path.join(self.run_dir, '.rsync_ongoing')
+        transfer_indicator = os.path.join(self.run_dir, ".rsync_ongoing")
         Path(transfer_indicator).unlink()
 
     def update_transfer_log(self):
@@ -821,7 +969,9 @@ def update_transfer_log(self):
             raise OSError(msg)
 
     def update_paths_after_archiving(self, new_location):
-        self.run_dir = os.path.join(new_location, self.NGI_run_id) # Needs to be redirected to new location so that TACA can find files to upload to statusdb
+        self.run_dir = os.path.join(
+            new_location, self.NGI_run_id
+        )  # Needs to be redirected to new location so that TACA can find files to upload to statusdb
         self.run_parameters_file = os.path.join(self.run_dir, "RunParameters.json")
         self.run_stats_file = os.path.join(self.run_dir, "RunStats.json")
         self.run_manifest_file_from_instrument = os.path.join(

From e245d1c718ace0ec41197934662dafb97e3aa9f9 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Tue, 1 Oct 2024 09:43:34 +0200
Subject: [PATCH 091/187] add status section to mock config

---
 tests/element/test_Element_Runs.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py
index 0471b0a4..23914a7d 100644
--- a/tests/element/test_Element_Runs.py
+++ b/tests/element/test_Element_Runs.py
@@ -18,6 +18,7 @@ def get_config(tmp: tempfile.TemporaryDirectory) -> dict:
                 },
             },
         },
+        "statusdb": {},
     }
     return config
 

From e772189ac8daee7782c64f50a89a18e9fc8e1738 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Tue, 1 Oct 2024 09:43:55 +0200
Subject: [PATCH 092/187] start analysis functional test

---
 tests/analysis/test_analysis_element.py | 35 +++++++++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 tests/analysis/test_analysis_element.py

diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py
new file mode 100644
index 00000000..49067ca4
--- /dev/null
+++ b/tests/analysis/test_analysis_element.py
@@ -0,0 +1,35 @@
+from tempfile import TemporaryDirectory
+from unittest.mock import patch
+
+import pytest
+
+from tests.element.test_Element_Runs import create_element_run_dir, get_config
+
+
+@pytest.mark.skip(reason="Not implemented yet")
+def test_run_preprocessing(create_dirs):
+    tmp: TemporaryDirectory = create_dirs
+
+    # Mock config
+    config = get_config(tmp)
+    mock_config = patch("taca.utils.config.CONFIG", new=config)
+    mock_config.start()
+
+    # Mock DB
+    mock_db = patch("taca.element.Element_Runs.ElementRunsConnection")
+    mock_db.start()
+
+    # Import module to test
+    from taca.analysis import analysis_element as to_test
+
+    run_dir = create_element_run_dir(
+        tmp=tmp,
+        nosync=False,
+        run_finished=False,
+        sync_finished=False,
+        demux_dir=False,
+        demux_done=False,
+        outcome_completed=False,
+    )
+
+    to_test.run_preprocessing(run_dir)

From f87487187183be3e99fa7953be531be72f817a15 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Tue, 1 Oct 2024 09:48:06 +0200
Subject: [PATCH 093/187] attr for empty manifest

---
 taca/element/Element_Runs.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 5fdde1a3..d41635a6 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -321,6 +321,9 @@ def copy_manifests(self) -> bool:
         self.lims_start_manifest = [
             m for m in manifests if re.match(r".*_trimmed\.csv$", m)
         ][0]
+        self.lims_empty_manifest = [
+            m for m in manifests if re.match(r".*_empty\.csv$", m)
+        ][0]
         self.lims_demux_manifests = [
             m for m in manifests if re.match(r".*_\d+\.csv$", m)
         ]

From 7958caa4285815d33009725044b0cb408736dbb9 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Tue, 1 Oct 2024 11:18:46 +0200
Subject: [PATCH 094/187] wip

---
 taca/element/Element_Runs.py | 74 +++++++++++++++++++++++++++++++++++-
 1 file changed, 73 insertions(+), 1 deletion(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index d41635a6..b42677da 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -18,6 +18,69 @@
 logger = logging.getLogger(__name__)
 
 
+def get_mask(seq: str, mask_type: str, which_index: int) -> str:
+    """Example usage:
+
+    get_mask("ACGTACGTNNNNNNNN", "umi", 1)   -> 'I1:N8Y8'
+    get_mask("ACGTACGTNNNNNNNN", "index", 2) -> 'I2:Y8N8'
+    """
+
+    # Input assertions
+    assert re.match(r"^[ACGTN]+$", seq), f"Index '{seq}' has non-ACGTN characters"
+    assert mask_type in ["umi", "index"], "Mask type must be 'umi' or 'index'"
+    assert which_index in [1, 2], "Index number must be 1 or 2"
+
+    # Define dict to convert base to mask classifier
+    base2mask = (
+        {
+            "N": "N",
+            "A": "Y",
+            "C": "Y",
+            "G": "Y",
+            "T": "Y",
+        }
+        if mask_type == "index"
+        else {
+            "N": "Y",
+            "A": "N",
+            "C": "N",
+            "G": "N",
+            "T": "N",
+        }
+    )
+
+    # Dynamically build the mask sequence
+    mask_seq = "I1:" if which_index == 1 else "I2:"
+    current_group = ""
+    current_group_len = 0
+    for letter in seq:
+        if base2mask[letter] == current_group:
+            current_group_len += 1
+        else:
+            mask_seq += (
+                f"{current_group}{current_group_len}" if current_group_len > 0 else ""
+            )
+            current_group = base2mask[letter]
+            current_group_len = 1
+    mask_seq += f"{current_group}{current_group_len}"
+
+    # Use the worlds ugliest string parsing to check that the mask length matches the input sequence length
+    assert sum(
+        [
+            int(n)
+            for n in mask_seq[3:]
+            .replace("N", "-")
+            .replace("Y", "-")
+            .strip("-")
+            .split("-")
+        ]
+    ) == len(
+        seq
+    ), f"Length of mask '{mask_seq}' does not match length of input seq '{seq}'"
+
+    return mask_seq
+
+
 class Run:
     """Defines an Element run"""
 
@@ -344,7 +407,7 @@ def make_demux_manifests(
             manifest_contents = f.read()
 
         # Get '[SAMPLES]' section
-        split_contents = "[SAMPLES]".split(manifest_contents)
+        split_contents = manifest_contents.split("[SAMPLES]")
         assert (
             len(split_contents) == 2
         ), f"Could not split sample rows out of manifest {manifest_contents}"
@@ -375,6 +438,15 @@ def make_demux_manifests(
 
         manifest_root_name = f"{self.NGI_run_id}_demux"
 
+        # Address UMI masks
+        for n in [1, 2]:
+            df_samples[f"I{n}Mask"] = df_samples[f"Index{n}"].apply(
+                lambda seq: get_mask(seq, "umi", n)
+            )
+            df_samples["UmiMask"] = df_samples[f"Index{n}"].apply(
+                lambda seq: get_mask(seq, "umi", n)
+            )
+
         # Get idx lengths for calculations
         df_samples.loc[:, "len_idx1"] = df["Index1"].apply(len)
         df_samples.loc[:, "len_idx2"] = df["Index2"].apply(len)

From 962a19cd68183dc1bbb0f4897610c2c0394fd086 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Tue, 1 Oct 2024 13:09:49 +0200
Subject: [PATCH 095/187] build manifests based on masks

---
 taca/element/Element_Runs.py | 75 +++++++++++++++++++++---------------
 1 file changed, 44 insertions(+), 31 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index b42677da..6f0ecd01 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -396,12 +396,10 @@ def copy_manifests(self) -> bool:
     def make_demux_manifests(
         self, manifest_to_split: os.PathLike, outdir: os.PathLike | None = None
     ) -> list[os.PathLike]:
-        """Derive composite demultiplexing manifests (grouped by index duplicity and lengths)
+        """Derive composite demultiplexing manifests
         from a single information-rich manifest.
         """
 
-        # TODO test me
-
         # Read specified manifest
         with open(manifest_to_split) as f:
             manifest_contents = f.read()
@@ -411,7 +409,7 @@ def make_demux_manifests(
         assert (
             len(split_contents) == 2
         ), f"Could not split sample rows out of manifest {manifest_contents}"
-        sample_section = split_contents[1].split("\n")
+        sample_section = split_contents[1].strip().split("\n")
 
         # Split into header and rows
         header = sample_section[0]
@@ -435,36 +433,40 @@ def make_demux_manifests(
             outdir = self.run_dir
 
         ## Build composite manifests
-
         manifest_root_name = f"{self.NGI_run_id}_demux"
 
-        # Address UMI masks
+        # Bool indicating whether UMI is present
+        df_samples["has_umi"] = df_samples["Index2"].str.contains("N")
+
+        # Add cols denoting idx and umi masks
         for n in [1, 2]:
             df_samples[f"I{n}Mask"] = df_samples[f"Index{n}"].apply(
-                lambda seq: get_mask(seq, "umi", n)
-            )
-            df_samples["UmiMask"] = df_samples[f"Index{n}"].apply(
-                lambda seq: get_mask(seq, "umi", n)
+                lambda seq: get_mask(seq, "index", n)
             )
+        df_samples["UmiMask"] = df_samples["Index2"].apply(
+            lambda seq: get_mask(seq, "umi", 2)
+        )
 
-        # Get idx lengths for calculations
-        df_samples.loc[:, "len_idx1"] = df["Index1"].apply(len)
-        df_samples.loc[:, "len_idx2"] = df["Index2"].apply(len)
+        # Re-make idx col without Ns
+        df_samples["Index2_umi"] = df_samples["Index2"]
+        df_samples.loc[:, "Index2"] = df_samples["Index2"].apply(
+            lambda x: x.replace("N", "")
+        )
 
-        # Break down by index lengths and lane, creating composite manifests
+        # Break down by masks and lane, creating composite manifests
         manifests = []
         n = 0
-        for (len_idx1, len_idx2, lane), group in df_samples.groupby(
-            ["len_idx1", "len_idx2", "Lane"]
-        ):
+        grouped_df = df_samples.groupby(["I1Mask", "I2Mask", "UmiMask", "Lane"])
+        for (I1Mask, I2Mask, UmiMask, lane), group in grouped_df:
             file_name = f"{manifest_root_name}_{n}.csv"
+
             runValues_section = "\n".join(
                 [
                     "[RUNVALUES]",
                     "KeyName, Value",
                     f'manifest_file, "{file_name}"',
-                    f"manifest_group, {n+1}/{len(df.groupby(['len_idx1', 'len_idx2', 'Lane']))}",
-                    f"grouped_by, len_idx1:{len_idx1} len_idx2:{len_idx2} lane:{lane}",
+                    f"manifest_group, {n+1}/{len(grouped_df)}",
+                    f"grouped_by, I1Mask:'{I1Mask}' I2Mask:'{I2Mask}' UmiMask:'{UmiMask}' lane:{lane}",
                 ]
             )
 
@@ -472,24 +474,35 @@ def make_demux_manifests(
                 [
                     "[SETTINGS]",
                     "SettingName, Value",
+                    f"I1Mask, {I1Mask}",
+                    f"I2Mask, {I2Mask}",
                 ]
             )
 
+            if group["has_umi"].all():
+                settings_section += "\n" + "\n".join(
+                    [
+                        f"UmiMask, {UmiMask}",
+                        "UmiFastQ, True",
+                    ]
+                )
+
             # Add PhiX stratified by index length
-            if group["phix_loaded"].any():
-                # Subset controls by lane
-                group_controls = df_controls[df_controls["Lane"] == lane].copy()
+            # Subset controls by lane
+            group_controls = df_controls[df_controls["Lane"] == lane].copy()
 
-                # Trim PhiX indexes to match group
-                group_controls.loc[:, "Index1"] = group_controls.loc[:, "Index1"].apply(
-                    lambda x: x[:len_idx1]
-                )
-                group_controls.loc[:, "Index2"] = group_controls.loc[:, "Index2"].apply(
-                    lambda x: x[:len_idx2]
-                )
+            # Trim PhiX indexes to match group
+            i1_len = group["Index1"].apply(len).max()
+            group_controls.loc[:, "Index1"] = group_controls.loc[:, "Index1"].apply(
+                lambda x: x[:i1_len]
+            )
+            i2_len = group["Index2"].apply(len).max()
+            group_controls.loc[:, "Index2"] = group_controls.loc[:, "Index2"].apply(
+                lambda x: x[:i2_len]
+            )
 
-                # Add PhiX to group
-                group = pd.concat([group, group_controls], axis=0, ignore_index=True)
+            # Add PhiX to group
+            group = pd.concat([group, group_controls], axis=0, ignore_index=True)
 
             samples_section = (
                 f"[SAMPLES]\n{group.iloc[:, 0:6].to_csv(index=None, header=True)}"

From bbc844087f5d63dbbc543e94dc6daeb19b2075c1 Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Tue, 1 Oct 2024 13:45:18 +0200
Subject: [PATCH 096/187] ruff formatting

---
 taca/analysis/analysis_element.py |  14 +-
 taca/element/Element_Runs.py      | 377 +++++++++++++++++++++---------
 2 files changed, 276 insertions(+), 115 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index 72c3c74d..eb15a8a4 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -45,9 +45,7 @@ def _process(run):
                 os.mkdir(run.demux_dir)
                 run.copy_manifests()
                 run_manifests = glob.glob(
-                    os.path.join(
-                        run.run_dir, "RunManifest_*.csv"
-                    )
+                    os.path.join(run.run_dir, "RunManifest_*.csv")
                 )
                 sub_demux_count = 0
                 for run_manifest in run_manifests.sort():
@@ -96,7 +94,9 @@ def _process(run):
             run.status = "transferring"
             if run.status_changed:
                 run.update_statusdb()
-            logger.info(f"{run} is being transferred. Skipping.") # TODO: fix formatting, currently prints "ElementRun(20240910_AV242106_B2403418431) is being transferred"
+            logger.info(
+                f"{run} is being transferred. Skipping."
+            )  # TODO: fix formatting, currently prints "ElementRun(20240910_AV242106_B2403418431) is being transferred"
             return
         elif transfer_status == "rsync done":
             if run.rsync_successful():
@@ -120,7 +120,7 @@ def _process(run):
         else:
             logger.warning(
                 f"Unknown transfer status {transfer_status} of run {run}. Please investigate"
-            ) # TODO: email warning to operator
+            )  # TODO: email warning to operator
             return
 
     if given_run:
@@ -132,9 +132,7 @@ def _process(run):
         )  # TODO: add to config
         for data_dir in data_dirs:
             # Run folder looks like DATE_*_*, the last section is the FC side (A/B) and name
-            runs = glob.glob(
-                os.path.join(data_dir, "[1-9]*_*_*")
-            )
+            runs = glob.glob(os.path.join(data_dir, "[1-9]*_*_*"))
             for run in runs:
                 runObj = Aviti_Run(run, CONFIG)
                 try:
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index d6dd0b3a..e6264396 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -39,7 +39,8 @@ def __init__(self, run_dir, configuration):
             "*RunStats.json"  # Assumes demux is finished when this file is created
         )
         self.transfer_file = (
-            self.CONFIG.get("element_analysis").get("Element", {})
+            self.CONFIG.get("element_analysis")
+            .get("Element", {})
             .get(self.sequencer_type, {})
             .get("transfer_log")
         )  # TODO: add to taca.yaml
@@ -106,7 +107,9 @@ def parse_run_parameters(self) -> None:
             "runID"
         )  # Unique hash that we don't really use
         self.side = run_parameters.get("Side")  # SideA or SideB
-        self.side_letter = self.side[-1]  # A or B TODO: compare side letter with manually entered letter in run name
+        self.side_letter = self.side[
+            -1
+        ]  # A or B TODO: compare side letter with manually entered letter in run name
         self.run_type = run_parameters.get(
             "RunType"
         )  # Sequencing, wash or prime I believe?
@@ -169,12 +172,12 @@ def to_doc_obj(self):
                 demux_command = command_file.readlines()[0]
         else:
             demux_command = None
-        demux_version_file = os.path.join(self.run_dir,"Demultiplexing_0", "RunStats.json")
+        demux_version_file = os.path.join(
+            self.run_dir, "Demultiplexing_0", "RunStats.json"
+        )
         if os.path.exists(demux_version_file):
             with open(demux_version_file) as json_file:
-                    demux_info = json.load(
-                        json_file
-                    )
+                demux_info = json.load(json_file)
             demux_version = demux_info.get("AnalysisVersion")
         else:
             demux_version = None
@@ -214,7 +217,9 @@ def get_demultiplexing_status(self):
         sub_demux_dirs = glob.glob(os.path.join(self.run_dir, "Demultiplexing_*"))
         finished_count = 0
         for demux_dir in sub_demux_dirs:
-            found_demux_stats_file = glob.glob(os.path.join(demux_dir, self.demux_stats_file))
+            found_demux_stats_file = glob.glob(
+                os.path.join(demux_dir, self.demux_stats_file)
+            )
             if not found_demux_stats_file:
                 return "ongoing"
             elif found_demux_stats_file:
@@ -261,7 +266,8 @@ def get_lims_step_id(self) -> str | None:
     def find_manifest_zip(self):
         # Specify dir in which LIMS drop the manifest zip files
         dir_to_search = os.path.join(
-            self.CONFIG.get("element_analysis").get("Element", {})
+            self.CONFIG.get("element_analysis")
+            .get("Element", {})
             .get(self.sequencer_type, {})
             .get("manifest_zip_location"),  # TODO: add to taca.yaml
             str(self.year),
@@ -296,7 +302,6 @@ def find_manifest_zip(self):
             zip_src_path = glob_results[0]
         return zip_src_path
 
-
     def copy_manifests(self) -> bool:
         """Fetch the LIMS-generated run manifests from ngi-nas-ns and unzip them into a run subdir."""
         zip_src_path = self.find_manifest_zip()
@@ -441,7 +446,7 @@ def generate_demux_command(self, run_manifest, demux_dir):
             + " --legacy-fastq"
             + " --force-index-orientation"
         )
-        with open(os.path.join(self.run_dir, '.bases2fastq_command')) as command_file:
+        with open(os.path.join(self.run_dir, ".bases2fastq_command")) as command_file:
             command_file.write(command)
         return command
 
@@ -493,7 +498,7 @@ def rsync_complete(self):
     def rsync_successful(self):
         with open(os.path.join(self.run_dir, ".rsync_exit_status")) as rsync_exit_file:
             rsync_exit_status = rsync_exit_file.readlines()
-        if rsync_exit_status[0].strip() == '0':
+        if rsync_exit_status[0].strip() == "0":
             return True
         else:
             return False
@@ -510,38 +515,36 @@ def clear_dir(dir):
             except Exception as e:
                 print(f"Failed to delete {file_path} Reason {e}")
 
-
     # Write to csv
     def write_to_csv(data, filename):
         # Get the fieldnames from the keys of the first dictionary
         fieldnames = data[0].keys()
         # Open the file and write the CSV
-        with open(filename, mode='w', newline='') as file:
+        with open(filename, mode="w", newline="") as file:
             writer = csv.DictWriter(file, fieldnames=fieldnames)
             # Write the header (fieldnames)
             writer.writeheader()
             # Write the data (rows)
             writer.writerows(data)
 
-
     # Collect demux info into a list of dictionaries
     # Structure: [{'sub_demux_count':XXX, 'SampleName':XXX, 'Index1':XXX, 'Index2':XXX, 'Lane':XXX, 'Project':XXX, 'Recipe':XXX}]
     def collect_demux_runmanifest(self, demux_results_dirs):
         demux_runmanifest = []
         for demux_dir in demux_results_dirs:
-            sub_demux_count = os.path.basename(demux_dir).split('_')[1]
-            with open(os.path.join(self.run_dir, demux_dir, 'RunManifest.csv'), 'r') as file:
+            sub_demux_count = os.path.basename(demux_dir).split("_")[1]
+            with open(os.path.join(self.run_dir, demux_dir, "RunManifest.csv")) as file:
                 lines = file.readlines()
             sample_section = False
             headers = []
             # Loop through each line
             for line in lines:
                 # Check if we reached the "[SAMPLES]" section
-                if '[SAMPLES]' in line:
+                if "[SAMPLES]" in line:
                     sample_section = True
                     continue
                 # Exit the sample section if another section is encountered
-                if sample_section and line.startswith('['):
+                if sample_section and line.startswith("["):
                     break
                 # If in the sample section, process the sample lines
                 if sample_section:
@@ -552,71 +555,127 @@ def collect_demux_runmanifest(self, demux_results_dirs):
                         continue
                     # Get the headers from the first line
                     if not headers:
-                        headers = line.split(',')
+                        headers = line.split(",")
                     else:
                         # Parse sample data
-                        values = line.split(',')
+                        values = line.split(",")
                         sample_dict = dict(zip(headers, values))
-                        sample_dict['sub_demux_count'] = sub_demux_count
+                        sample_dict["sub_demux_count"] = sub_demux_count
                         demux_runmanifest.append(sample_dict)
-        sorted_demux_runmanifest = sorted(demux_runmanifest, key=lambda x: (x['Lane'], x['SampleName'], x['sub_demux_count']))
+        sorted_demux_runmanifest = sorted(
+            demux_runmanifest,
+            key=lambda x: (x["Lane"], x["SampleName"], x["sub_demux_count"]),
+        )
         return sorted_demux_runmanifest
 
-
     # Aggregate the output FastQ files of samples from multiple demux
     def aggregate_sample_fastq(self, demux_runmanifest):
-        lanes = sorted(list(set(sample['Lane'] for sample in demux_runmanifest)))
+        lanes = sorted(list(set(sample["Lane"] for sample in demux_runmanifest)))
         unique_sample_demux = set()
         for lane in lanes:
             sample_count = 1
             for sample in demux_runmanifest:
-                lanenr = sample['Lane']
-                project = sample['Project']
-                sample_name = sample['SampleName']
-                sub_demux_count = sample['sub_demux_count']
+                lanenr = sample["Lane"]
+                project = sample["Project"]
+                sample_name = sample["SampleName"]
+                sub_demux_count = sample["sub_demux_count"]
                 # Skip PhiX
                 if lanenr == lane and sample_name != "PhiX":
                     sample_tuple = (sample_name, sub_demux_count)
                     if sample_tuple not in unique_sample_demux:
-                        project_dest = os.path.join(self.run_dir, self.demux_dir, project)
-                        sample_dest = os.path.join(self.run_dir, self.demux_dir, project, f"Sample_{sample_name}")
+                        project_dest = os.path.join(
+                            self.run_dir, self.demux_dir, project
+                        )
+                        sample_dest = os.path.join(
+                            self.run_dir,
+                            self.demux_dir,
+                            project,
+                            f"Sample_{sample_name}",
+                        )
                         if not os.path.exists(project_dest):
                             os.makedirs(project_dest)
                         if not os.path.exists(sample_dest):
                             os.makedirs(sample_dest)
-                        fastqfiles = glob.glob(os.path.join(self.run_dir, f"Demultiplexing_{sub_demux_count}", "Samples", project, sample_name, f"*L00{lane}*.fastq.gz"))
+                        fastqfiles = glob.glob(
+                            os.path.join(
+                                self.run_dir,
+                                f"Demultiplexing_{sub_demux_count}",
+                                "Samples",
+                                project,
+                                sample_name,
+                                f"*L00{lane}*.fastq.gz",
+                            )
+                        )
                         for fastqfile in fastqfiles:
                             old_name = os.path.basename(fastqfile)
-                            read_label = re.search(rf"L00{lane}_(.*?)_001", old_name).group(1)
-                            new_name = "_".join([sample_name, f"S{sample_count}", f"L00{lane}", read_label, "001.fastq.gz"])
+                            read_label = re.search(
+                                rf"L00{lane}_(.*?)_001", old_name
+                            ).group(1)
+                            new_name = "_".join(
+                                [
+                                    sample_name,
+                                    f"S{sample_count}",
+                                    f"L00{lane}",
+                                    read_label,
+                                    "001.fastq.gz",
+                                ]
+                            )
                             os.symlink(fastqfile, os.path.join(sample_dest, new_name))
                         unique_sample_demux.add(sample_tuple)
                         sample_count += 1
 
-
     # Symlink the output FastQ files of undet only if a lane does not have multiple demux
     def aggregate_undet_fastq(self, demux_runmanifest):
-        lanes = sorted(list(set(sample['Lane'] for sample in demux_runmanifest)))
+        lanes = sorted(list(set(sample["Lane"] for sample in demux_runmanifest)))
         for lane in lanes:
-            sub_demux = list(set(sample['sub_demux_count'] for sample in demux_runmanifest if sample['Lane']==lane))
+            sub_demux = list(
+                set(
+                    sample["sub_demux_count"]
+                    for sample in demux_runmanifest
+                    if sample["Lane"] == lane
+                )
+            )
             if len(sub_demux) == 1:
-                project_dest = os.path.join(self.run_dir, self.demux_dir, "Undetermined")
+                project_dest = os.path.join(
+                    self.run_dir, self.demux_dir, "Undetermined"
+                )
                 if not os.path.exists(project_dest):
                     os.makedirs(project_dest)
-                fastqfiles = glob.glob(os.path.join(self.run_dir, f"Demultiplexing_{sub_demux[0]}", "Samples", "Undetermined", f"*L00{lane}*.fastq.gz"))
+                fastqfiles = glob.glob(
+                    os.path.join(
+                        self.run_dir,
+                        f"Demultiplexing_{sub_demux[0]}",
+                        "Samples",
+                        "Undetermined",
+                        f"*L00{lane}*.fastq.gz",
+                    )
+                )
                 for fastqfile in fastqfiles:
                     base_name = os.path.basename(fastqfile)
                     os.symlink(fastqfile, os.path.join(project_dest, base_name))
 
-
     # Read in each Project_RunStats.json to fetch PercentMismatch, PercentQ30, PercentQ40 and QualityScoreMean
     # Note that Element promised that they would include these stats into IndexAssignment.csv
     # But for now we have to do this by ourselves in this hard way
     def get_project_runstats(self, sub_demux, demux_runmanifest):
         project_runstats = []
-        project_list = sorted(list(set(sample['Project'] for sample in demux_runmanifest if sample['sub_demux_count']==sub_demux)))
+        project_list = sorted(
+            list(
+                set(
+                    sample["Project"]
+                    for sample in demux_runmanifest
+                    if sample["sub_demux_count"] == sub_demux
+                )
+            )
+        )
         for project in project_list:
-            project_runstats_json_path = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux}", "Samples", project, f"{project}_RunStats.json")
+            project_runstats_json_path = os.path.join(
+                self.run_dir,
+                f"Demultiplexing_{sub_demux}",
+                "Samples",
+                project,
+                f"{project}_RunStats.json",
+            )
             if os.path.exists(project_runstats_json_path):
                 with open(project_runstats_json_path) as stats_json:
                     project_runstats_json = json.load(stats_json)
@@ -629,126 +688,218 @@ def get_project_runstats(self, sub_demux, demux_runmanifest):
                         percentage_q30 = occurrence["PercentQ30"]
                         percentage_q40 = occurrence["PercentQ40"]
                         quality_score_mean = occurrence["QualityScoreMean"]
-                        project_runstats.append({ "SampleName"       : sample_name,
-                                                  "Lane"             : str(lane),
-                                                  "ExpectedSequence" : expected_sequence,
-                                                  "PercentMismatch"  : percentage_mismatch,
-                                                  "PercentQ30"       : percentage_q30,
-                                                  "PercentQ40"       : percentage_q40,
-                                                  "QualityScoreMean" : quality_score_mean
-                        })
+                        project_runstats.append(
+                            {
+                                "SampleName": sample_name,
+                                "Lane": str(lane),
+                                "ExpectedSequence": expected_sequence,
+                                "PercentMismatch": percentage_mismatch,
+                                "PercentQ30": percentage_q30,
+                                "PercentQ40": percentage_q40,
+                                "QualityScoreMean": quality_score_mean,
+                            }
+                        )
             else:
                 continue
         return project_runstats
 
-
     # Aggregate stats in IndexAssignment.csv
     def aggregate_stats_assigned(self, demux_runmanifest):
         aggregated_assigned_indexes = []
-        sub_demux_list = sorted(list(set(sample['sub_demux_count'] for sample in demux_runmanifest)))
-        lanes = sorted(list(set(sample['Lane'] for sample in demux_runmanifest)))
+        sub_demux_list = sorted(
+            list(set(sample["sub_demux_count"] for sample in demux_runmanifest))
+        )
+        lanes = sorted(list(set(sample["Lane"] for sample in demux_runmanifest)))
         for sub_demux in sub_demux_list:
             # Read in each Project_RunStats.json to fetch PercentMismatch, PercentQ30, PercentQ40 and QualityScoreMean
             # Note that Element promised that they would include these stats into IndexAssignment.csv
             # But for now we have to do this by ourselves in this hard way
             project_runstats = self.get_project_runstats(sub_demux, demux_runmanifest)
             # Read in IndexAssignment.csv
-            assigned_csv = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux}", "IndexAssignment.csv")
+            assigned_csv = os.path.join(
+                self.run_dir, f"Demultiplexing_{sub_demux}", "IndexAssignment.csv"
+            )
             if os.path.exists(assigned_csv):
-                with open(assigned_csv, 'r') as assigned_file:
+                with open(assigned_csv) as assigned_file:
                     reader = csv.DictReader(assigned_file)
                     index_assignment = [row for row in reader]
                 for sample in index_assignment:
-                    if sample['Lane'] in lanes:
-                        project_runstats_sample = [d for d in project_runstats if d['SampleName'] == sample['SampleName'] and d['Lane'] == sample['Lane'] and d['ExpectedSequence'] == sample['I1']+sample['I2']]
-                        sample['sub_demux_count'] = sub_demux
-                        sample['PercentMismatch'] = project_runstats_sample[0]['PercentMismatch']
-                        sample['PercentQ30'] = project_runstats_sample[0]['PercentQ30']
-                        sample['PercentQ40'] = project_runstats_sample[0]['PercentQ40']
-                        sample['QualityScoreMean'] = project_runstats_sample[0]['QualityScoreMean']
+                    if sample["Lane"] in lanes:
+                        project_runstats_sample = [
+                            d
+                            for d in project_runstats
+                            if d["SampleName"] == sample["SampleName"]
+                            and d["Lane"] == sample["Lane"]
+                            and d["ExpectedSequence"] == sample["I1"] + sample["I2"]
+                        ]
+                        sample["sub_demux_count"] = sub_demux
+                        sample["PercentMismatch"] = project_runstats_sample[0][
+                            "PercentMismatch"
+                        ]
+                        sample["PercentQ30"] = project_runstats_sample[0]["PercentQ30"]
+                        sample["PercentQ40"] = project_runstats_sample[0]["PercentQ40"]
+                        sample["QualityScoreMean"] = project_runstats_sample[0][
+                            "QualityScoreMean"
+                        ]
                         aggregated_assigned_indexes.append(sample)
             else:
-                logger.warning(f"No {os.path.basename(assigned_csv)} file found for sub-demultiplexing {sub_demux}.")
+                logger.warning(
+                    f"No {os.path.basename(assigned_csv)} file found for sub-demultiplexing {sub_demux}."
+                )
         # Remove redundant rows for PhiX
         aggregated_assigned_indexes_filtered = []
         unique_phiX_combination = set()
         for sample in aggregated_assigned_indexes:
             # Add project name
-            sample['Project'] = [d for d in demux_runmanifest if d['SampleName'] == sample['SampleName']][0]['Project']
-            if sample['SampleName'] == 'PhiX':
-                combination = (sample['I1'], sample['I2'], sample['Lane'])
+            sample["Project"] = [
+                d for d in demux_runmanifest if d["SampleName"] == sample["SampleName"]
+            ][0]["Project"]
+            if sample["SampleName"] == "PhiX":
+                combination = (sample["I1"], sample["I2"], sample["Lane"])
                 if combination not in unique_phiX_combination:
                     aggregated_assigned_indexes_filtered.append(sample)
                     unique_phiX_combination.add(combination)
             else:
                 aggregated_assigned_indexes_filtered.append(sample)
         # Sort the list by Lane, SampleName and sub_demux_count
-        aggregated_assigned_indexes_filtered_sorted = sorted(aggregated_assigned_indexes_filtered, key=lambda x: (x['Lane'], x['SampleName'], x['sub_demux_count']))
+        aggregated_assigned_indexes_filtered_sorted = sorted(
+            aggregated_assigned_indexes_filtered,
+            key=lambda x: (x["Lane"], x["SampleName"], x["sub_demux_count"]),
+        )
         # Fix new sample number based on SampleName and Lane
         sample_count = 0
-        previous_samplename_lane = ('NA', 'NA')
+        previous_samplename_lane = ("NA", "NA")
         for sample in aggregated_assigned_indexes_filtered_sorted:
-            if (sample['SampleName'], sample['Lane']) != previous_samplename_lane:
+            if (sample["SampleName"], sample["Lane"]) != previous_samplename_lane:
                 sample_count += 1
-                previous_samplename_lane = (sample['SampleName'], sample['Lane'])
-            sample['SampleNumber'] = sample_count
+                previous_samplename_lane = (sample["SampleName"], sample["Lane"])
+            sample["SampleNumber"] = sample_count
         # Write to a new UnassignedSequences.csv file under demux_dir
-        aggregated_assigned_indexes_csv = os.path.join(self.run_dir, self.demux_dir, "IndexAssignment.csv")
-        self.write_to_csv(aggregated_assigned_indexes_filtered_sorted, aggregated_assigned_indexes_csv)
-
+        aggregated_assigned_indexes_csv = os.path.join(
+            self.run_dir, self.demux_dir, "IndexAssignment.csv"
+        )
+        self.write_to_csv(
+            aggregated_assigned_indexes_filtered_sorted, aggregated_assigned_indexes_csv
+        )
 
     # Aggregate stats in UnassignedSequences.csv
     def aggregate_stats_unassigned(self, demux_runmanifest):
         aggregated_unassigned_indexes = []
-        lanes = sorted(list(set(sample['Lane'] for sample in demux_runmanifest)))
+        lanes = sorted(list(set(sample["Lane"] for sample in demux_runmanifest)))
         for lane in lanes:
             sub_demux_index_lens = set()
             for sample in demux_runmanifest:
-                if sample['Lane'] == lane:
-                    sub_demux_index_lens.add((sample['sub_demux_count'], (len(sample.get("Index1", "")), len(sample.get("Index2", "")))))
+                if sample["Lane"] == lane:
+                    sub_demux_index_lens.add(
+                        (
+                            sample["sub_demux_count"],
+                            (
+                                len(sample.get("Index1", "")),
+                                len(sample.get("Index2", "")),
+                            ),
+                        )
+                    )
             # List of sub-demux with a decreasing order of index lengths
-            sub_demux_list = [x[0] for x in sorted(sub_demux_index_lens, key=lambda x: sum(x[1]), reverse=True)]
+            sub_demux_list = [
+                x[0]
+                for x in sorted(
+                    sub_demux_index_lens, key=lambda x: sum(x[1]), reverse=True
+                )
+            ]
             sub_demux_with_max_index_lens = sub_demux_list[0]
             # Start with the unassigned list with the longest index
-            max_unassigned_csv = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux_with_max_index_lens}", "UnassignedSequences.csv")
+            max_unassigned_csv = os.path.join(
+                self.run_dir,
+                f"Demultiplexing_{sub_demux_with_max_index_lens}",
+                "UnassignedSequences.csv",
+            )
             if os.path.exists(max_unassigned_csv):
-                with open(max_unassigned_csv, 'r') as max_unassigned_file:
+                with open(max_unassigned_csv) as max_unassigned_file:
                     reader = csv.DictReader(max_unassigned_file)
                     max_unassigned_indexes = [row for row in reader]
             else:
-                logger.warning(f"No {os.path.basename(max_unassigned_csv)} file found for sub-demultiplexing {sub_demux_with_max_index_lens}.")
+                logger.warning(
+                    f"No {os.path.basename(max_unassigned_csv)} file found for sub-demultiplexing {sub_demux_with_max_index_lens}."
+                )
                 break
             # Filter by lane
-            max_unassigned_indexes = [idx for idx in max_unassigned_indexes if idx["Lane"] == lane]
+            max_unassigned_indexes = [
+                idx for idx in max_unassigned_indexes if idx["Lane"] == lane
+            ]
             # Complicated case with multiple demuxes. Take the full list if there is only one sub-demux otherwise
             if len(sub_demux_list) > 1:
                 # Order: from longer to shorter indexes
                 sub_demux_with_shorter_index_lens = sub_demux_list[1:]
                 for sub_demux in sub_demux_with_shorter_index_lens:
-                    unassigned_csv = os.path.join(self.run_dir, f"Demultiplexing_{sub_demux}", "UnassignedSequences.csv")
+                    unassigned_csv = os.path.join(
+                        self.run_dir,
+                        f"Demultiplexing_{sub_demux}",
+                        "UnassignedSequences.csv",
+                    )
                     if os.path.exists(unassigned_csv):
-                        with open(unassigned_csv, 'r') as unassigned_file:
+                        with open(unassigned_csv) as unassigned_file:
                             reader = csv.DictReader(unassigned_file)
                             unassigned_indexes = [row for row in reader]
                     else:
-                        logger.warning(f"No {os.path.basename(unassigned_csv)} file found for sub-demultiplexing {sub_demux}.")
+                        logger.warning(
+                            f"No {os.path.basename(unassigned_csv)} file found for sub-demultiplexing {sub_demux}."
+                        )
                         continue
                     # Filter by lane
-                    unassigned_indexes = [unassigned_index for unassigned_index in unassigned_indexes if unassigned_index["Lane"] == lane]
+                    unassigned_indexes = [
+                        unassigned_index
+                        for unassigned_index in unassigned_indexes
+                        if unassigned_index["Lane"] == lane
+                    ]
                     # Remove overlapped indexes from the list of max_unassigned_indexes
-                    idx1_overlapped_len = min([demux_lens_pair[1] for demux_lens_pair in sub_demux_index_lens if demux_lens_pair[0] == sub_demux][0][0],
-                                              [demux_lens_pair[1] for demux_lens_pair in sub_demux_index_lens if demux_lens_pair[0] == sub_demux_with_max_index_lens][0][0])
-                    idx2_overlapped_len = min([demux_lens_pair[1] for demux_lens_pair in sub_demux_index_lens if demux_lens_pair[0] == sub_demux][0][1],
-                                              [demux_lens_pair[1] for demux_lens_pair in sub_demux_index_lens if demux_lens_pair[0] == sub_demux_with_max_index_lens][0][1])
+                    idx1_overlapped_len = min(
+                        [
+                            demux_lens_pair[1]
+                            for demux_lens_pair in sub_demux_index_lens
+                            if demux_lens_pair[0] == sub_demux
+                        ][0][0],
+                        [
+                            demux_lens_pair[1]
+                            for demux_lens_pair in sub_demux_index_lens
+                            if demux_lens_pair[0] == sub_demux_with_max_index_lens
+                        ][0][0],
+                    )
+                    idx2_overlapped_len = min(
+                        [
+                            demux_lens_pair[1]
+                            for demux_lens_pair in sub_demux_index_lens
+                            if demux_lens_pair[0] == sub_demux
+                        ][0][1],
+                        [
+                            demux_lens_pair[1]
+                            for demux_lens_pair in sub_demux_index_lens
+                            if demux_lens_pair[0] == sub_demux_with_max_index_lens
+                        ][0][1],
+                    )
                     for unassigned_index in unassigned_indexes:
-                        idx1_overlapped_seq = unassigned_index['I1'][:idx1_overlapped_len]
-                        idx2_overlapped_seq = unassigned_index['I2'][:idx2_overlapped_len]
+                        idx1_overlapped_seq = unassigned_index["I1"][
+                            :idx1_overlapped_len
+                        ]
+                        idx2_overlapped_seq = unassigned_index["I2"][
+                            :idx2_overlapped_len
+                        ]
                         # Remove the overlapped record from the max_unassigned_indexes list
-                        max_unassigned_indexes = [max_unassigned_index for max_unassigned_index in max_unassigned_indexes if not (max_unassigned_index['I1'][:idx1_overlapped_len] == idx1_overlapped_seq and max_unassigned_index['I2'][:idx2_overlapped_len] == idx2_overlapped_seq)]
+                        max_unassigned_indexes = [
+                            max_unassigned_index
+                            for max_unassigned_index in max_unassigned_indexes
+                            if not (
+                                max_unassigned_index["I1"][:idx1_overlapped_len]
+                                == idx1_overlapped_seq
+                                and max_unassigned_index["I2"][:idx2_overlapped_len]
+                                == idx2_overlapped_seq
+                            )
+                        ]
             # Append to the aggregated_unassigned_indexes list
             aggregated_unassigned_indexes += max_unassigned_indexes
         # Sort aggregated_unassigned_indexes list first by lane and then by Count in the decreasing order
-        aggregated_unassigned_indexes = sorted(aggregated_unassigned_indexes, key=lambda x: (x['Lane'], -int(x['Count'])))
+        aggregated_unassigned_indexes = sorted(
+            aggregated_unassigned_indexes, key=lambda x: (x["Lane"], -int(x["Count"]))
+        )
         # Fetch PFCount for each lane
         pfcount_lane = {}
         if os.path.exists(self.run_stats_file):
@@ -759,15 +910,22 @@ def aggregate_stats_unassigned(self, demux_runmanifest):
             # Modify the % Polonies values based on PFCount for each lane
             for unassigned_index in aggregated_unassigned_indexes:
                 if pfcount_lane.get(unassigned_index["Lane"]):
-                    unassigned_index["% Polonies"] = float(unassigned_index["Count"])/pfcount_lane[unassigned_index["Lane"]]*100
+                    unassigned_index["% Polonies"] = (
+                        float(unassigned_index["Count"])
+                        / pfcount_lane[unassigned_index["Lane"]]
+                        * 100
+                    )
         else:
-            logger.warning(f"No {os.path.basename(self.run_stats_file)} file found for the run.")
+            logger.warning(
+                f"No {os.path.basename(self.run_stats_file)} file found for the run."
+            )
 
         # Write to a new UnassignedSequences.csv file under demux_dir
-        aggregated_unassigned_csv = os.path.join(self.run_dir, self.demux_dir, "UnassignedSequences.csv")
+        aggregated_unassigned_csv = os.path.join(
+            self.run_dir, self.demux_dir, "UnassignedSequences.csv"
+        )
         self.write_to_csv(aggregated_unassigned_indexes, aggregated_unassigned_csv)
 
-
     # Aggregate demux results
     def aggregate_demux_results(self, demux_results_dirs):
         # Ensure the destination directory exists
@@ -786,12 +944,15 @@ def aggregate_demux_results(self, demux_results_dirs):
         self.aggregate_stats_unassigned(demux_runmanifest)
 
     def sync_metadata(self):
-        files_to_copy = [self.run_stats_file,
-                         os.path.join(self.run_dir, "Demultiplexing", "IndexAssignment.csv"),
-                         os.path.join(self.run_dir, "Demultiplexing", "UnassignedSequences.csv"),
-                         self.run_parameters_file,
-                         ]
-        metadata_archive = self.CONFIG.get("element_analysis").get("metadata_location") # TODO: add to taca.yaml
+        files_to_copy = [
+            self.run_stats_file,
+            os.path.join(self.run_dir, "Demultiplexing", "IndexAssignment.csv"),
+            os.path.join(self.run_dir, "Demultiplexing", "UnassignedSequences.csv"),
+            self.run_parameters_file,
+        ]
+        metadata_archive = self.CONFIG.get("element_analysis").get(
+            "metadata_location"
+        )  # TODO: add to taca.yaml
         dest = os.path.join(metadata_archive, self.NGI_run_id)
         os.makedirs(dest)
         for f in files_to_copy:
@@ -802,8 +963,8 @@ def make_transfer_indicator(self):
         Path(transfer_indicator).touch()
 
     def transfer(self):
-        transfer_details = (
-            self.CONFIG.get("element_analysis").get("transfer_details")
+        transfer_details = self.CONFIG.get("element_analysis").get(
+            "transfer_details"
         )  # TODO: Add section to taca.yaml
         command = (
             "rsync"
@@ -830,7 +991,7 @@ def transfer(self):
         return
 
     def remove_transfer_indicator(self):
-        transfer_indicator = os.path.join(self.run_dir, '.rsync_ongoing')
+        transfer_indicator = os.path.join(self.run_dir, ".rsync_ongoing")
         Path(transfer_indicator).unlink()
 
     def update_transfer_log(self):
@@ -845,7 +1006,9 @@ def update_transfer_log(self):
             raise OSError(msg)
 
     def update_paths_after_archiving(self, new_location):
-        self.run_dir = os.path.join(new_location, self.NGI_run_id) # Needs to be redirected to new location so that TACA can find files to upload to statusdb
+        self.run_dir = os.path.join(
+            new_location, self.NGI_run_id
+        )  # Needs to be redirected to new location so that TACA can find files to upload to statusdb
         self.run_parameters_file = os.path.join(self.run_dir, "RunParameters.json")
         self.run_stats_file = os.path.join(self.run_dir, "AvitiRunStats.json")
         self.run_manifest_file_from_instrument = os.path.join(

From 43d8316680c55eb695a7e98bf173f85a91dc96c9 Mon Sep 17 00:00:00 2001
From: chuan-wang <chuan.wang@scilifelab.se>
Date: Tue, 1 Oct 2024 14:06:20 +0200
Subject: [PATCH 097/187] Add cycles in runparameters

---
 taca/element/Element_Runs.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index e6264396..40aa0854 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -114,6 +114,7 @@ def parse_run_parameters(self) -> None:
             "RunType"
         )  # Sequencing, wash or prime I believe?
         self.flowcell_id = run_parameters.get("FlowcellID")
+        self.cycles = run_parameters.get("Cycles", {'R1': 0, 'R2': 0, 'I1': 0, 'I2': 0})
         self.instrument_name = run_parameters.get("InstrumentName")
         self.date = run_parameters.get("Date")[0:10].replace("-", "")
         self.year = self.date[0:4]

From 13340433a8c6dbda3ad690315a41d2950329e3a0 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Tue, 1 Oct 2024 14:23:50 +0200
Subject: [PATCH 098/187] polishing, adapt masks to true cycles

---
 taca/element/Element_Runs.py | 71 +++++++++++++++++++++++++-----------
 1 file changed, 49 insertions(+), 22 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 6f0ecd01..df48bdca 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -18,17 +18,27 @@
 logger = logging.getLogger(__name__)
 
 
-def get_mask(seq: str, mask_type: str, which_index: int) -> str:
+def get_mask(
+    seq: str,
+    mask_type: str,
+    prefix: str,
+    cycles_used: int | None = None,
+) -> str:
     """Example usage:
 
-    get_mask("ACGTACGTNNNNNNNN", "umi", 1)   -> 'I1:N8Y8'
-    get_mask("ACGTACGTNNNNNNNN", "index", 2) -> 'I2:Y8N8'
+    get_mask("ACGTNNN", "umi", "I1:", None) -> 'I1:N4Y3'
+    get_mask("ACGTNNN", "index", "I2:", 10) -> 'I2:Y4N3N3'
     """
 
     # Input assertions
     assert re.match(r"^[ACGTN]+$", seq), f"Index '{seq}' has non-ACGTN characters"
     assert mask_type in ["umi", "index"], "Mask type must be 'umi' or 'index'"
-    assert which_index in [1, 2], "Index number must be 1 or 2"
+    assert prefix in [
+        "R1:",
+        "R2:",
+        "I1:",
+        "I2:",
+    ], f"Mask prefix {prefix} not recognized"
 
     # Define dict to convert base to mask classifier
     base2mask = (
@@ -50,7 +60,7 @@ def get_mask(seq: str, mask_type: str, which_index: int) -> str:
     )
 
     # Dynamically build the mask sequence
-    mask_seq = "I1:" if which_index == 1 else "I2:"
+    mask_seq = prefix
     current_group = ""
     current_group_len = 0
     for letter in seq:
@@ -78,6 +88,11 @@ def get_mask(seq: str, mask_type: str, which_index: int) -> str:
         seq
     ), f"Length of mask '{mask_seq}' does not match length of input seq '{seq}'"
 
+    # TODO update this when we get the actual cycles used from the run parameters
+    if cycles_used is not None:
+        if cycles_used > len(mask_seq):
+            mask_seq += f"N{cycles_used-len(mask_seq)}"
+
     return mask_seq
 
 
@@ -428,23 +443,20 @@ def make_demux_manifests(
         df_samples = df[df["Project"] != "Control"].copy()
         df_controls = df[df["Project"] == "Control"].copy()
 
-        # Apply default dir path for output
-        if outdir is None:
-            outdir = self.run_dir
-
-        ## Build composite manifests
-        manifest_root_name = f"{self.NGI_run_id}_demux"
-
         # Bool indicating whether UMI is present
         df_samples["has_umi"] = df_samples["Index2"].str.contains("N")
 
         # Add cols denoting idx and umi masks
-        for n in [1, 2]:
-            df_samples[f"I{n}Mask"] = df_samples[f"Index{n}"].apply(
-                lambda seq: get_mask(seq, "index", n)
-            )
+        df_samples["I1Mask"] = df_samples[
+            "Index1"
+        ].apply(  # TODO get cycles from run parameters
+            lambda seq: get_mask(seq, "index", "I1:", None)
+        )
+        df_samples["I2Mask"] = df_samples["Index2"].apply(
+            lambda seq: get_mask(seq, "index", "I2:", None)
+        )
         df_samples["UmiMask"] = df_samples["Index2"].apply(
-            lambda seq: get_mask(seq, "umi", 2)
+            lambda seq: get_mask(seq, "umi", "I2:", None)
         )
 
         # Re-make idx col without Ns
@@ -453,11 +465,20 @@ def make_demux_manifests(
             lambda x: x.replace("N", "")
         )
 
-        # Break down by masks and lane, creating composite manifests
+        # Apply default dir path for output
+        if outdir is None:
+            outdir = self.run_dir
+
+        # Break down into groups by non-consolable properties
+        grouped_df = df_samples.groupby(
+            ["I1Mask", "I2Mask", "UmiMask", "Lane", "Recipe"]
+        )
+
+        # Iterate over groups to build composite manifests
+        manifest_root_name = f"{self.NGI_run_id}_demux"
         manifests = []
         n = 0
-        grouped_df = df_samples.groupby(["I1Mask", "I2Mask", "UmiMask", "Lane"])
-        for (I1Mask, I2Mask, UmiMask, lane), group in grouped_df:
+        for (I1Mask, I2Mask, UmiMask, lane, recipe), group in grouped_df:
             file_name = f"{manifest_root_name}_{n}.csv"
 
             runValues_section = "\n".join(
@@ -466,16 +487,22 @@ def make_demux_manifests(
                     "KeyName, Value",
                     f'manifest_file, "{file_name}"',
                     f"manifest_group, {n+1}/{len(grouped_df)}",
-                    f"grouped_by, I1Mask:'{I1Mask}' I2Mask:'{I2Mask}' UmiMask:'{UmiMask}' lane:{lane}",
+                    f"grouped_by, I1Mask:'{I1Mask}' I2Mask:'{I2Mask}' UmiMask:'{UmiMask}' lane:{lane} recipe:'{recipe}'",
                 ]
             )
 
+            recipe_split = recipe.split("-")
+            R1Mask = f"R1:Y{recipe_split[0]}N*"  # TODO remove asterisk by getting de-facto cycles from run parameters
+            R2Mask = f"R2:Y{recipe_split[3]}N*"  # TODO remove asterisk by getting de-facto cycles from run parameters
+
             settings_section = "\n".join(
                 [
                     "[SETTINGS]",
                     "SettingName, Value",
+                    f"R1Mask, {R1Mask}",
                     f"I1Mask, {I1Mask}",
                     f"I2Mask, {I2Mask}",
+                    f"R2Mask, {R2Mask}",
                 ]
             )
 
@@ -483,7 +510,7 @@ def make_demux_manifests(
                 settings_section += "\n" + "\n".join(
                     [
                         f"UmiMask, {UmiMask}",
-                        "UmiFastQ, True",
+                        "UmiFastQ, TRUE",
                     ]
                 )
 

From a1c7cceed5c05c3d0337a16004437365de897642 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Tue, 1 Oct 2024 14:32:06 +0200
Subject: [PATCH 099/187] ruff

---
 taca/analysis/analysis_element.py | 14 ++++++--------
 taca/element/Aviti_Runs.py        |  2 +-
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index 72c3c74d..eb15a8a4 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -45,9 +45,7 @@ def _process(run):
                 os.mkdir(run.demux_dir)
                 run.copy_manifests()
                 run_manifests = glob.glob(
-                    os.path.join(
-                        run.run_dir, "RunManifest_*.csv"
-                    )
+                    os.path.join(run.run_dir, "RunManifest_*.csv")
                 )
                 sub_demux_count = 0
                 for run_manifest in run_manifests.sort():
@@ -96,7 +94,9 @@ def _process(run):
             run.status = "transferring"
             if run.status_changed:
                 run.update_statusdb()
-            logger.info(f"{run} is being transferred. Skipping.") # TODO: fix formatting, currently prints "ElementRun(20240910_AV242106_B2403418431) is being transferred"
+            logger.info(
+                f"{run} is being transferred. Skipping."
+            )  # TODO: fix formatting, currently prints "ElementRun(20240910_AV242106_B2403418431) is being transferred"
             return
         elif transfer_status == "rsync done":
             if run.rsync_successful():
@@ -120,7 +120,7 @@ def _process(run):
         else:
             logger.warning(
                 f"Unknown transfer status {transfer_status} of run {run}. Please investigate"
-            ) # TODO: email warning to operator
+            )  # TODO: email warning to operator
             return
 
     if given_run:
@@ -132,9 +132,7 @@ def _process(run):
         )  # TODO: add to config
         for data_dir in data_dirs:
             # Run folder looks like DATE_*_*, the last section is the FC side (A/B) and name
-            runs = glob.glob(
-                os.path.join(data_dir, "[1-9]*_*_*")
-            )
+            runs = glob.glob(os.path.join(data_dir, "[1-9]*_*_*"))
             for run in runs:
                 runObj = Aviti_Run(run, CONFIG)
                 try:
diff --git a/taca/element/Aviti_Runs.py b/taca/element/Aviti_Runs.py
index cbe923ae..63b01bf7 100644
--- a/taca/element/Aviti_Runs.py
+++ b/taca/element/Aviti_Runs.py
@@ -5,4 +5,4 @@ class Aviti_Run(Run):
     def __init__(self, run_dir, configuration):
         self.sequencer_type = "Aviti"
         self.demux_dir = "Demultiplexing"
-        super().__init__(run_dir, configuration)
\ No newline at end of file
+        super().__init__(run_dir, configuration)

From 9197ea4cc3611d66c8f7d409a91107fd308cc06b Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Tue, 1 Oct 2024 14:34:40 +0200
Subject: [PATCH 100/187] ruff check

---
 taca/element/Element_Runs.py       | 12 ++++++------
 tests/element/test_Element_Runs.py |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index df48bdca..63ff4c4f 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -575,6 +575,7 @@ def start_demux(self, run_manifest, demux_dir):
                 logger.info(
                     "Bases2Fastq conversion and demultiplexing "
                     f"started for run {self} on {datetime.now()}"
+                    f"with p_handle {p_handle}"
                 )
             except subprocess.CalledProcessError:
                 logger.warning(
@@ -648,9 +649,7 @@ def collect_demux_runmanifest(self, demux_results_dirs):
         demux_runmanifest = []
         for demux_dir in demux_results_dirs:
             sub_demux_count = os.path.basename(demux_dir).split("_")[1]
-            with open(
-                os.path.join(self.run_dir, demux_dir, "RunManifest.csv"), "r"
-            ) as file:
+            with open(os.path.join(self.run_dir, demux_dir, "RunManifest.csv")) as file:
                 lines = file.readlines()
             sample_section = False
             headers = []
@@ -834,7 +833,7 @@ def aggregate_stats_assigned(self, demux_runmanifest):
                 self.run_dir, f"Demultiplexing_{sub_demux}", "IndexAssignment.csv"
             )
             if os.path.exists(assigned_csv):
-                with open(assigned_csv, "r") as assigned_file:
+                with open(assigned_csv) as assigned_file:
                     reader = csv.DictReader(assigned_file)
                     index_assignment = [row for row in reader]
                 for sample in index_assignment:
@@ -924,7 +923,7 @@ def aggregate_stats_unassigned(self, demux_runmanifest):
                 "UnassignedSequences.csv",
             )
             if os.path.exists(max_unassigned_csv):
-                with open(max_unassigned_csv, "r") as max_unassigned_file:
+                with open(max_unassigned_csv) as max_unassigned_file:
                     reader = csv.DictReader(max_unassigned_file)
                     max_unassigned_indexes = [row for row in reader]
             else:
@@ -947,7 +946,7 @@ def aggregate_stats_unassigned(self, demux_runmanifest):
                         "UnassignedSequences.csv",
                     )
                     if os.path.exists(unassigned_csv):
-                        with open(unassigned_csv, "r") as unassigned_file:
+                        with open(unassigned_csv) as unassigned_file:
                             reader = csv.DictReader(unassigned_file)
                             unassigned_indexes = [row for row in reader]
                     else:
@@ -1072,6 +1071,7 @@ def transfer(self):
             logger.info(
                 "Transfer to analysis cluster "
                 f"started for run {self} on {datetime.now()}"
+                f"with p_handle {p_handle}"
             )
         except subprocess.CalledProcessError:
             logger.warning(
diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py
index 23914a7d..2ebd777f 100644
--- a/tests/element/test_Element_Runs.py
+++ b/tests/element/test_Element_Runs.py
@@ -77,7 +77,7 @@ def create_element_run_dir(
             open(
                 os.path.join(
                     run_path,
-                    f"Demultiplexing",
+                    "Demultiplexing",
                     "RunStats.json",
                 ),
                 "w",

From afe2af67134d1dd08c0b9716fd2fd611e089beb2 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Tue, 1 Oct 2024 14:47:39 +0200
Subject: [PATCH 101/187] mypy fix

---
 taca/element/Element_Runs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 90dc6c33..ce7c1c39 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -410,7 +410,7 @@ def copy_manifests(self) -> bool:
 
     def make_demux_manifests(
         self, manifest_to_split: os.PathLike, outdir: os.PathLike | None = None
-    ) -> list[os.PathLike]:
+    ) -> list[str]:
         """Derive composite demultiplexing manifests
         from a single information-rich manifest.
         """

From 4b1610c590e52a64645ca43c06a15de069f3f7a5 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Wed, 2 Oct 2024 11:55:47 +0200
Subject: [PATCH 102/187] use cycles from runparam for mask generation

---
 taca/element/Element_Runs.py | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index e5867c4b..42947355 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -192,7 +192,7 @@ def parse_run_parameters(self) -> None:
             "RunType"
         )  # Sequencing, wash or prime I believe?
         self.flowcell_id = run_parameters.get("FlowcellID")
-        self.cycles = run_parameters.get("Cycles", {'R1': 0, 'R2': 0, 'I1': 0, 'I2': 0})
+        self.cycles = run_parameters.get("Cycles", {"R1": 0, "R2": 0, "I1": 0, "I2": 0})
         self.instrument_name = run_parameters.get("InstrumentName")
         self.date = run_parameters.get("Date")[0:10].replace("-", "")
         self.year = self.date[0:4]
@@ -448,16 +448,29 @@ def make_demux_manifests(
         df_samples["has_umi"] = df_samples["Index2"].str.contains("N")
 
         # Add cols denoting idx and umi masks
-        df_samples["I1Mask"] = df_samples[
-            "Index1"
-        ].apply(  # TODO get cycles from run parameters
-            lambda seq: get_mask(seq, "index", "I1:", None)
+        df_samples["I1Mask"] = df_samples["Index1"].apply(
+            lambda seq: get_mask(
+                seq=seq,
+                mask_type="index",
+                prefix="I1:",
+                cycles_used=self.cycles["I1"],
+            )
         )
         df_samples["I2Mask"] = df_samples["Index2"].apply(
-            lambda seq: get_mask(seq, "index", "I2:", None)
+            lambda seq: get_mask(
+                seq=seq,
+                mask_type="index",
+                prefix="I2:",
+                cycles_used=self.cycles["I2"],
+            )
         )
         df_samples["UmiMask"] = df_samples["Index2"].apply(
-            lambda seq: get_mask(seq, "umi", "I2:", None)
+            lambda seq: get_mask(
+                seq=seq,
+                mask_type="umi",
+                prefix="I2:",
+                cycles_used=self.cycles["I2"],
+            )
         )
 
         # Re-make idx col without Ns

From 298bcbee6a055e8fcf93cc34a5682686e289b359 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Wed, 2 Oct 2024 12:18:11 +0200
Subject: [PATCH 103/187] readability improvements, use runparam cycles for
 r1/r2 masks

---
 taca/element/Element_Runs.py | 71 ++++++++++++++++++++----------------
 1 file changed, 40 insertions(+), 31 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 42947355..07efea97 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -22,12 +22,12 @@ def get_mask(
     seq: str,
     mask_type: str,
     prefix: str,
-    cycles_used: int | None = None,
+    cycles_used: int,
 ) -> str:
     """Example usage:
 
     get_mask("ACGTNNN", "umi", "I1:", None) -> 'I1:N4Y3'
-    get_mask("ACGTNNN", "index", "I2:", 10) -> 'I2:Y4N3N3'
+    get_mask("ACGTNNN", "index", "I2:", 10) -> 'I2:Y4N6'
     """
 
     # Input assertions
@@ -59,41 +59,50 @@ def get_mask(
         }
     )
 
-    # Dynamically build the mask sequence
-    mask_seq = prefix
+    # Instantiate the mask string
+    mask = ""
+    # Add the prefix
+    mask += prefix
+    # Loop through the input sequence and dynamically add mask groups
     current_group = ""
     current_group_len = 0
-    for letter in seq:
-        if base2mask[letter] == current_group:
+    mask_len = 0
+    for base in seq:
+        mask_len += 1
+        if base2mask[base] == current_group:
             current_group_len += 1
         else:
-            mask_seq += (
+            mask += (
                 f"{current_group}{current_group_len}" if current_group_len > 0 else ""
             )
-            current_group = base2mask[letter]
+            current_group = base2mask[base]
             current_group_len = 1
-    mask_seq += f"{current_group}{current_group_len}"
-
-    # Use the worlds ugliest string parsing to check that the mask length matches the input sequence length
-    assert sum(
-        [
-            int(n)
-            for n in mask_seq[3:]
-            .replace("N", "-")
-            .replace("Y", "-")
-            .strip("-")
-            .split("-")
-        ]
-    ) == len(
-        seq
-    ), f"Length of mask '{mask_seq}' does not match length of input seq '{seq}'"
-
-    # TODO update this when we get the actual cycles used from the run parameters
-    if cycles_used is not None:
-        if cycles_used > len(mask_seq):
-            mask_seq += f"N{cycles_used-len(mask_seq)}"
+    # For the last mask group, check if we need to pad with Ns to match the number of cycles used
+    if cycles_used > mask_len:
+        diff = cycles_used - mask_len
+        if current_group == "N":
+            current_group_len += diff
+            mask += f"{current_group}{current_group_len}"
+        else:
+            mask += f"{current_group}{current_group_len}"
+            mask += f"N{diff}"
+
+    # Parse mask string to check that it matches the number of cycles used
+    assert (
+        sum(
+            [
+                int(n)
+                for n in mask[3:]
+                .replace("N", "-")
+                .replace("Y", "-")
+                .strip("-")
+                .split("-")
+            ]
+        )
+        == cycles_used
+    ), f"Length of mask '{mask}' does not match number of cycles used '{cycles_used}'."
 
-    return mask_seq
+    return mask
 
 
 class Run:
@@ -506,8 +515,8 @@ def make_demux_manifests(
             )
 
             recipe_split = recipe.split("-")
-            R1Mask = f"R1:Y{recipe_split[0]}N*"  # TODO remove asterisk by getting de-facto cycles from run parameters
-            R2Mask = f"R2:Y{recipe_split[3]}N*"  # TODO remove asterisk by getting de-facto cycles from run parameters
+            R1Mask = f"R1:Y{recipe_split[0]}N{self.cycles["R1"]}"
+            R2Mask = f"R2:Y{recipe_split[3]}N{self.cycles["R2"]}"
 
             settings_section = "\n".join(
                 [

From 95a32580bbcb7d033d3e135e0ad3fe541a05c084 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Wed, 2 Oct 2024 12:19:48 +0200
Subject: [PATCH 104/187] syntax fix

---
 taca/element/Element_Runs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 07efea97..c2ac77c8 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -515,8 +515,8 @@ def make_demux_manifests(
             )
 
             recipe_split = recipe.split("-")
-            R1Mask = f"R1:Y{recipe_split[0]}N{self.cycles["R1"]}"
-            R2Mask = f"R2:Y{recipe_split[3]}N{self.cycles["R2"]}"
+            R1Mask = f"R1:Y{recipe_split[0]}N{self.cycles['R1']}"
+            R2Mask = f"R2:Y{recipe_split[3]}N{self.cycles['R2']}"
 
             settings_section = "\n".join(
                 [

From 8a28e89c6ab1e5660b631707ccc2b06bcffdf786 Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Wed, 2 Oct 2024 14:14:05 +0200
Subject: [PATCH 105/187] Email warnings to operator

---
 taca/analysis/analysis_element.py | 33 ++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index eb15a8a4..d18e71f1 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -6,6 +6,7 @@
 
 from taca.element.Aviti_Runs import Aviti_Run
 from taca.utils.config import CONFIG
+from taca.utils.misc import send_mail
 
 logger = logging.getLogger(__name__)
 
@@ -27,11 +28,16 @@ def _process(run):
             logger.warning(
                 f"Cannot reliably set NGI_run_id for {run} due to missing RunParameters.json. Aborting run processing"
             )
+            email_subject = f"Issues processing {run}"
+            email_message = (
+                f"RunParameters.json missing for {run}. Processing was aborted."
+            )
+            send_mail(email_subject, email_message, CONFIG["mail"]["recipients"])
             raise
 
         #### Sequencing status ####
         sequencing_done = run.check_sequencing_status()
-        if not sequencing_done:  # Sequencing ongoing
+        if not sequencing_done:
             run.status = "sequencing"
             if run.status_changed:
                 run.update_statusdb()
@@ -40,7 +46,6 @@ def _process(run):
         #### Demultiplexing status ####
         demultiplexing_status = run.get_demultiplexing_status()
         if demultiplexing_status == "not started":
-            # Sequencing done. Start demux
             if run.manifest_exists():
                 os.mkdir(run.demux_dir)
                 run.copy_manifests()
@@ -61,7 +66,11 @@ def _process(run):
                 logger.warning(
                     f"Run manifest is missing for {run}, demultiplexing aborted"
                 )
-                # TODO: email operator warning
+                email_subject = f"Issues processing {run}"
+                email_message = (
+                    f"Run manifest is missing for {run}, demultiplexing aborted"
+                )
+                send_mail(email_subject, email_message, CONFIG["mail"]["recipients"])
                 return
         elif demultiplexing_status == "ongoing":
             run.status = "demultiplexing"
@@ -71,8 +80,13 @@ def _process(run):
 
         elif demultiplexing_status != "finished":
             logger.warning(
-                f"Unknown demultiplexing status {demultiplexing_status} of run {run}. Please investigate"
+                f"Unknown demultiplexing status {demultiplexing_status} of run {run}. Please investigate."
             )
+            email_subject = f"Issues processing {run}"
+            email_message = (
+                f"Unknown demultiplexing status {demultiplexing_status} of run {run}. Please investigate."
+            )
+            send_mail(email_subject, email_message, CONFIG["mail"]["recipients"])
             return
 
         #### Transfer status ####
@@ -115,12 +129,17 @@ def _process(run):
                 logger.warning(
                     f"An issue occurred while transfering {run} to the analysis cluster."
                 )
-                # TODO: email warning to operator
+                email_subject = f"Issues processing {run}"
+                email_message = f"An issue occurred while transfering {run} to the analysis cluster."
+                send_mail(email_subject, email_message, CONFIG["mail"]["recipients"])
             return
         else:
             logger.warning(
-                f"Unknown transfer status {transfer_status} of run {run}. Please investigate"
-            )  # TODO: email warning to operator
+                f"Unknown transfer status {transfer_status} of run {run}, please investigate."
+            )
+            email_subject = f"Issues processing {run}"
+            email_message = f"Unknown transfer status {transfer_status} of run {run}, please investigate."
+            send_mail(email_subject, email_message, CONFIG["mail"]["recipients"])
             return
 
     if given_run:

From 1e168a2505d797392b3f315a3762a9c5575a59fc Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Thu, 3 Oct 2024 09:56:03 +0200
Subject: [PATCH 106/187] Cleanup comments

---
 taca/analysis/analysis_element.py |  2 +-
 taca/element/Element_Runs.py      | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index d18e71f1..10b2adf1 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -148,7 +148,7 @@ def _process(run):
     else:
         data_dirs = CONFIG.get("element_analysis").get(
             "data_dirs"
-        )  # TODO: add to config
+        )
         for data_dir in data_dirs:
             # Run folder looks like DATE_*_*, the last section is the FC side (A/B) and name
             runs = glob.glob(os.path.join(data_dir, "[1-9]*_*_*"))
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index c2ac77c8..9e51ae3e 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -130,7 +130,7 @@ def __init__(self, run_dir, configuration):
             .get("Element", {})
             .get(self.sequencer_type, {})
             .get("transfer_log")
-        )  # TODO: add to taca.yaml
+        )
         self.rsync_exit_file = os.path.join(self.run_dir, ".rsync_exit_status")
 
         # Instrument generated files
@@ -340,7 +340,7 @@ def get_lims_step_id(self) -> str | None:
         the ID of the LIMS step can be extracted from it.
         """
 
-        # TODO test me
+        # TODO: test me
 
         assert self.manifest_exists(), "Run manifest not found"
         with open(self.run_manifest_file_from_instrument) as csv_file:
@@ -357,7 +357,7 @@ def find_manifest_zip(self):
             self.CONFIG.get("element_analysis")
             .get("Element", {})
             .get(self.sequencer_type, {})
-            .get("manifest_zip_location"),  # TODO: add to taca.yaml
+            .get("manifest_zip_location"),
             str(self.year),
         )
 
@@ -575,7 +575,7 @@ def make_demux_manifests(
 
     def generate_demux_command(self, run_manifest, demux_dir):
         command = (
-            f"{self.CONFIG.get('element_analysis').get('bases2fastq')}"  # TODO: add path to bases2fastq executable to config
+            f"{self.CONFIG.get('element_analysis').get('bases2fastq')}"
             + f" {self.run_dir}"
             + f" {demux_dir}"
             + " -p 8"
@@ -583,7 +583,7 @@ def generate_demux_command(self, run_manifest, demux_dir):
             + f" -r {run_manifest}"
             + " --legacy-fastq"
             + " --force-index-orientation"
-        )  # TODO: any other options?
+        )
         with open(os.path.join(self.run_dir, ".bases2fastq_command")) as command_file:
             command_file.write(command)
         return command
@@ -1091,7 +1091,7 @@ def sync_metadata(self):
         ]
         metadata_archive = self.CONFIG.get("element_analysis").get(
             "metadata_location"
-        )  # TODO: add to taca.yaml
+        )
         dest = os.path.join(metadata_archive, self.NGI_run_id)
         os.makedirs(dest)
         for f in files_to_copy:
@@ -1104,7 +1104,7 @@ def make_transfer_indicator(self):
     def transfer(self):
         transfer_details = self.CONFIG.get("element_analysis").get(
             "transfer_details"
-        )  # TODO: Add section to taca.yaml
+        )
         command = (
             "rsync"
             + " -rLav"

From ec7b39035aeb649659ae767d47c76098c4c9d302 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Thu, 3 Oct 2024 15:16:31 +0200
Subject: [PATCH 107/187] typo

---
 taca/element/Element_Runs.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 9e51ae3e..50a7e6c7 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -191,7 +191,7 @@ def parse_run_parameters(self) -> None:
         self.run_name = run_parameters.get("RunName")
 
         self.run_id = run_parameters.get(
-            "runID"
+            "RunID"
         )  # Unique hash that we don't really use
         self.side = run_parameters.get("Side")  # SideA or SideB
         self.side_letter = self.side[
@@ -1089,9 +1089,7 @@ def sync_metadata(self):
             os.path.join(self.run_dir, "Demultiplexing", "UnassignedSequences.csv"),
             self.run_parameters_file,
         ]
-        metadata_archive = self.CONFIG.get("element_analysis").get(
-            "metadata_location"
-        )
+        metadata_archive = self.CONFIG.get("element_analysis").get("metadata_location")
         dest = os.path.join(metadata_archive, self.NGI_run_id)
         os.makedirs(dest)
         for f in files_to_copy:
@@ -1102,9 +1100,7 @@ def make_transfer_indicator(self):
         Path(transfer_indicator).touch()
 
     def transfer(self):
-        transfer_details = self.CONFIG.get("element_analysis").get(
-            "transfer_details"
-        )
+        transfer_details = self.CONFIG.get("element_analysis").get("transfer_details")
         command = (
             "rsync"
             + " -rLav"

From 178683d05295638a7e12cf056a1323f2c2b7e255 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Thu, 3 Oct 2024 15:59:05 +0200
Subject: [PATCH 108/187] wip

---
 taca/analysis/analysis_element.py | 13 +++++-------
 taca/element/Element_Runs.py      | 34 ++++++++++++-------------------
 2 files changed, 18 insertions(+), 29 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index 10b2adf1..dc0a202c 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -46,9 +46,10 @@ def _process(run):
         #### Demultiplexing status ####
         demultiplexing_status = run.get_demultiplexing_status()
         if demultiplexing_status == "not started":
-            if run.manifest_exists():
+            lims_zip_path = run.find_lims_zip()
+            if lims_zip_path is not None:
                 os.mkdir(run.demux_dir)
-                run.copy_manifests()
+                run.copy_manifests(lims_zip_path)
                 run_manifests = glob.glob(
                     os.path.join(run.run_dir, "RunManifest_*.csv")
                 )
@@ -83,9 +84,7 @@ def _process(run):
                 f"Unknown demultiplexing status {demultiplexing_status} of run {run}. Please investigate."
             )
             email_subject = f"Issues processing {run}"
-            email_message = (
-                f"Unknown demultiplexing status {demultiplexing_status} of run {run}. Please investigate."
-            )
+            email_message = f"Unknown demultiplexing status {demultiplexing_status} of run {run}. Please investigate."
             send_mail(email_subject, email_message, CONFIG["mail"]["recipients"])
             return
 
@@ -146,9 +145,7 @@ def _process(run):
         run = Aviti_Run(given_run, CONFIG)
         _process(run)
     else:
-        data_dirs = CONFIG.get("element_analysis").get(
-            "data_dirs"
-        )
+        data_dirs = CONFIG.get("element_analysis").get("data_dirs")
         for data_dir in data_dirs:
             # Run folder looks like DATE_*_*, the last section is the FC side (A/B) and name
             runs = glob.glob(os.path.join(data_dir, "[1-9]*_*_*"))
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 50a7e6c7..9bcc2c7e 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -331,27 +331,19 @@ def update_statusdb(self):
         doc_obj = self.to_doc_obj()
         self.db.upload_to_statusdb(doc_obj)
 
-    def manifest_exists(self):
-        zip_src_path = self.find_manifest_zip()
-        return os.path.isfile(zip_src_path)
-
     def get_lims_step_id(self) -> str | None:
         """If the run was started using a LIMS-generated manifest,
         the ID of the LIMS step can be extracted from it.
         """
 
-        # TODO: test me
+        with open(self.run_manifest_file_from_instrument) as json_file:
+            manifest_json = json.load(json_file)
+
+        lims_step_id = manifest_json.get("RunValues").get("lims_step_id")
 
-        assert self.manifest_exists(), "Run manifest not found"
-        with open(self.run_manifest_file_from_instrument) as csv_file:
-            manifest_lines = csv_file.readlines()
-        for line in manifest_lines:
-            if "lims_step_id" in line:
-                lims_step_id = line.split(",")[1]
-                return lims_step_id
-        return None
+        return lims_step_id
 
-    def find_manifest_zip(self):
+    def find_lims_zip(self) -> str | None:
         # Specify dir in which LIMS drop the manifest zip files
         dir_to_search = os.path.join(
             self.CONFIG.get("element_analysis")
@@ -362,7 +354,8 @@ def find_manifest_zip(self):
         )
 
         # Use LIMS step ID if available, else flowcell ID, to make a query pattern
-        if self.lims_step_id:
+        self.lims_step_id = self.get_lims_step_id()
+        if self.lims_step_id is not None:
             logging.info(
                 f"Using LIMS step ID '{self.lims_step_id}' to find LIMS run manifests."
             )
@@ -379,20 +372,19 @@ def find_manifest_zip(self):
             logger.warning(
                 f"No manifest found for run '{self.run_dir}' with pattern '{glob_pattern}'."
             )
-            return False  # TODO: determine whether to raise an error here instead
+            return None
         elif len(glob_results) > 1:
             logger.warning(
                 f"Multiple manifests found for run '{self.run_dir}' with pattern '{glob_pattern}', using latest one."
             )
             glob_results.sort()
-            zip_src_path = glob_results[-1]
+            lims_zip_src_path = glob_results[-1]
         else:
-            zip_src_path = glob_results[0]
-        return zip_src_path
+            lims_zip_src_path = glob_results[0]
+        return lims_zip_src_path
 
-    def copy_manifests(self) -> bool:
+    def copy_manifests(self, zip_src_path) -> bool:
         """Fetch the LIMS-generated run manifests from ngi-nas-ns and unzip them into a run subdir."""
-        zip_src_path = self.find_manifest_zip()
         # Make a run subdir named after the zip file and extract manifests there
         zip_name = os.path.basename(zip_src_path)
         zip_dst_path = os.path.join(self.run_dir, zip_name)

From 602b14f522541f32477781404e26a7f8223b4277 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Thu, 3 Oct 2024 16:57:15 +0200
Subject: [PATCH 109/187] troubleshooting

---
 taca/analysis/analysis_element.py |  8 +++----
 taca/element/Element_Runs.py      | 37 ++++++++++++++-----------------
 2 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index dc0a202c..6e4c0a71 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -50,14 +50,14 @@ def _process(run):
             if lims_zip_path is not None:
                 os.mkdir(run.demux_dir)
                 run.copy_manifests(lims_zip_path)
-                run_manifests = glob.glob(
-                    os.path.join(run.run_dir, "RunManifest_*.csv")
+                demux_manifests = run.make_demux_manifests(
+                    manifest_to_split=run.lims_manifest
                 )
                 sub_demux_count = 0
-                for run_manifest in run_manifests.sort():
+                for demux_manifest in demux_manifests.sort():
                     demux_dir = f"Demultiplexing_{sub_demux_count}"
                     os.mkdir(demux_dir)
-                    run.start_demux(run_manifest, demux_dir)
+                    run.start_demux(demux_manifest, demux_dir)
                     sub_demux_count += 1
                 run.status = "demultiplexing"
                 if run.status_changed:
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 9bcc2c7e..bf804045 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -86,6 +86,8 @@ def get_mask(
         else:
             mask += f"{current_group}{current_group_len}"
             mask += f"N{diff}"
+    else:
+        mask += f"{current_group}{current_group_len}"
 
     # Parse mask string to check that it matches the number of cycles used
     assert (
@@ -386,29 +388,24 @@ def find_lims_zip(self) -> str | None:
     def copy_manifests(self, zip_src_path) -> bool:
         """Fetch the LIMS-generated run manifests from ngi-nas-ns and unzip them into a run subdir."""
         # Make a run subdir named after the zip file and extract manifests there
-        zip_name = os.path.basename(zip_src_path)
-        zip_dst_path = os.path.join(self.run_dir, zip_name)
-        os.mkdir(zip_dst_path)
 
+        # Extract the contents of the zip file into the destination directory
+        unzipped_manifests = []
         with zipfile.ZipFile(zip_src_path, "r") as zip_ref:
-            zip_ref.extractall(zip_dst_path)
-
-        # Set the paths of the different manifests as attributes
-        manifests = os.listdir(zip_dst_path)
-        self.lims_full_manifest = [
-            m for m in manifests if re.match(r".*_untrimmed\.csv$", m)
-        ][0]
-        self.lims_start_manifest = [
-            m for m in manifests if re.match(r".*_trimmed\.csv$", m)
-        ][0]
-        self.lims_empty_manifest = [
-            m for m in manifests if re.match(r".*_empty\.csv$", m)
+            for member in zip_ref.namelist():
+                # Extract each file individually into the destination directory
+                filename = os.path.basename(member)
+                if filename:  # Skip directories
+                    source = zip_ref.open(member)
+                    target = open(os.path.join(self.run_dir, filename), "wb")
+                    unzipped_manifests.append(target.name)
+                    with source, target:
+                        target.write(source.read())
+
+        # Pick out the manifest to use
+        self.lims_manifest = [
+            m for m in unzipped_manifests if re.match(r".*_untrimmed\.csv$", m)
         ][0]
-        self.lims_demux_manifests = [
-            m for m in manifests if re.match(r".*_\d+\.csv$", m)
-        ]
-
-        return True
 
     def make_demux_manifests(
         self, manifest_to_split: os.PathLike, outdir: os.PathLike | None = None

From 102f223f05b9f621935065f602f53584580e211b Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Thu, 3 Oct 2024 17:02:01 +0200
Subject: [PATCH 110/187] mypy

---
 taca/element/Element_Runs.py       |  2 +-
 tests/element/test_Element_Runs.py | 24 ------------------------
 2 files changed, 1 insertion(+), 25 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index bf804045..e8d41dba 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -385,7 +385,7 @@ def find_lims_zip(self) -> str | None:
             lims_zip_src_path = glob_results[0]
         return lims_zip_src_path
 
-    def copy_manifests(self, zip_src_path) -> bool:
+    def copy_manifests(self, zip_src_path):
         """Fetch the LIMS-generated run manifests from ngi-nas-ns and unzip them into a run subdir."""
         # Make a run subdir named after the zip file and extract manifests there
 
diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py
index 2ebd777f..7963761d 100644
--- a/tests/element/test_Element_Runs.py
+++ b/tests/element/test_Element_Runs.py
@@ -158,30 +158,6 @@ def test_get_demultiplexing_status(
 
         assert run.get_demultiplexing_status() == p["expected"]
 
-    @pytest.mark.skip(reason="Not implemented yet")
-    @pytest.mark.parametrize(
-        "p",
-        [
-            {"run_finished": True, "expected": True},
-            {"run_finished": False, "expected": False},
-        ],
-        ids=["exists", "does not exist"],
-    )
-    def test_manifest_exists(
-        self, mock_db: mock.Mock, create_dirs: pytest.fixture, p: pytest.fixture
-    ):
-        tmp: tempfile.TemporaryDirectory = create_dirs
-
-        run = to_test.Run(
-            create_element_run_dir(
-                tmp,
-                run_finished=p["run_finished"],
-            ),
-            get_config(tmp),
-        )
-
-        assert run.manifest_exists() == p["expected"]
-
     @pytest.mark.skip(reason="Not implemented yet")
     def test_generate_demux_command(self, mock_db):
         pass

From b57306d4298072a12d50bbc9ea9cea1d7fe776ba Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Fri, 4 Oct 2024 09:26:12 +0200
Subject: [PATCH 111/187] Don't group by lane when creating submanifests. Add
 sanity check for sample grouping.

---
 taca/element/Element_Runs.py | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index e8d41dba..97a9d2da 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -442,10 +442,10 @@ def make_demux_manifests(
         df_samples = df[df["Project"] != "Control"].copy()
         df_controls = df[df["Project"] == "Control"].copy()
 
-        # Bool indicating whether UMI is present
+        # Add bool indicating whether UMI is present
         df_samples["has_umi"] = df_samples["Index2"].str.contains("N")
 
-        # Add cols denoting idx and umi masks
+        # Add masks for indices and UMIs
         df_samples["I1Mask"] = df_samples["Index1"].apply(
             lambda seq: get_mask(
                 seq=seq,
@@ -471,7 +471,7 @@ def make_demux_manifests(
             )
         )
 
-        # Re-make idx col without Ns
+        # Re-make Index2 column without any Ns
         df_samples["Index2_umi"] = df_samples["Index2"]
         df_samples.loc[:, "Index2"] = df_samples["Index2"].apply(
             lambda x: x.replace("N", "")
@@ -482,15 +482,21 @@ def make_demux_manifests(
             outdir = self.run_dir
 
         # Break down into groups by non-consolable properties
-        grouped_df = df_samples.groupby(
-            ["I1Mask", "I2Mask", "UmiMask", "Lane", "Recipe"]
-        )
+        grouped_df = df_samples.groupby(["I1Mask", "I2Mask", "UmiMask", "Recipe"])
+
+        # Sanity check
+        if sum([len(group) for _, group in grouped_df]) < len(df_samples):
+            msg = "Some samples were not included in any submanifest."
+            logging.error(msg)
+            raise AssertionError(msg)
+        elif sum([len(group) for _, group in grouped_df]) > len(df_samples):
+            logging.warning("Some samples were included in multiple submanifests.")
 
         # Iterate over groups to build composite manifests
         manifest_root_name = f"{self.NGI_run_id}_demux"
         manifests = []
         n = 0
-        for (I1Mask, I2Mask, UmiMask, lane, recipe), group in grouped_df:
+        for (I1Mask, I2Mask, UmiMask, recipe), group in grouped_df:
             file_name = f"{manifest_root_name}_{n}.csv"
 
             runValues_section = "\n".join(
@@ -499,7 +505,7 @@ def make_demux_manifests(
                     "KeyName, Value",
                     f'manifest_file, "{file_name}"',
                     f"manifest_group, {n+1}/{len(grouped_df)}",
-                    f"grouped_by, I1Mask:'{I1Mask}' I2Mask:'{I2Mask}' UmiMask:'{UmiMask}' lane:{lane} recipe:'{recipe}'",
+                    f"grouped_by, I1Mask:'{I1Mask}' I2Mask:'{I2Mask}' UmiMask:'{UmiMask}' recipe:'{recipe}'",
                 ]
             )
 
@@ -527,8 +533,9 @@ def make_demux_manifests(
                 )
 
             # Add PhiX stratified by index length
-            # Subset controls by lane
-            group_controls = df_controls[df_controls["Lane"] == lane].copy()
+            group_controls = df_controls[
+                df_controls["Lane"].isin(group["Lane"].unique())
+            ].copy()
 
             # Trim PhiX indexes to match group
             i1_len = group["Index1"].apply(len).max()

From ec786f15919d96f050e17dd3407ca25cee9a6b59 Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Mon, 7 Oct 2024 10:18:46 +0200
Subject: [PATCH 112/187] Fix method definitions

---
 taca/element/Element_Runs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 97a9d2da..ff1fc0ef 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -639,7 +639,7 @@ def rsync_successful(self):
             return False
 
     # Clear all content under a dir
-    def clear_dir(dir):
+    def clear_dir(self, dir):
         for filename in os.listdir(dir):
             file_path = os.path.join(dir, filename)
             try:
@@ -651,7 +651,7 @@ def clear_dir(dir):
                 print(f"Failed to delete {file_path} Reason {e}")
 
     # Write to csv
-    def write_to_csv(data, filename):
+    def write_to_csv(self, data, filename):
         # Get the fieldnames from the keys of the first dictionary
         fieldnames = data[0].keys()
         # Open the file and write the CSV

From 533f448095ad76a52b3453fdcf1aefa9f06e63eb Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Mon, 7 Oct 2024 10:49:07 +0200
Subject: [PATCH 113/187] Handle masking when no R2

---
 taca/element/Element_Runs.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index ff1fc0ef..34cea966 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -31,6 +31,8 @@ def get_mask(
     """
 
     # Input assertions
+    if not seq:
+        return None
     assert re.match(r"^[ACGTN]+$", seq), f"Index '{seq}' has non-ACGTN characters"
     assert mask_type in ["umi", "index"], "Mask type must be 'umi' or 'index'"
     assert prefix in [
@@ -519,10 +521,12 @@ def make_demux_manifests(
                     "SettingName, Value",
                     f"R1Mask, {R1Mask}",
                     f"I1Mask, {I1Mask}",
-                    f"I2Mask, {I2Mask}",
                     f"R2Mask, {R2Mask}",
                 ]
             )
+            
+            if I2Mask:
+                settings_section += f"\nI2Mask, {I2Mask}"
 
             if group["has_umi"].all():
                 settings_section += "\n" + "\n".join(

From 0b030b13f7968a8b278b70b5a8196fe0f8ad85ec Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Mon, 7 Oct 2024 12:02:03 +0200
Subject: [PATCH 114/187] Return empty string instead of None for missing I2

---
 taca/element/Element_Runs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 34cea966..6550756c 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -32,7 +32,7 @@ def get_mask(
 
     # Input assertions
     if not seq:
-        return None
+        return ""
     assert re.match(r"^[ACGTN]+$", seq), f"Index '{seq}' has non-ACGTN characters"
     assert mask_type in ["umi", "index"], "Mask type must be 'umi' or 'index'"
     assert prefix in [
@@ -525,7 +525,7 @@ def make_demux_manifests(
                 ]
             )
             
-            if I2Mask:
+            if I2Mask != "":
                 settings_section += f"\nI2Mask, {I2Mask}"
 
             if group["has_umi"].all():

From f41778867d1314496ebc5dfeeee8711682b84881 Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Mon, 7 Oct 2024 13:54:04 +0200
Subject: [PATCH 115/187] fix sorting issue

---
 taca/analysis/analysis_element.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index 6e4c0a71..95bacaac 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -54,7 +54,7 @@ def _process(run):
                     manifest_to_split=run.lims_manifest
                 )
                 sub_demux_count = 0
-                for demux_manifest in demux_manifests.sort():
+                for demux_manifest in sorted(demux_manifests):
                     demux_dir = f"Demultiplexing_{sub_demux_count}"
                     os.mkdir(demux_dir)
                     run.start_demux(demux_manifest, demux_dir)

From a0ec98dc6da0b37acf423f6d050a8935742291ce Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Mon, 7 Oct 2024 15:27:29 +0200
Subject: [PATCH 116/187] Fixes for masks

---
 taca/analysis/analysis_element.py |  8 +++++---
 taca/element/Element_Runs.py      | 21 +++++++++++++--------
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index 95bacaac..f0274e47 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -55,9 +55,11 @@ def _process(run):
                 )
                 sub_demux_count = 0
                 for demux_manifest in sorted(demux_manifests):
-                    demux_dir = f"Demultiplexing_{sub_demux_count}"
-                    os.mkdir(demux_dir)
-                    run.start_demux(demux_manifest, demux_dir)
+                    sub_demux_dir = os.path.join(
+                        run.run_dir, f"Demultiplexing_{sub_demux_count}"
+                    )
+                    os.mkdir(sub_demux_dir)
+                    run.start_demux(demux_manifest, sub_demux_dir)
                     sub_demux_count += 1
                 run.status = "demultiplexing"
                 if run.status_changed:
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 6550756c..d1cd15a8 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -30,9 +30,14 @@ def get_mask(
     get_mask("ACGTNNN", "index", "I2:", 10) -> 'I2:Y4N6'
     """
 
+    if not seq and prefix == "I2:":
+        mask = "I2:N*"
+        return mask
+    if not seq and mask_type == "umi":
+        mask = "I2:Y*"
+        return mask
+    
     # Input assertions
-    if not seq:
-        return ""
     assert re.match(r"^[ACGTN]+$", seq), f"Index '{seq}' has non-ACGTN characters"
     assert mask_type in ["umi", "index"], "Mask type must be 'umi' or 'index'"
     assert prefix in [
@@ -519,14 +524,12 @@ def make_demux_manifests(
                 [
                     "[SETTINGS]",
                     "SettingName, Value",
-                    f"R1Mask, {R1Mask}",
+                    f"R1FastqMask, {R1Mask}",
                     f"I1Mask, {I1Mask}",
-                    f"R2Mask, {R2Mask}",
+                    f"I2Mask, {I2Mask}"
+                    f"R2FastqMask, {R2Mask}",
                 ]
             )
-            
-            if I2Mask != "":
-                settings_section += f"\nI2Mask, {I2Mask}"
 
             if group["has_umi"].all():
                 settings_section += "\n" + "\n".join(
@@ -584,7 +587,9 @@ def generate_demux_command(self, run_manifest, demux_dir):
             + " --legacy-fastq"
             + " --force-index-orientation"
         )
-        with open(os.path.join(self.run_dir, ".bases2fastq_command")) as command_file:
+        with open(
+            os.path.join(self.run_dir, ".bases2fastq_command"), "w"
+        ) as command_file:
             command_file.write(command)
         return command
 

From ec28bb74a24da82883a845d2d6025b6f65c01073 Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Mon, 7 Oct 2024 15:39:34 +0200
Subject: [PATCH 117/187] Fixes to masks

---
 taca/element/Element_Runs.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index d1cd15a8..38d3516a 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -517,8 +517,8 @@ def make_demux_manifests(
             )
 
             recipe_split = recipe.split("-")
-            R1Mask = f"R1:Y{recipe_split[0]}N{self.cycles['R1']}"
-            R2Mask = f"R2:Y{recipe_split[3]}N{self.cycles['R2']}"
+            R1Mask = f"R1:Y{recipe_split[0]}"
+            R2Mask = f"R2:Y{recipe_split[3]}"
 
             settings_section = "\n".join(
                 [
@@ -526,7 +526,7 @@ def make_demux_manifests(
                     "SettingName, Value",
                     f"R1FastqMask, {R1Mask}",
                     f"I1Mask, {I1Mask}",
-                    f"I2Mask, {I2Mask}"
+                    f"I2Mask, {I2Mask}",
                     f"R2FastqMask, {R2Mask}",
                 ]
             )

From 2874583e865c8eff7b3308519fbb842a9873a84c Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Mon, 7 Oct 2024 17:37:51 +0200
Subject: [PATCH 118/187] Clarify get_mask() and also use it to generate R1/R2
 masks

---
 taca/element/Element_Runs.py | 73 ++++++++++++++++++++++--------------
 1 file changed, 45 insertions(+), 28 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 38d3516a..5f9bff2c 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -20,26 +20,25 @@
 
 def get_mask(
     seq: str,
-    mask_type: str,
+    keep_Ns: bool,
     prefix: str,
     cycles_used: int,
 ) -> str:
-    """Example usage:
-
-    get_mask("ACGTNNN", "umi", "I1:", None) -> 'I1:N4Y3'
-    get_mask("ACGTNNN", "index", "I2:", 10) -> 'I2:Y4N6'
+    """
+    Inputs:
+        seq             Sequence string to make mask from
+        keep_Ns         Whether Ns should be "Y" or "N" in the mask, vice versa for ACGT
+        prefix          Prefix to add to the mask
+        cycles_used     Number of cycles used in the sequencing run
+
+    Example usage:
+        get_mask( "ACGTNNN", True,  "I1:",  7 ) -> 'I1:N4Y3'
+        get_mask( "ACGTNNN", False, "I2:", 10 ) -> 'I2:Y4N6'
     """
 
-    if not seq and prefix == "I2:":
-        mask = "I2:N*"
-        return mask
-    if not seq and mask_type == "umi":
-        mask = "I2:Y*"
-        return mask
-    
     # Input assertions
-    assert re.match(r"^[ACGTN]+$", seq), f"Index '{seq}' has non-ACGTN characters"
-    assert mask_type in ["umi", "index"], "Mask type must be 'umi' or 'index'"
+    if seq != "":
+        assert re.match(r"^[ACGTN]+$", seq), f"Index '{seq}' has non-ACGTN characters"
     assert prefix in [
         "R1:",
         "R2:",
@@ -47,6 +46,11 @@ def get_mask(
         "I2:",
     ], f"Mask prefix {prefix} not recognized"
 
+    # Handle no-input cases
+    if seq == "":
+        mask = f"{prefix}N{cycles_used}"
+        return mask
+
     # Define dict to convert base to mask classifier
     base2mask = (
         {
@@ -56,7 +60,7 @@ def get_mask(
             "G": "Y",
             "T": "Y",
         }
-        if mask_type == "index"
+        if keep_Ns is False
         else {
             "N": "Y",
             "A": "N",
@@ -452,11 +456,11 @@ def make_demux_manifests(
         # Add bool indicating whether UMI is present
         df_samples["has_umi"] = df_samples["Index2"].str.contains("N")
 
-        # Add masks for indices and UMIs
+        # Add masks
         df_samples["I1Mask"] = df_samples["Index1"].apply(
             lambda seq: get_mask(
                 seq=seq,
-                mask_type="index",
+                keep_Ns=False,
                 prefix="I1:",
                 cycles_used=self.cycles["I1"],
             )
@@ -464,7 +468,7 @@ def make_demux_manifests(
         df_samples["I2Mask"] = df_samples["Index2"].apply(
             lambda seq: get_mask(
                 seq=seq,
-                mask_type="index",
+                keep_Ns=False,
                 prefix="I2:",
                 cycles_used=self.cycles["I2"],
             )
@@ -472,14 +476,30 @@ def make_demux_manifests(
         df_samples["UmiMask"] = df_samples["Index2"].apply(
             lambda seq: get_mask(
                 seq=seq,
-                mask_type="umi",
+                keep_Ns=True,
                 prefix="I2:",
                 cycles_used=self.cycles["I2"],
             )
         )
+        df_samples["R1Mask"] = df_samples["Recipe"].apply(
+            lambda recipe: get_mask(
+                seq="N" * int(recipe.split("-")[0]),
+                keep_Ns=True,
+                prefix="R1:",
+                cycles_used=self.cycles["R1"],
+            )
+        )
+        df_samples["R2Mask"] = df_samples["Recipe"].apply(
+            lambda recipe: get_mask(
+                seq="N" * int(recipe.split("-")[3]),
+                keep_Ns=True,
+                prefix="R2:",
+                cycles_used=self.cycles["R2"],
+            )
+        )
 
         # Re-make Index2 column without any Ns
-        df_samples["Index2_umi"] = df_samples["Index2"]
+        df_samples["Index2_with_Ns"] = df_samples["Index2"]
         df_samples.loc[:, "Index2"] = df_samples["Index2"].apply(
             lambda x: x.replace("N", "")
         )
@@ -489,7 +509,9 @@ def make_demux_manifests(
             outdir = self.run_dir
 
         # Break down into groups by non-consolable properties
-        grouped_df = df_samples.groupby(["I1Mask", "I2Mask", "UmiMask", "Recipe"])
+        grouped_df = df_samples.groupby(
+            ["I1Mask", "I2Mask", "UmiMask", "R1Mask", "R2Mask", "Recipe"]
+        )
 
         # Sanity check
         if sum([len(group) for _, group in grouped_df]) < len(df_samples):
@@ -503,23 +525,18 @@ def make_demux_manifests(
         manifest_root_name = f"{self.NGI_run_id}_demux"
         manifests = []
         n = 0
-        for (I1Mask, I2Mask, UmiMask, recipe), group in grouped_df:
+        for (I1Mask, I2Mask, UmiMask, R1Mask, R2Mask, recipe), group in grouped_df:
             file_name = f"{manifest_root_name}_{n}.csv"
 
             runValues_section = "\n".join(
                 [
                     "[RUNVALUES]",
                     "KeyName, Value",
-                    f'manifest_file, "{file_name}"',
+                    f"manifest_file, {file_name}",
                     f"manifest_group, {n+1}/{len(grouped_df)}",
-                    f"grouped_by, I1Mask:'{I1Mask}' I2Mask:'{I2Mask}' UmiMask:'{UmiMask}' recipe:'{recipe}'",
                 ]
             )
 
-            recipe_split = recipe.split("-")
-            R1Mask = f"R1:Y{recipe_split[0]}"
-            R2Mask = f"R2:Y{recipe_split[3]}"
-
             settings_section = "\n".join(
                 [
                     "[SETTINGS]",

From 920cc27023113948640f5f4740abb09470a27d3e Mon Sep 17 00:00:00 2001
From: chuan-wang <chuan.wang@scilifelab.se>
Date: Tue, 8 Oct 2024 11:07:16 +0200
Subject: [PATCH 119/187] Fix bug that not all FastQ files are symplinked

---
 taca/element/Element_Runs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 5f9bff2c..50696c11 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -732,8 +732,8 @@ def collect_demux_runmanifest(self, demux_results_dirs):
     # Aggregate the output FastQ files of samples from multiple demux
     def aggregate_sample_fastq(self, demux_runmanifest):
         lanes = sorted(list(set(sample["Lane"] for sample in demux_runmanifest)))
-        unique_sample_demux = set()
         for lane in lanes:
+            unique_sample_demux = set()
             sample_count = 1
             for sample in demux_runmanifest:
                 lanenr = sample["Lane"]

From 6582eb9fc9a6d4d5d3e3ede66932d7332b2bdec1 Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Tue, 8 Oct 2024 12:25:18 +0200
Subject: [PATCH 120/187] check if dir exists before creating one

---
 taca/element/Element_Runs.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 5f9bff2c..a973679d 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -1113,7 +1113,8 @@ def sync_metadata(self):
         ]
         metadata_archive = self.CONFIG.get("element_analysis").get("metadata_location")
         dest = os.path.join(metadata_archive, self.NGI_run_id)
-        os.makedirs(dest)
+        if not os.path.exists(dest):
+            os.makedirs(dest)
         for f in files_to_copy:
             shutil.copy(f, dest)
 

From 160b204421a00c04e7c9a93f35a526a6da6a2de8 Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Tue, 8 Oct 2024 13:49:53 +0200
Subject: [PATCH 121/187] catch stderr from bases2fastq

---
 taca/element/Element_Runs.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index f7d0b39b..a3b8419d 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -613,14 +613,19 @@ def generate_demux_command(self, run_manifest, demux_dir):
     def start_demux(self, run_manifest, demux_dir):
         with chdir(self.run_dir):
             cmd = self.generate_demux_command(run_manifest, demux_dir)
+            stderr_abspath = f"{self.run_dir}/bases2fastq_stderr.txt"
             try:
-                p_handle = subprocess.Popen(
-                    cmd, stdout=subprocess.PIPE, shell=True, cwd=self.run_dir
-                )
+                with open(stderr_abspath, "w") as stderr:
+                    process = subprocess.Popen(
+                        cmd,
+                        shell=True,
+                        cwd=self.run_dir,
+                        stderr=stderr,
+                        )
                 logger.info(
                     "Bases2Fastq conversion and demultiplexing "
                     f"started for run {self} on {datetime.now()}"
-                    f"with p_handle {p_handle}"
+                    f"with p_handle {process}"
                 )
             except subprocess.CalledProcessError:
                 logger.warning(

From 401c3a9bf10b4c0971613ddc837b4353bc3d14af Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Tue, 8 Oct 2024 14:14:52 +0200
Subject: [PATCH 122/187] cleanup and versioning

---
 VERSIONLOG.md                | 9 ++-------
 taca/__init__.py             | 2 +-
 taca/element/Element_Runs.py | 8 +++++---
 3 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/VERSIONLOG.md b/VERSIONLOG.md
index bc899c3c..0832b02d 100644
--- a/VERSIONLOG.md
+++ b/VERSIONLOG.md
@@ -1,13 +1,8 @@
 # TACA Version Log
 
-## 20240927.1
+## 20241008.1
 
-Add project name in IndexAssignment;
-Correct index percentage in undet
-
-## 20240924.1
-
-Aggregate aviti demultiplexing results
+Add support for processing Element Aviti data
 
 ## 20240705.1
 
diff --git a/taca/__init__.py b/taca/__init__.py
index b85b2cf5..c516d006 100644
--- a/taca/__init__.py
+++ b/taca/__init__.py
@@ -1,3 +1,3 @@
 """Main TACA module"""
 
-__version__ = "1.0.0"
+__version__ = "1.1.0"
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index a3b8419d..38603e8c 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -390,7 +390,7 @@ def find_lims_zip(self) -> str | None:
             logger.warning(
                 f"Multiple manifests found for run '{self.run_dir}' with pattern '{glob_pattern}', using latest one."
             )
-            glob_results.sort()
+            glob_results.sort() # TODO: add CLI option to specify manifest for re-demux
             lims_zip_src_path = glob_results[-1]
         else:
             lims_zip_src_path = glob_results[0]
@@ -621,7 +621,7 @@ def start_demux(self, run_manifest, demux_dir):
                         shell=True,
                         cwd=self.run_dir,
                         stderr=stderr,
-                        )
+                    )
                 logger.info(
                     "Bases2Fastq conversion and demultiplexing "
                     f"started for run {self} on {datetime.now()}"
@@ -817,7 +817,9 @@ def aggregate_undet_fastq(self, demux_runmanifest):
                     )
                 )
                 for fastqfile in fastqfiles:
-                    base_name = os.path.basename(fastqfile)
+                    base_name = os.path.basename(
+                        fastqfile
+                    )  # TODO: Make symlinks relative instead of absolute to maintain them after archiving
                     os.symlink(fastqfile, os.path.join(project_dest, base_name))
 
     # Read in each Project_RunStats.json to fetch PercentMismatch, PercentQ30, PercentQ40 and QualityScoreMean

From 41b90eb2f53bcb5abce5424d4c9009cb238db2df Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Tue, 8 Oct 2024 14:18:15 +0200
Subject: [PATCH 123/187] spaaace

---
 taca/element/Element_Runs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 38603e8c..2a16c14c 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -390,7 +390,7 @@ def find_lims_zip(self) -> str | None:
             logger.warning(
                 f"Multiple manifests found for run '{self.run_dir}' with pattern '{glob_pattern}', using latest one."
             )
-            glob_results.sort() # TODO: add CLI option to specify manifest for re-demux
+            glob_results.sort()  # TODO: add CLI option to specify manifest for re-demux
             lims_zip_src_path = glob_results[-1]
         else:
             lims_zip_src_path = glob_results[0]

From e81460d2b6467f21b0042beee1c3810d2e019642 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Tue, 8 Oct 2024 16:38:42 +0200
Subject: [PATCH 124/187] Parse UMI masks for both I1 and I2, can only use one
 though

---
 taca/element/Element_Runs.py | 38 ++++++++++++++++++++++++++++--------
 1 file changed, 30 insertions(+), 8 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 2a16c14c..65c41a6b 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -453,9 +453,6 @@ def make_demux_manifests(
         df_samples = df[df["Project"] != "Control"].copy()
         df_controls = df[df["Project"] == "Control"].copy()
 
-        # Add bool indicating whether UMI is present
-        df_samples["has_umi"] = df_samples["Index2"].str.contains("N")
-
         # Add masks
         df_samples["I1Mask"] = df_samples["Index1"].apply(
             lambda seq: get_mask(
@@ -473,7 +470,15 @@ def make_demux_manifests(
                 cycles_used=self.cycles["I2"],
             )
         )
-        df_samples["UmiMask"] = df_samples["Index2"].apply(
+        df_samples["I1UmiMask"] = df_samples["Index1"].apply(
+            lambda seq: get_mask(
+                seq=seq,
+                keep_Ns=True,
+                prefix="I1:",
+                cycles_used=self.cycles["I1"],
+            )
+        )
+        df_samples["I2UmiMask"] = df_samples["Index2"].apply(
             lambda seq: get_mask(
                 seq=seq,
                 keep_Ns=True,
@@ -510,7 +515,7 @@ def make_demux_manifests(
 
         # Break down into groups by non-consolable properties
         grouped_df = df_samples.groupby(
-            ["I1Mask", "I2Mask", "UmiMask", "R1Mask", "R2Mask", "Recipe"]
+            ["I1Mask", "I2Mask", "I1UmiMask", "I2UmiMask", "R1Mask", "R2Mask", "Recipe"]
         )
 
         # Sanity check
@@ -525,7 +530,15 @@ def make_demux_manifests(
         manifest_root_name = f"{self.NGI_run_id}_demux"
         manifests = []
         n = 0
-        for (I1Mask, I2Mask, UmiMask, R1Mask, R2Mask, recipe), group in grouped_df:
+        for (
+            I1Mask,
+            I2Mask,
+            I1UmiMask,
+            I2UmiMask,
+            R1Mask,
+            R2Mask,
+            recipe,
+        ), group in grouped_df:
             file_name = f"{manifest_root_name}_{n}.csv"
 
             runValues_section = "\n".join(
@@ -548,13 +561,22 @@ def make_demux_manifests(
                 ]
             )
 
-            if group["has_umi"].all():
+            if "Y" in I1UmiMask:
                 settings_section += "\n" + "\n".join(
                     [
-                        f"UmiMask, {UmiMask}",
+                        f"UmiMask, {I1UmiMask}",
                         "UmiFastQ, TRUE",
                     ]
                 )
+            elif "Y" in I2UmiMask:
+                settings_section += "\n" + "\n".join(
+                    [
+                        f"UmiMask, {I2UmiMask}",
+                        "UmiFastQ, TRUE",
+                    ]
+                )
+            else:
+                pass
 
             # Add PhiX stratified by index length
             group_controls = df_controls[

From 34e260f5d92ed58682b5222556940c4c08719766 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Tue, 8 Oct 2024 16:40:47 +0200
Subject: [PATCH 125/187] stricter check

---
 taca/element/Element_Runs.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 65c41a6b..233249d8 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -561,22 +561,24 @@ def make_demux_manifests(
                 ]
             )
 
-            if "Y" in I1UmiMask:
+            if "Y" in I1UmiMask and "Y" not in I2UmiMask:
                 settings_section += "\n" + "\n".join(
                     [
                         f"UmiMask, {I1UmiMask}",
                         "UmiFastQ, TRUE",
                     ]
                 )
-            elif "Y" in I2UmiMask:
+            elif "Y" in I2UmiMask and "Y" not in I1UmiMask:
                 settings_section += "\n" + "\n".join(
                     [
                         f"UmiMask, {I2UmiMask}",
                         "UmiFastQ, TRUE",
                     ]
                 )
-            else:
+            elif "Y" not in I1UmiMask and "Y" not in I2UmiMask:
                 pass
+            else:
+                raise AssertionError("Both I1 and I2 appear to contain UMIs.")
 
             # Add PhiX stratified by index length
             group_controls = df_controls[

From ae961e94e500963fff9e284e3eb9ccc8172807ec Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Wed, 9 Oct 2024 13:17:50 +0200
Subject: [PATCH 126/187] overwrite group settings by settings row specified in
 lims manifest

---
 taca/element/Element_Runs.py | 60 +++++++++++++++++++++---------------
 1 file changed, 36 insertions(+), 24 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 233249d8..0b32accb 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -515,7 +515,15 @@ def make_demux_manifests(
 
         # Break down into groups by non-consolable properties
         grouped_df = df_samples.groupby(
-            ["I1Mask", "I2Mask", "I1UmiMask", "I2UmiMask", "R1Mask", "R2Mask", "Recipe"]
+            [
+                "I1Mask",
+                "I2Mask",
+                "I1UmiMask",
+                "I2UmiMask",
+                "R1Mask",
+                "R2Mask",
+                "settings",
+            ]
         )
 
         # Sanity check
@@ -537,7 +545,7 @@ def make_demux_manifests(
             I2UmiMask,
             R1Mask,
             R2Mask,
-            recipe,
+            settings,
         ), group in grouped_df:
             file_name = f"{manifest_root_name}_{n}.csv"
 
@@ -547,39 +555,43 @@ def make_demux_manifests(
                     "KeyName, Value",
                     f"manifest_file, {file_name}",
                     f"manifest_group, {n+1}/{len(grouped_df)}",
+                    f"built_from, {manifest_to_split}",
                 ]
             )
 
-            settings_section = "\n".join(
-                [
-                    "[SETTINGS]",
-                    "SettingName, Value",
-                    f"R1FastqMask, {R1Mask}",
-                    f"I1Mask, {I1Mask}",
-                    f"I2Mask, {I2Mask}",
-                    f"R2FastqMask, {R2Mask}",
-                ]
-            )
+            # Instantiate settings
+            settings_kvs = {
+                "R1FastqMask": R1Mask,
+                "I1Mask": I1Mask,
+                "I2Mask": I2Mask,
+                "R2FastqMask": R2Mask,
+            }
 
+            # Add UMI settings
             if "Y" in I1UmiMask and "Y" not in I2UmiMask:
-                settings_section += "\n" + "\n".join(
-                    [
-                        f"UmiMask, {I1UmiMask}",
-                        "UmiFastQ, TRUE",
-                    ]
-                )
+                settings_kvs["UmiMask"] = I1UmiMask
+                settings_kvs["UmiFastQ"] = "TRUE"
             elif "Y" in I2UmiMask and "Y" not in I1UmiMask:
-                settings_section += "\n" + "\n".join(
-                    [
-                        f"UmiMask, {I2UmiMask}",
-                        "UmiFastQ, TRUE",
-                    ]
-                )
+                settings_kvs["UmiMask"] = I2UmiMask
+                settings_kvs["UmiFastQ"] = "TRUE"
             elif "Y" not in I1UmiMask and "Y" not in I2UmiMask:
                 pass
             else:
                 raise AssertionError("Both I1 and I2 appear to contain UMIs.")
 
+            # Unpack settings from LIMS manifest
+            for kv in settings.split(" "):
+                k, v = kv.split(":")
+                settings_kvs[k] = v
+
+            settings_section = "\n".join(
+                [
+                    "[SETTINGS]",
+                    "SettingName, Value",
+                ]
+                + [f"{k}, {v}" for k, v in settings.items()]
+            )
+
             # Add PhiX stratified by index length
             group_controls = df_controls[
                 df_controls["Lane"].isin(group["Lane"].unique())

From 01dd52fd7ac215785062266047a3299f94f5f634 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Wed, 9 Oct 2024 13:20:14 +0200
Subject: [PATCH 127/187] fix ref

---
 taca/element/Element_Runs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 0b32accb..bfb8bfc7 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -589,7 +589,7 @@ def make_demux_manifests(
                     "[SETTINGS]",
                     "SettingName, Value",
                 ]
-                + [f"{k}, {v}" for k, v in settings.items()]
+                + [f"{k}, {v}" for k, v in settings_kvs.items()]
             )
 
             # Add PhiX stratified by index length

From f20c3d7b9fc10ab7d86ff543ab532376fc5758e7 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Fri, 11 Oct 2024 11:25:10 +0200
Subject: [PATCH 128/187] use tree output for docs

---
 tests/conftest.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index e9a3fd89..a4945938 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -14,31 +14,33 @@ def create_dirs():
         │   ├── Chromium_10X_indexes.txt
         │   └── Smart-seq3_v1.5.csv
         ├── log
-        │   ├── transfer_minion_qc.tsv
+        │   ├── taca.log
+        │   ├── transfer.tsv
+        │   ├── transfer_aviti.tsv
         │   ├── transfer_minion.tsv
+        │   ├── transfer_minion_qc.tsv
         │   └── transfer_promethion.tsv
-        │   └── transfer_aviti.tsv
-        │   └── transfer.tsv
-        │   └── taca.log
         ├── miarka
         │   ├── minion
         │   │   └── qc
         │   └── promethion
         ├── minknow_reports
         ├── ngi-nas-ns
+        │   ├── Aviti_data
         │   ├── NextSeq_data
         │   ├── NovaSeqXPlus_data
         │   ├── NovaSeq_data
         │   ├── minion_data
         │   ├── miseq_data
         │   ├── promethion_data
-        │   ├── Aviti_data
         │   └── samplesheets
+        │       ├── Aviti
         │       ├── NovaSeqXPlus
         │       └── anglerfish
-        │       └── Aviti
         └── ngi_data
             └── sequencing
+                ├── AV242106
+                │   └── nosync
                 ├── MiSeq
                 │   └── nosync
                 ├── NextSeq
@@ -52,8 +54,6 @@ def create_dirs():
                 │   └── qc
                 │       └── nosync
                 └── promethion
-                │   └── nosync
-                └── AV242106
                     └── nosync
 
     --> Return the the temporary directory object

From 1d48b47cd493c46ce9a83d5b097e5f1745e13965 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Fri, 11 Oct 2024 11:25:21 +0200
Subject: [PATCH 129/187] add files for func test

---
 tests/element/test_Element_Runs.py | 356 ++++++++++++++++++++++++++++-
 1 file changed, 347 insertions(+), 9 deletions(-)

diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py
index 7963761d..c7dffc18 100644
--- a/tests/element/test_Element_Runs.py
+++ b/tests/element/test_Element_Runs.py
@@ -1,6 +1,6 @@
-import json
 import os
 import tempfile
+import zipfile
 from unittest import mock
 
 import pytest
@@ -25,7 +25,8 @@ def get_config(tmp: tempfile.TemporaryDirectory) -> dict:
 
 def create_element_run_dir(
     tmp: tempfile.TemporaryDirectory,
-    run_name: str = "20240716_AV242106_testrun",
+    run_name: str = "20240926_AV242106_A2349523513",
+    lims_manifest: bool = True,
     nosync: bool = False,
     run_finished: bool = True,
     sync_finished: bool = True,
@@ -35,9 +36,11 @@ def create_element_run_dir(
     outcome_completed: bool = True,
 ) -> str:
     """
-    Conditionally build a file tree for an Element run.
+    Build a run dir for an Element run for test purposes.
 
-        .
+    Some file contents are replaced with "MOCK" to shorten them.
+
+        20240926_AV242106_A2349523513
         ├── RunManifest.csv
         ├── RunManifest.json
         ├── RunParameters.json
@@ -59,14 +62,349 @@ def create_element_run_dir(
         run_path = f"{tmp.name}/ngi_data/sequencing/AV242106/{run_name}"
     os.mkdir(run_path)
 
+    # Create LIMS manifest
+    if lims_manifest:
+        manifest_root_name = "AVITI_run_manifest_2349523513_24-1061390_240926_171138_ChristianNatanaelsson"
+        manifest_pdir = f"{tmp.name}/ngi-nas-ns/samplesheets/Aviti/2024"
+
+        csv_path = f"{manifest_pdir}/{manifest_root_name}_untrimmed.csv"
+        zip_path = f"{manifest_pdir}/{manifest_root_name}.zip"
+
+        with open(csv_path, "w") as stream:
+            stream.write("""[RUNVALUES]
+KeyName, Value
+lims_step_name, "Load to Flowcell (AVITI) v1.0"
+lims_step_id, "24-1061390"
+manifest_file, "AVITI_run_manifest_2349523513_24-1061390_240926_171138_ChristianNatanaelsson_untrimmed.csv"
+
+[SETTINGS]
+SettingName, Value
+
+[SAMPLES]
+SampleName,Index1,Index2,Lane,Project,Recipe
+P32105_1001,AAAGCATA,,1,I__Adameyko_24_06,50-8-24-49
+P32105_1001,CTGCAGCC,,1,I__Adameyko_24_06,50-8-24-49
+P32105_1001,GCCTTTAT,,1,I__Adameyko_24_06,50-8-24-49
+P32105_1001,TGTAGCGG,,1,I__Adameyko_24_06,50-8-24-49
+P32105_1002,ATTGGACG,,1,I__Adameyko_24_06,50-8-24-49
+P32105_1002,CAGCTTAC,,1,I__Adameyko_24_06,50-8-24-49
+P32105_1002,GGCAAGGA,,1,I__Adameyko_24_06,50-8-24-49
+P32105_1002,TCATCCTT,,1,I__Adameyko_24_06,50-8-24-49
+P32105_1003,ACGTTACA,,1,I__Adameyko_24_06,50-8-24-49
+P32105_1003,CGTAGGTT,,1,I__Adameyko_24_06,50-8-24-49
+P32105_1003,GACGACGG,,1,I__Adameyko_24_06,50-8-24-49
+P32105_1003,TTACCTAC,,1,I__Adameyko_24_06,50-8-24-49
+P32105_1004,ACTTCACT,,1,I__Adameyko_24_06,50-8-24-49
+P32105_1004,CGAAGTTG,,1,I__Adameyko_24_06,50-8-24-49
+P32105_1004,GAGCACGC,,1,I__Adameyko_24_06,50-8-24-49
+P32105_1004,TTCGTGAA,,1,I__Adameyko_24_06,50-8-24-49
+PhiX_Adept,ATGTCGCTAG,CTAGCTCGTA,1,Control,0-0
+PhiX_Adept,CACAGATCGT,ACGAGAGTCT,1,Control,0-0
+PhiX_Adept,GCACATAGTC,GACTACTAGC,1,Control,0-0
+PhiX_Adept,TGTGTCGACA,TGTCTGACAG,1,Control,0-0
+P32105_1001,AAAGCATA,,2,I__Adameyko_24_06,50-8-24-49
+P32105_1001,CTGCAGCC,,2,I__Adameyko_24_06,50-8-24-49
+P32105_1001,GCCTTTAT,,2,I__Adameyko_24_06,50-8-24-49
+P32105_1001,TGTAGCGG,,2,I__Adameyko_24_06,50-8-24-49
+P32105_1002,ATTGGACG,,2,I__Adameyko_24_06,50-8-24-49
+P32105_1002,CAGCTTAC,,2,I__Adameyko_24_06,50-8-24-49
+P32105_1002,GGCAAGGA,,2,I__Adameyko_24_06,50-8-24-49
+P32105_1002,TCATCCTT,,2,I__Adameyko_24_06,50-8-24-49
+P32105_1003,ACGTTACA,,2,I__Adameyko_24_06,50-8-24-49
+P32105_1003,CGTAGGTT,,2,I__Adameyko_24_06,50-8-24-49
+P32105_1003,GACGACGG,,2,I__Adameyko_24_06,50-8-24-49
+P32105_1003,TTACCTAC,,2,I__Adameyko_24_06,50-8-24-49
+P32105_1004,ACTTCACT,,2,I__Adameyko_24_06,50-8-24-49
+P32105_1004,CGAAGTTG,,2,I__Adameyko_24_06,50-8-24-49
+P32105_1004,GAGCACGC,,2,I__Adameyko_24_06,50-8-24-49
+P32105_1004,TTCGTGAA,,2,I__Adameyko_24_06,50-8-24-49
+PhiX_Adept,ATGTCGCTAG,CTAGCTCGTA,2,Control,0-0
+PhiX_Adept,CACAGATCGT,ACGAGAGTCT,2,Control,0-0
+PhiX_Adept,GCACATAGTC,GACTACTAGC,2,Control,0-0
+PhiX_Adept,TGTGTCGACA,TGTCTGACAG,2,Control,0-0
+""")
+
+        with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
+            # Add the CSV file to the zip file
+            zipf.write(csv_path, os.path.basename(csv_path))
+
     # Populate run dir with files and folders
     if run_finished:
-        open(f"{run_path}/RunManifest.csv", "w").close()
-        open(f"{run_path}/RunManifest.json", "w").close()
-        open(f"{run_path}/RunParameters.json", "w").close()
-        with open(f"{run_path}/RunUploaded.json", "w") as f:
+        with open(f"{run_path}/RunManifest.json", "w") as stream:
+            stream.write("""{
+    "KitConfiguration": {
+        "MaxCycles": 334,
+        "DefaultR1Cycles": 151,
+        "DefaultR2Cycles": 151,
+        "DefaultI1Cycles": -1,
+        "DefaultI2Cycles": -1,
+        "MinimumR1Cycles": 5,
+        "MinimumR2Cycles": 0,
+        "MinimumI1Cycles": 0,
+        "MinimumI2Cycles": 0,
+        "DefaultI1FastQ": false,
+        "DefaultI2FastQ": false,
+        "DefaultUMIFastQ": false,
+        "DefaultI1Mask": "I1:Y*",
+        "DefaultI2Mask": "I2:Y*",
+        "DefaultUmiMask": "I1:N*",
+        "DefaultR1FastQMask": "R1:Y*N",
+        "DefaultR2FastQMask": "R2:Y*N",
+        "DefaultI1MaskRead": "I1",
+        "DefaultI2MaskRead": "I2",
+        "DefaultUmiMaskRead": "I1",
+        "DefaultR1FastQMaskRead": "R1",
+        "DefaultR2FastQMaskRead": "R2",
+        "DefaultR1Adapter": "",
+        "DefaultR2Adapter": "",
+        "DefaultR1AdapterTrim": false,
+        "DefaultR2AdapterTrim": false,
+        "DefaultR1AdapterNMask": false,
+        "DefaultR2AdapterNMask": false,
+        "DefaultR1AdapterMinimumTrimmedLength": 16,
+        "DefaultR2AdapterMinimumTrimmedLength": 16,
+        "DefaultR1AdapterMinimumStringency": 0.9,
+        "DefaultR2AdapterMinimumStringency": 0.9,
+        "DefaultR1AdapterMinimumOverlap": 3,
+        "DefaultR2AdapterMinimumOverlap": 3,
+        "DefaultAdapterTrimType": "Paired-End"
+    },
+    "RunParameters": {
+        "PreparationWorkflow": "Adept",
+        "KitConfiguration": "300Cycles",
+        "ChemistryVersion": "Cloudbreak",
+        "LowDiversity": false,
+        "I1Cycles": 8,
+        "I2Cycles": 24,
+        "R1Cycles": 50,
+        "R2Cycles": 49
+    },
+    "RunValues": {
+        "lims_step_id": "24-1061390",
+        "lims_step_name": "Load to Flowcell (AVITI) v1.0",
+        "manifest_file": "AVITI_run_manifest_2349523513_24-1061390_240926_171138_ChristianNatanaelsson_trimmed.csv"
+    },
+    "Settings": [
+        {
+            "Lane": 1,
+            "I1MismatchThreshold": 1,
+            "I2MismatchThreshold": 1,
+            "R1Adapter": [],
+            "R2Adapter": [],
+            "I1MaskManifest": "I1:N*",
+            "I1Mask": [
+                {
+                    "Read": "I1",
+                    "Cycles": []
+                }
+            ],
+            "I1FastQ": false,
+            "I2MaskManifest": "I2:N*",
+            "I2Mask": [
+                {
+                    "Read": "I2",
+                    "Cycles": []
+                }
+            ],
+            "I2FastQ": false,
+            "UmiMaskManifest": "I1:N*",
+            "UmiMask": [
+                {
+                    "Read": "I1",
+                    "Cycles": []
+                }
+            ],
+            "UmiFastQ": false,
+            "R1FastQMaskManifest": "R1:Y*N",
+            "R1FastQMask": [
+                {
+                    "Read": "R1",
+                    "Cycles": "MOCK"
+                }
+            ],
+            "R2FastQMaskManifest": "R2:Y*N",
+            "R2FastQMask": [
+                {
+                    "Read": "R2",
+                    "Cycles": "MOCK"
+                }
+            ],
+            "SpikeInAsUnassigned": true,
+            "R1AdapterTrim": false,
+            "R2AdapterTrim": false,
+            "R1AdapterNMask": false,
+            "R2AdapterNMask": false,
+            "R1AdapterMinimumTrimmedLength": 16,
+            "R2AdapterMinimumTrimmedLength": 16,
+            "R1AdapterMinimumStringency": 0.9,
+            "R2AdapterMinimumStringency": 0.9,
+            "R1AdapterMinimumOverlap": 3,
+            "R2AdapterMinimumOverlap": 3,
+            "AdapterTrimType": "Paired-End"
+        },
+        {
+            "Lane": 2,
+            "I1MismatchThreshold": 1,
+            "I2MismatchThreshold": 1,
+            "R1Adapter": [],
+            "R2Adapter": [],
+            "I1MaskManifest": "I1:N*",
+            "I1Mask": [
+                {
+                    "Read": "I1",
+                    "Cycles": []
+                }
+            ],
+            "I1FastQ": false,
+            "I2MaskManifest": "I2:N*",
+            "I2Mask": [
+                {
+                    "Read": "I2",
+                    "Cycles": []
+                }
+            ],
+            "I2FastQ": false,
+            "UmiMaskManifest": "I1:N*",
+            "UmiMask": [
+                {
+                    "Read": "I1",
+                    "Cycles": []
+                }
+            ],
+            "UmiFastQ": false,
+            "R1FastQMaskManifest": "R1:Y*N",
+            "R1FastQMask": [
+                {
+                    "Read": "R1",
+                    "Cycles": "MOCK"
+                }
+            ],
+            "R2FastQMaskManifest": "R2:Y*N",
+            "R2FastQMask": [
+                {
+                    "Read": "R2",
+                    "Cycles": "MOCK"
+                }
+            ],
+            "SpikeInAsUnassigned": true,
+            "R1AdapterTrim": false,
+            "R2AdapterTrim": false,
+            "R1AdapterNMask": false,
+            "R2AdapterNMask": false,
+            "R1AdapterMinimumTrimmedLength": 16,
+            "R2AdapterMinimumTrimmedLength": 16,
+            "R1AdapterMinimumStringency": 0.9,
+            "R2AdapterMinimumStringency": 0.9,
+            "R1AdapterMinimumOverlap": 3,
+            "R2AdapterMinimumOverlap": 3,
+            "AdapterTrimType": "Paired-End"
+        }
+    ],
+    "Samples": [
+        {
+            "SampleName": "DefaultSample",
+            "SampleNumber": 1,
+            "ExternalId": "",
+            "Indexes": [
+                {
+                    "Lane": 1,
+                    "Index1": "",
+                    "Index2": ""
+                },
+                {
+                    "Lane": 2,
+                    "Index1": "",
+                    "Index2": ""
+                }
+            ],
+            "CustomMetadata": {},
+            "Project": "DefaultProject"
+        }
+    ]
+}
+""")
+        with open(f"{run_path}/RunParameters.json", "w") as stream:
+            stream.write("""{
+  "FileVersion": "5.0.0",
+  "RunName": "A2349523513",
+  "RecipeExecutionID": "rec.9590c80c95fc4eee8b3eb10c31251915",
+  "RunID": "seq_66f5837f1ae1a35f10a2e594",
+  "RunType": "Sequencing",
+  "RunDescription": "",
+  "Side": "SideA",
+  "FlowcellID": "2349523513",
+  "Date": "2024-09-26T16:34:55.978072698Z",
+  "InstrumentName": "AV242106",
+  "OperatorName": "christian.natanael@scilifelab.se ",
+  "RunFolderName": "20240926_AV242106_A2349523513",
+  "Tiles": "MOCK",
+  "Cycles": {
+    "R1": 50,
+    "R2": 49,
+    "I1": 8,
+    "I2": 24
+  },
+  "ReadOrder": "I1,I2,R1,R2",
+  "ThroughputSelection": "High",
+  "KitConfiguration": "300Cycles",
+  "PreparationWorkflow": "Adept",
+  "ChemistryVersion": "Cloudbreak",
+  "LowDiversity": false,
+  "PlatformVersion": "2.6.2",
+  "AnalysisLanes": "1+2",
+  "StorageConnectionID": "local:66866355d07c3234c01b67b1",
+  "PMGMask": "P1:Y4N*",
+  "Consumables": {
+    "Flowcell": {
+      "SerialNumber": "2349523513",
+      "PartNumber": "810-00002",
+      "LotNumber": "2405300233",
+      "Expiration": "2025-05-31T00:00:00Z",
+      "ExpirationStr": "20250531",
+      "BarcodeStr": "2349523513,810-00002,2405300233,20250531"
+    },
+    "SequencingCartridge": {
+      "SerialNumber": "24062600390028",
+      "PartNumber": "820-00013",
+      "LotNumber": "2406260039",
+      "Expiration": "2025-05-22T00:00:00Z",
+      "ExpirationStr": "20250522",
+      "BarcodeStr": "24062600390028,820-00013,2406260039,20250522"
+    },
+    "Buffer": {
+      "SerialNumber": "24062400390041",
+      "PartNumber": "820-00002",
+      "LotNumber": "2406240039",
+      "Expiration": "2026-06-25T00:00:00Z",
+      "ExpirationStr": "20260625",
+      "BarcodeStr": "24062400390041,820-00002,2406240039,20260625"
+    }
+  },
+  "LibraryType": "Linear",
+  "RecipeValues": [
+    {
+      "Name": "filterMask",
+      "Value": "R1:Y15N*-R2:Y15N*"
+    }
+  ],
+  "AdvancedSettings": {
+    "PolonyDensity": "HighDensity"
+  }
+}
+""")
+        with open(f"{run_path}/RunUploaded.json", "w") as stream:
             outcome = "OutcomeCompleted" if outcome_completed else "OutcomeFailed"
-            f.write(json.dumps({"outcome": outcome}))
+            stream.write(
+                "{"
+                + '"version":"1.0.0",'
+                + '"instrument":"AV242106",'
+                + '"instrumentId":"0000024023696901c5621014",'
+                + '"runType":"Sequencing",'
+                + '"recipeExecutionId":"rec.9590c80c95fc4eee8b3eb10c31251915",'
+                + '"runID":"seq_66f5837f1ae1a35f10a2e594",'
+                + f'"outcome":"{outcome}"'
+                + "}"
+            )
 
     if sync_finished:
         open(f"{run_path}/.sync_finished", "w").close()

From 76da6a5ff70e1b89d2d4392fb289e3e93e79c616 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Fri, 11 Oct 2024 14:52:44 +0200
Subject: [PATCH 130/187] fix presumed method call bug

---
 taca/analysis/analysis_element.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index f0274e47..915fcfa7 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -62,7 +62,7 @@ def _process(run):
                     run.start_demux(demux_manifest, sub_demux_dir)
                     sub_demux_count += 1
                 run.status = "demultiplexing"
-                if run.status_changed:
+                if run.status_changed():
                     run.update_statusdb()
                 return
             else:

From 2fb014f936d5aa983b2d998fc00f9a615a1924a7 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Fri, 11 Oct 2024 14:54:05 +0200
Subject: [PATCH 131/187] reset kwargs, update lims manifest, fix bugs, remove
 csv after zipping it

---
 tests/element/test_Element_Runs.py | 103 +++++++++++++++--------------
 1 file changed, 54 insertions(+), 49 deletions(-)

diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py
index c7dffc18..63e8fe89 100644
--- a/tests/element/test_Element_Runs.py
+++ b/tests/element/test_Element_Runs.py
@@ -27,13 +27,13 @@ def create_element_run_dir(
     tmp: tempfile.TemporaryDirectory,
     run_name: str = "20240926_AV242106_A2349523513",
     lims_manifest: bool = True,
-    nosync: bool = False,
     run_finished: bool = True,
-    sync_finished: bool = True,
-    demux_dir: bool = True,
-    n_demux_subdirs: int = 1,
-    demux_done: bool = True,
     outcome_completed: bool = True,
+    sync_finished: bool = True,
+    demux_dir: bool = False,
+    n_demux_subdirs: int = 0,
+    demux_done: bool = False,
+    nosync: bool = False,
 ) -> str:
     """
     Build a run dir for an Element run for test purposes.
@@ -67,66 +67,71 @@ def create_element_run_dir(
         manifest_root_name = "AVITI_run_manifest_2349523513_24-1061390_240926_171138_ChristianNatanaelsson"
         manifest_pdir = f"{tmp.name}/ngi-nas-ns/samplesheets/Aviti/2024"
 
+        os.mkdir(manifest_pdir)
+
         csv_path = f"{manifest_pdir}/{manifest_root_name}_untrimmed.csv"
         zip_path = f"{manifest_pdir}/{manifest_root_name}.zip"
 
         with open(csv_path, "w") as stream:
+            # This run manifest was generated after the sequencing run,
+            #  and is different from what it's file name implies.
             stream.write("""[RUNVALUES]
 KeyName, Value
-lims_step_name, "Load to Flowcell (AVITI) v1.0"
-lims_step_id, "24-1061390"
-manifest_file, "AVITI_run_manifest_2349523513_24-1061390_240926_171138_ChristianNatanaelsson_untrimmed.csv"
+lims_step_name, Load to Flowcell (AVITI) v1.0
+lims_step_id, 24-1061411
+manifest_file, AVITI_run_manifest_2349523513_24-1061411_241011_142515_AlfredKedhammar_untrimmed.csv
 
 [SETTINGS]
 SettingName, Value
 
 [SAMPLES]
-SampleName,Index1,Index2,Lane,Project,Recipe
-P32105_1001,AAAGCATA,,1,I__Adameyko_24_06,50-8-24-49
-P32105_1001,CTGCAGCC,,1,I__Adameyko_24_06,50-8-24-49
-P32105_1001,GCCTTTAT,,1,I__Adameyko_24_06,50-8-24-49
-P32105_1001,TGTAGCGG,,1,I__Adameyko_24_06,50-8-24-49
-P32105_1002,ATTGGACG,,1,I__Adameyko_24_06,50-8-24-49
-P32105_1002,CAGCTTAC,,1,I__Adameyko_24_06,50-8-24-49
-P32105_1002,GGCAAGGA,,1,I__Adameyko_24_06,50-8-24-49
-P32105_1002,TCATCCTT,,1,I__Adameyko_24_06,50-8-24-49
-P32105_1003,ACGTTACA,,1,I__Adameyko_24_06,50-8-24-49
-P32105_1003,CGTAGGTT,,1,I__Adameyko_24_06,50-8-24-49
-P32105_1003,GACGACGG,,1,I__Adameyko_24_06,50-8-24-49
-P32105_1003,TTACCTAC,,1,I__Adameyko_24_06,50-8-24-49
-P32105_1004,ACTTCACT,,1,I__Adameyko_24_06,50-8-24-49
-P32105_1004,CGAAGTTG,,1,I__Adameyko_24_06,50-8-24-49
-P32105_1004,GAGCACGC,,1,I__Adameyko_24_06,50-8-24-49
-P32105_1004,TTCGTGAA,,1,I__Adameyko_24_06,50-8-24-49
-PhiX_Adept,ATGTCGCTAG,CTAGCTCGTA,1,Control,0-0
-PhiX_Adept,CACAGATCGT,ACGAGAGTCT,1,Control,0-0
-PhiX_Adept,GCACATAGTC,GACTACTAGC,1,Control,0-0
-PhiX_Adept,TGTGTCGACA,TGTCTGACAG,1,Control,0-0
-P32105_1001,AAAGCATA,,2,I__Adameyko_24_06,50-8-24-49
-P32105_1001,CTGCAGCC,,2,I__Adameyko_24_06,50-8-24-49
-P32105_1001,GCCTTTAT,,2,I__Adameyko_24_06,50-8-24-49
-P32105_1001,TGTAGCGG,,2,I__Adameyko_24_06,50-8-24-49
-P32105_1002,ATTGGACG,,2,I__Adameyko_24_06,50-8-24-49
-P32105_1002,CAGCTTAC,,2,I__Adameyko_24_06,50-8-24-49
-P32105_1002,GGCAAGGA,,2,I__Adameyko_24_06,50-8-24-49
-P32105_1002,TCATCCTT,,2,I__Adameyko_24_06,50-8-24-49
-P32105_1003,ACGTTACA,,2,I__Adameyko_24_06,50-8-24-49
-P32105_1003,CGTAGGTT,,2,I__Adameyko_24_06,50-8-24-49
-P32105_1003,GACGACGG,,2,I__Adameyko_24_06,50-8-24-49
-P32105_1003,TTACCTAC,,2,I__Adameyko_24_06,50-8-24-49
-P32105_1004,ACTTCACT,,2,I__Adameyko_24_06,50-8-24-49
-P32105_1004,CGAAGTTG,,2,I__Adameyko_24_06,50-8-24-49
-P32105_1004,GAGCACGC,,2,I__Adameyko_24_06,50-8-24-49
-P32105_1004,TTCGTGAA,,2,I__Adameyko_24_06,50-8-24-49
-PhiX_Adept,ATGTCGCTAG,CTAGCTCGTA,2,Control,0-0
-PhiX_Adept,CACAGATCGT,ACGAGAGTCT,2,Control,0-0
-PhiX_Adept,GCACATAGTC,GACTACTAGC,2,Control,0-0
-PhiX_Adept,TGTGTCGACA,TGTCTGACAG,2,Control,0-0
+SampleName,Index1,Index2,Lane,Project,Recipe,lims_label,settings
+P32105_1001,AAAGCATA,NNNNNNNNNNNNNNNNNNNNNNNN,1,I__Adameyko_24_06,50-8-24-49,SI-NA-A3,I1Fastq:True
+P32105_1001,CTGCAGCC,NNNNNNNNNNNNNNNNNNNNNNNN,1,I__Adameyko_24_06,50-8-24-49,SI-NA-A3,I1Fastq:True
+P32105_1001,GCCTTTAT,NNNNNNNNNNNNNNNNNNNNNNNN,1,I__Adameyko_24_06,50-8-24-49,SI-NA-A3,I1Fastq:True
+P32105_1001,TGTAGCGG,NNNNNNNNNNNNNNNNNNNNNNNN,1,I__Adameyko_24_06,50-8-24-49,SI-NA-A3,I1Fastq:True
+P32105_1002,ATTGGACG,NNNNNNNNNNNNNNNNNNNNNNNN,1,I__Adameyko_24_06,50-8-24-49,SI-NA-B3,I1Fastq:True
+P32105_1002,CAGCTTAC,NNNNNNNNNNNNNNNNNNNNNNNN,1,I__Adameyko_24_06,50-8-24-49,SI-NA-B3,I1Fastq:True
+P32105_1002,GGCAAGGA,NNNNNNNNNNNNNNNNNNNNNNNN,1,I__Adameyko_24_06,50-8-24-49,SI-NA-B3,I1Fastq:True
+P32105_1002,TCATCCTT,NNNNNNNNNNNNNNNNNNNNNNNN,1,I__Adameyko_24_06,50-8-24-49,SI-NA-B3,I1Fastq:True
+P32105_1003,ACGTTACA,NNNNNNNNNNNNNNNNNNNNNNNN,1,I__Adameyko_24_06,50-8-24-49,SI-NA-C3,I1Fastq:True
+P32105_1003,CGTAGGTT,NNNNNNNNNNNNNNNNNNNNNNNN,1,I__Adameyko_24_06,50-8-24-49,SI-NA-C3,I1Fastq:True
+P32105_1003,GACGACGG,NNNNNNNNNNNNNNNNNNNNNNNN,1,I__Adameyko_24_06,50-8-24-49,SI-NA-C3,I1Fastq:True
+P32105_1003,TTACCTAC,NNNNNNNNNNNNNNNNNNNNNNNN,1,I__Adameyko_24_06,50-8-24-49,SI-NA-C3,I1Fastq:True
+P32105_1004,ACTTCACT,NNNNNNNNNNNNNNNNNNNNNNNN,1,I__Adameyko_24_06,50-8-24-49,SI-NA-D3,I1Fastq:True
+P32105_1004,CGAAGTTG,NNNNNNNNNNNNNNNNNNNNNNNN,1,I__Adameyko_24_06,50-8-24-49,SI-NA-D3,I1Fastq:True
+P32105_1004,GAGCACGC,NNNNNNNNNNNNNNNNNNNNNNNN,1,I__Adameyko_24_06,50-8-24-49,SI-NA-D3,I1Fastq:True
+P32105_1004,TTCGTGAA,NNNNNNNNNNNNNNNNNNNNNNNN,1,I__Adameyko_24_06,50-8-24-49,SI-NA-D3,I1Fastq:True
+PhiX_Adept,ATGTCGCTAG,CTAGCTCGTA,1,Control,0-0,,
+PhiX_Adept,CACAGATCGT,ACGAGAGTCT,1,Control,0-0,,
+PhiX_Adept,GCACATAGTC,GACTACTAGC,1,Control,0-0,,
+PhiX_Adept,TGTGTCGACA,TGTCTGACAG,1,Control,0-0,,
+P32105_1001,AAAGCATA,NNNNNNNNNNNNNNNNNNNNNNNN,2,I__Adameyko_24_06,50-8-24-49,SI-NA-A3,I1Fastq:True
+P32105_1001,CTGCAGCC,NNNNNNNNNNNNNNNNNNNNNNNN,2,I__Adameyko_24_06,50-8-24-49,SI-NA-A3,I1Fastq:True
+P32105_1001,GCCTTTAT,NNNNNNNNNNNNNNNNNNNNNNNN,2,I__Adameyko_24_06,50-8-24-49,SI-NA-A3,I1Fastq:True
+P32105_1001,TGTAGCGG,NNNNNNNNNNNNNNNNNNNNNNNN,2,I__Adameyko_24_06,50-8-24-49,SI-NA-A3,I1Fastq:True
+P32105_1002,ATTGGACG,NNNNNNNNNNNNNNNNNNNNNNNN,2,I__Adameyko_24_06,50-8-24-49,SI-NA-B3,I1Fastq:True
+P32105_1002,CAGCTTAC,NNNNNNNNNNNNNNNNNNNNNNNN,2,I__Adameyko_24_06,50-8-24-49,SI-NA-B3,I1Fastq:True
+P32105_1002,GGCAAGGA,NNNNNNNNNNNNNNNNNNNNNNNN,2,I__Adameyko_24_06,50-8-24-49,SI-NA-B3,I1Fastq:True
+P32105_1002,TCATCCTT,NNNNNNNNNNNNNNNNNNNNNNNN,2,I__Adameyko_24_06,50-8-24-49,SI-NA-B3,I1Fastq:True
+P32105_1003,ACGTTACA,NNNNNNNNNNNNNNNNNNNNNNNN,2,I__Adameyko_24_06,50-8-24-49,SI-NA-C3,I1Fastq:True
+P32105_1003,CGTAGGTT,NNNNNNNNNNNNNNNNNNNNNNNN,2,I__Adameyko_24_06,50-8-24-49,SI-NA-C3,I1Fastq:True
+P32105_1003,GACGACGG,NNNNNNNNNNNNNNNNNNNNNNNN,2,I__Adameyko_24_06,50-8-24-49,SI-NA-C3,I1Fastq:True
+P32105_1003,TTACCTAC,NNNNNNNNNNNNNNNNNNNNNNNN,2,I__Adameyko_24_06,50-8-24-49,SI-NA-C3,I1Fastq:True
+P32105_1004,ACTTCACT,NNNNNNNNNNNNNNNNNNNNNNNN,2,I__Adameyko_24_06,50-8-24-49,SI-NA-D3,I1Fastq:True
+P32105_1004,CGAAGTTG,NNNNNNNNNNNNNNNNNNNNNNNN,2,I__Adameyko_24_06,50-8-24-49,SI-NA-D3,I1Fastq:True
+P32105_1004,GAGCACGC,NNNNNNNNNNNNNNNNNNNNNNNN,2,I__Adameyko_24_06,50-8-24-49,SI-NA-D3,I1Fastq:True
+P32105_1004,TTCGTGAA,NNNNNNNNNNNNNNNNNNNNNNNN,2,I__Adameyko_24_06,50-8-24-49,SI-NA-D3,I1Fastq:True
+PhiX_Adept,ATGTCGCTAG,CTAGCTCGTA,2,Control,0-0,,
+PhiX_Adept,CACAGATCGT,ACGAGAGTCT,2,Control,0-0,,
+PhiX_Adept,GCACATAGTC,GACTACTAGC,2,Control,0-0,,
+PhiX_Adept,TGTGTCGACA,TGTCTGACAG,2,Control,0-0,,
 """)
 
         with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
             # Add the CSV file to the zip file
             zipf.write(csv_path, os.path.basename(csv_path))
+        os.remove(csv_path)
 
     # Populate run dir with files and folders
     if run_finished:

From 120f9475f4bd3364e7e22a9b2948442283815d19 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Fri, 11 Oct 2024 14:54:43 +0200
Subject: [PATCH 132/187] mock subprocess, use default kwargs

---
 tests/analysis/test_analysis_element.py | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py
index 49067ca4..ff4d380b 100644
--- a/tests/analysis/test_analysis_element.py
+++ b/tests/analysis/test_analysis_element.py
@@ -1,12 +1,9 @@
 from tempfile import TemporaryDirectory
 from unittest.mock import patch
 
-import pytest
-
 from tests.element.test_Element_Runs import create_element_run_dir, get_config
 
 
-@pytest.mark.skip(reason="Not implemented yet")
 def test_run_preprocessing(create_dirs):
     tmp: TemporaryDirectory = create_dirs
 
@@ -19,17 +16,14 @@ def test_run_preprocessing(create_dirs):
     mock_db = patch("taca.element.Element_Runs.ElementRunsConnection")
     mock_db.start()
 
+    # Mock subprocess
+    mock_subprocess = patch("subprocess.Popen")
+    mock_subprocess.start()
+
+    # Create run dir and associated LIMS manifest
+    run_dir = create_element_run_dir(tmp=tmp)
+
     # Import module to test
     from taca.analysis import analysis_element as to_test
 
-    run_dir = create_element_run_dir(
-        tmp=tmp,
-        nosync=False,
-        run_finished=False,
-        sync_finished=False,
-        demux_dir=False,
-        demux_done=False,
-        outcome_completed=False,
-    )
-
     to_test.run_preprocessing(run_dir)

From dd4917744e2dd4e719d5f6a38ee826eeda9d31b6 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Fri, 11 Oct 2024 15:01:11 +0200
Subject: [PATCH 133/187] add stop mocks

---
 tests/analysis/test_analysis_element.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py
index ff4d380b..41e7f305 100644
--- a/tests/analysis/test_analysis_element.py
+++ b/tests/analysis/test_analysis_element.py
@@ -26,4 +26,10 @@ def test_run_preprocessing(create_dirs):
     # Import module to test
     from taca.analysis import analysis_element as to_test
 
+    # Test
     to_test.run_preprocessing(run_dir)
+
+    # Stop mocks
+    mock_config.stop()
+    mock_db.stop()
+    mock_subprocess.stop()

From e27bc66353bd4b716157ed7e6d4f301d5499a579 Mon Sep 17 00:00:00 2001
From: chuan-wang <chuan.wang@scilifelab.se>
Date: Fri, 11 Oct 2024 15:01:15 +0200
Subject: [PATCH 134/187] Fix issue with 0 lane number; Add percentage int
 total unassigned

---
 taca/element/Element_Runs.py | 49 ++++++++++++++++++++++++++++++++----
 1 file changed, 44 insertions(+), 5 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index bfb8bfc7..d7fb42c7 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -986,8 +986,11 @@ def aggregate_stats_assigned(self, demux_runmanifest):
             aggregated_assigned_indexes_filtered_sorted, aggregated_assigned_indexes_csv
         )
 
+        yield aggregated_assigned_indexes_filtered_sorted
+
+
     # Aggregate stats in UnassignedSequences.csv
-    def aggregate_stats_unassigned(self, demux_runmanifest):
+    def aggregate_stats_unassigned(self, demux_runmanifest, aggregated_assigned_indexes_filtered_sorted):
         aggregated_unassigned_indexes = []
         lanes = sorted(list(set(sample["Lane"] for sample in demux_runmanifest)))
         for lane in lanes:
@@ -1105,12 +1108,37 @@ def aggregate_stats_unassigned(self, demux_runmanifest):
             aggregated_unassigned_indexes, key=lambda x: (x["Lane"], -int(x["Count"]))
         )
         # Fetch PFCount for each lane
+        # to calculate % of unassigned index in total lane PF polonies
         pfcount_lane = {}
         if os.path.exists(self.run_stats_file):
             with open(self.run_stats_file) as stats_json:
                 aviti_runstats_json = json.load(stats_json)
-            for lane_stats in aviti_runstats_json["LaneStats"]:
-                pfcount_lane[str(lane_stats["Lane"])] = float(lane_stats["PFCount"])
+            # Check whether the lane numbers match between the run stat json and run manifests
+            if len(aviti_runstats_json["LaneStats"]) != len(lanes):
+                logger.warning(
+                    f"Inconsistent lane numbers between the {os.path.basename(self.run_stats_file)} file and run manifests!"
+                )
+            else:
+                # When there is no RunManifest uploaded at the sequencer, the lane numbers will all be 0
+                # In this case we assume that the lanes are ordered by their numbers
+                if all(lane_stats["Lane"] == 0 for lane_stats in aviti_runstats_json["LaneStats"]):
+                    lane_counter = 1
+                    for lane_stats in aviti_runstats_json["LaneStats"]:
+                        pfcount_lane[str(lane_counter)] = float(lane_stats["PFCount"])
+                        lane_counter += 1
+                # Otherwise we parse the PF counts by matching the lane numbers
+                else:
+                    for lane_stats in aviti_runstats_json["LaneStats"]:
+                        pfcount_lane[str(lane_stats["Lane"])] = float(lane_stats["PFCount"])
+            # Prepare the dict for pf assigned coutn for each lane
+            pf_assigned_lane = {}
+            for sample in aggregated_assigned_indexes_filtered_sorted:
+                lane = sample['Lane']
+                num_polonies_assigned = int(sample['NumPoloniesAssigned'])
+                if lane in pf_assigned_lane:
+                    pf_assigned_lane[lane] += num_polonies_assigned
+                else:
+                    pf_assigned_lane[lane] = num_polonies_assigned
             # Modify the % Polonies values based on PFCount for each lane
             for unassigned_index in aggregated_unassigned_indexes:
                 if pfcount_lane.get(unassigned_index["Lane"]):
@@ -1119,6 +1147,17 @@ def aggregate_stats_unassigned(self, demux_runmanifest):
                         / pfcount_lane[unassigned_index["Lane"]]
                         * 100
                     )
+                    # Calculate the % Polonies values in the total unassigned for each lane
+                    if pf_assigned_lane.get(unassigned_index["Lane"]):
+                        unassigned_index["% Unassigned"] = (
+                            float(unassigned_index["Count"])
+                            / (pfcount_lane[unassigned_index["Lane"]] - pf_assigned_lane[unassigned_index["Lane"]])
+                            * 100
+                        )
+                    else:
+                        unassigned_index["% Unassigned"] = 0
+                else:
+                    unassigned_index["% Polonies"] = 0
         else:
             logger.warning(
                 f"No {os.path.basename(self.run_stats_file)} file found for the run."
@@ -1143,9 +1182,9 @@ def aggregate_demux_results(self, demux_results_dirs):
         # Symlink the output FastQ files of undet only if a lane does not have multiple demux
         self.aggregate_undet_fastq(demux_runmanifest)
         # Aggregate stats in IndexAssignment.csv
-        self.aggregate_stats_assigned(demux_runmanifest)
+        aggregated_assigned_indexes_filtered_sorted = self.aggregate_stats_assigned(demux_runmanifest)
         # Aggregate stats in UnassignedSequences.csv
-        self.aggregate_stats_unassigned(demux_runmanifest)
+        self.aggregate_stats_unassigned(demux_runmanifest, aggregated_assigned_indexes_filtered_sorted)
 
     def sync_metadata(self):
         files_to_copy = [

From 447045f5fe9cbe718d036945ac0ff4f10c82287e Mon Sep 17 00:00:00 2001
From: chuan-wang <chuan.wang@scilifelab.se>
Date: Fri, 11 Oct 2024 15:02:55 +0200
Subject: [PATCH 135/187] Fix bug'

---
 taca/element/Element_Runs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index d7fb42c7..f74a30ee 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -986,7 +986,7 @@ def aggregate_stats_assigned(self, demux_runmanifest):
             aggregated_assigned_indexes_filtered_sorted, aggregated_assigned_indexes_csv
         )
 
-        yield aggregated_assigned_indexes_filtered_sorted
+        return aggregated_assigned_indexes_filtered_sorted
 
 
     # Aggregate stats in UnassignedSequences.csv

From cfcef6834152eac61aea719b00a9b326f4aeba3e Mon Sep 17 00:00:00 2001
From: chuan-wang <chuan.wang@scilifelab.se>
Date: Fri, 11 Oct 2024 15:04:52 +0200
Subject: [PATCH 136/187] ruff format

---
 taca/element/Element_Runs.py | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index f74a30ee..be0fce49 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -988,9 +988,10 @@ def aggregate_stats_assigned(self, demux_runmanifest):
 
         return aggregated_assigned_indexes_filtered_sorted
 
-
     # Aggregate stats in UnassignedSequences.csv
-    def aggregate_stats_unassigned(self, demux_runmanifest, aggregated_assigned_indexes_filtered_sorted):
+    def aggregate_stats_unassigned(
+        self, demux_runmanifest, aggregated_assigned_indexes_filtered_sorted
+    ):
         aggregated_unassigned_indexes = []
         lanes = sorted(list(set(sample["Lane"] for sample in demux_runmanifest)))
         for lane in lanes:
@@ -1121,7 +1122,10 @@ def aggregate_stats_unassigned(self, demux_runmanifest, aggregated_assigned_inde
             else:
                 # When there is no RunManifest uploaded at the sequencer, the lane numbers will all be 0
                 # In this case we assume that the lanes are ordered by their numbers
-                if all(lane_stats["Lane"] == 0 for lane_stats in aviti_runstats_json["LaneStats"]):
+                if all(
+                    lane_stats["Lane"] == 0
+                    for lane_stats in aviti_runstats_json["LaneStats"]
+                ):
                     lane_counter = 1
                     for lane_stats in aviti_runstats_json["LaneStats"]:
                         pfcount_lane[str(lane_counter)] = float(lane_stats["PFCount"])
@@ -1129,12 +1133,14 @@ def aggregate_stats_unassigned(self, demux_runmanifest, aggregated_assigned_inde
                 # Otherwise we parse the PF counts by matching the lane numbers
                 else:
                     for lane_stats in aviti_runstats_json["LaneStats"]:
-                        pfcount_lane[str(lane_stats["Lane"])] = float(lane_stats["PFCount"])
+                        pfcount_lane[str(lane_stats["Lane"])] = float(
+                            lane_stats["PFCount"]
+                        )
             # Prepare the dict for pf assigned coutn for each lane
             pf_assigned_lane = {}
             for sample in aggregated_assigned_indexes_filtered_sorted:
-                lane = sample['Lane']
-                num_polonies_assigned = int(sample['NumPoloniesAssigned'])
+                lane = sample["Lane"]
+                num_polonies_assigned = int(sample["NumPoloniesAssigned"])
                 if lane in pf_assigned_lane:
                     pf_assigned_lane[lane] += num_polonies_assigned
                 else:
@@ -1151,7 +1157,10 @@ def aggregate_stats_unassigned(self, demux_runmanifest, aggregated_assigned_inde
                     if pf_assigned_lane.get(unassigned_index["Lane"]):
                         unassigned_index["% Unassigned"] = (
                             float(unassigned_index["Count"])
-                            / (pfcount_lane[unassigned_index["Lane"]] - pf_assigned_lane[unassigned_index["Lane"]])
+                            / (
+                                pfcount_lane[unassigned_index["Lane"]]
+                                - pf_assigned_lane[unassigned_index["Lane"]]
+                            )
                             * 100
                         )
                     else:
@@ -1182,9 +1191,13 @@ def aggregate_demux_results(self, demux_results_dirs):
         # Symlink the output FastQ files of undet only if a lane does not have multiple demux
         self.aggregate_undet_fastq(demux_runmanifest)
         # Aggregate stats in IndexAssignment.csv
-        aggregated_assigned_indexes_filtered_sorted = self.aggregate_stats_assigned(demux_runmanifest)
+        aggregated_assigned_indexes_filtered_sorted = self.aggregate_stats_assigned(
+            demux_runmanifest
+        )
         # Aggregate stats in UnassignedSequences.csv
-        self.aggregate_stats_unassigned(demux_runmanifest, aggregated_assigned_indexes_filtered_sorted)
+        self.aggregate_stats_unassigned(
+            demux_runmanifest, aggregated_assigned_indexes_filtered_sorted
+        )
 
     def sync_metadata(self):
         files_to_copy = [

From 9e8b375bce9facac8491d19307e3d44f0b973872 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Fri, 11 Oct 2024 15:10:25 +0200
Subject: [PATCH 137/187] fix tests

---
 tests/element/test_Element_Runs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py
index 63e8fe89..eb4a714c 100644
--- a/tests/element/test_Element_Runs.py
+++ b/tests/element/test_Element_Runs.py
@@ -31,7 +31,7 @@ def create_element_run_dir(
     outcome_completed: bool = True,
     sync_finished: bool = True,
     demux_dir: bool = False,
-    n_demux_subdirs: int = 0,
+    n_demux_subdirs: int = 2,
     demux_done: bool = False,
     nosync: bool = False,
 ) -> str:

From bd6ac211acebe8ba7b9f0a93908d25e86cb7d1f8 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Fri, 11 Oct 2024 15:22:45 +0200
Subject: [PATCH 138/187] stop mocks

---
 tests/analysis/test_analysis_element.py  | 4 +---
 tests/analysis/test_analysis_nanopore.py | 3 +++
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py
index 41e7f305..bc3bdd8b 100644
--- a/tests/analysis/test_analysis_element.py
+++ b/tests/analysis/test_analysis_element.py
@@ -30,6 +30,4 @@ def test_run_preprocessing(create_dirs):
     to_test.run_preprocessing(run_dir)
 
     # Stop mocks
-    mock_config.stop()
-    mock_db.stop()
-    mock_subprocess.stop()
+    patch.stopall()
diff --git a/tests/analysis/test_analysis_nanopore.py b/tests/analysis/test_analysis_nanopore.py
index 01fec070..bd4499ca 100644
--- a/tests/analysis/test_analysis_nanopore.py
+++ b/tests/analysis/test_analysis_nanopore.py
@@ -147,3 +147,6 @@ def side_effect(*args, **kwargs):
 
     # Start testing
     analysis_nanopore.ont_transfer(run_abspath=None, qc=False)
+
+    # Stop mocks
+    patch.stopall()

From 9b1a4953799e27f9fffc858cc17c90c8158ee933 Mon Sep 17 00:00:00 2001
From: chuan-wang <chuan.wang@scilifelab.se>
Date: Fri, 11 Oct 2024 15:49:12 +0200
Subject: [PATCH 139/187] Refactor based on comments from JoA; Fix VERSIONLOG

---
 VERSIONLOG.md                | 4 ++++
 taca/element/Element_Runs.py | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/VERSIONLOG.md b/VERSIONLOG.md
index 80a77861..2b5fed5c 100644
--- a/VERSIONLOG.md
+++ b/VERSIONLOG.md
@@ -1,5 +1,9 @@
 # TACA Version Log
 
+## 20241011.1
+
+Fix issue with 0 lane number; Add percentage of unassigned in total unassigned per lane
+
 ## 20241008.1
 
 Add support for processing Element Aviti data
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index be0fce49..4473b9a8 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -1136,7 +1136,7 @@ def aggregate_stats_unassigned(
                         pfcount_lane[str(lane_stats["Lane"])] = float(
                             lane_stats["PFCount"]
                         )
-            # Prepare the dict for pf assigned coutn for each lane
+            # Prepare the dict for pf assigned count for each lane
             pf_assigned_lane = {}
             for sample in aggregated_assigned_indexes_filtered_sorted:
                 lane = sample["Lane"]

From 81b91ea4922f88231a5119a8557a4807f18622b3 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Fri, 11 Oct 2024 16:05:36 +0200
Subject: [PATCH 140/187] fix faulty annotation

---
 tests/analysis/test_analysis_nanopore.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/analysis/test_analysis_nanopore.py b/tests/analysis/test_analysis_nanopore.py
index bd4499ca..e06059cc 100644
--- a/tests/analysis/test_analysis_nanopore.py
+++ b/tests/analysis/test_analysis_nanopore.py
@@ -15,7 +15,7 @@
 )
 
 
-def build_run_properties() -> dict:
+def build_run_properties() -> list[dict]:
     """In order to parametrize the test in a comprehensive way, the parametrization is
     tabulated as a string here.
     """

From fb9218903f782738a8938d9c816f3390b460b83e Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Fri, 11 Oct 2024 16:13:21 +0200
Subject: [PATCH 141/187] parametrization

---
 tests/analysis/test_analysis_element.py | 33 +++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py
index bc3bdd8b..8f12bf10 100644
--- a/tests/analysis/test_analysis_element.py
+++ b/tests/analysis/test_analysis_element.py
@@ -1,10 +1,39 @@
+from io import StringIO
 from tempfile import TemporaryDirectory
 from unittest.mock import patch
 
+import pandas as pd
+import pytest
+
 from tests.element.test_Element_Runs import create_element_run_dir, get_config
 
 
-def test_run_preprocessing(create_dirs):
+def get_run_kwargs() -> list[dict]:
+    parameter_string_table = """
+lims_manifest run_finished outcome_completed sync_finished demux_dir demux_done nosync
+False         False        False             False         False     False      False
+True          False        False             False         False     False      False
+True          True         False             False         False     False      False
+True          True         True              False         False     False      False
+True          True         True              True          False     False      False
+True          True         True              True          True      False      False
+True          True         True              True          True      True       False
+True          True         True              True          True      True       True
+"""
+    # Turn string table to datastream
+    data = StringIO(parameter_string_table)
+
+    # Read data, trimming whitespace
+    df = pd.read_csv(data, sep=r"\s+")
+
+    # Compile into list of parameters to use
+    run_kwargs = df.to_dict(orient="records")
+
+    return run_kwargs
+
+
+@pytest.mark.parametrize("run_kwargs", get_run_kwargs())
+def test_run_preprocessing(create_dirs, run_kwargs):
     tmp: TemporaryDirectory = create_dirs
 
     # Mock config
@@ -21,7 +50,7 @@ def test_run_preprocessing(create_dirs):
     mock_subprocess.start()
 
     # Create run dir and associated LIMS manifest
-    run_dir = create_element_run_dir(tmp=tmp)
+    run_dir = create_element_run_dir(tmp=tmp, **run_kwargs)
 
     # Import module to test
     from taca.analysis import analysis_element as to_test

From 47e912d27ab9fe3b21a0b0d816608d8e03c62e9e Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Fri, 11 Oct 2024 16:53:40 +0200
Subject: [PATCH 142/187] test polishing

---
 tests/analysis/test_analysis_element.py | 35 ++++++++++++++-----------
 tests/element/test_Element_Runs.py      | 24 +++++++++++------
 2 files changed, 35 insertions(+), 24 deletions(-)

diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py
index 8f12bf10..4fb5acbe 100644
--- a/tests/analysis/test_analysis_element.py
+++ b/tests/analysis/test_analysis_element.py
@@ -8,31 +8,34 @@
 from tests.element.test_Element_Runs import create_element_run_dir, get_config
 
 
-def get_run_kwargs() -> list[dict]:
-    parameter_string_table = """
-lims_manifest run_finished outcome_completed sync_finished demux_dir demux_done nosync
-False         False        False             False         False     False      False
-True          False        False             False         False     False      False
-True          True         False             False         False     False      False
-True          True         True              False         False     False      False
-True          True         True              True          False     False      False
-True          True         True              True          True      False      False
-True          True         True              True          True      True       False
-True          True         True              True          True      True       True
-"""
+def parametrize_testruns() -> list[dict]:
+    """Helper function to build test parametrization from a friendly string table."""
+
+    testrun_descs = ["ready to demux"]
+
+    kwarg_table = """
+    lims_manifest  metadata_files  run_finished  outcome_completed  demux_dir  demux_done  rsync_ongoing  rsync_exit_status  nosync
+    True           True            True          True               False      False       False          None               False
+    """
+
     # Turn string table to datastream
-    data = StringIO(parameter_string_table)
+    data = StringIO(kwarg_table)
 
     # Read data, trimming whitespace
     df = pd.read_csv(data, sep=r"\s+")
 
     # Compile into list of parameters to use
-    run_kwargs = df.to_dict(orient="records")
+    testrun_kwargs = df.to_dict(orient="records")
+
+    assert len(testrun_descs) == len(testrun_kwargs)
+
+    return testrun_kwargs, testrun_descs
+
 
-    return run_kwargs
+testrun_kwargs, testrun_descs = parametrize_testruns()
 
 
-@pytest.mark.parametrize("run_kwargs", get_run_kwargs())
+@pytest.mark.parametrize("run_kwargs", testrun_kwargs, ids=testrun_descs)
 def test_run_preprocessing(create_dirs, run_kwargs):
     tmp: TemporaryDirectory = create_dirs
 
diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py
index eb4a714c..7ba86e9f 100644
--- a/tests/element/test_Element_Runs.py
+++ b/tests/element/test_Element_Runs.py
@@ -26,13 +26,15 @@ def get_config(tmp: tempfile.TemporaryDirectory) -> dict:
 def create_element_run_dir(
     tmp: tempfile.TemporaryDirectory,
     run_name: str = "20240926_AV242106_A2349523513",
-    lims_manifest: bool = True,
-    run_finished: bool = True,
-    outcome_completed: bool = True,
-    sync_finished: bool = True,
+    metadata_files: bool = False,
+    lims_manifest: bool = False,
+    run_finished: bool = False,
+    outcome_completed: bool = False,
     demux_dir: bool = False,
     n_demux_subdirs: int = 2,
     demux_done: bool = False,
+    rsync_ongoing: bool = False,
+    rsync_exit_status: int | None = None,
     nosync: bool = False,
 ) -> str:
     """
@@ -134,7 +136,7 @@ def create_element_run_dir(
         os.remove(csv_path)
 
     # Populate run dir with files and folders
-    if run_finished:
+    if metadata_files:
         with open(f"{run_path}/RunManifest.json", "w") as stream:
             stream.write("""{
     "KitConfiguration": {
@@ -397,6 +399,8 @@ def create_element_run_dir(
   }
 }
 """)
+
+    if run_finished:
         with open(f"{run_path}/RunUploaded.json", "w") as stream:
             outcome = "OutcomeCompleted" if outcome_completed else "OutcomeFailed"
             stream.write(
@@ -411,8 +415,12 @@ def create_element_run_dir(
                 + "}"
             )
 
-    if sync_finished:
-        open(f"{run_path}/.sync_finished", "w").close()
+    if rsync_ongoing:
+        open(f"{run_path}/.rsync_ongoing", "w").close()
+
+    if rsync_exit_status is not None:
+        with open(f"{run_path}/.rsync_exit_status", "w") as stream:
+            stream.write(str(rsync_exit_status))
 
     if demux_dir:
         os.mkdir(os.path.join(run_path, "Demultiplexing"))
@@ -469,7 +477,7 @@ def test_check_sequencing_status(
         run = to_test.Run(
             create_element_run_dir(
                 tmp,
-                run_finished=p["run_finished"],
+                metadata_files=p["run_finished"],
                 outcome_completed=p["outcome_completed"],
             ),
             get_config(tmp),

From b16423bcc6f985d1ab172422a1e4b523c9f9b40c Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Fri, 11 Oct 2024 17:02:03 +0200
Subject: [PATCH 143/187] test troubleshooting

---
 tests/element/test_Element_Runs.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py
index 7ba86e9f..f0e3e0c6 100644
--- a/tests/element/test_Element_Runs.py
+++ b/tests/element/test_Element_Runs.py
@@ -460,9 +460,24 @@ def test_init(self, mock_db: mock.Mock, create_dirs: pytest.fixture):
     @pytest.mark.parametrize(
         "p",
         [
-            {"run_finished": True, "outcome_completed": True, "expected": True},
-            {"run_finished": True, "outcome_completed": False, "expected": False},
-            {"run_finished": False, "outcome_completed": False, "expected": False},
+            {
+                "run_finished": True,
+                "metadata_files": True,
+                "outcome_completed": True,
+                "expected": True,
+            },
+            {
+                "run_finished": True,
+                "metadata_files": True,
+                "outcome_completed": False,
+                "expected": False,
+            },
+            {
+                "run_finished": False,
+                "metadata_files ": False,
+                "outcome_completed": False,
+                "expected": False,
+            },
         ],
         ids=["success", "failure", "ongoing"],
     )

From 81dcc40ae2ef4ea03f92941d1484c3bc4021f72b Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Fri, 11 Oct 2024 17:11:22 +0200
Subject: [PATCH 144/187] all tests functional

---
 tests/element/test_Element_Runs.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py
index f0e3e0c6..2accd282 100644
--- a/tests/element/test_Element_Runs.py
+++ b/tests/element/test_Element_Runs.py
@@ -474,7 +474,7 @@ def test_init(self, mock_db: mock.Mock, create_dirs: pytest.fixture):
             },
             {
                 "run_finished": False,
-                "metadata_files ": False,
+                "metadata_files": False,
                 "outcome_completed": False,
                 "expected": False,
             },
@@ -489,15 +489,15 @@ def test_check_sequencing_status(
     ):
         tmp: tempfile.TemporaryDirectory = create_dirs
 
+        expected_outcome = p.pop("expected")
         run = to_test.Run(
             create_element_run_dir(
                 tmp,
-                metadata_files=p["run_finished"],
-                outcome_completed=p["outcome_completed"],
+                **p,
             ),
             get_config(tmp),
         )
-        assert run.check_sequencing_status() is p["expected"]
+        assert run.check_sequencing_status() is expected_outcome
 
     @pytest.mark.parametrize(
         "p",

From b0593da06117e85033f7df794d229793f65e5a9f Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Fri, 11 Oct 2024 17:15:47 +0200
Subject: [PATCH 145/187] add some parametrization

---
 tests/analysis/test_analysis_element.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py
index 4fb5acbe..1657667f 100644
--- a/tests/analysis/test_analysis_element.py
+++ b/tests/analysis/test_analysis_element.py
@@ -11,11 +11,12 @@
 def parametrize_testruns() -> list[dict]:
     """Helper function to build test parametrization from a friendly string table."""
 
-    testrun_descs = ["ready to demux"]
+    testrun_descs = ["ready to demux", "demux_ongoing"]
 
     kwarg_table = """
     lims_manifest  metadata_files  run_finished  outcome_completed  demux_dir  demux_done  rsync_ongoing  rsync_exit_status  nosync
     True           True            True          True               False      False       False          None               False
+    True           True            True          True               True       False       False          None               False
     """
 
     # Turn string table to datastream

From b290bb53c8904655398046857161c3370322cd94 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Fri, 11 Oct 2024 17:23:24 +0200
Subject: [PATCH 146/187] mypy

---
 tests/analysis/test_analysis_element.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py
index 1657667f..96a3817e 100644
--- a/tests/analysis/test_analysis_element.py
+++ b/tests/analysis/test_analysis_element.py
@@ -8,10 +8,10 @@
 from tests.element.test_Element_Runs import create_element_run_dir, get_config
 
 
-def parametrize_testruns() -> list[dict]:
+def parametrize_testruns() -> tuple[list[dict], list[str]]:
     """Helper function to build test parametrization from a friendly string table."""
 
-    testrun_descs = ["ready to demux", "demux_ongoing"]
+    testrun_descs: list[str] = ["ready to demux", "demux_ongoing"]
 
     kwarg_table = """
     lims_manifest  metadata_files  run_finished  outcome_completed  demux_dir  demux_done  rsync_ongoing  rsync_exit_status  nosync
@@ -26,7 +26,7 @@ def parametrize_testruns() -> list[dict]:
     df = pd.read_csv(data, sep=r"\s+")
 
     # Compile into list of parameters to use
-    testrun_kwargs = df.to_dict(orient="records")
+    testrun_kwargs: list[dict] = df.to_dict(orient="records")
 
     assert len(testrun_descs) == len(testrun_kwargs)
 

From 35248bd33d5332f3d94f369ed7b414e1e77d77e1 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Fri, 11 Oct 2024 17:24:04 +0200
Subject: [PATCH 147/187] remove empty, skipped tests

---
 tests/element/test_Element_Runs.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py
index 2accd282..f558c194 100644
--- a/tests/element/test_Element_Runs.py
+++ b/tests/element/test_Element_Runs.py
@@ -524,10 +524,6 @@ def test_get_demultiplexing_status(
 
         assert run.get_demultiplexing_status() == p["expected"]
 
-    @pytest.mark.skip(reason="Not implemented yet")
-    def test_generate_demux_command(self, mock_db):
-        pass
-
     def test_start_demux(self, mock_db, create_dirs):
         tmp: tempfile.TemporaryDirectory = create_dirs
         with mock.patch("subprocess.Popen") as mock_Popen, mock.patch(
@@ -538,11 +534,3 @@ def test_start_demux(self, mock_db, create_dirs):
             run.start_demux("mock_run_manifest", "mock_demux_dir")
             mock_command.assert_called_once_with("mock_run_manifest", "mock_demux_dir")
             mock_Popen.assert_called_once()
-
-    @pytest.mark.skip(reason="Not implemented yet")
-    def test_is_transferred(self, mock_db, create_dirs):
-        pass
-
-    @pytest.mark.skip(reason="Not implemented yet")
-    def test_parse_rundir(self, mock_db, create_dirs):
-        pass

From bb888bcd7f206911b0b299d01510f1e4e6872cb1 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Mon, 14 Oct 2024 11:46:39 +0200
Subject: [PATCH 148/187] Reconfigure pytest arguments w / wo CI

---
 .github/workflows/test-code.yml | 4 ++--
 pyproject.toml                  | 4 ----
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/test-code.yml b/.github/workflows/test-code.yml
index 1ff360e3..eec2d5fb 100644
--- a/.github/workflows/test-code.yml
+++ b/.github/workflows/test-code.yml
@@ -21,8 +21,8 @@ jobs:
       - name: Install TACA
         run: pip install -e .
       - name: pytest
-        # Options are configured in pyproject.toml
-        run: pytest --cov=genologics --cov-report=xml
+        # Default options are configured in pyproject.toml
+        run: pytest --cov=./taca --cov-report=xml --cov-report term-missing -vv
       - name: CodeCov
         uses: codecov/codecov-action@v4
         with:
diff --git a/pyproject.toml b/pyproject.toml
index d5d152b2..15c6c907 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,10 +38,6 @@ filterwarnings = [
 # Default addopts
 addopts = "--ignore tests_old/"
 
-# CLI coverage reports, messes with IDE debugging
-# pytest --ignore tests_old/ --cov=./taca --cov-report term-missing -vv
-
-
 [tool.coverage.run]
 # The comment "# pragma: no cover" can be used to exclude a line from coverage
 source = ["taca"]

From fd54a125cbab15caf397a1e97b40954399fa78a1 Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Tue, 15 Oct 2024 14:00:42 +0200
Subject: [PATCH 149/187] Small fixes

---
 taca/element/Element_Runs.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 4473b9a8..02ca8810 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -270,9 +270,9 @@ def to_doc_obj(self):
         demux_command_file = os.path.join(self.run_dir, ".bases2fastq_command")
         if os.path.exists(demux_command_file):
             with open(demux_command_file) as command_file:
-                demux_command = command_file.readlines()[0]
+                demux_commands = command_file.readlines()
         else:
-            demux_command = None
+            demux_commands = None
         demux_version_file = os.path.join(
             self.run_dir, "Demultiplexing_0", "RunStats.json"
         )
@@ -286,7 +286,7 @@ def to_doc_obj(self):
         software_info = {
             "Version": demux_version,
             "bin": self.CONFIG.get("element_analysis").get("bases2fastq"),
-            "options": demux_command,
+            "options": demux_commands,
         }
 
         doc_obj = {
@@ -580,9 +580,10 @@ def make_demux_manifests(
                 raise AssertionError("Both I1 and I2 appear to contain UMIs.")
 
             # Unpack settings from LIMS manifest
-            for kv in settings.split(" "):
-                k, v = kv.split(":")
-                settings_kvs[k] = v
+            if settings:
+                for kv in settings.split(" "):
+                    k, v = kv.split(":")
+                    settings_kvs[k] = v
 
             settings_section = "\n".join(
                 [
@@ -641,7 +642,7 @@ def generate_demux_command(self, run_manifest, demux_dir):
             + " --force-index-orientation"
         )
         with open(
-            os.path.join(self.run_dir, ".bases2fastq_command"), "w"
+            os.path.join(self.run_dir, ".bases2fastq_command"), "a"
         ) as command_file:
             command_file.write(command)
         return command

From 5f46a2fdb86119372097d403af58cc1cab822f6f Mon Sep 17 00:00:00 2001
From: chuan-wang <chuan.wang@scilifelab.se>
Date: Wed, 16 Oct 2024 10:01:54 +0200
Subject: [PATCH 150/187] Fix bug with empty aggregated_unassigned_indexes

---
 taca/element/Element_Runs.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 02ca8810..afeb2777 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -1174,10 +1174,11 @@ def aggregate_stats_unassigned(
             )
 
         # Write to a new UnassignedSequences.csv file under demux_dir
-        aggregated_unassigned_csv = os.path.join(
-            self.run_dir, self.demux_dir, "UnassignedSequences.csv"
-        )
-        self.write_to_csv(aggregated_unassigned_indexes, aggregated_unassigned_csv)
+        if aggregated_unassigned_indexes:
+            aggregated_unassigned_csv = os.path.join(
+                self.run_dir, self.demux_dir, "UnassignedSequences.csv"
+            )
+            self.write_to_csv(aggregated_unassigned_indexes, aggregated_unassigned_csv)
 
     # Aggregate demux results
     def aggregate_demux_results(self, demux_results_dirs):

From 340de287301c7a7b5e1dbe86cd7016cfdafba30e Mon Sep 17 00:00:00 2001
From: chuan-wang <chuan.wang@scilifelab.se>
Date: Wed, 16 Oct 2024 10:36:54 +0200
Subject: [PATCH 151/187] Fix wrong logic for collecting unassigned indexes

---
 VERSIONLOG.md                |  4 ++++
 taca/element/Element_Runs.py | 30 ++++++++----------------------
 2 files changed, 12 insertions(+), 22 deletions(-)

diff --git a/VERSIONLOG.md b/VERSIONLOG.md
index 2b5fed5c..2e6d992b 100644
--- a/VERSIONLOG.md
+++ b/VERSIONLOG.md
@@ -1,5 +1,9 @@
 # TACA Version Log
 
+## 20241016.1
+
+Fix wrong logic for collecting unassigned indexes
+
 ## 20241011.1
 
 Fix issue with 0 lane number; Add percentage of unassigned in total unassigned per lane
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index afeb2777..fb2ce2b2 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -1040,25 +1040,11 @@ def aggregate_stats_unassigned(
                 # Order: from longer to shorter indexes
                 sub_demux_with_shorter_index_lens = sub_demux_list[1:]
                 for sub_demux in sub_demux_with_shorter_index_lens:
-                    unassigned_csv = os.path.join(
-                        self.run_dir,
-                        f"Demultiplexing_{sub_demux}",
-                        "UnassignedSequences.csv",
-                    )
-                    if os.path.exists(unassigned_csv):
-                        with open(unassigned_csv) as unassigned_file:
-                            reader = csv.DictReader(unassigned_file)
-                            unassigned_indexes = [row for row in reader]
-                    else:
-                        logger.warning(
-                            f"No {os.path.basename(unassigned_csv)} file found for sub-demultiplexing {sub_demux}."
-                        )
-                        continue
-                    # Filter by lane
-                    unassigned_indexes = [
-                        unassigned_index
-                        for unassigned_index in unassigned_indexes
-                        if unassigned_index["Lane"] == lane
+                    sub_demux_assigned_indexes = [
+                        sub_demux_assigned_index
+                        for sub_demux_assigned_index in aggregated_assigned_indexes_filtered_sorted
+                        if sub_demux_assigned_index["sub_demux_count"] == sub_demux
+                        and sub_demux_assigned_index["Lane"] == lane
                     ]
                     # Remove overlapped indexes from the list of max_unassigned_indexes
                     idx1_overlapped_len = min(
@@ -1085,11 +1071,11 @@ def aggregate_stats_unassigned(
                             if demux_lens_pair[0] == sub_demux_with_max_index_lens
                         ][0][1],
                     )
-                    for unassigned_index in unassigned_indexes:
-                        idx1_overlapped_seq = unassigned_index["I1"][
+                    for sub_demux_assigned_index in sub_demux_assigned_indexes:
+                        idx1_overlapped_seq = sub_demux_assigned_index["I1"][
                             :idx1_overlapped_len
                         ]
-                        idx2_overlapped_seq = unassigned_index["I2"][
+                        idx2_overlapped_seq = sub_demux_assigned_index["I2"][
                             :idx2_overlapped_len
                         ]
                         # Remove the overlapped record from the max_unassigned_indexes list

From 8134461c45ba0a86756466eebad8ad7da9c7bc1e Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Wed, 16 Oct 2024 13:22:30 +0200
Subject: [PATCH 152/187] Warn about missing files

---
 taca/element/Element_Runs.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index fb2ce2b2..28dc81da 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -650,7 +650,7 @@ def generate_demux_command(self, run_manifest, demux_dir):
     def start_demux(self, run_manifest, demux_dir):
         with chdir(self.run_dir):
             cmd = self.generate_demux_command(run_manifest, demux_dir)
-            stderr_abspath = f"{self.run_dir}/bases2fastq_stderr.txt"
+            stderr_abspath = f"{self.run_dir}/bases2fastq_stderr.txt" #TODO: individual files for each sub-demux
             try:
                 with open(stderr_abspath, "w") as stderr:
                     process = subprocess.Popen(
@@ -1198,8 +1198,11 @@ def sync_metadata(self):
         dest = os.path.join(metadata_archive, self.NGI_run_id)
         if not os.path.exists(dest):
             os.makedirs(dest)
-        for f in files_to_copy:
-            shutil.copy(f, dest)
+        for f in files_to_copy: # UnassignedSequences.csv missing in NoIndex case
+            if os.path.exists(f):
+                shutil.copy(f, dest)
+            else:
+                logger.warning(f"File {f} missing for run {self.run}")
 
     def make_transfer_indicator(self):
         transfer_indicator = os.path.join(self.run_dir, ".rsync_ongoing")

From ad2d01515b731b4b798559ea1bf9d74c4f608ac3 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Tue, 22 Oct 2024 15:20:21 +0200
Subject: [PATCH 153/187] modernize ont test parametrization

---
 tests/analysis/test_analysis_nanopore.py | 80 ++++++++++--------------
 1 file changed, 34 insertions(+), 46 deletions(-)

diff --git a/tests/analysis/test_analysis_nanopore.py b/tests/analysis/test_analysis_nanopore.py
index e06059cc..142d1a38 100644
--- a/tests/analysis/test_analysis_nanopore.py
+++ b/tests/analysis/test_analysis_nanopore.py
@@ -15,70 +15,58 @@
 )
 
 
-def build_run_properties() -> list[dict]:
+def parametrize_testruns() -> list[dict]:
     """In order to parametrize the test in a comprehensive way, the parametrization is
     tabulated as a string here.
     """
 
-    col_names = [
-        "instrument",
-        "qc",
-        "run_finished",
-        "sync_finished",
-        "raw_dirs",
-        "fastq_dirs",
-        "barcode_dirs",
-        "anglerfish_samplesheets",
-        "anglerfish_ongoing",
-        "anglerfish_exit",
-    ]
-
     parameter_string_table = """
-    promethion False False False False False False False False NA
-    promethion False True  False False False False False False NA
-    promethion False True  True  False False False False False NA
-    promethion False True  True  True  False False False False NA
-    promethion False True  True  True  True  False False False NA
-    promethion False True  True  True  True  True  False False NA
-    minion     False False False False False False False False NA
-    minion     False True  False False False False False False NA
-    minion     False True  True  False False False False False NA
-    minion     False True  True  True  False False False False NA
-    minion     False True  True  True  True  False False False NA
-    minion     False True  True  True  True  True  False False NA
-    minion     True  False False False False False False False NA
-    minion     True  True  False False False False False False NA
-    minion     True  True  True  False False False False False NA
-    minion     True  True  True  True  False False False False NA
-    minion     True  True  True  True  True  False False False NA
-    minion     True  True  True  True  True  True  False False NA
-    minion     True  True  True  True  True  True  True  False NA
-    minion     True  True  True  True  True  True  True  True  NA
-    minion     True  True  True  True  True  True  True  False 0
+    desc            instrument qc    run_finished sync_finished raw_dirs fastq_dirs barcode_dirs anglerfish_samplesheets anglerfish_ongoing anglerfish_exit
+    prom_ongoing    promethion False False        False         False    False      False        False                   False              NA
+    prom_done       promethion False True         False         False    False      False        False                   False              NA
+    prom_synced     promethion False True         True          False    False      False        False                   False              NA
+    prom_reads      promethion False True         True          True     False      False        False                   False              NA
+    prom_fastq      promethion False True         True          True     True       False        False                   False              NA
+    prom_bcs        promethion False True         True          True     True       True         False                   False              NA
+    min_ongoing     minion     False False        False         False    False      False        False                   False              NA
+    min_done        minion     False True         False         False    False      False        False                   False              NA
+    min_synced      minion     False True         True          False    False      False        False                   False              NA
+    min_reads       minion     False True         True          True     False      False        False                   False              NA
+    min_fastq       minion     False True         True          True     True       False        False                   False              NA
+    min_bcs         minion     False True         True          True     True       True         False                   False              NA
+    min_qc_ongoing  minion     True  False        False         False    False      False        False                   False              NA
+    min_qc_done     minion     True  True         False         False    False      False        False                   False              NA
+    min_qc_synced   minion     True  True         True          False    False      False        False                   False              NA
+    min_qc_reads    minion     True  True         True          True     False      False        False                   False              NA
+    min_qc_fastq    minion     True  True         True          True     True       False        False                   False              NA
+    min_qc_bcs      minion     True  True         True          True     True       True         False                   False              NA
+    min_qc_ang_ss   minion     True  True         True          True     True       True         True                    False              NA
+    min_qc_ang_run  minion     True  True         True          True     True       True         True                    True               NA
+    min_qc_ang_done minion     True  True         True          True     True       True         True                    False              0
     """
 
+    # Turn string table to datastream
     data = StringIO(parameter_string_table)
 
     # Read data, trimming whitespace
-    df = pd.read_csv(data, header=None, sep=r"\s+")
-    assert len(df.columns) == len(col_names)
-    df.columns = col_names
+    df = pd.read_csv(data, sep=r"\s+")
 
     # Replace nan(s) with None(s)
     df = df.replace(np.nan, None)
 
-    # Convert to dict
-    run_properties = df.to_dict("records")
+    # Drop the "desc" column and retain it as a list
+    testrun_descs = df.pop("desc").tolist()
+
+    # Compile into list of parameters to use
+    testrun_kwargs: list[dict] = df.to_dict(orient="records")
+
+    return testrun_kwargs, testrun_descs
 
-    # Convert float exit codes to ints
-    for d in run_properties:
-        if d["anglerfish_exit"] == 0.0:
-            d["anglerfish_exit"] = int(d["anglerfish_exit"])
 
-    return run_properties
+testrun_kwargs, testrun_descs = parametrize_testruns()
 
 
-@pytest.mark.parametrize("run_properties", build_run_properties())
+@pytest.mark.parametrize("run_properties", testrun_kwargs, ids=testrun_descs)
 def test_ont_transfer(create_dirs, run_properties, caplog):
     """Test the "taca analaysis ont-transfer" subcommand automation from
     start to finish for a variety of runs.

From 554d38a2ef650fac8d97f921961df8b214430868 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Tue, 22 Oct 2024 15:20:47 +0200
Subject: [PATCH 154/187] remove rsync options causing vscode pytest crash due
 to outdated rsync version

---
 tests/nanopore/test_ONT_run_classes.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/nanopore/test_ONT_run_classes.py b/tests/nanopore/test_ONT_run_classes.py
index a91a2f97..99cd558f 100644
--- a/tests/nanopore/test_ONT_run_classes.py
+++ b/tests/nanopore/test_ONT_run_classes.py
@@ -52,8 +52,6 @@ def make_ONT_test_config(tmp: tempfile.TemporaryDirectory) -> dict:
     minknow_reports_dir: {tmp.name}/minknow_reports/
     rsync_options:
         '-Lav': None
-        '--chown': ':ngi2016003'
-        '--chmod': 'Dg+s,g+rw'
         '-r': None
         '--exclude': ['work']"""
 

From 4b2f8d8d5a1d268f5ef7804a809002661e7ea842 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Tue, 22 Oct 2024 17:33:46 +0200
Subject: [PATCH 155/187] add mock mail to config and overwrite functionality
 to element dir

---
 tests/element/test_Element_Runs.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py
index f558c194..7a54bd56 100644
--- a/tests/element/test_Element_Runs.py
+++ b/tests/element/test_Element_Runs.py
@@ -18,6 +18,9 @@ def get_config(tmp: tempfile.TemporaryDirectory) -> dict:
                 },
             },
         },
+        "mail": {
+            "recipients": ["mock@mock.com"],
+        },
         "statusdb": {},
     }
     return config
@@ -25,6 +28,7 @@ def get_config(tmp: tempfile.TemporaryDirectory) -> dict:
 
 def create_element_run_dir(
     tmp: tempfile.TemporaryDirectory,
+    overwrite: bool = False,
     run_name: str = "20240926_AV242106_A2349523513",
     metadata_files: bool = False,
     lims_manifest: bool = False,
@@ -62,6 +66,11 @@ def create_element_run_dir(
         run_path = f"{tmp.name}/ngi_data/sequencing/AV242106/nosync/{run_name}"
     else:
         run_path = f"{tmp.name}/ngi_data/sequencing/AV242106/{run_name}"
+    if os.path.exists(run_path):
+        if overwrite:
+            os.rmdir(run_path)
+        else:
+            raise FileExistsError(f"Directory {run_path} already exists.")
     os.mkdir(run_path)
 
     # Create LIMS manifest

From 70d51bcb57cf6b972341ab423c438563da21fe40 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Tue, 22 Oct 2024 17:35:43 +0200
Subject: [PATCH 156/187] Add conftest fixture for logging when testing. Send
 to stdout and specified log file in tempdir.

---
 tests/conftest.py | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/tests/conftest.py b/tests/conftest.py
index a4945938..8b53b4e3 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,3 +1,4 @@
+import logging
 import os
 import shutil
 import tempfile
@@ -115,3 +116,46 @@ def create_dirs():
     yield tmp
 
     tmp.cleanup()
+
+
+@pytest.fixture(autouse=True)
+def configure_logging(create_dirs):
+    """Configure logging for the entire test session."""
+
+    # Use fixture
+    tmp = create_dirs
+
+    # Specify log file path
+    log_file = os.path.join(tmp.name, "log", "taca.log")
+    assert os.path.exists(log_file)
+
+    # Get the root logger
+    logger = logging.getLogger()
+
+    # Clear any existing handlers to avoid duplicate logs
+    if logger.hasHandlers():
+        logger.handlers.clear()
+
+    # Configure logging
+    file_handler = logging.FileHandler(log_file)
+    stream_handler = logging.StreamHandler()
+
+    # Set a common formatter
+    formatter = logging.Formatter(
+        "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    )
+    file_handler.setFormatter(formatter)
+    stream_handler.setFormatter(formatter)
+
+    # Add handlers to the root logger
+    logger.addHandler(file_handler)
+    logger.addHandler(stream_handler)
+
+    # Set log level
+    logger.setLevel(logging.INFO)
+
+    # Log to confirm the logger is working
+    logger.info(f"Logging is set up. Logs will be stored in {log_file}.")
+
+    # Return the log file path to use in tests if needed
+    return log_file

From 825182e77ecac7dfb777b3b1a57d9d981936c601 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Tue, 22 Oct 2024 17:35:56 +0200
Subject: [PATCH 157/187] start work on incremental test function

---
 tests/analysis/test_analysis_element.py | 51 +++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py
index 96a3817e..760ab1be 100644
--- a/tests/analysis/test_analysis_element.py
+++ b/tests/analysis/test_analysis_element.py
@@ -1,3 +1,4 @@
+import logging
 from io import StringIO
 from tempfile import TemporaryDirectory
 from unittest.mock import patch
@@ -64,3 +65,53 @@ def test_run_preprocessing(create_dirs, run_kwargs):
 
     # Stop mocks
     patch.stopall()
+
+
+def test_incremental(create_dirs, caplog):
+    # Create tempdir
+    tmp: TemporaryDirectory = create_dirs
+
+    # Capture log
+    caplog.at_level(logging.INFO)
+
+    # Mock config
+    config = get_config(tmp)
+    mock_config = patch("taca.utils.config.CONFIG", new=config)
+    mock_config.start()
+
+    # Mock DB
+    mock_db = patch("taca.element.Element_Runs.ElementRunsConnection")
+    mock_db.start()
+
+    # Mock send mail
+    mock_mail = patch("taca.analysis.analysis_element.send_mail").start()
+
+    # Mock subprocess
+    mock_subprocess = patch("subprocess.Popen")
+    mock_subprocess.start()
+
+    # Import module to test
+    from taca.analysis import analysis_element as to_test
+
+    # Test: Empty dir, should raise error and send mail
+    run_dir = create_element_run_dir(
+        tmp=tmp,
+        lims_manifest=False,
+        metadata_files=False,
+        run_finished=False,
+        outcome_completed=False,
+        demux_dir=False,
+        demux_done=False,
+        rsync_ongoing=False,
+        rsync_exit_status=None,
+        nosync=False,
+    )
+
+    with pytest.raises(FileNotFoundError):
+        to_test.run_preprocessing(run_dir)
+
+    mock_mail.assert_called_once()
+    assert "Run parameters file not found" in caplog.text
+
+    # Stop mocks
+    patch.stopall()

From f69beea89bc76ee9b444575b9b08e0e45d69dc06 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Tue, 22 Oct 2024 17:58:46 +0200
Subject: [PATCH 158/187] add package for checking dir hashes, add 2nd test
 increment

---
 requirements-dev.txt                    | 16 +++++++-------
 tests/analysis/test_analysis_element.py | 28 ++++++++++++++++++++++++-
 2 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index 0ef1b795..8126d039 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,13 +1,13 @@
 -r requirements.txt
-
-nose
+dirhash
+ipdb
+ipython
 mock
-sphinx
-sphinx-rtd-theme
+mypy
+nose
+pipreqs
 pytest
 pytest-cov
-ipython
-ipdb
 ruff
-mypy
-pipreqs
+sphinx
+sphinx-rtd-theme
diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py
index 760ab1be..4b924ef8 100644
--- a/tests/analysis/test_analysis_element.py
+++ b/tests/analysis/test_analysis_element.py
@@ -5,6 +5,7 @@
 
 import pandas as pd
 import pytest
+from dirhash import dirhash
 
 from tests.element.test_Element_Runs import create_element_run_dir, get_config
 
@@ -93,7 +94,9 @@ def test_incremental(create_dirs, caplog):
     # Import module to test
     from taca.analysis import analysis_element as to_test
 
-    # Test: Empty dir, should raise error and send mail
+    ### Test: Empty dir, should raise error and send mail
+
+    # Create dir
     run_dir = create_element_run_dir(
         tmp=tmp,
         lims_manifest=False,
@@ -107,11 +110,34 @@ def test_incremental(create_dirs, caplog):
         nosync=False,
     )
 
+    # Run code (1)
     with pytest.raises(FileNotFoundError):
         to_test.run_preprocessing(run_dir)
 
+    # Assertions
     mock_mail.assert_called_once()
     assert "Run parameters file not found" in caplog.text
 
+    # Add metadata files
+    run_dir = create_element_run_dir(
+        tmp=tmp,
+        overwrite=True,
+        lims_manifest=False,
+        metadata_files=True,
+        run_finished=False,
+        outcome_completed=False,
+        demux_dir=False,
+        demux_done=False,
+        rsync_ongoing=False,
+        rsync_exit_status=None,
+        nosync=False,
+    )
+
+    # Run code (2) with snapshots
+    before = dirhash(run_dir, "md5")
+    to_test.run_preprocessing(run_dir)
+    after = dirhash(run_dir, "md5")
+    assert before == after
+
     # Stop mocks
     patch.stopall()

From f383665f4aa2ba0b90ef2e321bdd6830845abbf7 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Wed, 23 Oct 2024 13:09:30 +0200
Subject: [PATCH 159/187] suspected bug fix

---
 taca/analysis/analysis_element.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index 915fcfa7..984c2a7c 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -39,7 +39,7 @@ def _process(run):
         sequencing_done = run.check_sequencing_status()
         if not sequencing_done:
             run.status = "sequencing"
-            if run.status_changed:
+            if run.status_changed():
                 run.update_statusdb()
             return
 

From f659fa6f06f6f2264efad5c58e44e372482f1978 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Wed, 23 Oct 2024 13:16:45 +0200
Subject: [PATCH 160/187] start working on nicer tests

---
 tests/analysis/test_analysis_element.py | 34 +++++++++++++++----------
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py
index 4b924ef8..b05109c8 100644
--- a/tests/analysis/test_analysis_element.py
+++ b/tests/analysis/test_analysis_element.py
@@ -68,7 +68,8 @@ def test_run_preprocessing(create_dirs, run_kwargs):
     patch.stopall()
 
 
-def test_incremental(create_dirs, caplog):
+@pytest.fixture
+def aviti_fixture(create_dirs, caplog):
     # Create tempdir
     tmp: TemporaryDirectory = create_dirs
 
@@ -77,12 +78,10 @@ def test_incremental(create_dirs, caplog):
 
     # Mock config
     config = get_config(tmp)
-    mock_config = patch("taca.utils.config.CONFIG", new=config)
-    mock_config.start()
+    mock_config = patch("taca.utils.config.CONFIG", new=config).start()
 
     # Mock DB
-    mock_db = patch("taca.element.Element_Runs.ElementRunsConnection")
-    mock_db.start()
+    mock_db = patch("taca.element.Element_Runs.ElementRunsConnection").start()
 
     # Mock send mail
     mock_mail = patch("taca.analysis.analysis_element.send_mail").start()
@@ -94,7 +93,16 @@ def test_incremental(create_dirs, caplog):
     # Import module to test
     from taca.analysis import analysis_element as to_test
 
-    ### Test: Empty dir, should raise error and send mail
+    # Yield fixtures
+    yield to_test, tmp, mock_mail, mock_db, caplog
+
+    # Stop mocks
+    patch.stopall()
+
+
+def test_process_empty_dir(aviti_fixture):
+    to_test, tmp, mock_mail, mock_db, caplog = aviti_fixture
+    """Should raise FileNotFoundError when no files are present in the run dir and send mail."""
 
     # Create dir
     run_dir = create_element_run_dir(
@@ -110,7 +118,6 @@ def test_incremental(create_dirs, caplog):
         nosync=False,
     )
 
-    # Run code (1)
     with pytest.raises(FileNotFoundError):
         to_test.run_preprocessing(run_dir)
 
@@ -118,6 +125,10 @@ def test_incremental(create_dirs, caplog):
     mock_mail.assert_called_once()
     assert "Run parameters file not found" in caplog.text
 
+
+def test_process_dir_metadata(aviti_fixture):
+    to_test, tmp, mock_mail, mock_db, caplog = aviti_fixture
+
     # Add metadata files
     run_dir = create_element_run_dir(
         tmp=tmp,
@@ -133,11 +144,8 @@ def test_incremental(create_dirs, caplog):
         nosync=False,
     )
 
-    # Run code (2) with snapshots
-    before = dirhash(run_dir, "md5")
     to_test.run_preprocessing(run_dir)
-    after = dirhash(run_dir, "md5")
-    assert before == after
 
-    # Stop mocks
-    patch.stopall()
+    assert mock_db.upload_to_statusdb.called
+
+    print(caplog.text)

From 2ef5bae5f4a0f5c281e4b1029515dd9d1f3a24ff Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Wed, 23 Oct 2024 13:17:13 +0200
Subject: [PATCH 161/187] syntax fix

---
 tests/analysis/test_analysis_element.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py
index b05109c8..18408143 100644
--- a/tests/analysis/test_analysis_element.py
+++ b/tests/analysis/test_analysis_element.py
@@ -87,8 +87,7 @@ def aviti_fixture(create_dirs, caplog):
     mock_mail = patch("taca.analysis.analysis_element.send_mail").start()
 
     # Mock subprocess
-    mock_subprocess = patch("subprocess.Popen")
-    mock_subprocess.start()
+    mock_subprocess = patch("subprocess.Popen").start()
 
     # Import module to test
     from taca.analysis import analysis_element as to_test

From 6db637889e1cbe21c11a0dca70ad32164b321743 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Wed, 23 Oct 2024 13:47:02 +0200
Subject: [PATCH 162/187] add db mock assertions!

---
 tests/analysis/test_analysis_element.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py
index 18408143..2b3ca0fe 100644
--- a/tests/analysis/test_analysis_element.py
+++ b/tests/analysis/test_analysis_element.py
@@ -81,7 +81,10 @@ def aviti_fixture(create_dirs, caplog):
     mock_config = patch("taca.utils.config.CONFIG", new=config).start()
 
     # Mock DB
-    mock_db = patch("taca.element.Element_Runs.ElementRunsConnection").start()
+    mock_db = patch(
+        "taca.element.Element_Runs.ElementRunsConnection", autospec=True
+    ).start()
+    print("BOOYAH", mock_db)
 
     # Mock send mail
     mock_mail = patch("taca.analysis.analysis_element.send_mail").start()
@@ -128,6 +131,10 @@ def test_process_empty_dir(aviti_fixture):
 def test_process_dir_metadata(aviti_fixture):
     to_test, tmp, mock_mail, mock_db, caplog = aviti_fixture
 
+    # Sub-mock configuration
+    mock_db.return_value.check_db_run_status.return_value = "ongoing"
+    mock_db.return_value.upload_to_statusdb.return_value = None
+
     # Add metadata files
     run_dir = create_element_run_dir(
         tmp=tmp,
@@ -145,6 +152,6 @@ def test_process_dir_metadata(aviti_fixture):
 
     to_test.run_preprocessing(run_dir)
 
-    assert mock_db.upload_to_statusdb.called
+    assert mock_db.return_value.upload_to_statusdb.called
 
     print(caplog.text)

From 4b649ebdfb0a3206e12b9324cacc6fcea6561aa3 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Wed, 23 Oct 2024 14:04:43 +0200
Subject: [PATCH 163/187] use dict of mocks for flexibility

---
 tests/analysis/test_analysis_element.py | 45 +++++++++++--------------
 1 file changed, 19 insertions(+), 26 deletions(-)

diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py
index 2b3ca0fe..c7cc81e0 100644
--- a/tests/analysis/test_analysis_element.py
+++ b/tests/analysis/test_analysis_element.py
@@ -76,34 +76,28 @@ def aviti_fixture(create_dirs, caplog):
     # Capture log
     caplog.at_level(logging.INFO)
 
-    # Mock config
-    config = get_config(tmp)
-    mock_config = patch("taca.utils.config.CONFIG", new=config).start()
-
-    # Mock DB
-    mock_db = patch(
-        "taca.element.Element_Runs.ElementRunsConnection", autospec=True
-    ).start()
-    print("BOOYAH", mock_db)
-
-    # Mock send mail
-    mock_mail = patch("taca.analysis.analysis_element.send_mail").start()
-
-    # Mock subprocess
-    mock_subprocess = patch("subprocess.Popen").start()
+    # Mocks
+    mocks = {
+        "mock_config": patch("taca.utils.config.CONFIG", new=get_config(tmp)).start(),
+        "mock_db": patch(
+            "taca.element.Element_Runs.ElementRunsConnection", autospec=True
+        ).start(),
+        "mock_mail": patch("taca.analysis.analysis_element.send_mail").start(),
+        "mock_subprocess": patch("subprocess.Popen").start(),
+    }
 
     # Import module to test
     from taca.analysis import analysis_element as to_test
 
     # Yield fixtures
-    yield to_test, tmp, mock_mail, mock_db, caplog
+    yield to_test, tmp, caplog, mocks
 
     # Stop mocks
     patch.stopall()
 
 
-def test_process_empty_dir(aviti_fixture):
-    to_test, tmp, mock_mail, mock_db, caplog = aviti_fixture
+def test_process_on_empty_dir(aviti_fixture):
+    to_test, tmp, caplog, mocks = aviti_fixture
     """Should raise FileNotFoundError when no files are present in the run dir and send mail."""
 
     # Create dir
@@ -124,16 +118,17 @@ def test_process_empty_dir(aviti_fixture):
         to_test.run_preprocessing(run_dir)
 
     # Assertions
-    mock_mail.assert_called_once()
+    mocks["mock_mail"].assert_called_once()
     assert "Run parameters file not found" in caplog.text
 
 
-def test_process_dir_metadata(aviti_fixture):
-    to_test, tmp, mock_mail, mock_db, caplog = aviti_fixture
+def test_process_on_dir_w_metadata(aviti_fixture):
+    """Should update statusdb."""
+    to_test, tmp, caplog, mocks = aviti_fixture
 
     # Sub-mock configuration
-    mock_db.return_value.check_db_run_status.return_value = "ongoing"
-    mock_db.return_value.upload_to_statusdb.return_value = None
+    mocks["mock_db"].return_value.check_db_run_status.return_value = "ongoing"
+    mocks["mock_db"].return_value.upload_to_statusdb.return_value = None
 
     # Add metadata files
     run_dir = create_element_run_dir(
@@ -152,6 +147,4 @@ def test_process_dir_metadata(aviti_fixture):
 
     to_test.run_preprocessing(run_dir)
 
-    assert mock_db.return_value.upload_to_statusdb.called
-
-    print(caplog.text)
+    assert mocks["mock_db"].return_value.upload_to_statusdb.called

From ddfb36536c3ac667958b40eeefd01114336e5e4d Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Wed, 23 Oct 2024 14:18:56 +0200
Subject: [PATCH 164/187] prep for merge

---
 tests/analysis/test_analysis_element.py | 55 ++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py
index c7cc81e0..939b132e 100644
--- a/tests/analysis/test_analysis_element.py
+++ b/tests/analysis/test_analysis_element.py
@@ -133,7 +133,6 @@ def test_process_on_dir_w_metadata(aviti_fixture):
     # Add metadata files
     run_dir = create_element_run_dir(
         tmp=tmp,
-        overwrite=True,
         lims_manifest=False,
         metadata_files=True,
         run_finished=False,
@@ -148,3 +147,57 @@ def test_process_on_dir_w_metadata(aviti_fixture):
     to_test.run_preprocessing(run_dir)
 
     assert mocks["mock_db"].return_value.upload_to_statusdb.called
+
+
+@pytest.skip("Not implemented")
+def test_process_on_failed_run(aviti_fixture):
+    """"""
+    to_test, tmp, caplog, mocks = aviti_fixture
+
+    # Sub-mock configuration
+    mocks["mock_db"].return_value.check_db_run_status.return_value = "ongoing"
+    mocks["mock_db"].return_value.upload_to_statusdb.return_value = None
+
+    # Add metadata files
+    run_dir = create_element_run_dir(
+        tmp=tmp,
+        lims_manifest=False,
+        metadata_files=True,
+        run_finished=True,
+        outcome_completed=False,
+        demux_dir=False,
+        demux_done=False,
+        rsync_ongoing=False,
+        rsync_exit_status=None,
+        nosync=False,
+    )
+
+    to_test.run_preprocessing(run_dir)
+
+
+def test_process_on_finished_run_wo_lims_manifest(aviti_fixture):
+    """Should fail to find LIMS run manifest and send mail."""
+    to_test, tmp, caplog, mocks = aviti_fixture
+
+    # Sub-mock configuration
+    mocks["mock_db"].return_value.check_db_run_status.return_value = "ongoing"
+    mocks["mock_db"].return_value.upload_to_statusdb.return_value = None
+
+    # Add metadata files
+    run_dir = create_element_run_dir(
+        tmp=tmp,
+        lims_manifest=False,
+        metadata_files=True,
+        run_finished=True,
+        outcome_completed=True,
+        demux_dir=False,
+        demux_done=False,
+        rsync_ongoing=False,
+        rsync_exit_status=None,
+        nosync=False,
+    )
+
+    to_test.run_preprocessing(run_dir)
+
+    assert "No manifest found for run" in caplog.text
+    mocks["mock_mail"].assert_called_once()

From 3527588262e15ea58c1a25e8b8168bbab9156be8 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Wed, 23 Oct 2024 14:58:36 +0200
Subject: [PATCH 165/187] add mock bases2fastq executable

---
 tests/element/test_Element_Runs.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/element/test_Element_Runs.py b/tests/element/test_Element_Runs.py
index 7a54bd56..37a5c9a4 100644
--- a/tests/element/test_Element_Runs.py
+++ b/tests/element/test_Element_Runs.py
@@ -17,6 +17,7 @@ def get_config(tmp: tempfile.TemporaryDirectory) -> dict:
                     "transfer_log": f"{tmp.name}/log/transfer_aviti.tsv",
                 },
             },
+            "bases2fastq": "mock_bases2fastq_path",
         },
         "mail": {
             "recipients": ["mock@mock.com"],

From 8aef1c95b47eb5f32c03854fcddb5e95e3819b3c Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Wed, 23 Oct 2024 14:59:07 +0200
Subject: [PATCH 166/187] add demux test w assertions

---
 tests/analysis/test_analysis_element.py | 46 +++++++++++++++++++++++--
 1 file changed, 43 insertions(+), 3 deletions(-)

diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py
index 939b132e..7202c589 100644
--- a/tests/analysis/test_analysis_element.py
+++ b/tests/analysis/test_analysis_element.py
@@ -83,7 +83,7 @@ def aviti_fixture(create_dirs, caplog):
             "taca.element.Element_Runs.ElementRunsConnection", autospec=True
         ).start(),
         "mock_mail": patch("taca.analysis.analysis_element.send_mail").start(),
-        "mock_subprocess": patch("subprocess.Popen").start(),
+        "mock_popen": patch("subprocess.Popen").start(),
     }
 
     # Import module to test
@@ -149,9 +149,8 @@ def test_process_on_dir_w_metadata(aviti_fixture):
     assert mocks["mock_db"].return_value.upload_to_statusdb.called
 
 
-@pytest.skip("Not implemented")
+@pytest.mark.skip("Currently a failed run is treated as an ongoing run.")
 def test_process_on_failed_run(aviti_fixture):
-    """"""
     to_test, tmp, caplog, mocks = aviti_fixture
 
     # Sub-mock configuration
@@ -201,3 +200,44 @@ def test_process_on_finished_run_wo_lims_manifest(aviti_fixture):
 
     assert "No manifest found for run" in caplog.text
     mocks["mock_mail"].assert_called_once()
+
+
+def test_process_on_finished_run(aviti_fixture):
+    """Should start demux."""
+    to_test, tmp, caplog, mocks = aviti_fixture
+
+    # Sub-mock configuration
+    mocks["mock_db"].return_value.check_db_run_status.return_value = "ongoing"
+    mocks["mock_db"].return_value.upload_to_statusdb.return_value = None
+
+    # Add metadata files
+    run_dir = create_element_run_dir(
+        tmp=tmp,
+        lims_manifest=True,
+        metadata_files=True,
+        run_finished=True,
+        outcome_completed=True,
+        demux_dir=False,
+        demux_done=False,
+        rsync_ongoing=False,
+        rsync_exit_status=None,
+        nosync=False,
+    )
+
+    to_test.run_preprocessing(run_dir)
+
+    expected_call = " ".join(
+        [
+            "mock_bases2fastq_path",
+            f"{tmp.name}/ngi_data/sequencing/AV242106/20240926_AV242106_A2349523513",
+            f"{tmp.name}/ngi_data/sequencing/AV242106/20240926_AV242106_A2349523513/Demultiplexing_0",
+            "-p 8",
+            "--num-unassigned 500",
+            f"-r {tmp.name}/ngi_data/sequencing/AV242106/20240926_AV242106_A2349523513/20240926_AV242106_A2349523513_demux_0.csv",
+            "--legacy-fastq",
+            "--force-index-orientation",
+        ]
+    )
+    assert mocks["mock_popen"].call_args.args[0] == expected_call
+    assert "Bases2Fastq conversion and demultiplexing started for run " in caplog.text
+    assert mocks["mock_db"].return_value.upload_to_statusdb.called

From fa506dd317259ee973c3744a1e528e1557bf97bc Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Wed, 23 Oct 2024 15:33:31 +0200
Subject: [PATCH 167/187] formatting

---
 taca/element/Element_Runs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 28dc81da..eaa700fe 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -650,7 +650,7 @@ def generate_demux_command(self, run_manifest, demux_dir):
     def start_demux(self, run_manifest, demux_dir):
         with chdir(self.run_dir):
             cmd = self.generate_demux_command(run_manifest, demux_dir)
-            stderr_abspath = f"{self.run_dir}/bases2fastq_stderr.txt" #TODO: individual files for each sub-demux
+            stderr_abspath = f"{self.run_dir}/bases2fastq_stderr.txt"  # TODO: individual files for each sub-demux
             try:
                 with open(stderr_abspath, "w") as stderr:
                     process = subprocess.Popen(
@@ -1198,7 +1198,7 @@ def sync_metadata(self):
         dest = os.path.join(metadata_archive, self.NGI_run_id)
         if not os.path.exists(dest):
             os.makedirs(dest)
-        for f in files_to_copy: # UnassignedSequences.csv missing in NoIndex case
+        for f in files_to_copy:  # UnassignedSequences.csv missing in NoIndex case
             if os.path.exists(f):
                 shutil.copy(f, dest)
             else:

From e208826aa69379a8967e99bbee302a51b65fa594 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Wed, 23 Oct 2024 16:54:30 +0200
Subject: [PATCH 168/187] IT'S ALIVE

---
 tests/analysis/test_analysis_element.py | 66 ++-----------------------
 1 file changed, 5 insertions(+), 61 deletions(-)

diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py
index 7202c589..b5f442ad 100644
--- a/tests/analysis/test_analysis_element.py
+++ b/tests/analysis/test_analysis_element.py
@@ -1,74 +1,15 @@
 import logging
-from io import StringIO
+import sys
 from tempfile import TemporaryDirectory
 from unittest.mock import patch
 
-import pandas as pd
 import pytest
 from dirhash import dirhash
 
 from tests.element.test_Element_Runs import create_element_run_dir, get_config
 
 
-def parametrize_testruns() -> tuple[list[dict], list[str]]:
-    """Helper function to build test parametrization from a friendly string table."""
-
-    testrun_descs: list[str] = ["ready to demux", "demux_ongoing"]
-
-    kwarg_table = """
-    lims_manifest  metadata_files  run_finished  outcome_completed  demux_dir  demux_done  rsync_ongoing  rsync_exit_status  nosync
-    True           True            True          True               False      False       False          None               False
-    True           True            True          True               True       False       False          None               False
-    """
-
-    # Turn string table to datastream
-    data = StringIO(kwarg_table)
-
-    # Read data, trimming whitespace
-    df = pd.read_csv(data, sep=r"\s+")
-
-    # Compile into list of parameters to use
-    testrun_kwargs: list[dict] = df.to_dict(orient="records")
-
-    assert len(testrun_descs) == len(testrun_kwargs)
-
-    return testrun_kwargs, testrun_descs
-
-
-testrun_kwargs, testrun_descs = parametrize_testruns()
-
-
-@pytest.mark.parametrize("run_kwargs", testrun_kwargs, ids=testrun_descs)
-def test_run_preprocessing(create_dirs, run_kwargs):
-    tmp: TemporaryDirectory = create_dirs
-
-    # Mock config
-    config = get_config(tmp)
-    mock_config = patch("taca.utils.config.CONFIG", new=config)
-    mock_config.start()
-
-    # Mock DB
-    mock_db = patch("taca.element.Element_Runs.ElementRunsConnection")
-    mock_db.start()
-
-    # Mock subprocess
-    mock_subprocess = patch("subprocess.Popen")
-    mock_subprocess.start()
-
-    # Create run dir and associated LIMS manifest
-    run_dir = create_element_run_dir(tmp=tmp, **run_kwargs)
-
-    # Import module to test
-    from taca.analysis import analysis_element as to_test
-
-    # Test
-    to_test.run_preprocessing(run_dir)
-
-    # Stop mocks
-    patch.stopall()
-
-
-@pytest.fixture
+@pytest.fixture()
 def aviti_fixture(create_dirs, caplog):
     # Create tempdir
     tmp: TemporaryDirectory = create_dirs
@@ -95,6 +36,9 @@ def aviti_fixture(create_dirs, caplog):
     # Stop mocks
     patch.stopall()
 
+    # Purge module
+    del sys.modules["taca.analysis.analysis_element"]
+
 
 def test_process_on_empty_dir(aviti_fixture):
     to_test, tmp, caplog, mocks = aviti_fixture

From efffd89847106da8cadeb2d0597bb7524a0dc869 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Wed, 23 Oct 2024 16:57:03 +0200
Subject: [PATCH 169/187] ruff fix

---
 tests/analysis/test_analysis_element.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py
index b5f442ad..905ca963 100644
--- a/tests/analysis/test_analysis_element.py
+++ b/tests/analysis/test_analysis_element.py
@@ -4,10 +4,11 @@
 from unittest.mock import patch
 
 import pytest
-from dirhash import dirhash
 
 from tests.element.test_Element_Runs import create_element_run_dir, get_config
 
+# from dirhash import dirhash TODO this might be useful for validating dir tree snapshots
+
 
 @pytest.fixture()
 def aviti_fixture(create_dirs, caplog):

From 4807f747a0aafcdc7a7e5313f580f844a9d4c601 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Wed, 23 Oct 2024 17:03:08 +0200
Subject: [PATCH 170/187] mypy fix

---
 pyproject.toml                           | 1 +
 tests/analysis/test_analysis_nanopore.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 15c6c907..0fc1fcb9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,6 +26,7 @@ ignore = [
 [tool.mypy]
 ignore_missing_imports = true
 follow_imports = 'skip'
+exclude = "build"
 
 # === Testing ================================================================
 
diff --git a/tests/analysis/test_analysis_nanopore.py b/tests/analysis/test_analysis_nanopore.py
index 142d1a38..30a7105a 100644
--- a/tests/analysis/test_analysis_nanopore.py
+++ b/tests/analysis/test_analysis_nanopore.py
@@ -15,7 +15,7 @@
 )
 
 
-def parametrize_testruns() -> list[dict]:
+def parametrize_testruns():
     """In order to parametrize the test in a comprehensive way, the parametrization is
     tabulated as a string here.
     """

From 636dcddbd3effab8a5dce53893f56ce0891e2693 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Wed, 23 Oct 2024 17:06:39 +0200
Subject: [PATCH 171/187] try to placate GHA

---
 tests/analysis/test_analysis_element.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py
index 905ca963..3cbdbfb5 100644
--- a/tests/analysis/test_analysis_element.py
+++ b/tests/analysis/test_analysis_element.py
@@ -38,7 +38,13 @@ def aviti_fixture(create_dirs, caplog):
     patch.stopall()
 
     # Purge module
-    del sys.modules["taca.analysis.analysis_element"]
+    try:
+        del sys.modules["taca.analysis.analysis_element"]
+    except KeyError:
+        try:
+            del sys.modules["to_test"]
+        except KeyError:
+            pass
 
 
 def test_process_on_empty_dir(aviti_fixture):

From 9130d43e2fcb6c28fdb5d71a19303d299185644b Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Thu, 24 Oct 2024 12:44:19 +0200
Subject: [PATCH 172/187] try different way of extracting mock call args

---
 tests/analysis/test_analysis_element.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py
index 3cbdbfb5..0d4e328b 100644
--- a/tests/analysis/test_analysis_element.py
+++ b/tests/analysis/test_analysis_element.py
@@ -177,7 +177,7 @@ def test_process_on_finished_run(aviti_fixture):
 
     to_test.run_preprocessing(run_dir)
 
-    expected_call = " ".join(
+    expected_cmd = " ".join(
         [
             "mock_bases2fastq_path",
             f"{tmp.name}/ngi_data/sequencing/AV242106/20240926_AV242106_A2349523513",
@@ -189,6 +189,6 @@ def test_process_on_finished_run(aviti_fixture):
             "--force-index-orientation",
         ]
     )
-    assert mocks["mock_popen"].call_args.args[0] == expected_call
+    assert mocks["mock_popen"].call_args_list[0].args[0] == expected_cmd
     assert "Bases2Fastq conversion and demultiplexing started for run " in caplog.text
     assert mocks["mock_db"].return_value.upload_to_statusdb.called

From a91d868efb0503bb0c9bb4360352b43b2120919d Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Thu, 24 Oct 2024 12:55:28 +0200
Subject: [PATCH 173/187] potential fix

---
 tests/analysis/test_analysis_element.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py
index 0d4e328b..1be0860f 100644
--- a/tests/analysis/test_analysis_element.py
+++ b/tests/analysis/test_analysis_element.py
@@ -189,6 +189,9 @@ def test_process_on_finished_run(aviti_fixture):
             "--force-index-orientation",
         ]
     )
-    assert mocks["mock_popen"].call_args_list[0].args[0] == expected_cmd
+    assert any(
+        expected_cmd in call.args[0] for call in mocks["mock_popen"].call_args_list
+    ), f"Expected command '{expected_cmd}' not found in any Popen calls."
+
     assert "Bases2Fastq conversion and demultiplexing started for run " in caplog.text
     assert mocks["mock_db"].return_value.upload_to_statusdb.called

From 383a7b2ed647bac4ea5b70dea10ffe852929ad1a Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Thu, 24 Oct 2024 12:58:18 +0200
Subject: [PATCH 174/187] troubleshooting

---
 tests/analysis/test_analysis_element.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py
index 1be0860f..60f966a0 100644
--- a/tests/analysis/test_analysis_element.py
+++ b/tests/analysis/test_analysis_element.py
@@ -189,6 +189,11 @@ def test_process_on_finished_run(aviti_fixture):
             "--force-index-orientation",
         ]
     )
+
+    print("Troubleshooting start")
+    print([call.args[0] for call in mocks["mock_popen"].call_args_list])
+    print("Troubleshooting end")
+
     assert any(
         expected_cmd in call.args[0] for call in mocks["mock_popen"].call_args_list
     ), f"Expected command '{expected_cmd}' not found in any Popen calls."

From e2cefe88c16ecd2043fc45b166bd18998436aee6 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Thu, 24 Oct 2024 13:01:43 +0200
Subject: [PATCH 175/187] more debugging

---
 tests/analysis/test_analysis_element.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py
index 60f966a0..beb4fbc8 100644
--- a/tests/analysis/test_analysis_element.py
+++ b/tests/analysis/test_analysis_element.py
@@ -190,13 +190,11 @@ def test_process_on_finished_run(aviti_fixture):
         ]
     )
 
-    print("Troubleshooting start")
-    print([call.args[0] for call in mocks["mock_popen"].call_args_list])
-    print("Troubleshooting end")
+    debug_msg = "\n".join([call.args[0] for call in mocks["mock_popen"].call_args_list])
 
     assert any(
         expected_cmd in call.args[0] for call in mocks["mock_popen"].call_args_list
-    ), f"Expected command '{expected_cmd}' not found in any Popen calls."
+    ), f"Expected command '{expected_cmd}' not found in any Popen calls: {debug_msg}"
 
     assert "Bases2Fastq conversion and demultiplexing started for run " in caplog.text
     assert mocks["mock_db"].return_value.upload_to_statusdb.called

From 63495ef10d765bf301df13db7383167d16fee6a7 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Thu, 24 Oct 2024 13:08:08 +0200
Subject: [PATCH 176/187] try getting python ver up to date and decreasing
 debugging resolution

---
 .github/workflows/lint-code.yml         | 8 ++++----
 .github/workflows/test-code.yml         | 2 +-
 tests/analysis/test_analysis_element.py | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/lint-code.yml b/.github/workflows/lint-code.yml
index 59536920..bbb74445 100644
--- a/.github/workflows/lint-code.yml
+++ b/.github/workflows/lint-code.yml
@@ -11,7 +11,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v4
         with:
-          python-version: "3.10"
+          python-version: "3.11.5"
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
@@ -29,7 +29,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v4
         with:
-          python-version: "3.10"
+          python-version: "3.11.5"
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
@@ -46,7 +46,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v4
         with:
-          python-version: "3.10"
+          python-version: "3.11.5"
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
@@ -67,7 +67,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v4
         with:
-          python-version: "3.10"
+          python-version: "3.11.5"
 
       - name: Install dependencies
         run: |
diff --git a/.github/workflows/test-code.yml b/.github/workflows/test-code.yml
index eec2d5fb..7ac1ed0d 100644
--- a/.github/workflows/test-code.yml
+++ b/.github/workflows/test-code.yml
@@ -12,7 +12,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v4
         with:
-          python-version: "3.10"
+          python-version: "3.11.5"
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py
index beb4fbc8..91398db2 100644
--- a/tests/analysis/test_analysis_element.py
+++ b/tests/analysis/test_analysis_element.py
@@ -190,7 +190,7 @@ def test_process_on_finished_run(aviti_fixture):
         ]
     )
 
-    debug_msg = "\n".join([call.args[0] for call in mocks["mock_popen"].call_args_list])
+    debug_msg = "\n".join([call for call in mocks["mock_popen"].call_args_list])
 
     assert any(
         expected_cmd in call.args[0] for call in mocks["mock_popen"].call_args_list

From 0a4fd33ffaf75722457fd6dda9a2e4e575f5a741 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Thu, 24 Oct 2024 13:11:54 +0200
Subject: [PATCH 177/187] more debugging

---
 .github/workflows/test-code.yml         | 2 +-
 tests/analysis/test_analysis_element.py | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test-code.yml b/.github/workflows/test-code.yml
index 7ac1ed0d..4248a17d 100644
--- a/.github/workflows/test-code.yml
+++ b/.github/workflows/test-code.yml
@@ -22,7 +22,7 @@ jobs:
         run: pip install -e .
       - name: pytest
         # Default options are configured in pyproject.toml
-        run: pytest --cov=./taca --cov-report=xml --cov-report term-missing -vv
+        run: pytest -s .
       - name: CodeCov
         uses: codecov/codecov-action@v4
         with:
diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py
index 91398db2..506bd538 100644
--- a/tests/analysis/test_analysis_element.py
+++ b/tests/analysis/test_analysis_element.py
@@ -190,11 +190,12 @@ def test_process_on_finished_run(aviti_fixture):
         ]
     )
 
-    debug_msg = "\n".join([call for call in mocks["mock_popen"].call_args_list])
+    for call in mocks["mock_popen"].call_args_list:
+        print(call)
 
     assert any(
         expected_cmd in call.args[0] for call in mocks["mock_popen"].call_args_list
-    ), f"Expected command '{expected_cmd}' not found in any Popen calls: {debug_msg}"
+    ), f"Expected command '{expected_cmd}' not found in any Popen calls."
 
     assert "Bases2Fastq conversion and demultiplexing started for run " in caplog.text
     assert mocks["mock_db"].return_value.upload_to_statusdb.called

From 60ed258b35eba1b8b53027189dce954f4fed6f89 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Thu, 24 Oct 2024 13:16:07 +0200
Subject: [PATCH 178/187] last commit worked! Try re-instating cov pytest

---
 .github/workflows/test-code.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test-code.yml b/.github/workflows/test-code.yml
index 4248a17d..f0f9af7d 100644
--- a/.github/workflows/test-code.yml
+++ b/.github/workflows/test-code.yml
@@ -22,7 +22,7 @@ jobs:
         run: pip install -e .
       - name: pytest
         # Default options are configured in pyproject.toml
-        run: pytest -s .
+        run: pytest -s --cov=./taca --cov-report=xml --cov-report term-missing -vv
       - name: CodeCov
         uses: codecov/codecov-action@v4
         with:

From e6c9043f8f7edef4a217fed5bcf26d9c1bc101a6 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Thu, 24 Oct 2024 13:18:47 +0200
Subject: [PATCH 179/187] last commit worked, try not capturing pytest output

---
 .github/workflows/test-code.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test-code.yml b/.github/workflows/test-code.yml
index f0f9af7d..7ac1ed0d 100644
--- a/.github/workflows/test-code.yml
+++ b/.github/workflows/test-code.yml
@@ -22,7 +22,7 @@ jobs:
         run: pip install -e .
       - name: pytest
         # Default options are configured in pyproject.toml
-        run: pytest -s --cov=./taca --cov-report=xml --cov-report term-missing -vv
+        run: pytest --cov=./taca --cov-report=xml --cov-report term-missing -vv
       - name: CodeCov
         uses: codecov/codecov-action@v4
         with:

From 221a57a29eba4a0a9521c4feda297c60262bcd0c Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Thu, 24 Oct 2024 13:20:49 +0200
Subject: [PATCH 180/187] last commit worked, try removing debug statement

---
 tests/analysis/test_analysis_element.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py
index 506bd538..0b783875 100644
--- a/tests/analysis/test_analysis_element.py
+++ b/tests/analysis/test_analysis_element.py
@@ -190,9 +190,6 @@ def test_process_on_finished_run(aviti_fixture):
         ]
     )
 
-    for call in mocks["mock_popen"].call_args_list:
-        print(call)
-
     assert any(
         expected_cmd in call.args[0] for call in mocks["mock_popen"].call_args_list
     ), f"Expected command '{expected_cmd}' not found in any Popen calls."

From 80c687b39967e3c4612e1ebcd9c177e95c2486ab Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Thu, 24 Oct 2024 13:27:36 +0200
Subject: [PATCH 181/187] try cleaner module purge

---
 tests/analysis/test_analysis_element.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/tests/analysis/test_analysis_element.py b/tests/analysis/test_analysis_element.py
index 0b783875..356a695a 100644
--- a/tests/analysis/test_analysis_element.py
+++ b/tests/analysis/test_analysis_element.py
@@ -38,13 +38,7 @@ def aviti_fixture(create_dirs, caplog):
     patch.stopall()
 
     # Purge module
-    try:
-        del sys.modules["taca.analysis.analysis_element"]
-    except KeyError:
-        try:
-            del sys.modules["to_test"]
-        except KeyError:
-            pass
+    del sys.modules["taca.analysis.analysis_element"]
 
 
 def test_process_on_empty_dir(aviti_fixture):

From db2b99e3bf39ea4c4a14da1367228e6b090b1b05 Mon Sep 17 00:00:00 2001
From: chuan-wang <chuan.wang@scilifelab.se>
Date: Thu, 24 Oct 2024 15:44:03 +0200
Subject: [PATCH 182/187] Fix issue with redundant PhiX record

---
 taca/element/Element_Runs.py | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 28dc81da..6aed0ebb 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -953,19 +953,39 @@ def aggregate_stats_assigned(self, demux_runmanifest):
                 )
         # Remove redundant rows for PhiX
         aggregated_assigned_indexes_filtered = []
-        unique_phiX_combination = set()
+        phix_filtered = []
         for sample in aggregated_assigned_indexes:
             # Add project name
             sample["Project"] = [
                 d for d in demux_runmanifest if d["SampleName"] == sample["SampleName"]
             ][0]["Project"]
+            # Get the PhiX with the longest index combination.
             if sample["SampleName"] == "PhiX":
-                combination = (sample["I1"], sample["I2"], sample["Lane"])
-                if combination not in unique_phiX_combination:
-                    aggregated_assigned_indexes_filtered.append(sample)
-                    unique_phiX_combination.add(combination)
+                lane = sample["Lane"]
+                idx1 = sample["I1"]
+                idx2 = sample["I2"]
+                num_polonies_assigned = sample["NumPoloniesAssigned"]
+                if not phix_filtered:
+                    phix_filtered.append(sample)
+                else:
+                    found_flag = False
+                    for phix_record in phix_filtered:
+                        if lane == phix_record["Lane"]:
+                            idx1_shorter_len = min(len(idx1), len(phix_record["I1"]))
+                            idx2_shorter_len = min(len(idx2), len(phix_record["I2"]))
+                            if idx1[:idx1_shorter_len] == phix_record["I1"][:idx1_shorter_len] and idx2[:idx2_shorter_len] == phix_record["I2"][:idx2_shorter_len]:
+                                found_flag = True
+                                # When the new record has a longer index combination length, take the new record and remove the old one
+                                # When the index combination length happen to be the same, keep the one with the higher polonies assigned
+                                if len(idx1)+len(idx2) > len(phix_record["I1"])+len(phix_record["I2"]) or (len(idx1)+len(idx2) == len(phix_record["I1"])+len(phix_record["I2"]) and num_polonies_assigned >= phix_record["NumPoloniesAssigned"]):
+                                    phix_filtered.remove(phix_record)
+                                    phix_filtered.append(sample)
+                    if not found_flag:
+                        phix_filtered.append(sample)
             else:
                 aggregated_assigned_indexes_filtered.append(sample)
+        # Combine the list of samples and PhiX
+        aggregated_assigned_indexes_filtered += phix_filtered
         # Sort the list by Lane, SampleName and sub_demux_count
         aggregated_assigned_indexes_filtered_sorted = sorted(
             aggregated_assigned_indexes_filtered,

From 2de49941256cc02b1ea315b964f787a09fcbd330 Mon Sep 17 00:00:00 2001
From: chuan-wang <chuan.wang@scilifelab.se>
Date: Thu, 24 Oct 2024 16:13:11 +0200
Subject: [PATCH 183/187] ruff format change

---
 taca/element/Element_Runs.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 6aed0ebb..8b246858 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -650,7 +650,7 @@ def generate_demux_command(self, run_manifest, demux_dir):
     def start_demux(self, run_manifest, demux_dir):
         with chdir(self.run_dir):
             cmd = self.generate_demux_command(run_manifest, demux_dir)
-            stderr_abspath = f"{self.run_dir}/bases2fastq_stderr.txt" #TODO: individual files for each sub-demux
+            stderr_abspath = f"{self.run_dir}/bases2fastq_stderr.txt"  # TODO: individual files for each sub-demux
             try:
                 with open(stderr_abspath, "w") as stderr:
                     process = subprocess.Popen(
@@ -973,11 +973,23 @@ def aggregate_stats_assigned(self, demux_runmanifest):
                         if lane == phix_record["Lane"]:
                             idx1_shorter_len = min(len(idx1), len(phix_record["I1"]))
                             idx2_shorter_len = min(len(idx2), len(phix_record["I2"]))
-                            if idx1[:idx1_shorter_len] == phix_record["I1"][:idx1_shorter_len] and idx2[:idx2_shorter_len] == phix_record["I2"][:idx2_shorter_len]:
+                            if (
+                                idx1[:idx1_shorter_len]
+                                == phix_record["I1"][:idx1_shorter_len]
+                                and idx2[:idx2_shorter_len]
+                                == phix_record["I2"][:idx2_shorter_len]
+                            ):
                                 found_flag = True
                                 # When the new record has a longer index combination length, take the new record and remove the old one
                                 # When the index combination length happen to be the same, keep the one with the higher polonies assigned
-                                if len(idx1)+len(idx2) > len(phix_record["I1"])+len(phix_record["I2"]) or (len(idx1)+len(idx2) == len(phix_record["I1"])+len(phix_record["I2"]) and num_polonies_assigned >= phix_record["NumPoloniesAssigned"]):
+                                if len(idx1) + len(idx2) > len(phix_record["I1"]) + len(
+                                    phix_record["I2"]
+                                ) or (
+                                    len(idx1) + len(idx2)
+                                    == len(phix_record["I1"]) + len(phix_record["I2"])
+                                    and num_polonies_assigned
+                                    >= phix_record["NumPoloniesAssigned"]
+                                ):
                                     phix_filtered.remove(phix_record)
                                     phix_filtered.append(sample)
                     if not found_flag:
@@ -1218,7 +1230,7 @@ def sync_metadata(self):
         dest = os.path.join(metadata_archive, self.NGI_run_id)
         if not os.path.exists(dest):
             os.makedirs(dest)
-        for f in files_to_copy: # UnassignedSequences.csv missing in NoIndex case
+        for f in files_to_copy:  # UnassignedSequences.csv missing in NoIndex case
             if os.path.exists(f):
                 shutil.copy(f, dest)
             else:

From 05979c7208b377d60d569a3a79512639ea84e53f Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Thu, 24 Oct 2024 16:34:47 +0200
Subject: [PATCH 184/187] propagate bugfix

---
 taca/analysis/analysis_element.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index 984c2a7c..6937b0c5 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -77,7 +77,7 @@ def _process(run):
                 return
         elif demultiplexing_status == "ongoing":
             run.status = "demultiplexing"
-            if run.status_changed:
+            if run.status_changed():
                 run.update_statusdb()
             return
 
@@ -100,14 +100,14 @@ def _process(run):
             run.sync_metadata()
             run.make_transfer_indicator()
             run.status = "transferring"
-            if run.status_changed:
+            if run.status_changed():
                 run.update_statusdb()
                 # TODO: Also update statusdb with a timestamp of when the transfer started
             run.transfer()
             return
         elif transfer_status == "ongoing":
             run.status = "transferring"
-            if run.status_changed:
+            if run.status_changed():
                 run.update_statusdb()
             logger.info(
                 f"{run} is being transferred. Skipping."
@@ -118,12 +118,12 @@ def _process(run):
                 run.remove_transfer_indicator()
                 run.update_transfer_log()
                 run.status = "transferred"
-                if run.status_changed:
+                if run.status_changed():
                     run.update_statusdb()
                 run.archive()
                 run.status = "archived"
 
-                if run.status_changed:
+                if run.status_changed():
                     run.update_statusdb()
             else:
                 run.status = "transfer failed"

From 5599bed3fb9b2ead09bcfd14dfd418762db258e2 Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Thu, 24 Oct 2024 16:42:01 +0200
Subject: [PATCH 185/187] set required python version, same as for GHA build

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index e278a522..da2b5026 100644
--- a/setup.py
+++ b/setup.py
@@ -27,6 +27,7 @@
     keywords="bioinformatics",
     author="NGI-stockholm",
     author_email="ngi_pipeline_operators@scilifelab.se",
+    python_requires=">=3.11.5",
     url="http://taca.readthedocs.org/en/latest/",
     license="MIT",
     packages=find_packages(exclude=["ez_setup", "examples", "tests"]),

From 488cb39f01e2d2806ae1a5c351f0b16d6fb07cdb Mon Sep 17 00:00:00 2001
From: kedhammar <alfred.kedhammar@scilifelab.se>
Date: Thu, 24 Oct 2024 16:43:50 +0200
Subject: [PATCH 186/187] propagate python version explication to Dockerfile

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 93fd631b..5f9e41f0 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.10 AS base
+FROM python:3.11.5 AS base
 
 # Update pip to latest version
 RUN python -m pip install --upgrade pip

From 6b6e05631563a9db41840a395538c833f906c3c3 Mon Sep 17 00:00:00 2001
From: Sara Sjunnebo <sara.sjunnebo@gmail.com>
Date: Fri, 25 Oct 2024 08:52:43 +0200
Subject: [PATCH 187/187] Rename archiving

---
 taca/analysis/analysis_element.py | 4 ++--
 taca/element/Element_Runs.py      | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/taca/analysis/analysis_element.py b/taca/analysis/analysis_element.py
index 6937b0c5..9f71efb5 100755
--- a/taca/analysis/analysis_element.py
+++ b/taca/analysis/analysis_element.py
@@ -120,8 +120,8 @@ def _process(run):
                 run.status = "transferred"
                 if run.status_changed():
                     run.update_statusdb()
-                run.archive()
-                run.status = "archived"
+                run.move_to_nosync()
+                run.status = "processed"
 
                 if run.status_changed():
                     run.update_statusdb()
diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py
index 8b246858..3fa697a0 100644
--- a/taca/element/Element_Runs.py
+++ b/taca/element/Element_Runs.py
@@ -1293,7 +1293,7 @@ def update_paths_after_archiving(self, new_location):
         )
         self.run_uploaded_file = os.path.join(self.run_dir, "RunUploaded.json")
 
-    def archive(self):
+    def move_to_nosync(self):
         """Move directory to nosync."""
         src = self.run_dir
         parent_dir = Path(self.run_dir).parent.absolute()