neuroinformatics-unit · lauraporta · Dec 20, 2022 · Jan 6, 2023 · Jan 6, 2023 · Jan 6, 2023
diff --git a/.gitignore b/.gitignore
@@ -85,3 +85,6 @@ venv/
 # config specific
 **/config.yml
 .env
+
+# used in testing
+*.hdf5
diff --git a/README.md b/README.md
@@ -8,35 +8,31 @@
 [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
 [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
 
-# load-suite2p
-This package focuses on loading 2p / 3p data generated by suite2p or registers 2p.
-Part of a bigger project to reorganize, improve and expand the code contained in [ctsitou_rsp_vision](https://github.com/SainsburyWellcomeCentre/ctsitou_rsp_vision).
+# MultiPhoton RSP Analysis
+TODO: Add a description of the project
 
-Could work as a package to be called in a python script or as a CLI application. Its main purpose is to crate an object to be read by the `analysis` package.
-## Structure of the package folder `load_suite2p`:
+## Demo Usage
+To test the functionalities that have been implemented so far, you can use `python3` from the command line and run the following commands:
+```python
+from load_suite2p.main import main
+main()
 ```
-  __init__.py
-  config/
-      config.yml
-  folder_naming_specs.py
-  formatted_data.py
-  main.py
-  parsers/
-      __init__.py
-      chryssanthi.py
-      parser.py
-  read_config.py
-  utils.py
+This script will call the `main()` method and ask you for the name of the folder containing the data you want to analyse. Be sure to have the data locally or to have access to the folder in the shared server. If you don't, I can share the data with you privately.
+
+The precise path to the data should be stored in a `.env` file that you can create in bash using the command `touch .env`. You can then edit using the command `nano .env`. The `.env` file could contain the following line: `CONFIG_PATH=config/config.yml`, which specifies that the config file, containing the path information, is located in the `config` folder, a subdirectory of the `load_suite2p` folder.
+
+Here an example of the content of the config file:
+
+```yaml
+parser: Parser2pRSP
+
+paths:
+  imaging: '/path/to/imaging'
+  allen-dff: '/path/to/imaging/allen_dff'
+  serial2p: '/path/to/imaging/serial2p'
+  stimulus-ai-schedule: '/path/to/imaging/stimulus_AI_schedule_files'
+
+use-allen-dff: true
+analysis-type: 'sf_tf'
+
 ```
-## Description:
-### load_suite2p
-* `main()` in `main.py` can be used as the entry point of a CLI application. It takes care of logging with `fancy_log`. It is now used only for testing purposes, it will be expanded in the future.
-* `read_config.py` contains a method to read yaml configs, stored in the `config/` folder. Will be expanded as the config file becames more complex.
-* `utils.py` contains helper methods that I could use in multiple locations. I might move them in a separate repository in the future. It contains an helper method for `fancy_log`, two methods to check the connection to winstor (`can_ping_swc_server`, `is_winstor_mounted`) and `exception_handler` an exception wrapper for main.py.
-* `formatted_data.py` is a draft of the class describing the object to be saved.
-* `folder_name_specs.py` contains a class that holds the folder name in which experimental data is saved, details on this experiment extracted from the folder and paths. Recives as input the name of the folder and checks if it is valid and if the data can be read. Calls a `parser` to extract the details from the folder name which are specific to the scientist/project.
-* `parsers/` contains the `parser` class and a parser tailor made for Chryssanthi's folder structure. The `parser` class is an abstract class that defines the methods that a parser should have. The `chryssanthi` parser is the only one implemented so far. It extracts the details from the folder name taking into account various exceptions in the formatting. It is called by `folder_name_specs.py`.
-
-### tests
-Some tests have been implemented, especially integration tests that take into account the Parser and the FolderNameSpecs classes. They are located in the `tests/` folder. They can be run with `pytest` or `python -m pytest` from the root of the repository.
-There is not yet 100% coverage, but I am working on it.
diff --git a/load_suite2p/app.py b/load_suite2p/app.py
@@ -0,0 +1,13 @@
+from textual.app import App, ComposeResult
+from textual.widgets import Button, Static
+
+
+class MultiPhoton_RSP_Vision(App):
+    def compose(self) -> ComposeResult:
+        yield Static("Welcome to Multi Photon RSP Vision!")
+        yield Button("Great!", id="yes", variant="primary")
+
+
+if __name__ == "__main__":
+    app = MultiPhoton_RSP_Vision()
+    app.run()
diff --git a/load_suite2p/load/load_data.py b/load_suite2p/load/load_data.py
@@ -1,15 +1,20 @@
 import logging
+from pathlib import Path
 from typing import Tuple
 
+import h5py
 from decouple import config
 
+from ..objects.data_raw import DataRaw
+from ..objects.enums import AnalysisType, DataType
 from ..objects.specifications import Specifications
 from .read_config import read
 
 CONFIG_PATH = config("CONFIG_PATH")
+config_path = Path(__file__).parents[1] / CONFIG_PATH
 
 
-def load_data(folder_name: str) -> Tuple[list, Specifications]:
+def load_data(folder_name: str) -> Tuple[DataRaw, Specifications]:
     """Creates the configuration object and loads the data.
 
     Parameters
@@ -46,26 +51,35 @@ def get_specifications(folder_name: str) -> Specifications:
         Specifications object
     """
     """"""
-
-    specs = Specifications(read_configurations(), folder_name)
-    return specs
-
-
-def load(specs: Specifications) -> list:
-    raise NotImplementedError("TODO")
-
-
-def read_configurations() -> dict:
-    """Read configurations regarding experiment and analysis.
-
-    Returns
-    -------
-    dict
-        dictionary with configurations
-    """
-
     logging.debug("Reading configurations")
-    config = read(CONFIG_PATH)
+    config = read(config_path)
     logging.debug(f"Configurations read: {config}")
+    specs = Specifications(config, folder_name)
+    return specs
+
 
-    return config
+def load(specs: Specifications) -> DataRaw:
+    if specs.config["use-allen-dff"]:
+        if specs.config["analysis-type"] == "sf_tf":
+            allen_data_files = [
+                file
+                for file in specs.folder_naming.all_files
+                if file.datatype == DataType.ALLEN_DFF
+                and file.analysistype == AnalysisType.SF_TF
+            ]
+            if len(allen_data_files) == 1:
+                with h5py.File(allen_data_files[0].path, "r") as h5py_file:
+                    data_raw = DataRaw(h5py_file, is_allen=True)
+
+                logging.info(f"Allen data loaded: {data_raw}")
+                return data_raw
+            else:
+                raise ValueError(
+                    "There is more than one Allen file for sf_tf analysis"
+                )
+        else:
+            raise NotImplementedError(
+                "Only sf_tf analysis is implemented for Allen data"
+            )
+    else:
+        raise NotImplementedError("Only loading for Allen data is implemented")
diff --git a/load_suite2p/main.py b/load_suite2p/main.py
@@ -14,7 +14,13 @@ def main():
     start_logging()
 
     # TODO: add TUI or GUI fuctionality to get input from user
-    folder_name = Prompt.ask("Please provide the folder name")
+    folder_name = Prompt.ask(
+        " \
+        Please provide the experimental folder name.\n \
+        Format: Mouse_Id_Hemisphere_BrainRegion_Monitor_position.\n \
+        Example: AK_1111739_hL_RSPd_monitor_front.\n \
+        📁"
+    )
 
     # load data
     data, specs = load_data(folder_name)

diff --git a/load_suite2p/objects/data_raw.py b/load_suite2p/objects/data_raw.py
@@ -0,0 +1,166 @@
+import logging
+from typing import Union
+
+import h5py
+import numpy as np
+
+
+class DataRaw:
+    """Class to load and contain the raw data.
+    It can load data from Allen or from
+    a list of Paths. Only the Allen case is implemented so far.
+    """
+
+    def __init__(self, data: dict, is_allen: bool = True):
+        if is_allen:
+            logging.info("Loading Allen data, starting to unpack...")
+
+            self.day = self._unpack_data(data["day"], data)
+            logging.info("Unpacked day")
+
+            self.imaging = self._unpack_data(data["imaging"], data)
+            logging.info("Unpacked imaging")
+
+            self.frames = self._unpack_data(data["f"], data)
+            logging.info("Unpacked f")
+
+            self.is_cell = self._unpack_data(data["is_cell"], data)
+            logging.info("Unpacked is_cell")
+
+            self.neuropil_coeficient = self._unpack_data(data["r_neu"], data)
+            logging.info("Unpacked r_neu")
+
+            self.stim = self._unpack_data(data["stim"], data)
+            logging.info("Unpacked stim")
+
+            self.trig = self._unpack_data(data["trig"], data)
+            logging.info("Unpacked trig")
+        else:
+            self.day = data["day"]
+            self.imaging = data["imaging"]
+            self.frames = data["f"]
+            self.is_cell = data["is_cell"]
+            self.neuropil_coeficient = data["r_neu"]
+            self.stim = data["stim"]
+            self.trig = data["trig"]
+
+    def __repr__(self) -> str:
+        return f"DataRaw(day={self.day}, imaging={self.imaging}, \
+            f={self.frames}, is_cell={self.is_cell}, \
+            r_neu={self.neuropil_coeficient}, stim={self.stim}, \
+            trig={self.trig})"
+
+    @classmethod
+    def _unpack_data(
+        cls,
+        element: Union[h5py._hl.dataset.Dataset, h5py._hl.group.Group],
+        parent: Union[h5py.File, h5py._hl.group.Group],
+    ) -> Union[np.ndarray, dict]:
+        """This method unpack a complex MATLAB datastructure and returns a
+        nested dictionary or numpy array. Only the relevant subset (Dataset
+        and Groups) of the possible datastructures is implemented.
+        Datasets can be mapped to arrays. Groups can be mapped to
+        dictionaries, and each entry can be a Dataset or another Group.
+        An array might contain numbers or point to other Arrays or Groups
+        through References.
+        References are a HDF5 type that can point either to an array or
+        to a group.
+        They need to be resolved in order to get the data. They are resolved
+        by calling the methods ref_dataset_to_array.
+        If element is a Group, its content is unpacked recursively.
+
+
+        Example of folder structure:
+        . (root)
+        ├── dataset_01 (contains numbers, -> array)
+        ├── dataset_02 (contains references to datasets, -> array)
+        ├── dataset_03 (contains references to groups, -> array of dict)
+        ├── group_01 (contains datasets and groups, never references
+                    -> dict of arrays and dicts)
+
+        A specific example:
+        `data["day"]` is a group containing datasets or groups. It is
+        mappable to a dictionary.
+        When `unpack_data()` is called on `data["day"]`, `isinstance(element,
+        h5py._hl.group.Group)` will be true and the method will call itself
+        recursively on each element in the group, unpacking datasets and groups
+        until it reaches the bottom of the tree.
+        This is one of the most complicated matlab `struct` to unpack,
+        together with `data["stim"]`.
+
+        Args:
+            element Union[h5py._hl.dataset.Dataset, h5py._hl.group.Group]:
+                is either a h5py Group or Dataset.
+                It is what we want to unpack.
+            parent Union[h5py.File, h5py._hl.group.Group]:
+                is the object that contains the element.
+                It is used to resolve references.
+
+        Returns:
+            Union[np.ndarray, dict]:
+                is either a numpy array or a nested dictionary.
+        """
+        if isinstance(element, h5py._hl.dataset.Dataset):
+            if element.dtype == h5py.special_dtype(ref=h5py.Reference):
+                return cls._ref_dataset_to_array(element, parent)
+            else:
+                return np.squeeze(element[:])
+        elif isinstance(element, h5py._hl.group.Group):
+            dict = {}
+            for key in element:
+                dict[key] = cls._unpack_data(element[key], element)
+            return dict
+        else:
+            return None
+
+    @classmethod
+    def _ref_dataset_to_array(
+        cls,
+        dataset: h5py._hl.dataset.Dataset,
+        parent: Union[h5py._hl.group.Group, h5py.File],
+    ) -> np.ndarray:
+        """Takes a Dataset that contains references to other Datasets or
+        Groups and resolves its content.
+
+        Args:
+            dataset (h5py._hl.dataset.Dataset):
+                HDF5 Dataset containing references
+            parent_container (Union[h5py._hl.group.Group, h5py.File]):
+                is the object that contains the element.
+                It is used to resolve references.
+
+        Returns:
+            np.ndarray: an array of numbers or an array of dictionaries
+        """
+        array = np.zeros((dataset.shape[0], dataset.shape[1]), dtype=object)
+
+        for i in range(dataset.shape[0]):
+            for j in range(dataset.shape[1]):
+                ref = dataset[i][j]
+                if isinstance(parent[ref], h5py._hl.group.Group):
+                    array[i, j] = cls._group_to_dict_recursive(parent[ref])
+                else:
+                    array[i, j] = np.squeeze(parent[ref][:])
+
+        return np.squeeze(array)
+
+    @classmethod
+    def _group_to_dict_recursive(cls, group: h5py._hl.group.Group) -> dict:
+        """Takes a Group and resolves its content. If the Group contains
+        other Groups, it calls itself recursively.
+        It assumes there are no more References.
+
+        Args:
+            group (h5py._hl.group.Group):
+                HDF5 Group containing references
+
+        Returns:
+            dict: the resolved dictionary
+        """
+        dict = {}
+        for key in group:
+            if isinstance(group[key], h5py._hl.group.Group):
+                dict[key] = cls._group_to_dict_recursive(group[key])
+            else:
+                dict[key] = np.squeeze(group[key][:])
+        return dict
diff --git a/load_suite2p/objects/photon_data.py b/load_suite2p/objects/photon_data.py
@@ -1,10 +1,11 @@
+from .data_raw import DataRaw
 from .specifications import Specifications
 
 
 class PhotonData:
     """Class to load the formatted data from suite2p and registers2p."""
 
-    def __init__(self, data_raw: list, specs: Specifications):
+    def __init__(self, data_raw: DataRaw, specs: Specifications):
 
         self.response_matrix = self.get_response_matrix()
         self.preprocess(data_raw, specs)
@@ -20,7 +21,7 @@ def __init__(self, data_raw: list, specs: Specifications):
     def get_response_matrix(self):
         raise NotImplementedError("This method is not implemented yet")
 
-    def preprocess(self, data_raw: list, specs: Specifications):
+    def preprocess(self, data_raw: DataRaw, specs: Specifications):
         raise NotImplementedError("This method is not implemented yet")
 
     def reorder(self):

diff --git a/load_suite2p/objects/specifications.py b/load_suite2p/objects/specifications.py
@@ -8,8 +8,8 @@ class Specifications:
     to be loaded."""
 
     def __init__(self, config: dict, folder_name: str):
-        self.base_paths: dict = config["paths"]
+        self.config: dict = config
         self.folder_name = folder_name
-        self.folder_naming_specs = FolderNamingSpecs(folder_name, config)
-        self.folder_naming_specs.extract_all_file_names()
+        self.folder_naming = FolderNamingSpecs(folder_name, config)
+        self.folder_naming.extract_all_file_names()
         self.options = Options(config)
diff --git a/pyproject.toml b/pyproject.toml
@@ -24,6 +24,8 @@ dependencies = [
     "fancylog",
     "PyYAML",
     "types-PyYAML",
+    "h5py",
+    "decouple",
 ]
 
 [project.urls]