neuroinformatics-unit · lauraporta · Jan 26, 2023 · Dec 20, 2022 · Jan 6, 2023 · Jan 6, 2023
diff --git a/.gitignore b/.gitignore
@@ -85,3 +85,6 @@ venv/
 # config specific
 **/config.yml
 .env
+
+# used in testing
+*.hdf5
diff --git a/load_suite2p/load/load_data.py b/load_suite2p/load/load_data.py
@@ -1,15 +1,20 @@
 import logging
+from pathlib import Path
 from typing import Tuple
 
+import h5py
 from decouple import config
 
+from ..objects.data_raw import DataRaw
+from ..objects.enums import AnalysisType, DataType
 from ..objects.specifications import Specifications
 from .read_config import read
 
 CONFIG_PATH = config("CONFIG_PATH")
+config_path = Path(__file__).parents[1] / CONFIG_PATH
 
 
-def load_data(folder_name: str) -> Tuple[list, Specifications]:
+def load_data(folder_name: str) -> Tuple[DataRaw, Specifications]:
     """Creates the configuration object and loads the data.
 
     Parameters
@@ -51,8 +56,31 @@ def get_specifications(folder_name: str) -> Specifications:
     return specs
 
 
-def load(specs: Specifications) -> list:
-    raise NotImplementedError("TODO")
+def load(specs: Specifications) -> DataRaw:
+    if specs.config["use-allen-dff"]:
+        if specs.config["analysis-type"] == "sf_tf":
+            allen_data_files = [
+                file
+                for file in specs.folder_naming.all_files
+                if file.datatype == DataType.ALLEN_DFF
+                and file.analysistype == AnalysisType.SF_TF
+            ]
+            if len(allen_data_files) == 1:
+                data_raw = DataRaw(
+                    h5py.File(allen_data_files[0].path, "r"), is_allen=True
+                )
+                logging.info(f"Allen data loaded: {data_raw}")
+                return data_raw
+            else:
+                raise ValueError(
+                    "There is more than one Allen file for sf_tf analysis"
+                )
+        else:
+            raise NotImplementedError(
+                "Only sf_tf analysis is implemented for Allen data"
+            )
+    else:
+        raise NotImplementedError("Only loading for Allen data is implemented")
 
 
 def read_configurations() -> dict:
@@ -65,7 +93,7 @@ def read_configurations() -> dict:
     """
 
     logging.debug("Reading configurations")
-    config = read(CONFIG_PATH)
+    config = read(config_path)
     logging.debug(f"Configurations read: {config}")
 
     return config
diff --git a/load_suite2p/objects/data_raw.py b/load_suite2p/objects/data_raw.py
@@ -0,0 +1,146 @@
+import logging
+from typing import Union
+
+import h5py
+import numpy as np
+
+
+class DataRaw:
+    """Class to load and contain the raw data.
+    It can load data from Allen or from
+    a list of Paths. Only the Allen case is implemented so far.
+    """
+
+    def __init__(self, data, is_allen: bool = True):
+        if is_allen:
+            logging.info("Loading Allen data, starting to unpack...")
+
+            self.day = self.unpack_data(data["day"], data)
+            logging.info("Unpacked day")
+
+            self.imaging = self.unpack_data(data["imaging"], data)
+            logging.info("Unpacked imaging")
+
+            self.f = self.unpack_data(data["f"], data)
+            logging.info("Unpacked f")
+
+            self.is_cell = self.unpack_data(data["is_cell"], data)
+            logging.info("Unpacked is_cell")
+
+            self.r_neu = self.unpack_data(data["r_neu"], data)
+            logging.info("Unpacked r_neu")
+
+            self.stim = self.unpack_data(data["stim"], data)
+            logging.info("Unpacked stim")
+
+            self.trig = self.unpack_data(data["trig"], data)
+            logging.info("Unpacked trig")
+        else:
+            self.day = data["day"]
+            self.imaging = data["imaging"]
+            self.f = data["f"]
+            self.is_cell = data["is_cell"]
+            self.r_neu = data["r_neu"]
+            self.stim = data["stim"]
+            self.trig = data["trig"]
+
+    def __repr__(self) -> str:
+        return f"DataRaw(day={self.day}, imaging={self.imaging}, f={self.f}, \
+            is_cell={self.is_cell}, r_neu={self.r_neu}, stim={self.stim}, \
+            trig={self.trig})"
+
+    @classmethod
+    def group_to_dict_recursive(cls, group: h5py._hl.group.Group) -> dict:
+        """Takes a Group and resolves its content. If the Group contains
+        other Groups, it calls itself recursively.
+        It assumes there are no more References.
+
+        Args:
+            group (h5py._hl.group.Group):
+                HDF5 Group containing references
+
+        Returns:
+            dict: the resolved dictionary
+        """
+        dict = {}
+        for key in group:
+            if isinstance(group[key], h5py._hl.group.Group):
+                dict[key] = cls.group_to_dict_recursive(group[key])
+            else:
+                dict[key] = np.squeeze(group[key][:])
+        return dict
+
+    @classmethod
+    def ref_dataset_to_array(
+        cls,
+        dataset: h5py._hl.dataset.Dataset,
+        parent: Union[h5py._hl.group.Group, h5py.File],
+    ) -> np.ndarray:
+        """Takes a Dataset that contains references to other Datasets or
+        Groups and resolves its content.
+
+        Args:
+            dataset (h5py._hl.dataset.Dataset):
+                HDF5 Dataset containing references
+            parent_container (Union[h5py._hl.group.Group, h5py.File]):
+                is the object that contains the element.
+                It is used to resolve references.
+
+        Returns:
+            np.ndarray: an array of numbers or an array of dictionaries
+        """
+        array = np.zeros((dataset.shape[0], dataset.shape[1]), dtype=object)
+
+        for i in range(dataset.shape[0]):
+            for j in range(dataset.shape[1]):
+                ref = dataset[i][j]
+                if isinstance(parent[ref], h5py._hl.group.Group):
+                    array[i, j] = cls.group_to_dict_recursive(parent[ref])
+                else:
+                    array[i, j] = np.squeeze(parent[ref][:])
+
+        return np.squeeze(array)
+
+    @classmethod
+    def unpack_data(
+        cls,
+        element: Union[h5py._hl.dataset.Dataset, h5py._hl.group.Group],
+        parent: Union[h5py.File, h5py._hl.group.Group],
+    ) -> Union[np.ndarray, dict]:
+        """This method unpack a complex MATLAB datastructure and returns a
+        nested dictionary or numpy array. Only the relevant subset (Dataset
+        and Groups) of the possible datastructures is implemented.
+        Datasets can be mapped to arrays. Groups can be mapped to
+        dictionaires, and each entry can be a Dataset or another Group.
+        An array might contain numbers or point to other Arrays or Groups
+        through References.
+        References are a HDF5 type that can point either to an array or
+        to a group.
+        They need to be resolved in order to get the data. They are resolved
+        by calling the methods ref_dataset_to_array.
+        If element is a Group, its content is unpacked recursively.
+
+        Args:
+            element Union[h5py._hl.dataset.Dataset, h5py._hl.group.Group]:
+                is either a h5py Group or Dataset.
+                It is what we want to unpack.
+            parent Union[h5py.File, h5py._hl.group.Group]:
+                is the object that contains the element.
+                It is used to resolve references.
+
+        Returns:
+            Union[np.ndarray, dict]:
+                is either a numpy array or a nested dictionary.
+        """
+        if isinstance(element, h5py._hl.dataset.Dataset):
+            if element.dtype == h5py.special_dtype(ref=h5py.Reference):
+                return cls.ref_dataset_to_array(element, parent)
+            else:
+                return np.squeeze(element[:])
+        elif isinstance(element, h5py._hl.group.Group):
+            dict = {}
+            for key in element:
+                dict[key] = cls.unpack_data(element[key], element)
+            return dict
+        else:
+            return None
diff --git a/load_suite2p/objects/photon_data.py b/load_suite2p/objects/photon_data.py
@@ -1,10 +1,11 @@
+from .data_raw import DataRaw
 from .specifications import Specifications
 
 
 class PhotonData:
     """Class to load the formatted data from suite2p and registers2p."""
 
-    def __init__(self, data_raw: list, specs: Specifications):
+    def __init__(self, data_raw: DataRaw, specs: Specifications):
 
         self.response_matrix = self.get_response_matrix()
         self.preprocess(data_raw, specs)
@@ -20,7 +21,7 @@ def __init__(self, data_raw: list, specs: Specifications):
     def get_response_matrix(self):
         raise NotImplementedError("This method is not implemented yet")
 
-    def preprocess(self, data_raw: list, specs: Specifications):
+    def preprocess(self, data_raw: DataRaw, specs: Specifications):
         raise NotImplementedError("This method is not implemented yet")
 
     def reorder(self):

diff --git a/load_suite2p/objects/specifications.py b/load_suite2p/objects/specifications.py
@@ -8,8 +8,8 @@ class Specifications:
     to be loaded."""
 
     def __init__(self, config: dict, folder_name: str):
-        self.base_paths: dict = config["paths"]
+        self.config: dict = config
         self.folder_name = folder_name
-        self.folder_naming_specs = FolderNamingSpecs(folder_name, config)
-        self.folder_naming_specs.extract_all_file_names()
+        self.folder_naming = FolderNamingSpecs(folder_name, config)
+        self.folder_naming.extract_all_file_names()
         self.options = Options(config)
diff --git a/pyproject.toml b/pyproject.toml
@@ -24,6 +24,7 @@ dependencies = [
     "fancylog",
     "PyYAML",
     "types-PyYAML",
+    "h5py",
 ]
 
 [project.urls]

diff --git a/tests/test_integration/test_data_raw.py b/tests/test_integration/test_data_raw.py
@@ -0,0 +1,86 @@
+import h5py
+import numpy as np
+
+from load_suite2p.objects.data_raw import DataRaw
+
+array0 = np.array([1, 2, 3, 4, 5])
+array1 = np.array([3, 4, 5, 6, 7])
+array2 = np.array([5, 6, 7, 8, 9])
+array3 = np.array([7, 8, 9, 10, 11])
+array4 = np.array([9, 10, 11, 12, 13])
+
+
+def create_mock_hdf5_data():
+    with h5py.File("mytestfile.hdf5", "w") as f:
+        # create a file and add a simple dataset
+        f.create_dataset("array0", data=array0, dtype="i")
+
+        # create a file and add a group with a dataset inside
+        grp = f.create_group("mygroup")
+        grp.create_dataset("array1", data=array1, dtype="f")
+
+        # create group with subgroup and a dataset
+        subgroup = grp.create_group("subgroup")
+        subgroup.create_dataset("array2", data=array2, dtype="f")
+
+        # create a dataset with references of dataset
+        dataset_to_be_referenced = f.create_dataset(
+            "array3", data=array3, dtype="f"
+        )
+        ref = dataset_to_be_referenced.ref
+        ref_array = [[ref, ref], [ref, ref]]
+        f.create_dataset(
+            "ref_dataset",
+            data=ref_array,
+            dtype=h5py.special_dtype(ref=h5py.Reference),
+        )
+
+        # create a dataset with references of group with subgroup
+        group_to_be_referenced = f.create_group("#ref_group#")
+        subgroup2 = group_to_be_referenced.create_group("subgroup2")
+        subgroup2.create_dataset("array4", data=array4, dtype="f")
+        ref2 = group_to_be_referenced.ref
+        ref_array2 = [[ref2, ref2], [ref2, ref2]]
+        f.create_dataset(
+            "ref_dataset2",
+            data=ref_array2,
+            dtype=h5py.special_dtype(ref=h5py.Reference),
+        )
+
+
+def test_unpack_of_simple_dataset():
+    create_mock_hdf5_data()
+    with h5py.File("mytestfile.hdf5", "r") as f:
+        assert np.all(DataRaw.unpack_data(f["array0"], f) == array0)
+
+
+def test_unpack_of_dataset_in_group():
+    create_mock_hdf5_data()
+    with h5py.File("mytestfile.hdf5", "r") as f:
+        assert np.all(DataRaw.unpack_data(f["mygroup"]["array1"], f) == array1)
+
+
+def test_unpack_of_dataset_in_subgroup():
+    create_mock_hdf5_data()
+    with h5py.File("mytestfile.hdf5", "r") as f:
+        assert np.all(
+            DataRaw.unpack_data(f["mygroup"]["subgroup"]["array2"], f)
+            == array2
+        )
+
+
+def test_unpack_of_dataset_with_references_to_dataset():
+    create_mock_hdf5_data()
+    with h5py.File("mytestfile.hdf5", "r") as f:
+        assert np.all(DataRaw.unpack_data(f["ref_dataset"], f)[0][0] == array3)
+
+
+def test_unpack_of_dataset_with_references_to_group_with_subgroup():
+    create_mock_hdf5_data()
+    with h5py.File("mytestfile.hdf5", "r") as f:
+        assert np.all(
+            DataRaw.unpack_data(f["ref_dataset2"], f)[0][0]["subgroup2"][
+                "array4"
+            ]
+            == array4
+        )