From fd15fd52bd373bf151d6cd3bda8a8d8fee3736ac Mon Sep 17 00:00:00 2001
From: Duarte Folgado <dmfolgado@users.noreply.github.com>
Date: Tue, 2 Jul 2024 12:45:03 +0100
Subject: [PATCH] Add the parser for the UCI HAR dataset.

Returns train/test split inertial sensor raw data.
---
 .github/workflows/tests.yml               |   7 +-
 tests/test_dataset_loaders.py             |  14 +++
 tsfel/datasets/__init__.py                |   2 +-
 tsfel/datasets/_single_problem_loaders.py | 102 +++++++++++++++++-----
 4 files changed, 102 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 4dd6d06..5905103 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -39,6 +39,9 @@ jobs:
         run: |
           python -c "import numpy as np; np.show_config()"
           python -c "import scipy as sp; sp.show_config()"
-      - name: Run tests
+      - name: Test the input/output
         run: |
-          python tests/test_features.py
+          python tests/test_calc_features.py
+      - name: Test the data loaders
+        run: |
+          python tests/dataset_loaders.py
diff --git a/tests/test_dataset_loaders.py b/tests/test_dataset_loaders.py
index 925aa6a..9735592 100644
--- a/tests/test_dataset_loaders.py
+++ b/tests/test_dataset_loaders.py
@@ -18,6 +18,20 @@ def test_load_biopluxecg(self):
             "8e2a2c0f18860b23eb6ebb76b7ceff1cf1fab78f743345fab1f03d315dbc8e21",
         )
 
+    def test_load_ucihar_all(self):
+        X_train, y_train, X_test, y_test = tsfel.datasets.load_ucihar(use_cache=True)
+        np.testing.assert_equal(
+            (len(X_train), len(y_train), len(X_test), len(y_test), X_train[0].shape, X_test[0].shape),
+            (7352, 7352, 2947, 2947, (128, 9), (128, 9)),
+        )
+
+    def test_load_ucihar_single_data_modality(self):
+        X_train, y_train, X_test, y_test = tsfel.datasets.load_ucihar(use_cache=True, data_modality=["body_gyro"])
+        np.testing.assert_equal(
+            (len(X_train), len(y_train), len(X_test), len(y_test), X_train[0].shape, X_test[0].shape),
+            (7352, 7352, 2947, 2947, (128, 3), (128, 3)),
+        )
+
     @staticmethod
     def calculate_sha256_of_ndarray(array: np.ndarray) -> str:
         sha256_hash = hashlib.sha256()
diff --git a/tsfel/datasets/__init__.py b/tsfel/datasets/__init__.py
index f1b43b4..948c5f4 100644
--- a/tsfel/datasets/__init__.py
+++ b/tsfel/datasets/__init__.py
@@ -1 +1 @@
-from tsfel.datasets._single_problem_loaders import load_biopluxecg
+from tsfel.datasets._single_problem_loaders import load_biopluxecg, load_ucihar
diff --git a/tsfel/datasets/_single_problem_loaders.py b/tsfel/datasets/_single_problem_loaders.py
index 5d11f4a..bf0464d 100644
--- a/tsfel/datasets/_single_problem_loaders.py
+++ b/tsfel/datasets/_single_problem_loaders.py
@@ -4,6 +4,7 @@
 """
 
 import os
+import warnings
 import zipfile
 from pathlib import Path
 
@@ -15,6 +16,12 @@
 
 
 def _download_dataset(url, cache_dir, filename):
+    warnings.warn("Cache folder is empty. Downloading the dataset...", UserWarning)
+    Path(os.path.join(cache_dir)).mkdir(
+        parents=True,
+        exist_ok=True,
+    )
+
     try:
         response = requests.get(
             url,
@@ -59,11 +66,6 @@ def load_biopluxecg(use_cache=True) -> pd.Series:
     cache_dir = os.path.join(CACHE_ROOT_DIR, "BioPluxECG")
 
     if not os.path.exists(cache_dir) or not os.listdir(cache_dir) or not use_cache:
-        print("Cache folder is empty. Downloading the dataset...")
-        Path(os.path.join(cache_dir)).mkdir(
-            parents=True,
-            exist_ok=True,
-        )
         _download_dataset(REF_URL, cache_dir, "biopluxecg.txt")
 
     X = np.loadtxt(os.path.join(cache_dir, "biopluxecg.txt"))[1]
@@ -72,45 +74,105 @@ def load_biopluxecg(use_cache=True) -> pd.Series:
     return X
 
 
+def _get_uci_train_test_splits(dataset_dir: str, data_modality: list, split: str) -> (list[pd.DataFrame], np.ndarray):
+    raw_signals_split_dir = os.path.join(dataset_dir, "UCI HAR Dataset", split, "Inertial Signals")
+    raw_signals_split_label_path = os.path.join(dataset_dir, "UCI HAR Dataset", split, f"y_{split}.txt")
+    _, _, filenames = next(os.walk(raw_signals_split_dir), (None, None, []))
+
+    columns = [
+        filename[: -len(f"{split}.txt")] if filename.endswith(f"{split}.txt") else filename for filename in filenames
+    ]
+    filtered_columns = [
+        col for col in columns if not data_modality or any(substring in col for substring in data_modality)
+    ]
+
+    data = {col: np.loadtxt(os.path.join(raw_signals_split_dir, f"{col}{split}.txt")) for col in columns}
+    X = [
+        pd.DataFrame({col: data[col][i] for col in filtered_columns}) for i in range(data[filtered_columns[0]].shape[0])
+    ]
+    y = np.loadtxt(raw_signals_split_label_path)
+
+    return X, y
+
+
+def get_uci_splits(cache_dir, data_modality):
+    return (_get_uci_train_test_splits(cache_dir, data_modality, split) for split in ["train", "test"])
+
+
 # TODO: Write a parser for this dataset.
-def load_ucihar(use_cache=True):
+def load_ucihar(use_cache=True, data_modality=None) -> (list[pd.DataFrame], np.ndarray, list[pd.DataFrame], np.ndarray):
     """Loads the Human Activity Recognition Using Smartphones dataset from the
-    UC Irvine Machine Learning Repository [1]_.
+    UC Irvine Machine Learning Repository [1]_. Retrieves the raw inertial data
+    for both the training and test sets.
 
     Parameters
     ----------
     use_cache: bool, default=True
         If True, caches a local copy of the dataset in the user's home directory.
 
+    data_modality: None or list of data modalities, default=None
+        If set to None, all available data modalities are loaded. Otherwise,
+        only the specified modalities are loaded. The supported modalities are
+        defined as "body_acc," "body_gyro," and "total_acc".
+
+    Returns
+    -------
+    X_train : list
+        A list of DataFrames containing windows of multivariate time series
+        from the training set. The number of channels (columns) in each
+        DataFrame depends on the `data_modality`.
+
+    y_train : ndarray
+        The corresponding labels for the training set.
+
+    X_test : list
+        A list of DataFrames containing windows of multivariate time series
+        from the test set. The number of channels (columns) in each
+        DataFrame depends on the `data_modality`.
+
+    y_test : ndarray
+        The corresponding labels for the test set.
 
     Notes
     -----
-    The signal is sampled at 100 Hz and its divided in short fixed-size windows.
+    The signal is sampled at 100 Hz, and it's divided in short fixed-size
+    windows.
 
     .. versionadded:: 0.1.8
 
     Examples
     --------
     >>> from tsfel.datasets import load_ucihar
-    >>> X = load_ucihar()
+    >>> X_train, y_train, X_test, y_test = load_ucihar()
 
     References
     ----------
-    .. [1] Anguita, D., Ghio, A., Oneto, L., Parra, X., & Reyes-Ortiz, J.L. (2013). A Public Domain Dataset for Human Activity Recognition using Smartphones. The European Symposium on Artificial Neural Networks.
+    .. [1] Anguita, D., et. al (2013). A Public Domain Dataset for Human
+    Activity Recognition using Smartphones. The European Symposium on Artificial
+    Neural Networks.
     """
 
-    REF_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00240/UCI%20HAR%20Dataset.zip"
+    data_modality = [] if data_modality is None else data_modality
     cache_dir = os.path.join(CACHE_ROOT_DIR, "UCIHAR")
+    REF_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00240/UCI%20HAR%20Dataset.zip"
+
+    if not isinstance(data_modality, list):
+        raise ValueError("data_modality must be a list of strings.")
+
+    if not all(
+        any(element.startswith(prefix) for prefix in ["body_acc", "body_gyro", "total_acc"])
+        for element in data_modality
+    ):
+        raise ValueError("Elements of the list should be `body_acc`, `body_gyro`, or `total_acc`")
 
     if not os.path.exists(cache_dir) or not os.listdir(cache_dir) or not use_cache:
-        print("Cache folder is empty. Downloading the dataset...")
-        Path(os.path.join(cache_dir)).mkdir(
-            parents=True,
-            exist_ok=True,
-        )
         _download_dataset(REF_URL, cache_dir, "ucihar.zip")
 
-    zip_file_path = os.path.join(cache_dir, "ucihar.zip")
-    with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
-        zip_ref.extractall(cache_dir)
-        os.remove(zip_file_path)
+        zip_file_path = os.path.join(cache_dir, "ucihar.zip")
+        with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
+            zip_ref.extractall(cache_dir)
+            os.remove(zip_file_path)
+
+    (X_train, y_train), (X_test, y_test) = get_uci_splits(cache_dir, data_modality)
+
+    return X_train, y_train, X_test, y_test