From fd15fd52bd373bf151d6cd3bda8a8d8fee3736ac Mon Sep 17 00:00:00 2001 From: Duarte Folgado Date: Tue, 2 Jul 2024 12:45:03 +0100 Subject: [PATCH] Add the parser for the UCI HAR dataset. Returns train/test split inertial sensor raw data. --- .github/workflows/tests.yml | 7 +- tests/test_dataset_loaders.py | 14 +++ tsfel/datasets/__init__.py | 2 +- tsfel/datasets/_single_problem_loaders.py | 102 +++++++++++++++++----- 4 files changed, 102 insertions(+), 23 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 4dd6d06..5905103 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -39,6 +39,9 @@ jobs: run: | python -c "import numpy as np; np.show_config()" python -c "import scipy as sp; sp.show_config()" - - name: Run tests + - name: Test the input/output run: | - python tests/test_features.py + python tests/test_calc_features.py + - name: Test the data loaders + run: | + python tests/dataset_loaders.py diff --git a/tests/test_dataset_loaders.py b/tests/test_dataset_loaders.py index 925aa6a..9735592 100644 --- a/tests/test_dataset_loaders.py +++ b/tests/test_dataset_loaders.py @@ -18,6 +18,20 @@ def test_load_biopluxecg(self): "8e2a2c0f18860b23eb6ebb76b7ceff1cf1fab78f743345fab1f03d315dbc8e21", ) + def test_load_ucihar_all(self): + X_train, y_train, X_test, y_test = tsfel.datasets.load_ucihar(use_cache=True) + np.testing.assert_equal( + (len(X_train), len(y_train), len(X_test), len(y_test), X_train[0].shape, X_test[0].shape), + (7352, 7352, 2947, 2947, (128, 9), (128, 9)), + ) + + def test_load_ucihar_single_data_modality(self): + X_train, y_train, X_test, y_test = tsfel.datasets.load_ucihar(use_cache=True, data_modality=["body_gyro"]) + np.testing.assert_equal( + (len(X_train), len(y_train), len(X_test), len(y_test), X_train[0].shape, X_test[0].shape), + (7352, 7352, 2947, 2947, (128, 3), (128, 3)), + ) + @staticmethod def calculate_sha256_of_ndarray(array: np.ndarray) -> str: sha256_hash = hashlib.sha256() diff --git a/tsfel/datasets/__init__.py b/tsfel/datasets/__init__.py index f1b43b4..948c5f4 100644 --- a/tsfel/datasets/__init__.py +++ b/tsfel/datasets/__init__.py @@ -1 +1 @@ -from tsfel.datasets._single_problem_loaders import load_biopluxecg +from tsfel.datasets._single_problem_loaders import load_biopluxecg, load_ucihar diff --git a/tsfel/datasets/_single_problem_loaders.py b/tsfel/datasets/_single_problem_loaders.py index 5d11f4a..bf0464d 100644 --- a/tsfel/datasets/_single_problem_loaders.py +++ b/tsfel/datasets/_single_problem_loaders.py @@ -4,6 +4,7 @@ """ import os +import warnings import zipfile from pathlib import Path @@ -15,6 +16,12 @@ def _download_dataset(url, cache_dir, filename): + warnings.warn("Cache folder is empty. Downloading the dataset...", UserWarning) + Path(os.path.join(cache_dir)).mkdir( + parents=True, + exist_ok=True, + ) + try: response = requests.get( url, @@ -59,11 +66,6 @@ def load_biopluxecg(use_cache=True) -> pd.Series: cache_dir = os.path.join(CACHE_ROOT_DIR, "BioPluxECG") if not os.path.exists(cache_dir) or not os.listdir(cache_dir) or not use_cache: - print("Cache folder is empty. Downloading the dataset...") - Path(os.path.join(cache_dir)).mkdir( - parents=True, - exist_ok=True, - ) _download_dataset(REF_URL, cache_dir, "biopluxecg.txt") X = np.loadtxt(os.path.join(cache_dir, "biopluxecg.txt"))[1] @@ -72,45 +74,105 @@ def load_biopluxecg(use_cache=True) -> pd.Series: return X +def _get_uci_train_test_splits(dataset_dir: str, data_modality: list, split: str) -> (list[pd.DataFrame], np.ndarray): + raw_signals_split_dir = os.path.join(dataset_dir, "UCI HAR Dataset", split, "Inertial Signals") + raw_signals_split_label_path = os.path.join(dataset_dir, "UCI HAR Dataset", split, f"y_{split}.txt") + _, _, filenames = next(os.walk(raw_signals_split_dir), (None, None, [])) + + columns = [ + filename[: -len(f"{split}.txt")] if filename.endswith(f"{split}.txt") else filename for filename in filenames + ] + filtered_columns = [ + col for col in columns if not data_modality or any(substring in col for substring in data_modality) + ] + + data = {col: np.loadtxt(os.path.join(raw_signals_split_dir, f"{col}{split}.txt")) for col in columns} + X = [ + pd.DataFrame({col: data[col][i] for col in filtered_columns}) for i in range(data[filtered_columns[0]].shape[0]) + ] + y = np.loadtxt(raw_signals_split_label_path) + + return X, y + + +def get_uci_splits(cache_dir, data_modality): + return (_get_uci_train_test_splits(cache_dir, data_modality, split) for split in ["train", "test"]) + + # TODO: Write a parser for this dataset. -def load_ucihar(use_cache=True): +def load_ucihar(use_cache=True, data_modality=None) -> (list[pd.DataFrame], np.ndarray, list[pd.DataFrame], np.ndarray): """Loads the Human Activity Recognition Using Smartphones dataset from the - UC Irvine Machine Learning Repository [1]_. + UC Irvine Machine Learning Repository [1]_. Retrieves the raw inertial data + for both the training and test sets. Parameters ---------- use_cache: bool, default=True If True, caches a local copy of the dataset in the user's home directory. + data_modality: None or list of data modalities, default=None + If set to None, all available data modalities are loaded. Otherwise, + only the specified modalities are loaded. The supported modalities are + defined as "body_acc," "body_gyro," and "total_acc". + + Returns + ------- + X_train : list + A list of DataFrames containing windows of multivariate time series + from the training set. The number of channels (columns) in each + DataFrame depends on the `data_modality`. + + y_train : ndarray + The corresponding labels for the training set. + + X_test : list + A list of DataFrames containing windows of multivariate time series + from the test set. The number of channels (columns) in each + DataFrame depends on the `data_modality`. + + y_test : ndarray + The corresponding labels for the test set. Notes ----- - The signal is sampled at 100 Hz and its divided in short fixed-size windows. + The signal is sampled at 100 Hz, and it's divided in short fixed-size + windows. .. versionadded:: 0.1.8 Examples -------- >>> from tsfel.datasets import load_ucihar - >>> X = load_ucihar() + >>> X_train, y_train, X_test, y_test = load_ucihar() References ---------- - .. [1] Anguita, D., Ghio, A., Oneto, L., Parra, X., & Reyes-Ortiz, J.L. (2013). A Public Domain Dataset for Human Activity Recognition using Smartphones. The European Symposium on Artificial Neural Networks. + .. [1] Anguita, D., et. al (2013). A Public Domain Dataset for Human + Activity Recognition using Smartphones. The European Symposium on Artificial + Neural Networks. """ - REF_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00240/UCI%20HAR%20Dataset.zip" + data_modality = [] if data_modality is None else data_modality cache_dir = os.path.join(CACHE_ROOT_DIR, "UCIHAR") + REF_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00240/UCI%20HAR%20Dataset.zip" + + if not isinstance(data_modality, list): + raise ValueError("data_modality must be a list of strings.") + + if not all( + any(element.startswith(prefix) for prefix in ["body_acc", "body_gyro", "total_acc"]) + for element in data_modality + ): + raise ValueError("Elements of the list should be `body_acc`, `body_gyro`, or `total_acc`") if not os.path.exists(cache_dir) or not os.listdir(cache_dir) or not use_cache: - print("Cache folder is empty. Downloading the dataset...") - Path(os.path.join(cache_dir)).mkdir( - parents=True, - exist_ok=True, - ) _download_dataset(REF_URL, cache_dir, "ucihar.zip") - zip_file_path = os.path.join(cache_dir, "ucihar.zip") - with zipfile.ZipFile(zip_file_path, "r") as zip_ref: - zip_ref.extractall(cache_dir) - os.remove(zip_file_path) + zip_file_path = os.path.join(cache_dir, "ucihar.zip") + with zipfile.ZipFile(zip_file_path, "r") as zip_ref: + zip_ref.extractall(cache_dir) + os.remove(zip_file_path) + + (X_train, y_train), (X_test, y_test) = get_uci_splits(cache_dir, data_modality) + + return X_train, y_train, X_test, y_test