Skip to content

Commit

Permalink
Add the parser for the UCI HAR dataset.
Browse files Browse the repository at this point in the history
Returns train/test split inertial sensor raw data.
  • Loading branch information
dmfolgado committed Jul 2, 2024
1 parent c5e4d77 commit fd15fd5
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 23 deletions.
7 changes: 5 additions & 2 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ jobs:
run: |
python -c "import numpy as np; np.show_config()"
python -c "import scipy as sp; sp.show_config()"
- name: Run tests
- name: Test the input/output
run: |
python tests/test_features.py
python tests/test_calc_features.py
- name: Test the data loaders
run: |
python tests/dataset_loaders.py
14 changes: 14 additions & 0 deletions tests/test_dataset_loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,20 @@ def test_load_biopluxecg(self):
"8e2a2c0f18860b23eb6ebb76b7ceff1cf1fab78f743345fab1f03d315dbc8e21",
)

def test_load_ucihar_all(self):
X_train, y_train, X_test, y_test = tsfel.datasets.load_ucihar(use_cache=True)
np.testing.assert_equal(
(len(X_train), len(y_train), len(X_test), len(y_test), X_train[0].shape, X_test[0].shape),
(7352, 7352, 2947, 2947, (128, 9), (128, 9)),
)

def test_load_ucihar_single_data_modality(self):
X_train, y_train, X_test, y_test = tsfel.datasets.load_ucihar(use_cache=True, data_modality=["body_gyro"])
np.testing.assert_equal(
(len(X_train), len(y_train), len(X_test), len(y_test), X_train[0].shape, X_test[0].shape),
(7352, 7352, 2947, 2947, (128, 3), (128, 3)),
)

@staticmethod
def calculate_sha256_of_ndarray(array: np.ndarray) -> str:
sha256_hash = hashlib.sha256()
Expand Down
2 changes: 1 addition & 1 deletion tsfel/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from tsfel.datasets._single_problem_loaders import load_biopluxecg
from tsfel.datasets._single_problem_loaders import load_biopluxecg, load_ucihar
102 changes: 82 additions & 20 deletions tsfel/datasets/_single_problem_loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"""

import os
import warnings
import zipfile
from pathlib import Path

Expand All @@ -15,6 +16,12 @@


def _download_dataset(url, cache_dir, filename):
warnings.warn("Cache folder is empty. Downloading the dataset...", UserWarning)
Path(os.path.join(cache_dir)).mkdir(
parents=True,
exist_ok=True,
)

try:
response = requests.get(
url,
Expand Down Expand Up @@ -59,11 +66,6 @@ def load_biopluxecg(use_cache=True) -> pd.Series:
cache_dir = os.path.join(CACHE_ROOT_DIR, "BioPluxECG")

if not os.path.exists(cache_dir) or not os.listdir(cache_dir) or not use_cache:
print("Cache folder is empty. Downloading the dataset...")
Path(os.path.join(cache_dir)).mkdir(
parents=True,
exist_ok=True,
)
_download_dataset(REF_URL, cache_dir, "biopluxecg.txt")

X = np.loadtxt(os.path.join(cache_dir, "biopluxecg.txt"))[1]
Expand All @@ -72,45 +74,105 @@ def load_biopluxecg(use_cache=True) -> pd.Series:
return X


def _get_uci_train_test_splits(dataset_dir: str, data_modality: list, split: str) -> (list[pd.DataFrame], np.ndarray):
raw_signals_split_dir = os.path.join(dataset_dir, "UCI HAR Dataset", split, "Inertial Signals")
raw_signals_split_label_path = os.path.join(dataset_dir, "UCI HAR Dataset", split, f"y_{split}.txt")
_, _, filenames = next(os.walk(raw_signals_split_dir), (None, None, []))

columns = [
filename[: -len(f"{split}.txt")] if filename.endswith(f"{split}.txt") else filename for filename in filenames
]
filtered_columns = [
col for col in columns if not data_modality or any(substring in col for substring in data_modality)
]

data = {col: np.loadtxt(os.path.join(raw_signals_split_dir, f"{col}{split}.txt")) for col in columns}
X = [
pd.DataFrame({col: data[col][i] for col in filtered_columns}) for i in range(data[filtered_columns[0]].shape[0])
]
y = np.loadtxt(raw_signals_split_label_path)

return X, y


def get_uci_splits(cache_dir, data_modality):
return (_get_uci_train_test_splits(cache_dir, data_modality, split) for split in ["train", "test"])


# TODO: Write a parser for this dataset.
def load_ucihar(use_cache=True):
def load_ucihar(use_cache=True, data_modality=None) -> (list[pd.DataFrame], np.ndarray, list[pd.DataFrame], np.ndarray):
"""Loads the Human Activity Recognition Using Smartphones dataset from the
UC Irvine Machine Learning Repository [1]_.
UC Irvine Machine Learning Repository [1]_. Retrieves the raw inertial data
for both the training and test sets.
Parameters
----------
use_cache: bool, default=True
If True, caches a local copy of the dataset in the user's home directory.
data_modality: None or list of data modalities, default=None
If set to None, all available data modalities are loaded. Otherwise,
only the specified modalities are loaded. The supported modalities are
defined as "body_acc," "body_gyro," and "total_acc".
Returns
-------
X_train : list
A list of DataFrames containing windows of multivariate time series
from the training set. The number of channels (columns) in each
DataFrame depends on the `data_modality`.
y_train : ndarray
The corresponding labels for the training set.
X_test : list
A list of DataFrames containing windows of multivariate time series
from the test set. The number of channels (columns) in each
DataFrame depends on the `data_modality`.
y_test : ndarray
The corresponding labels for the test set.
Notes
-----
The signal is sampled at 100 Hz and its divided in short fixed-size windows.
The signal is sampled at 100 Hz, and it's divided in short fixed-size
windows.
.. versionadded:: 0.1.8
Examples
--------
>>> from tsfel.datasets import load_ucihar
>>> X = load_ucihar()
>>> X_train, y_train, X_test, y_test = load_ucihar()
References
----------
.. [1] Anguita, D., Ghio, A., Oneto, L., Parra, X., & Reyes-Ortiz, J.L. (2013). A Public Domain Dataset for Human Activity Recognition using Smartphones. The European Symposium on Artificial Neural Networks.
.. [1] Anguita, D., et. al (2013). A Public Domain Dataset for Human
Activity Recognition using Smartphones. The European Symposium on Artificial
Neural Networks.
"""

REF_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00240/UCI%20HAR%20Dataset.zip"
data_modality = [] if data_modality is None else data_modality
cache_dir = os.path.join(CACHE_ROOT_DIR, "UCIHAR")
REF_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00240/UCI%20HAR%20Dataset.zip"

if not isinstance(data_modality, list):
raise ValueError("data_modality must be a list of strings.")

if not all(
any(element.startswith(prefix) for prefix in ["body_acc", "body_gyro", "total_acc"])
for element in data_modality
):
raise ValueError("Elements of the list should be `body_acc`, `body_gyro`, or `total_acc`")

if not os.path.exists(cache_dir) or not os.listdir(cache_dir) or not use_cache:
print("Cache folder is empty. Downloading the dataset...")
Path(os.path.join(cache_dir)).mkdir(
parents=True,
exist_ok=True,
)
_download_dataset(REF_URL, cache_dir, "ucihar.zip")

zip_file_path = os.path.join(cache_dir, "ucihar.zip")
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
zip_ref.extractall(cache_dir)
os.remove(zip_file_path)
zip_file_path = os.path.join(cache_dir, "ucihar.zip")
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
zip_ref.extractall(cache_dir)
os.remove(zip_file_path)

(X_train, y_train), (X_test, y_test) = get_uci_splits(cache_dir, data_modality)

return X_train, y_train, X_test, y_test

0 comments on commit fd15fd5

Please sign in to comment.