pyg-team · qychen2001 · Aug 8, 2024 · Aug 8, 2024 · Aug 8, 2024 · Aug 8, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -56,6 +56,7 @@ full=[
     "lightgbm",
     "datasets",
     "torchmetrics",
+    "openml",
 ]
 
 [project.urls]

diff --git a/test/datasets/test_data_frame_openml.py b/test/datasets/test_data_frame_openml.py
@@ -0,0 +1,20 @@
+import pytest
+
+from torch_frame.datasets import OpenMLDataset
+from torch_frame.typing import TaskType
+
+
+@pytest.mark.parametrize("dataset_id", [8, 31, 455])
+def test_data_frame_openml(dataset_id):
+    dataset = OpenMLDataset(dataset_id)
+    if dataset_id == 8:
+        assert dataset.task_type == TaskType.REGRESSION
+        assert dataset.target_col == "drinks"
+    if dataset_id == 31:
+        assert dataset.task_type == TaskType.BINARY_CLASSIFICATION
+        assert dataset.num_classes == 2
+        assert dataset.target_col == "class"
+    if dataset_id == 455:
+        assert dataset.task_type == TaskType.MULTICLASS_CLASSIFICATION
+        assert dataset.num_classes == 3
+        assert dataset.target_col == "origin"
diff --git a/torch_frame/datasets/__init__.py b/torch_frame/datasets/__init__.py
@@ -19,6 +19,7 @@
 from .amazon_fine_food_reviews import AmazonFineFoodReviews
 from .diamond_images import DiamondImages
 from .huggingface_dataset import HuggingFaceDatasetDict
+from .openml_dataset import OpenMLDataset
 
 real_world_datasets = [
     'Titanic',
@@ -38,6 +39,7 @@
     'Movielens1M',
     'AmazonFineFoodReviews',
     'DiamondImages',
+    'OpenMLDataset',
 ]
 
 synthetic_datasets = [

diff --git a/torch_frame/datasets/openml_dataset.py b/torch_frame/datasets/openml_dataset.py
@@ -0,0 +1,92 @@
+import os
+from typing import Optional
+
+import openml
+import pandas as pd
+
+import torch_frame
+from torch_frame import stype
+from torch_frame.utils.infer_stype import infer_series_stype
+
+
+class OpenMLDataset(torch_frame.data.Dataset):
+    r"""A dataset class for loading datasets from OpenML,
+    designed to integrate with the torch_frame library.
+    More information about OpenML can be found at https://www.openml.org/.
+
+    Parameters:
+    - dataset_id (int): The ID of the dataset to be loaded from OpenML.
+    - cache_dir (str, optional): The directory where the dataset is cached.
+    If None, the default cache directory is used.
+    """
+    def __init__(self, dataset_id: int, cache_dir: Optional[str] = None):
+        if cache_dir is not None:
+            openml.config.set_root_cache_directory(
+                os.path.expanduser(cache_dir))
+        self.dataset_id = dataset_id
+        self._openml_dataset = openml.datasets.get_dataset(
+            self.dataset_id,
+            download_data=True,
+            download_qualities=True,
+            download_features_meta_data=True,
+        )
+        # Get dataset info from OpenML
+        self.dataset_info = self._openml_dataset.qualities
+        target_col = self._openml_dataset.default_target_attribute
+        X, y, self.categorical_indicator, _ = self._openml_dataset.get_data(
+            target=target_col)
+        df = pd.concat([X, y], axis=1)
+        self._task_type: torch_frame.TaskType = (
+            torch_frame.TaskType.BINARY_CLASSIFICATION)
+        self._num_classes: int = 0
+
+        # The column type can be inferred from the categorical_indicator
+        col_to_stype = {
+            col:
+            stype.categorical
+            if self.categorical_indicator[i] else stype.numerical
+            for i, col in enumerate(X.columns)
+        }
+
+        # Infer the stype of the target column
+        target_col_type = infer_series_stype(df[target_col])
+        if target_col_type == torch_frame.categorical:
+            assert self.dataset_info["NumberOfClasses"] > 0
+            if self.dataset_info["NumberOfClasses"] == 2:
+                assert df[target_col].nunique() == 2
+                self._task_type = torch_frame.TaskType.BINARY_CLASSIFICATION
+                self._num_classes = 2
+            else:
+                assert df[target_col].nunique(
+                ) == self.dataset_info["NumberOfClasses"]
+                self._task_type = (
+                    torch_frame.TaskType.MULTICLASS_CLASSIFICATION)
+                self._num_classes = int(self.dataset_info["NumberOfClasses"])
+            col_to_stype[target_col] = torch_frame.categorical
+        else:
+            assert self.dataset_info["NumberOfClasses"] == 0
+            self._task_type = torch_frame.TaskType.REGRESSION
+            self._num_classes = 0
+            col_to_stype[target_col] = torch_frame.numerical
+
+        super().__init__(df=df, col_to_stype=col_to_stype,
+                         target_col=target_col)
+
+    # NOTE: Overriding the `task_type()` and `num_classes` property method
+    @property
+    def task_type(self) -> torch_frame.TaskType:
+        """Returns the task type of the dataset.
+
+        Returns:
+            torch_frame.TaskType: The task type of the dataset.
+        """
+        return self._task_type
+
+    @property
+    def num_classes(self) -> int:
+        """Returns the number of classes in the dataset.
+
+        Returns:
+            int: The number of classes in the dataset.
+        """
+        return self._num_classes