lancedb · eddyxu · Jan 30, 2024 · Jan 30, 2024 · Jan 30, 2024 · Jan 30, 2024
diff --git a/docs/integrations/huggingface.rst b/docs/integrations/huggingface.rst
@@ -0,0 +1,16 @@
+HuggingFace
+-----------
+
+The Hugging Face Hub has a great amount of pre-trained models and datasets available.
+
+It is easy to convert a Huggingface dataset to Lance dataset:
+
+.. code-block:: python
+
+    # Huggingface datasets
+    import datasets
+    import lance
+
+    lance.write(datasets.load_dataset(
+        "poloclub/diffusiondb", split="train[:10]"
+    ), "diffusiondb_train.lance")
diff --git a/docs/integrations/integrations.rst b/docs/integrations/integrations.rst
@@ -3,4 +3,5 @@ Integrations
 
 .. toctree::
 
+    Huggingface <./huggingface>
     Tensorflow <./tensorflow>
diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -2,9 +2,7 @@
 name = "pylance"
 dependencies = ["pyarrow>=12", "numpy>=1.22"]
 description = "python wrapper for Lance columnar format"
-authors = [
-    { name = "Lance Devs", email = "[email protected]" },
-]
+authors = [{ name = "Lance Devs", email = "[email protected]" }]
 license = { file = "LICENSE" }
 repository = "https://github.com/eto-ai/lance"
 readme = "README.md"
@@ -48,20 +46,17 @@ build-backend = "maturin"
 
 [project.optional-dependencies]
 tests = [
-    "pandas",
-    "pytest",
+    "datasets",
     "duckdb",
     "ml_dtypes",
+    "pandas",
     "polars[pyarrow,pandas]",
+    "pytest",
     "tensorflow",
     "tqdm",
 ]
-benchmarks = [
-    "pytest-benchmark",
-]
-torch = [
-    "torch",
-]
+benchmarks = ["pytest-benchmark"]
+torch = ["torch"]
 
 [tool.ruff]
 select = ["F", "E", "W", "I", "G", "TCH", "PERF", "CPY001"]

diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
@@ -1960,6 +1960,7 @@ def write_dataset(
     data_obj: Reader-like
         The data to be written. Acceptable types are:
         - Pandas DataFrame, Pyarrow Table, Dataset, Scanner, or RecordBatchReader
+        - Huggingface dataset
     uri: str or Path
         Where to write the dataset to (directory)
     schema: Schema, optional
@@ -1988,6 +1989,17 @@ def write_dataset(
         a custom class that defines hooks to be called when each fragment is
         starting to write and finishing writing.
     """
+    try:
+        # Huggingface datasets
+        import datasets
+
+        if isinstance(data_obj, datasets.Dataset):
+            if schema is None:
+                schema = data_obj.features.arrow_schema
+            data_obj = data_obj.data.to_batches()
+    except ImportError:
+        pass
+
-    try:
-        # Huggingface datasets
-        import datasets
-
-        if isinstance(data_obj, datasets.Dataset):
-            if schema is None:
-                schema = data_obj.features.arrow_schema
-            data_obj = data_obj.data.to_batches()
-    except ImportError:
-        pass
+    if _check_for_huggingface(data_obj):
+        # Huggingface datasets
+        import datasets
+
+        if isinstance(data_obj, datasets.Dataset):
+            if schema is None:
+                schema = data_obj.features.arrow_schema
+            data_obj = data_obj.data.to_batches()
+
-    try:
-        # Huggingface datasets
-        import datasets
-
-        if isinstance(data_obj, datasets.Dataset):
-            if schema is None:
-                schema = data_obj.features.arrow_schema
-            data_obj = data_obj.data.to_batches()
-    except ImportError:
-        pass
+    if _check_for_huggingface(data_obj):
+        # Huggingface datasets
+        import datasets
+
+        if isinstance(data_obj, datasets.Dataset):
+            if schema is None:
+                schema = data_obj.features.arrow_schema
+            data_obj = data_obj.data.to_batches()
+
     reader = _coerce_reader(data_obj, schema)
     _validate_schema(reader.schema)
     # TODO add support for passing in LanceDataset and LanceScanner here
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,4 +3,5 @@ Integrations

		.. toctree::

		Huggingface <./huggingface>
		Tensorflow <./tensorflow>