From 61acc614d75e8d6324a7c8988459319de91617ab Mon Sep 17 00:00:00 2001
From: Lei Xu <lei@lancedb.com>
Date: Mon, 29 Jan 2024 18:37:50 -0800
Subject: [PATCH 1/7] add docs

---
 docs/integrations/integrations.rst |  1 +
 python/pyproject.toml              | 17 ++++++-----------
 python/python/lance/dataset.py     | 13 +++++++++++++
 3 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/docs/integrations/integrations.rst b/docs/integrations/integrations.rst
index c6e85e7d63..a84ca4f9b8 100644
--- a/docs/integrations/integrations.rst
+++ b/docs/integrations/integrations.rst
@@ -3,4 +3,5 @@ Integrations
 
 .. toctree::
 
+    Huggingface <./huggingface>
     Tensorflow <./tensorflow>
\ No newline at end of file
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 6e2842d265..4c2861488c 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -2,9 +2,7 @@
 name = "pylance"
 dependencies = ["pyarrow>=12", "numpy>=1.22"]
 description = "python wrapper for Lance columnar format"
-authors = [
-    { name = "Lance Devs", email = "dev@lancedb.com" },
-]
+authors = [{ name = "Lance Devs", email = "dev@lancedb.com" }]
 license = { file = "LICENSE" }
 repository = "https://github.com/eto-ai/lance"
 readme = "README.md"
@@ -48,20 +46,17 @@ build-backend = "maturin"
 
 [project.optional-dependencies]
 tests = [
-    "pandas",
-    "pytest",
+    "datasets",
     "duckdb",
     "ml_dtypes",
+    "pandas",
     "polars[pyarrow,pandas]",
+    "pytest",
     "tensorflow",
     "tqdm",
 ]
-benchmarks = [
-    "pytest-benchmark",
-]
-torch = [
-    "torch",
-]
+benchmarks = ["pytest-benchmark"]
+torch = ["torch"]
 
 [tool.ruff]
 select = ["F", "E", "W", "I", "G", "TCH", "PERF", "CPY001"]
diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
index 520d37ae53..481f50ee0f 100644
--- a/python/python/lance/dataset.py
+++ b/python/python/lance/dataset.py
@@ -1960,6 +1960,7 @@ def write_dataset(
     data_obj: Reader-like
         The data to be written. Acceptable types are:
         - Pandas DataFrame, Pyarrow Table, Dataset, Scanner, or RecordBatchReader
+        - Huggingface dataset
     uri: str or Path
         Where to write the dataset to (directory)
     schema: Schema, optional
@@ -1988,6 +1989,18 @@ def write_dataset(
         a custom class that defines hooks to be called when each fragment is
         starting to write and finishing writing.
     """
+    try:
+        # Huggingface datasets
+        import datasets
+
+        if isinstance(data_obj, datasets.Dataset):
+            if schema is None:
+                schema = data_obj.features.arrow_schema
+            data_obj = data_obj.data.to_batches()
+    except ImportError:
+        pass
+    print("Schema is: ", schema)
+
     reader = _coerce_reader(data_obj, schema)
     _validate_schema(reader.schema)
     # TODO add support for passing in LanceDataset and LanceScanner here

From 8b49cbf9772865d5fa803d5c7f80b17cbc57b4ff Mon Sep 17 00:00:00 2001
From: Lei Xu <lei@lancedb.com>
Date: Mon, 29 Jan 2024 18:48:09 -0800
Subject: [PATCH 2/7] remove prints

---
 python/python/lance/dataset.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
index 481f50ee0f..76a0f03a08 100644
--- a/python/python/lance/dataset.py
+++ b/python/python/lance/dataset.py
@@ -1999,7 +1999,6 @@ def write_dataset(
             data_obj = data_obj.data.to_batches()
     except ImportError:
         pass
-    print("Schema is: ", schema)
 
     reader = _coerce_reader(data_obj, schema)
     _validate_schema(reader.schema)

From 9ddacb0b65cee27834b10f3cbbf480609b59956c Mon Sep 17 00:00:00 2001
From: Lei Xu <lei@lancedb.com>
Date: Mon, 29 Jan 2024 18:52:57 -0800
Subject: [PATCH 3/7] add missing huggingface rst

---
 docs/integrations/huggingface.rst | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 docs/integrations/huggingface.rst

diff --git a/docs/integrations/huggingface.rst b/docs/integrations/huggingface.rst
new file mode 100644
index 0000000000..d165707094
--- /dev/null
+++ b/docs/integrations/huggingface.rst
@@ -0,0 +1,16 @@
+HuggingFace
+-----------
+
+The Hugging Face Hub has a great amount of pre-trained models and datasets available.
+
+It is easy to convert a Huggingface dataset to Lance dataset:
+
+.. code-block:: python
+
+    # Huggingface datasets
+    import datasets
+    import lance
+
+    lance.write(datasets.load_dataset(
+        "poloclub/diffusiondb", split="train[:10]"
+    ), "diffusiondb_train.lance")
\ No newline at end of file

From 995aab2da3ef0677f4fc0037a70762fff3a80e25 Mon Sep 17 00:00:00 2001
From: Lei Xu <lei@lancedb.com>
Date: Mon, 29 Jan 2024 19:01:47 -0800
Subject: [PATCH 4/7] use lazy loading

---
 python/python/lance/dataset.py          | 13 ++++++-----
 python/python/lance/dependencies.py     | 11 +++++++++
 python/python/tests/test_huggingface.py | 30 +++++++++++++++++++++++++
 3 files changed, 49 insertions(+), 5 deletions(-)
 create mode 100644 python/python/tests/test_huggingface.py

diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
index 76a0f03a08..9ad59721dd 100644
--- a/python/python/lance/dataset.py
+++ b/python/python/lance/dataset.py
@@ -42,7 +42,12 @@
 import pyarrow.dataset
 from pyarrow import RecordBatch, Schema
 
-from .dependencies import _check_for_numpy, _check_for_pandas, torch
+from .dependencies import (
+    _check_for_hugging_face,
+    _check_for_numpy,
+    _check_for_pandas,
+    torch,
+)
 from .dependencies import numpy as np
 from .dependencies import pandas as pd
 from .fragment import FragmentMetadata, LanceFragment
@@ -1989,16 +1994,14 @@ def write_dataset(
         a custom class that defines hooks to be called when each fragment is
         starting to write and finishing writing.
     """
-    try:
+    if _check_for_hugging_face(data_obj):
         # Huggingface datasets
-        import datasets
+        from .dependencies import datasets
 
         if isinstance(data_obj, datasets.Dataset):
             if schema is None:
                 schema = data_obj.features.arrow_schema
             data_obj = data_obj.data.to_batches()
-    except ImportError:
-        pass
 
     reader = _coerce_reader(data_obj, schema)
     _validate_schema(reader.schema)
diff --git a/python/python/lance/dependencies.py b/python/python/lance/dependencies.py
index f020d7a921..13c8c2156a 100644
--- a/python/python/lance/dependencies.py
+++ b/python/python/lance/dependencies.py
@@ -34,6 +34,7 @@
 _PANDAS_AVAILABLE = True
 _POLARS_AVAILABLE = True
 _TORCH_AVAILABLE = True
+_HUGGING_FACE_AVAILABLE = True
 
 
 class _LazyModule(ModuleType):
@@ -164,6 +165,7 @@ def _lazy_import(module_name: str) -> tuple[ModuleType, bool]:
 
 
 if TYPE_CHECKING:
+    import datasets
     import numpy
     import pandas
     import polars
@@ -174,6 +176,7 @@ def _lazy_import(module_name: str) -> tuple[ModuleType, bool]:
     pandas, _PANDAS_AVAILABLE = _lazy_import("pandas")
     polars, _POLARS_AVAILABLE = _lazy_import("polars")
     torch, _TORCH_AVAILABLE = _lazy_import("torch")
+    datasets, _HUGGING_FACE_AVAILABLE = _lazy_import("datasets")
 
 
 @lru_cache(maxsize=None)
@@ -210,6 +213,12 @@ def _check_for_torch(obj: Any, *, check_type: bool = True) -> bool:
     )
 
 
+def _check_for_hugging_face(obj: Any, *, check_type: bool = True) -> bool:
+    return _HUGGING_FACE_AVAILABLE and _might_be(
+        cast(Hashable, type(obj) if check_type else obj), "datasets"
+    )
+
+
 __all__ = [
     # lazy-load third party libs
     "numpy",
@@ -221,10 +230,12 @@ def _check_for_torch(obj: Any, *, check_type: bool = True) -> bool:
     "_check_for_pandas",
     "_check_for_polars",
     "_check_for_torch",
+    "_check_for_hugging_face",
     "_LazyModule",
     # exported flags/guards
     "_NUMPY_AVAILABLE",
     "_PANDAS_AVAILABLE",
     "_POLARS_AVAILABLE",
     "_TORCH_AVAILABLE",
+    "_HUGGING_FACE_AVAILABLE",
 ]
diff --git a/python/python/tests/test_huggingface.py b/python/python/tests/test_huggingface.py
new file mode 100644
index 0000000000..693465381d
--- /dev/null
+++ b/python/python/tests/test_huggingface.py
@@ -0,0 +1,30 @@
+#  Copyright 2023 Lance Developers
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from pathlib import Path
+
+import lance
+import pytest
+
+datasets = pytest.importorskip("datasets")
+
+
+def test_write_hf_dataset(tmp_path: Path):
+    hf_ds = datasets.load_dataset(
+        "poloclub/diffusiondb", name="2m_first_1k", split="train[:50]"
+    )
+
+    ds = lance.write_dataset(hf_ds, tmp_path)
+    assert ds.count_rows() == 50
+
+    assert ds.schema == hf_ds.features.arrow_schema

From 24fddc466e1e119b8c7e1394ee03634e61012ab8 Mon Sep 17 00:00:00 2001
From: Lei Xu <lei@lancedb.com>
Date: Mon, 29 Jan 2024 23:01:27 -0800
Subject: [PATCH 5/7] doc

---
 docs/integrations/huggingface.rst | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/docs/integrations/huggingface.rst b/docs/integrations/huggingface.rst
index d165707094..f70edbef71 100644
--- a/docs/integrations/huggingface.rst
+++ b/docs/integrations/huggingface.rst
@@ -1,9 +1,11 @@
-HuggingFace
------------
+Lance ❤️ HuggingFace
+--------------------
 
-The Hugging Face Hub has a great amount of pre-trained models and datasets available.
+The HuggingFace Hub has become the go to place for ML practitioners to find pre-trained models and useful datasets.
+
+HuggingFace datasets can be written directly into Lance format by using the
+:meth:`lance.write_dataset` method. You can write the entire dataset or a particular split. For example:
 
-It is easy to convert a Huggingface dataset to Lance dataset:
 
 .. code-block:: python
 
@@ -11,6 +13,6 @@ It is easy to convert a Huggingface dataset to Lance dataset:
     import datasets
     import lance
 
-    lance.write(datasets.load_dataset(
+    lance.write_dataset(datasets.load_dataset(
         "poloclub/diffusiondb", split="train[:10]"
     ), "diffusiondb_train.lance")
\ No newline at end of file

From e794b67d80b61a54da6c7b184a6e1e70e5abbebc Mon Sep 17 00:00:00 2001
From: Lei Xu <lei@lancedb.com>
Date: Mon, 29 Jan 2024 23:08:02 -0800
Subject: [PATCH 6/7] s

---
 docs/integrations/huggingface.rst       | 2 +-
 python/python/tests/test_huggingface.py | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/docs/integrations/huggingface.rst b/docs/integrations/huggingface.rst
index f70edbef71..892e56be77 100644
--- a/docs/integrations/huggingface.rst
+++ b/docs/integrations/huggingface.rst
@@ -14,5 +14,5 @@ HuggingFace datasets can be written directly into Lance format by using the
     import lance
 
     lance.write_dataset(datasets.load_dataset(
-        "poloclub/diffusiondb", split="train[:10]"
+        "poloclub/diffusiondb", split="train[:10]",
     ), "diffusiondb_train.lance")
\ No newline at end of file
diff --git a/python/python/tests/test_huggingface.py b/python/python/tests/test_huggingface.py
index 693465381d..fd7a64167f 100644
--- a/python/python/tests/test_huggingface.py
+++ b/python/python/tests/test_huggingface.py
@@ -21,7 +21,10 @@
 
 def test_write_hf_dataset(tmp_path: Path):
     hf_ds = datasets.load_dataset(
-        "poloclub/diffusiondb", name="2m_first_1k", split="train[:50]"
+        "poloclub/diffusiondb",
+        name="2m_first_1k",
+        split="train[:50]",
+        trust_remote_code=True,
     )
 
     ds = lance.write_dataset(hf_ds, tmp_path)

From 07f701e593fab5c2f815c89cf589984aaa0ffe23 Mon Sep 17 00:00:00 2001
From: Lei Xu <lei@lancedb.com>
Date: Mon, 29 Jan 2024 23:14:50 -0800
Subject: [PATCH 7/7] pillow

---
 python/pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/pyproject.toml b/python/pyproject.toml
index 4c2861488c..1282fcb51e 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -49,6 +49,7 @@ tests = [
     "datasets",
     "duckdb",
     "ml_dtypes",
+	"pillow",
     "pandas",
     "polars[pyarrow,pandas]",
     "pytest",