From 61acc614d75e8d6324a7c8988459319de91617ab Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Mon, 29 Jan 2024 18:37:50 -0800 Subject: [PATCH 1/7] add docs --- docs/integrations/integrations.rst | 1 + python/pyproject.toml | 17 ++++++----------- python/python/lance/dataset.py | 13 +++++++++++++ 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/docs/integrations/integrations.rst b/docs/integrations/integrations.rst index c6e85e7d63..a84ca4f9b8 100644 --- a/docs/integrations/integrations.rst +++ b/docs/integrations/integrations.rst @@ -3,4 +3,5 @@ Integrations .. toctree:: + Huggingface <./huggingface> Tensorflow <./tensorflow> \ No newline at end of file diff --git a/python/pyproject.toml b/python/pyproject.toml index 6e2842d265..4c2861488c 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -2,9 +2,7 @@ name = "pylance" dependencies = ["pyarrow>=12", "numpy>=1.22"] description = "python wrapper for Lance columnar format" -authors = [ - { name = "Lance Devs", email = "dev@lancedb.com" }, -] +authors = [{ name = "Lance Devs", email = "dev@lancedb.com" }] license = { file = "LICENSE" } repository = "https://github.com/eto-ai/lance" readme = "README.md" @@ -48,20 +46,17 @@ build-backend = "maturin" [project.optional-dependencies] tests = [ - "pandas", - "pytest", + "datasets", "duckdb", "ml_dtypes", + "pandas", "polars[pyarrow,pandas]", + "pytest", "tensorflow", "tqdm", ] -benchmarks = [ - "pytest-benchmark", -] -torch = [ - "torch", -] +benchmarks = ["pytest-benchmark"] +torch = ["torch"] [tool.ruff] select = ["F", "E", "W", "I", "G", "TCH", "PERF", "CPY001"] diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 520d37ae53..481f50ee0f 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -1960,6 +1960,7 @@ def write_dataset( data_obj: Reader-like The data to be written. Acceptable types are: - Pandas DataFrame, Pyarrow Table, Dataset, Scanner, or RecordBatchReader + - Huggingface dataset uri: str or Path Where to write the dataset to (directory) schema: Schema, optional @@ -1988,6 +1989,18 @@ def write_dataset( a custom class that defines hooks to be called when each fragment is starting to write and finishing writing. """ + try: + # Huggingface datasets + import datasets + + if isinstance(data_obj, datasets.Dataset): + if schema is None: + schema = data_obj.features.arrow_schema + data_obj = data_obj.data.to_batches() + except ImportError: + pass + print("Schema is: ", schema) + reader = _coerce_reader(data_obj, schema) _validate_schema(reader.schema) # TODO add support for passing in LanceDataset and LanceScanner here From 8b49cbf9772865d5fa803d5c7f80b17cbc57b4ff Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Mon, 29 Jan 2024 18:48:09 -0800 Subject: [PATCH 2/7] remove prints --- python/python/lance/dataset.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 481f50ee0f..76a0f03a08 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -1999,7 +1999,6 @@ def write_dataset( data_obj = data_obj.data.to_batches() except ImportError: pass - print("Schema is: ", schema) reader = _coerce_reader(data_obj, schema) _validate_schema(reader.schema) From 9ddacb0b65cee27834b10f3cbbf480609b59956c Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Mon, 29 Jan 2024 18:52:57 -0800 Subject: [PATCH 3/7] add missing huggingface rst --- docs/integrations/huggingface.rst | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 docs/integrations/huggingface.rst diff --git a/docs/integrations/huggingface.rst b/docs/integrations/huggingface.rst new file mode 100644 index 0000000000..d165707094 --- /dev/null +++ b/docs/integrations/huggingface.rst @@ -0,0 +1,16 @@ +HuggingFace +----------- + +The Hugging Face Hub has a great amount of pre-trained models and datasets available. + +It is easy to convert a Huggingface dataset to Lance dataset: + +.. code-block:: python + + # Huggingface datasets + import datasets + import lance + + lance.write(datasets.load_dataset( + "poloclub/diffusiondb", split="train[:10]" + ), "diffusiondb_train.lance") \ No newline at end of file From 995aab2da3ef0677f4fc0037a70762fff3a80e25 Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Mon, 29 Jan 2024 19:01:47 -0800 Subject: [PATCH 4/7] use lazy loading --- python/python/lance/dataset.py | 13 ++++++----- python/python/lance/dependencies.py | 11 +++++++++ python/python/tests/test_huggingface.py | 30 +++++++++++++++++++++++++ 3 files changed, 49 insertions(+), 5 deletions(-) create mode 100644 python/python/tests/test_huggingface.py diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 76a0f03a08..9ad59721dd 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -42,7 +42,12 @@ import pyarrow.dataset from pyarrow import RecordBatch, Schema -from .dependencies import _check_for_numpy, _check_for_pandas, torch +from .dependencies import ( + _check_for_hugging_face, + _check_for_numpy, + _check_for_pandas, + torch, +) from .dependencies import numpy as np from .dependencies import pandas as pd from .fragment import FragmentMetadata, LanceFragment @@ -1989,16 +1994,14 @@ def write_dataset( a custom class that defines hooks to be called when each fragment is starting to write and finishing writing. """ - try: + if _check_for_hugging_face(data_obj): # Huggingface datasets - import datasets + from .dependencies import datasets if isinstance(data_obj, datasets.Dataset): if schema is None: schema = data_obj.features.arrow_schema data_obj = data_obj.data.to_batches() - except ImportError: - pass reader = _coerce_reader(data_obj, schema) _validate_schema(reader.schema) diff --git a/python/python/lance/dependencies.py b/python/python/lance/dependencies.py index f020d7a921..13c8c2156a 100644 --- a/python/python/lance/dependencies.py +++ b/python/python/lance/dependencies.py @@ -34,6 +34,7 @@ _PANDAS_AVAILABLE = True _POLARS_AVAILABLE = True _TORCH_AVAILABLE = True +_HUGGING_FACE_AVAILABLE = True class _LazyModule(ModuleType): @@ -164,6 +165,7 @@ def _lazy_import(module_name: str) -> tuple[ModuleType, bool]: if TYPE_CHECKING: + import datasets import numpy import pandas import polars @@ -174,6 +176,7 @@ def _lazy_import(module_name: str) -> tuple[ModuleType, bool]: pandas, _PANDAS_AVAILABLE = _lazy_import("pandas") polars, _POLARS_AVAILABLE = _lazy_import("polars") torch, _TORCH_AVAILABLE = _lazy_import("torch") + datasets, _HUGGING_FACE_AVAILABLE = _lazy_import("datasets") @lru_cache(maxsize=None) @@ -210,6 +213,12 @@ def _check_for_torch(obj: Any, *, check_type: bool = True) -> bool: ) +def _check_for_hugging_face(obj: Any, *, check_type: bool = True) -> bool: + return _HUGGING_FACE_AVAILABLE and _might_be( + cast(Hashable, type(obj) if check_type else obj), "datasets" + ) + + __all__ = [ # lazy-load third party libs "numpy", @@ -221,10 +230,12 @@ def _check_for_torch(obj: Any, *, check_type: bool = True) -> bool: "_check_for_pandas", "_check_for_polars", "_check_for_torch", + "_check_for_hugging_face", "_LazyModule", # exported flags/guards "_NUMPY_AVAILABLE", "_PANDAS_AVAILABLE", "_POLARS_AVAILABLE", "_TORCH_AVAILABLE", + "_HUGGING_FACE_AVAILABLE", ] diff --git a/python/python/tests/test_huggingface.py b/python/python/tests/test_huggingface.py new file mode 100644 index 0000000000..693465381d --- /dev/null +++ b/python/python/tests/test_huggingface.py @@ -0,0 +1,30 @@ +# Copyright 2023 Lance Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path + +import lance +import pytest + +datasets = pytest.importorskip("datasets") + + +def test_write_hf_dataset(tmp_path: Path): + hf_ds = datasets.load_dataset( + "poloclub/diffusiondb", name="2m_first_1k", split="train[:50]" + ) + + ds = lance.write_dataset(hf_ds, tmp_path) + assert ds.count_rows() == 50 + + assert ds.schema == hf_ds.features.arrow_schema From 24fddc466e1e119b8c7e1394ee03634e61012ab8 Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Mon, 29 Jan 2024 23:01:27 -0800 Subject: [PATCH 5/7] doc --- docs/integrations/huggingface.rst | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/integrations/huggingface.rst b/docs/integrations/huggingface.rst index d165707094..f70edbef71 100644 --- a/docs/integrations/huggingface.rst +++ b/docs/integrations/huggingface.rst @@ -1,9 +1,11 @@ -HuggingFace ------------ +Lance ❤️ HuggingFace +-------------------- -The Hugging Face Hub has a great amount of pre-trained models and datasets available. +The HuggingFace Hub has become the go to place for ML practitioners to find pre-trained models and useful datasets. + +HuggingFace datasets can be written directly into Lance format by using the +:meth:`lance.write_dataset` method. You can write the entire dataset or a particular split. For example: -It is easy to convert a Huggingface dataset to Lance dataset: .. code-block:: python @@ -11,6 +13,6 @@ It is easy to convert a Huggingface dataset to Lance dataset: import datasets import lance - lance.write(datasets.load_dataset( + lance.write_dataset(datasets.load_dataset( "poloclub/diffusiondb", split="train[:10]" ), "diffusiondb_train.lance") \ No newline at end of file From e794b67d80b61a54da6c7b184a6e1e70e5abbebc Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Mon, 29 Jan 2024 23:08:02 -0800 Subject: [PATCH 6/7] s --- docs/integrations/huggingface.rst | 2 +- python/python/tests/test_huggingface.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/integrations/huggingface.rst b/docs/integrations/huggingface.rst index f70edbef71..892e56be77 100644 --- a/docs/integrations/huggingface.rst +++ b/docs/integrations/huggingface.rst @@ -14,5 +14,5 @@ HuggingFace datasets can be written directly into Lance format by using the import lance lance.write_dataset(datasets.load_dataset( - "poloclub/diffusiondb", split="train[:10]" + "poloclub/diffusiondb", split="train[:10]", ), "diffusiondb_train.lance") \ No newline at end of file diff --git a/python/python/tests/test_huggingface.py b/python/python/tests/test_huggingface.py index 693465381d..fd7a64167f 100644 --- a/python/python/tests/test_huggingface.py +++ b/python/python/tests/test_huggingface.py @@ -21,7 +21,10 @@ def test_write_hf_dataset(tmp_path: Path): hf_ds = datasets.load_dataset( - "poloclub/diffusiondb", name="2m_first_1k", split="train[:50]" + "poloclub/diffusiondb", + name="2m_first_1k", + split="train[:50]", + trust_remote_code=True, ) ds = lance.write_dataset(hf_ds, tmp_path) From 07f701e593fab5c2f815c89cf589984aaa0ffe23 Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Mon, 29 Jan 2024 23:14:50 -0800 Subject: [PATCH 7/7] pillow --- python/pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pyproject.toml b/python/pyproject.toml index 4c2861488c..1282fcb51e 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -49,6 +49,7 @@ tests = [ "datasets", "duckdb", "ml_dtypes", + "pillow", "pandas", "polars[pyarrow,pandas]", "pytest",