Add python bindings for reading and writing lance v2 files

lancedb · Apr 5, 2024 · 137c004 · 137c004
1 parent 4c6cb7d
commit 137c004
Show file tree

Hide file tree

Showing 11 changed files with 825 additions and 44 deletions.
diff --git a/python/Cargo.toml b/python/Cargo.toml
@@ -41,11 +41,14 @@ pyo3 = { version = "0.20", features = ["extension-module", "abi3-py38"] }
 tokio = { version = "1.23", features = ["rt-multi-thread"] }
 uuid = "1.3.0"
 serde_json = "1"
+serde = "1.0.197"
+serde_yaml = "0.9.34"
 num_cpus = "1"
 snafu = "0.7.4"
 tracing-chrome = "0.7.1"
 tracing-subscriber = "0.3.17"
 tracing = "0.1.37"
+url = "2.5.0"
 
 # Prevent dynamic linking of lzma, which comes from datafusion
 lzma-sys = { version = "*", features = ["static"] }

diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -12,13 +12,13 @@ keywords = [
     "data-science",
     "machine-learning",
     "arrow",
-    "data-analytics"
+    "data-analytics",
 ]
 categories = [
     "database-implementations",
     "data-structures",
     "development-tools",
-    "science"
+    "science",
 ]
 classifiers = [
     "Development Status :: 3 - Alpha",
@@ -48,7 +48,7 @@ build-backend = "maturin"
 [project.optional-dependencies]
 tests = [
     "datasets",
-    "duckdb; python_version<'3.12'", # TODO: remove when duckdb supports 3.12
+    "duckdb; python_version<'3.12'",     # TODO: remove when duckdb supports 3.12
     "ml_dtypes",
     "pillow",
     "pandas",
@@ -65,7 +65,7 @@ torch = ["torch"]
 lint.select = ["F", "E", "W", "I", "G", "TCH", "PERF", "CPY001", "B019"]
 
 [tool.ruff.lint.per-file-ignores]
-"*.pyi" = ["E302"]
+"*.pyi" = ["E301", "E302"]
 
 [tool.mypy]
 python_version = "3.11"

diff --git a/python/python/lance/file.py b/python/python/lance/file.py
@@ -0,0 +1,188 @@
+#  Copyright (c) 2023. Lance Developers
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+from typing import Union
+
+import pyarrow as pa
+
+from .lance import (
+    LanceBufferDescriptor,
+    LanceColumnMetadata,
+    LanceFileMetadata,
+    LancePageMetadata,
+)
+from .lance import (
+    LanceFileReader as _LanceFileReader,
+)
+from .lance import (
+    LanceFileWriter as _LanceFileWriter,
+)
+
+
+class ReaderResults:
+    """
+    Utility class for converting results from Lance's internal
+    format (RecordBatchReader) to a desired format such
+    as a pyarrow Table, etc.
+    """
+
+    def __init__(self, reader: pa.RecordBatchReader):
+        """
+        Creates a new instance, not meant for external use
+        """
+        self.reader = reader
+
+    def to_batches(self) -> pa.RecordBatchReader:
+        """
+        Return the results as a pyarrow RecordBatchReader
+        """
+        return self.reader
+
+    def to_table(self) -> pa.Table:
+        """
+        Return the results as a pyarrow Table
+        """
+        return self.reader.read_all()
+
+
+class LanceFileReader:
+    """
+    A file reader for reading Lance files
+
+    This class is used to read Lance data files, a low level structure
+    optimized for storing multi-modal tabular data.  If you are working with
+    Lance datasets then you should use the LanceDataset class instead.
+    """
+
+    # TODO: make schema optional
+    def __init__(self, path: str, schema: pa.Schema):
+        """
+        Creates a new file reader to read the given file
+
+        Parameters
+        ----------
+
+        path: str
+            The path to read, can be a pathname for local storage
+            or a URI to read from cloud storage.
+        schema: pa.Schema
+            The desired projection schema
+        """
+        self._reader = _LanceFileReader(path, schema)
+
+    def read_all(self, *, batch_size: int = 1024) -> ReaderResults:
+        """
+        Reads the entire file
+
+        Parameters
+        ----------
+        batch_size: int, default 1024
+            The file will be read in batches.  This parameter controls
+            how many rows will be in each batch (except the final batch)
+
+            Smaller batches will use less memory but might be slightly
+            slower because there is more per-batch overhead
+        """
+        return ReaderResults(self._reader.read_all(batch_size))
+
+    def read_range(
+        self, start: int, num_rows: int, *, batch_size: int = 1024
+    ) -> ReaderResults:
+        """
+        Read a range of rows from the file
+
+        Parameters
+        ----------
+        start: int
+            The offset of the first row to start reading
+        num_rows: int
+            The number of rows to read from the file
+        batch_size: int, default 1024
+            The file will be read in batches.  This parameter controls
+            how many rows will be in each batch (except the final batch)
+
+            Smaller batches will use less memory but might be slightly
+            slower because there is more per-batch overhead
+        """
+        return ReaderResults(self._reader.read_range(start, num_rows, batch_size))
+
+    def metadata(self) -> LanceFileMetadata:
+        """
+        Return metadata describing the file contents
+        """
+        return self._reader.metadata()
+
+
+class LanceFileWriter:
+    """
+    A file writer for writing Lance data files
+
+    This class is used to write Lance data files, a low level structure
+    optimized for storing multi-modal tabular data.  If you are working with
+    Lance datasets then you should use the LanceDataset class instead.
+    """
+
+    def __init__(self, path: str, schema: pa.Schema, **kwargs):
+        """
+        Create a new LanceFileWriter to write to the given path
+
+        Parameters
+        ----------
+        path: str
+            The path to write to.  Can be a pathname for local storage
+            or a URI for remote storage.
+        schema: pa.Schema
+            The schema of data that will be written
+        """
+        self._writer = _LanceFileWriter(path, schema, **kwargs)
+        self.closed = False
+
+    def write_batch(self, batch: Union[pa.RecordBatch, pa.Table]) -> None:
+        """
+        Write a batch of data to the file
+
+        parameters
+        ----------
+        batch: Union[pa.RecordBatch, pa.Table]
+            The data to write to the file
+        """
+        if isinstance(batch, pa.Table):
+            for batch in batch.to_batches():
+                self._writer.write_batch(batch)
+        else:
+            self._writer.write_batch(batch)
+
+    def close(self) -> None:
+        """
+        Write the file metadata and close the file
+        """
+        if self.closed:
+            return
+        self.closed = True
+        self._writer.finish()
+
+    def __enter__(self) -> "LanceFileWriter":
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
+        self.close()
+
+
+__all__ = [
+    "LanceFileReader",
+    "LanceFileWriter",
+    "LanceFileMetadata",
+    "LanceColumnMetadata",
+    "LancePageMetadata",
+    "LanceBufferDescriptor",
+]
diff --git a/python/python/lance/lance/__init__.pyi b/python/python/lance/lance/__init__.pyi
@@ -32,3 +32,36 @@ class CompactionMetrics:
     fragments_added: int
     files_removed: int
     files_added: int
+
+class LanceFileWriter:
+    def __init__(self, path: str, schema: pa.Schema): ...
+    def write_batch(self, batch: pa.RecordBatch) -> None: ...
+    def finish(self) -> None: ...
+
+class LanceFileReader:
+    def __init__(self, path: str, schema: pa.Schema): ...
+    def read_all(self, batch_size: int) -> pa.RecordBatchReader: ...
+    def read_range(
+        self, start: int, num_rows: int, batch_size: int
+    ) -> pa.RecordBatchReader: ...
+
+class LanceBufferDescriptor:
+    position: int
+    size: int
+
+class LancePageMetadata:
+    buffers: List[LanceBufferDescriptor]
+    encoding: str
+
+class LanceColumnMetadata:
+    column_buffers: List[LanceBufferDescriptor]
+    pages: List[LancePageMetadata]
+
+class LanceFileMetadata:
+    schema: pa.Schema
+    num_rows: int
+    num_data_bytes: int
+    num_column_metadata_bytes: int
+    num_global_buffer_bytes: int
+    global_buffers: List[LanceBufferDescriptor]
+    columns: List[LanceColumnMetadata]
diff --git a/python/python/tests/test_file.py b/python/python/tests/test_file.py
@@ -0,0 +1,70 @@
+#  Copyright (c) 2023. Lance Developers
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import pyarrow as pa
+from lance.file import LanceFileReader, LanceFileWriter
+
+
+def test_file_writer(tmp_path):
+    path = tmp_path / "foo.lance"
+    schema = pa.schema([pa.field("a", pa.int64())])
+    with LanceFileWriter(str(path), schema) as writer:
+        writer.write_batch(pa.table({"a": [1, 2, 3]}))
+    assert len(path.read_bytes()) > 0
+
+
+def test_round_trip(tmp_path):
+    path = tmp_path / "foo.lance"
+    schema = pa.schema([pa.field("a", pa.int64())])
+    data = pa.table({"a": [1, 2, 3]})
+    with LanceFileWriter(str(path), schema) as writer:
+        writer.write_batch(data)
+    reader = LanceFileReader(str(path), schema)
+    result = reader.read_all().to_table()
+    assert result == data
+
+    # TODO: Currently fails, need to fix reader
+    # result = reader.read_range(1, 1).to_table()
+    # assert result == pa.table({"a": [2]})
+
+    # TODO: Test reading invalid ranges
+    # TODO: Test invalid batch sizes
+
+
+def test_metadata(tmp_path):
+    path = tmp_path / "foo.lance"
+    schema = pa.schema([pa.field("a", pa.int64())])
+    data = pa.table({"a": [1, 2, 3]})
+    with LanceFileWriter(str(path), schema) as writer:
+        writer.write_batch(data)
+    reader = LanceFileReader(str(path), schema)
+    metadata = reader.metadata()
+
+    assert metadata.schema == schema
+    assert metadata.num_rows == 3
+    assert metadata.num_global_buffer_bytes > 0
+    assert metadata.num_column_metadata_bytes > 0
+    assert metadata.num_data_bytes == 24
+    assert len(metadata.columns) == 1
+
+    column = metadata.columns[0]
+    assert len(column.column_buffers) == 0
+    assert len(column.pages) == 1
+
+    page = column.pages[0]
+    assert len(page.buffers) == 1
+    assert page.buffers[0].position == 0
+    assert page.buffers[0].size == 24
+
+    assert len(page.encoding) > 0