Skip to content

Commit

Permalink
Add python bindings for reading and writing lance v2 files
Browse files Browse the repository at this point in the history
  • Loading branch information
westonpace committed Apr 5, 2024
1 parent 4c6cb7d commit 137c004
Show file tree
Hide file tree
Showing 11 changed files with 825 additions and 44 deletions.
3 changes: 3 additions & 0 deletions python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,14 @@ pyo3 = { version = "0.20", features = ["extension-module", "abi3-py38"] }
tokio = { version = "1.23", features = ["rt-multi-thread"] }
uuid = "1.3.0"
serde_json = "1"
serde = "1.0.197"
serde_yaml = "0.9.34"
num_cpus = "1"
snafu = "0.7.4"
tracing-chrome = "0.7.1"
tracing-subscriber = "0.3.17"
tracing = "0.1.37"
url = "2.5.0"

# Prevent dynamic linking of lzma, which comes from datafusion
lzma-sys = { version = "*", features = ["static"] }
Expand Down
8 changes: 4 additions & 4 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@ keywords = [
"data-science",
"machine-learning",
"arrow",
"data-analytics"
"data-analytics",
]
categories = [
"database-implementations",
"data-structures",
"development-tools",
"science"
"science",
]
classifiers = [
"Development Status :: 3 - Alpha",
Expand Down Expand Up @@ -48,7 +48,7 @@ build-backend = "maturin"
[project.optional-dependencies]
tests = [
"datasets",
"duckdb; python_version<'3.12'", # TODO: remove when duckdb supports 3.12
"duckdb; python_version<'3.12'", # TODO: remove when duckdb supports 3.12
"ml_dtypes",
"pillow",
"pandas",
Expand All @@ -65,7 +65,7 @@ torch = ["torch"]
lint.select = ["F", "E", "W", "I", "G", "TCH", "PERF", "CPY001", "B019"]

[tool.ruff.lint.per-file-ignores]
"*.pyi" = ["E302"]
"*.pyi" = ["E301", "E302"]

[tool.mypy]
python_version = "3.11"
Expand Down
188 changes: 188 additions & 0 deletions python/python/lance/file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
# Copyright (c) 2023. Lance Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Union

import pyarrow as pa

from .lance import (
LanceBufferDescriptor,
LanceColumnMetadata,
LanceFileMetadata,
LancePageMetadata,
)
from .lance import (
LanceFileReader as _LanceFileReader,
)
from .lance import (
LanceFileWriter as _LanceFileWriter,
)


class ReaderResults:
"""
Utility class for converting results from Lance's internal
format (RecordBatchReader) to a desired format such
as a pyarrow Table, etc.
"""

def __init__(self, reader: pa.RecordBatchReader):
"""
Creates a new instance, not meant for external use
"""
self.reader = reader

def to_batches(self) -> pa.RecordBatchReader:
"""
Return the results as a pyarrow RecordBatchReader
"""
return self.reader

def to_table(self) -> pa.Table:
"""
Return the results as a pyarrow Table
"""
return self.reader.read_all()


class LanceFileReader:
"""
A file reader for reading Lance files
This class is used to read Lance data files, a low level structure
optimized for storing multi-modal tabular data. If you are working with
Lance datasets then you should use the LanceDataset class instead.
"""

# TODO: make schema optional
def __init__(self, path: str, schema: pa.Schema):
"""
Creates a new file reader to read the given file
Parameters
----------
path: str
The path to read, can be a pathname for local storage
or a URI to read from cloud storage.
schema: pa.Schema
The desired projection schema
"""
self._reader = _LanceFileReader(path, schema)

def read_all(self, *, batch_size: int = 1024) -> ReaderResults:
"""
Reads the entire file
Parameters
----------
batch_size: int, default 1024
The file will be read in batches. This parameter controls
how many rows will be in each batch (except the final batch)
Smaller batches will use less memory but might be slightly
slower because there is more per-batch overhead
"""
return ReaderResults(self._reader.read_all(batch_size))

def read_range(
self, start: int, num_rows: int, *, batch_size: int = 1024
) -> ReaderResults:
"""
Read a range of rows from the file
Parameters
----------
start: int
The offset of the first row to start reading
num_rows: int
The number of rows to read from the file
batch_size: int, default 1024
The file will be read in batches. This parameter controls
how many rows will be in each batch (except the final batch)
Smaller batches will use less memory but might be slightly
slower because there is more per-batch overhead
"""
return ReaderResults(self._reader.read_range(start, num_rows, batch_size))

def metadata(self) -> LanceFileMetadata:
"""
Return metadata describing the file contents
"""
return self._reader.metadata()


class LanceFileWriter:
"""
A file writer for writing Lance data files
This class is used to write Lance data files, a low level structure
optimized for storing multi-modal tabular data. If you are working with
Lance datasets then you should use the LanceDataset class instead.
"""

def __init__(self, path: str, schema: pa.Schema, **kwargs):
"""
Create a new LanceFileWriter to write to the given path
Parameters
----------
path: str
The path to write to. Can be a pathname for local storage
or a URI for remote storage.
schema: pa.Schema
The schema of data that will be written
"""
self._writer = _LanceFileWriter(path, schema, **kwargs)
self.closed = False

def write_batch(self, batch: Union[pa.RecordBatch, pa.Table]) -> None:
"""
Write a batch of data to the file
parameters
----------
batch: Union[pa.RecordBatch, pa.Table]
The data to write to the file
"""
if isinstance(batch, pa.Table):
for batch in batch.to_batches():
self._writer.write_batch(batch)
else:
self._writer.write_batch(batch)

def close(self) -> None:
"""
Write the file metadata and close the file
"""
if self.closed:
return
self.closed = True
self._writer.finish()

def __enter__(self) -> "LanceFileWriter":
return self

def __exit__(self, exc_type, exc_val, exc_tb) -> None:
self.close()


__all__ = [
"LanceFileReader",
"LanceFileWriter",
"LanceFileMetadata",
"LanceColumnMetadata",
"LancePageMetadata",
"LanceBufferDescriptor",
]
33 changes: 33 additions & 0 deletions python/python/lance/lance/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,36 @@ class CompactionMetrics:
fragments_added: int
files_removed: int
files_added: int

class LanceFileWriter:
def __init__(self, path: str, schema: pa.Schema): ...
def write_batch(self, batch: pa.RecordBatch) -> None: ...
def finish(self) -> None: ...

class LanceFileReader:
def __init__(self, path: str, schema: pa.Schema): ...
def read_all(self, batch_size: int) -> pa.RecordBatchReader: ...
def read_range(
self, start: int, num_rows: int, batch_size: int
) -> pa.RecordBatchReader: ...

class LanceBufferDescriptor:
position: int
size: int

class LancePageMetadata:
buffers: List[LanceBufferDescriptor]
encoding: str

class LanceColumnMetadata:
column_buffers: List[LanceBufferDescriptor]
pages: List[LancePageMetadata]

class LanceFileMetadata:
schema: pa.Schema
num_rows: int
num_data_bytes: int
num_column_metadata_bytes: int
num_global_buffer_bytes: int
global_buffers: List[LanceBufferDescriptor]
columns: List[LanceColumnMetadata]
70 changes: 70 additions & 0 deletions python/python/tests/test_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Copyright (c) 2023. Lance Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pyarrow as pa
from lance.file import LanceFileReader, LanceFileWriter


def test_file_writer(tmp_path):
path = tmp_path / "foo.lance"
schema = pa.schema([pa.field("a", pa.int64())])
with LanceFileWriter(str(path), schema) as writer:
writer.write_batch(pa.table({"a": [1, 2, 3]}))
assert len(path.read_bytes()) > 0


def test_round_trip(tmp_path):
path = tmp_path / "foo.lance"
schema = pa.schema([pa.field("a", pa.int64())])
data = pa.table({"a": [1, 2, 3]})
with LanceFileWriter(str(path), schema) as writer:
writer.write_batch(data)
reader = LanceFileReader(str(path), schema)
result = reader.read_all().to_table()
assert result == data

# TODO: Currently fails, need to fix reader
# result = reader.read_range(1, 1).to_table()
# assert result == pa.table({"a": [2]})

# TODO: Test reading invalid ranges
# TODO: Test invalid batch sizes


def test_metadata(tmp_path):
path = tmp_path / "foo.lance"
schema = pa.schema([pa.field("a", pa.int64())])
data = pa.table({"a": [1, 2, 3]})
with LanceFileWriter(str(path), schema) as writer:
writer.write_batch(data)
reader = LanceFileReader(str(path), schema)
metadata = reader.metadata()

assert metadata.schema == schema
assert metadata.num_rows == 3
assert metadata.num_global_buffer_bytes > 0
assert metadata.num_column_metadata_bytes > 0
assert metadata.num_data_bytes == 24
assert len(metadata.columns) == 1

column = metadata.columns[0]
assert len(column.column_buffers) == 0
assert len(column.pages) == 1

page = column.pages[0]
assert len(page.buffers) == 1
assert page.buffers[0].position == 0
assert page.buffers[0].size == 24

assert len(page.encoding) > 0
Loading

0 comments on commit 137c004

Please sign in to comment.