Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add python bindings for the v2 reader/writer #2158

Merged
merged 9 commits into from
Apr 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,11 @@ half = { version = "2.3", default-features = false, features = [
"num-traits",
"std",
] }
lance = { path = "../rust/lance", features = ["tensorflow", "dynamodb", "substrait"] }
lance = { path = "../rust/lance", features = [
"tensorflow",
"dynamodb",
"substrait",
] }
lance-arrow = { path = "../rust/lance-arrow" }
lance-core = { path = "../rust/lance-core" }
lance-datagen = { path = "../rust/lance-datagen", optional = true }
Expand All @@ -41,11 +45,14 @@ pyo3 = { version = "0.20", features = ["extension-module", "abi3-py38"] }
tokio = { version = "1.23", features = ["rt-multi-thread"] }
uuid = "1.3.0"
serde_json = "1"
serde = "1.0.197"
serde_yaml = "0.9.34"
num_cpus = "1"
snafu = "0.7.4"
tracing-chrome = "0.7.1"
tracing-subscriber = "0.3.17"
tracing = "0.1.37"
url = "2.5.0"

# Prevent dynamic linking of lzma, which comes from datafusion
lzma-sys = { version = "*", features = ["static"] }
Expand Down
6 changes: 3 additions & 3 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@ keywords = [
"data-science",
"machine-learning",
"arrow",
"data-analytics"
"data-analytics",
]
categories = [
"database-implementations",
"data-structures",
"development-tools",
"science"
"science",
]
classifiers = [
"Development Status :: 3 - Alpha",
Expand Down Expand Up @@ -66,7 +66,7 @@ torch = ["torch"]
lint.select = ["F", "E", "W", "I", "G", "TCH", "PERF", "CPY001", "B019"]

[tool.ruff.lint.per-file-ignores]
"*.pyi" = ["E302"]
"*.pyi" = ["E301", "E302"]

[tool.mypy]
python_version = "3.11"
Expand Down
188 changes: 188 additions & 0 deletions python/python/lance/file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
# Copyright (c) 2024. Lance Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Union

import pyarrow as pa

from .lance import (
LanceBufferDescriptor,
LanceColumnMetadata,
LanceFileMetadata,
LancePageMetadata,
)
from .lance import (
LanceFileReader as _LanceFileReader,
)
from .lance import (
LanceFileWriter as _LanceFileWriter,
)


class ReaderResults:
"""
Utility class for converting results from Lance's internal
format (RecordBatchReader) to a desired format such
as a pyarrow Table, etc.
"""

def __init__(self, reader: pa.RecordBatchReader):
"""
Creates a new instance, not meant for external use
"""
self.reader = reader

def to_batches(self) -> pa.RecordBatchReader:
"""
Return the results as a pyarrow RecordBatchReader
"""
return self.reader

def to_table(self) -> pa.Table:
"""
Return the results as a pyarrow Table
"""
return self.reader.read_all()


class LanceFileReader:
"""
A file reader for reading Lance files
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this interface mostly internal utility? The public main interface is still pyarrow.Dataset?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. This is mainly for benchmarking / advanced use cases.


This class is used to read Lance data files, a low level structure
optimized for storing multi-modal tabular data. If you are working with
Lance datasets then you should use the LanceDataset class instead.
"""

# TODO: make schema optional
def __init__(self, path: str, schema: pa.Schema):
"""
Creates a new file reader to read the given file

Parameters
----------

path: str
The path to read, can be a pathname for local storage
or a URI to read from cloud storage.
schema: pa.Schema
The desired projection schema
"""
self._reader = _LanceFileReader(path, schema)

def read_all(self, *, batch_size: int = 1024) -> ReaderResults:
"""
Reads the entire file

Parameters
----------
batch_size: int, default 1024
The file will be read in batches. This parameter controls
how many rows will be in each batch (except the final batch)

Smaller batches will use less memory but might be slightly
slower because there is more per-batch overhead
"""
return ReaderResults(self._reader.read_all(batch_size))

def read_range(
self, start: int, num_rows: int, *, batch_size: int = 1024
) -> ReaderResults:
"""
Read a range of rows from the file

Parameters
----------
start: int
The offset of the first row to start reading
num_rows: int
The number of rows to read from the file
batch_size: int, default 1024
The file will be read in batches. This parameter controls
how many rows will be in each batch (except the final batch)

Smaller batches will use less memory but might be slightly
slower because there is more per-batch overhead
"""
return ReaderResults(self._reader.read_range(start, num_rows, batch_size))

def metadata(self) -> LanceFileMetadata:
"""
Return metadata describing the file contents
"""
return self._reader.metadata()


class LanceFileWriter:
"""
A file writer for writing Lance data files

This class is used to write Lance data files, a low level structure
optimized for storing multi-modal tabular data. If you are working with
Lance datasets then you should use the LanceDataset class instead.
"""

def __init__(self, path: str, schema: pa.Schema, **kwargs):
"""
Create a new LanceFileWriter to write to the given path

Parameters
----------
path: str
The path to write to. Can be a pathname for local storage
or a URI for remote storage.
schema: pa.Schema
The schema of data that will be written
"""
self._writer = _LanceFileWriter(path, schema, **kwargs)
self.closed = False

def write_batch(self, batch: Union[pa.RecordBatch, pa.Table]) -> None:
"""
Write a batch of data to the file

parameters
----------
batch: Union[pa.RecordBatch, pa.Table]
The data to write to the file
"""
if isinstance(batch, pa.Table):
for batch in batch.to_batches():
self._writer.write_batch(batch)
else:
self._writer.write_batch(batch)

def close(self) -> None:
"""
Write the file metadata and close the file
"""
if self.closed:
return
self.closed = True
self._writer.finish()

def __enter__(self) -> "LanceFileWriter":
return self

def __exit__(self, exc_type, exc_val, exc_tb) -> None:
self.close()


__all__ = [
"LanceFileReader",
"LanceFileWriter",
"LanceFileMetadata",
"LanceColumnMetadata",
"LancePageMetadata",
"LanceBufferDescriptor",
]
33 changes: 33 additions & 0 deletions python/python/lance/lance/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,36 @@ class CompactionMetrics:
fragments_added: int
files_removed: int
files_added: int

class LanceFileWriter:
def __init__(self, path: str, schema: pa.Schema): ...
def write_batch(self, batch: pa.RecordBatch) -> None: ...
def finish(self) -> None: ...

class LanceFileReader:
def __init__(self, path: str, schema: pa.Schema): ...
def read_all(self, batch_size: int) -> pa.RecordBatchReader: ...
def read_range(
self, start: int, num_rows: int, batch_size: int
) -> pa.RecordBatchReader: ...

class LanceBufferDescriptor:
position: int
size: int

class LancePageMetadata:
buffers: List[LanceBufferDescriptor]
encoding: str

class LanceColumnMetadata:
column_buffers: List[LanceBufferDescriptor]
pages: List[LancePageMetadata]

class LanceFileMetadata:
schema: pa.Schema
num_rows: int
num_data_bytes: int
num_column_metadata_bytes: int
num_global_buffer_bytes: int
global_buffers: List[LanceBufferDescriptor]
columns: List[LanceColumnMetadata]
90 changes: 90 additions & 0 deletions python/python/tests/test_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# Copyright (c) 2024. Lance Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pyarrow as pa
from lance.file import LanceFileReader, LanceFileWriter


def test_file_writer(tmp_path):
path = tmp_path / "foo.lance"
schema = pa.schema([pa.field("a", pa.int64())])
with LanceFileWriter(str(path), schema) as writer:
writer.write_batch(pa.table({"a": [1, 2, 3]}))
reader = LanceFileReader(str(path), schema)
metadata = reader.metadata()
assert metadata.num_rows == 3


def test_aborted_write(tmp_path):
path = tmp_path / "foo.lance"
schema = pa.schema([pa.field("a", pa.int64())])
writer = LanceFileWriter(str(path), schema)
writer.write_batch(pa.table({"a": [1, 2, 3]}))
del writer
assert not path.exists()


def test_multiple_close(tmp_path):
path = tmp_path / "foo.lance"
schema = pa.schema([pa.field("a", pa.int64())])
writer = LanceFileWriter(str(path), schema)
writer.write_batch(pa.table({"a": [1, 2, 3]}))
writer.close()
writer.close()


def test_round_trip(tmp_path):
path = tmp_path / "foo.lance"
schema = pa.schema([pa.field("a", pa.int64())])
data = pa.table({"a": [1, 2, 3]})
with LanceFileWriter(str(path), schema) as writer:
writer.write_batch(data)
reader = LanceFileReader(str(path), schema)
result = reader.read_all().to_table()
assert result == data

# TODO: Currently fails, need to fix reader
# result = reader.read_range(1, 1).to_table()
# assert result == pa.table({"a": [2]})

# TODO: Test reading invalid ranges
# TODO: Test invalid batch sizes


def test_metadata(tmp_path):
path = tmp_path / "foo.lance"
schema = pa.schema([pa.field("a", pa.int64())])
data = pa.table({"a": [1, 2, 3]})
with LanceFileWriter(str(path), schema) as writer:
writer.write_batch(data)
reader = LanceFileReader(str(path), schema)
metadata = reader.metadata()

assert metadata.schema == schema
assert metadata.num_rows == 3
assert metadata.num_global_buffer_bytes > 0
assert metadata.num_column_metadata_bytes > 0
assert metadata.num_data_bytes == 24
assert len(metadata.columns) == 1

column = metadata.columns[0]
assert len(column.column_buffers) == 0
assert len(column.pages) == 1

page = column.pages[0]
assert len(page.buffers) == 1
assert page.buffers[0].position == 0
assert page.buffers[0].size == 24

assert len(page.encoding) > 0
2 changes: 1 addition & 1 deletion python/src/dataset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ use std::str;
use std::sync::Arc;

use arrow::ffi_stream::ArrowArrayStreamReader;
use arrow::pyarrow::{ToPyArrow, *};
use arrow::pyarrow::*;
use arrow_array::{Float32Array, RecordBatch, RecordBatchReader};
use arrow_data::ArrayData;
use arrow_schema::{DataType, Schema as ArrowSchema};
Expand Down
Loading
Loading