diff --git a/docs/_static/blob.png b/docs/_static/blob.png new file mode 100644 index 0000000000..74d31b964a Binary files /dev/null and b/docs/_static/blob.png differ diff --git a/docs/api/api.rst b/docs/api/api.rst index 4a6667f10a..d2e0c33bc2 100644 --- a/docs/api/api.rst +++ b/docs/api/api.rst @@ -4,4 +4,4 @@ APIs .. toctree:: Rust - Python <./python/modules> + Python <./python.rst> diff --git a/docs/api/python.rst b/docs/api/python.rst new file mode 100644 index 0000000000..f450a7b4dc --- /dev/null +++ b/docs/api/python.rst @@ -0,0 +1,66 @@ +Python APIs +=========== + +``Lance`` is a columnar format that is specifically designed for efficient +multi-modal data processing. + +Lance Dataset +------------- + +The core of Lance is the ``LanceDataset`` class. User can open a dataset by using +:py:meth:`lance.dataset`. + +.. autofunction:: lance.dataset + :noindex: + +Basic IOs +~~~~~~~~~ + +The following functions are used to read and write data in Lance format. + +.. automethod:: lance.dataset.LanceDataset.insert + :noindex: +.. automethod:: lance.dataset.LanceDataset.scanner + :noindex: +.. automethod:: lance.dataset.LanceDataset.to_batches + :noindex: +.. automethod:: lance.dataset.LanceDataset.to_table + :noindex: + +Random Access +~~~~~~~~~~~~~ + +Lance stands out with its super fast random access, unlike other columnar formats. + +.. automethod:: lance.dataset.LanceDataset.take + :noindex: +.. automethod:: lance.dataset.LanceDataset.take_blobs + :noindex: + + +Schema Evolution +~~~~~~~~~~~~~~~~ + +Lance supports schema evolution, which means that you can add new columns to the dataset +cheaply. + +.. automethod:: lance.dataset.LanceDataset.add_columns + :noindex: +.. automethod:: lance.dataset.LanceDataset.drop_columns + :noindex: + + +Indexing and Searching +~~~~~~~~~~~~~~~~~~~~~~ + +.. automethod:: lance.dataset.LanceDataset.create_index + :noindex: +.. automethod:: lance.dataset.LanceDataset.scanner + :noindex: + +API Reference +~~~~~~~~~~~~~ + +More information can be found in the :doc:`API reference `. + +.. _Lance Python API documentation: ./python/modules diff --git a/docs/blob.rst b/docs/blob.rst new file mode 100644 index 0000000000..13c5dbd02c --- /dev/null +++ b/docs/blob.rst @@ -0,0 +1,46 @@ +Blob As Files +============= + +Unlike other data formats, large multimodal data is a first-class citizen in the Lance columnar format. +Lance provides a high-level API to store and retrieve large binary objects (blobs) in Lance datasets. + +.. image:: _static/blob.png + :scale: 50% + +Lance serves large binary data using :py:class:`lance.BlobFile`, which +is a file-like object that lazily reads large binary objects. + +.. autoclass:: lance.BlobFile + :members: + :show-inheritance: + :noindex: + +To fetch blobs from a Lance dataset, you can use :py:meth:`lance.dataset.LanceDataset.take_blobs`. + +For example, it's easy to use `BlobFile` to extract frames from a video file without +loading the entire video into memory. + +.. code-block:: python + + # pip install av pylance + + import av + import lance + + ds = lance.dataset("./youtube.lance") + start_time, end_time = 500, 1000 + blobs = ds.take_blobs([5], "video") + with av.open(blobs[0]) as container: + stream = container.streams.video[0] + stream.codec_context.skip_frame = "NONKEY" + + start_time = start_time / stream.time_base + start_time = start_time.as_integer_ratio()[0] + end_time = end_time / stream.time_base + container.seek(start_time, stream=stream) + + for frame in container.decode(stream): + if frame.time > end_time: + break + display(frame.to_image()) + clear_output(wait=True) diff --git a/docs/index.rst b/docs/index.rst index 28c96053ce..6d281be84f 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -45,6 +45,7 @@ Preview releases receive the same level of testing as regular releases. ./read_and_write Lance Formats <./format> Arrays <./arrays> + Blob API <./blob> Integrations <./integrations/integrations> Performance Guide <./performance> API References <./api/api> diff --git a/python/python/lance/__init__.py b/python/python/lance/__init__.py index b917e43b55..e7764d1815 100644 --- a/python/python/lance/__init__.py +++ b/python/python/lance/__init__.py @@ -68,7 +68,8 @@ def dataset( Parameters ---------- uri : str - Address to the Lance dataset. + Address to the Lance dataset. It can be a local file path `/tmp/data.lance`, + or a cloud object store URI, i.e., `s3://bucket/data.lance`. version : optional, int | str If specified, load a specific version of the Lance dataset. Else, loads the latest version. A version number (`int`) or a tag (`str`) can be provided. @@ -77,7 +78,7 @@ def dataset( argument value. If a version is already specified, this arg is ignored. block_size : optional, int Block size in bytes. Provide a hint for the size of the minimal I/O request. - commit_handler : optional, CommitLock + commit_handler : optional, lance.commit.CommitLock If specified, use the provided commit handler to lock the table while committing a new version. Not necessary on object stores other than S3 or when there are no concurrent writers. diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 8f0f6daf8a..0316847aa7 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -152,7 +152,7 @@ def when_not_matched_by_source_delete(self, expr: Optional[str] = None): class LanceDataset(pa.dataset.Dataset): - """A dataset in Lance format where the data is stored at the given uri.""" + """A Lance Dataset in Lance format where the data is stored at the given uri.""" def __init__( self, @@ -325,6 +325,7 @@ def scanner( "nprobes": 1, "refine_factor": 1 } + batch_size: int, default None The max size of batches returned. io_buffer_size: int, default None @@ -366,7 +367,7 @@ def scanner( If True, then all columns are late materialized. If False, then all columns are early materialized. If a list of strings, then only the columns in the list are - late materialized. + late materialized. The default uses a heuristic that assumes filters will select about 0.1% of the rows. If your filter is more selective (e.g. find by id) you may @@ -376,6 +377,7 @@ def scanner( query string to search for, the results will be ranked by BM25. e.g. "hello world", would match documents containing "hello" or "world". or a dictionary with the following keys: + - columns: list[str] The columns to search, currently only supports a single column in the columns list. @@ -389,6 +391,7 @@ def scanner( ----- For now, if BOTH filter and nearest is specified, then: + 1. nearest is executed first. 2. The results are filtered afterwards. @@ -506,7 +509,7 @@ def to_table( late_materialization: Optional[bool | List[str]] = None, use_scalar_index: Optional[bool] = None, ) -> pa.Table: - """Read the data into memory as a pyarrow Table. + """Read the data into memory as a :py:class:`pyarrow.Table` Parameters ---------- @@ -567,6 +570,7 @@ def to_table( query string to search for, the results will be ranked by BM25. e.g. "hello world", would match documents contains "hello" or "world". or a dictionary with the following keys: + - columns: list[str] The columns to search, currently only supports a single column in the columns list. @@ -576,6 +580,7 @@ def to_table( Notes ----- If BOTH filter and nearest is specified, then: + 1. nearest is executed first. 2. The results are filtered afterward, unless pre-filter sets to True. """ @@ -734,11 +739,11 @@ def take( Or a dictionary of column names to SQL expressions. All columns are fetched if None or unspecified. **kwargs : dict, optional - See scanner() method for full parameter description. + See :py:method::scanner method for full parameter description. Returns ------- - table : Table + table : pyarrow.Table """ columns_with_transform = None if isinstance(columns, dict): @@ -787,7 +792,11 @@ def take_blobs( blob_column: str, ) -> List[BlobFile]: """ - Select blobs by row_ids. + Select blobs by row IDs. + + Instead of loading large binary blob data into memory before processing it, + this API allows you to open binary blob data as a regular Python file-like + object. For more details, see :py:class:`lance.BlobFile`. Parameters ---------- @@ -1612,15 +1621,19 @@ def create_index( Replace the existing index if it exists. num_partitions : int, optional The number of partitions of IVF (Inverted File Index). - ivf_centroids : ``np.ndarray``, ``pyarrow.FixedSizeListArray`` - or ``pyarrow.FixedShapeTensorArray``. Optional. - A ``num_partitions x dimension`` array of K-mean centroids for IVF - clustering. If not provided, a new Kmean model will be trained. - pq_codebook : ``np.ndarray``, ``pyarrow.FixedSizeListArray`` - or ``pyarrow.FixedShapeTensorArray``. Optional. + ivf_centroids : optional + It can be either :py:class:`np.ndarray`, + :py:class:`pyarrow.FixedSizeListArray` or + :py:class:`pyarrow.FixedShapeTensorArray`. + A ``num_partitions x dimension`` array of existing K-mean centroids + for IVF clustering. If not provided, a new KMeans model will be trained. + pq_codebook : optional, + It can be :py:class:`np.ndarray`, :py:class:`pyarrow.FixedSizeListArray`, + or :py:class:`pyarrow.FixedShapeTensorArray`. A ``num_sub_vectors x (2 ^ nbits * dimensions // num_sub_vectors)`` array of K-mean centroids for PQ codebook. - Note: nbits is always 8 for now. + + Note: ``nbits`` is always 8 for now. If not provided, a new PQ model will be trained. num_sub_vectors : int, optional The number of sub-vectors for PQ (Product Quantization). @@ -1654,7 +1667,9 @@ def create_index( kwargs : Parameters passed to the index building process. - The SQ (Scalar Quantization) is available for only "IVF_HNSW_SQ" index type, + + + The SQ (Scalar Quantization) is available for only ``IVF_HNSW_SQ`` index type, this quantization method is used to reduce the memory usage of the index, it maps the float vectors to integer vectors, each integer is of ``num_bits``, now only 8 bits are supported. @@ -1665,20 +1680,21 @@ def create_index( If ``index_type`` is with "PQ", then the following parameters are required: num_sub_vectors - Optional parameters for "IVF_PQ": - ivf_centroids : - K-mean centroids for IVF clustering. - num_bits : int, optional + Optional parameters for `IVF_PQ`: + + - ivf_centroids + Existing K-mean centroids for IVF clustering. + - num_bits The number of bits for PQ (Product Quantization). Default is 8. Only 4, 8 are supported. - Optional parameters for "IVF_HNSW_*": - max_level : int - the maximum number of levels in the graph. - m : int - the number of edges per node in the graph. - ef_construction : int - the number of nodes to examine during the construction. + Optional parameters for `IVF_HNSW_*`: + max_level + Int, the maximum number of levels in the graph. + m + Int, the number of edges per node in the graph. + ef_construction + Int, the number of nodes to examine during the construction. Examples --------