docs: blob api documents (#3247)

Closes #3160
lancedb · Dec 14, 2024 · 83b8efd · 83b8efd
1 parent 6203435
commit 83b8efd
Show file tree

Hide file tree

Showing 7 changed files with 158 additions and 28 deletions.
diff --git a/docs/_static/blob.png b/docs/_static/blob.png
diff --git a/docs/api/api.rst b/docs/api/api.rst
@@ -4,4 +4,4 @@ APIs
 .. toctree::
 
   Rust <https://docs.rs/crate/lance/latest>
-  Python <./python/modules>
+  Python <./python.rst>
diff --git a/docs/api/python.rst b/docs/api/python.rst
@@ -0,0 +1,66 @@
+Python APIs
+===========
+
+``Lance`` is a columnar format that is specifically designed for efficient
+multi-modal data processing.
+
+Lance Dataset
+-------------
+
+The core of Lance is the ``LanceDataset`` class. User can open a dataset by using
+:py:meth:`lance.dataset`.
+
+.. autofunction:: lance.dataset
+    :noindex:
+
+Basic IOs
+~~~~~~~~~
+
+The following functions are used to read and write data in Lance format.
+
+.. automethod:: lance.dataset.LanceDataset.insert
+    :noindex:
+.. automethod:: lance.dataset.LanceDataset.scanner
+    :noindex:
+.. automethod:: lance.dataset.LanceDataset.to_batches
+    :noindex:
+.. automethod:: lance.dataset.LanceDataset.to_table
+    :noindex:
+
+Random Access
+~~~~~~~~~~~~~
+
+Lance stands out with its super fast random access, unlike other columnar formats.
+
+.. automethod:: lance.dataset.LanceDataset.take
+    :noindex:
+.. automethod:: lance.dataset.LanceDataset.take_blobs
+    :noindex:
+
+
+Schema Evolution
+~~~~~~~~~~~~~~~~
+
+Lance supports schema evolution, which means that you can add new columns to the dataset
+cheaply.
+
+.. automethod:: lance.dataset.LanceDataset.add_columns
+    :noindex:
+.. automethod:: lance.dataset.LanceDataset.drop_columns
+    :noindex:
+
+
+Indexing and Searching
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. automethod:: lance.dataset.LanceDataset.create_index
+    :noindex:
+.. automethod:: lance.dataset.LanceDataset.scanner
+    :noindex:
+
+API Reference
+~~~~~~~~~~~~~
+
+More information can be found in the :doc:`API reference <python/modules>`.
+
+.. _Lance Python API documentation: ./python/modules
diff --git a/docs/blob.rst b/docs/blob.rst
@@ -0,0 +1,46 @@
+Blob As Files
+=============
+
+Unlike other data formats, large multimodal data is a first-class citizen in the Lance columnar format.
+Lance provides a high-level API to store and retrieve large binary objects (blobs) in Lance datasets.
+
+.. image:: _static/blob.png
+    :scale: 50%
+
+Lance serves large binary data using :py:class:`lance.BlobFile`, which
+is a file-like object that lazily reads large binary objects.
+
+.. autoclass:: lance.BlobFile
+    :members:
+    :show-inheritance:
+    :noindex:
+
+To fetch blobs from a Lance dataset, you can use :py:meth:`lance.dataset.LanceDataset.take_blobs`.
+
+For example, it's easy to use `BlobFile` to extract frames from a video file without
+loading the entire video into memory.
+
+.. code-block:: python
+
+    # pip install av pylance
+
+    import av
+    import lance
+
+    ds = lance.dataset("./youtube.lance")
+    start_time, end_time = 500, 1000
+    blobs = ds.take_blobs([5], "video")
+    with av.open(blobs[0]) as container:
+        stream = container.streams.video[0]
+        stream.codec_context.skip_frame = "NONKEY"
+
+        start_time = start_time / stream.time_base
+        start_time = start_time.as_integer_ratio()[0]
+        end_time = end_time / stream.time_base
+        container.seek(start_time, stream=stream)
+
+        for frame in container.decode(stream):
+            if frame.time > end_time:
+                break
+            display(frame.to_image())
+            clear_output(wait=True)
diff --git a/docs/index.rst b/docs/index.rst
@@ -45,6 +45,7 @@ Preview releases receive the same level of testing as regular releases.
    ./read_and_write
    Lance Formats <./format>
    Arrays <./arrays>
+   Blob API <./blob>
    Integrations <./integrations/integrations>
    Performance Guide <./performance>
    API References <./api/api>

diff --git a/python/python/lance/__init__.py b/python/python/lance/__init__.py
@@ -68,7 +68,8 @@ def dataset(
     Parameters
     ----------
     uri : str
-        Address to the Lance dataset.
+        Address to the Lance dataset. It can be a local file path `/tmp/data.lance`,
+        or a cloud object store URI, i.e., `s3://bucket/data.lance`.
     version : optional, int | str
         If specified, load a specific version of the Lance dataset. Else, loads the
         latest version. A version number (`int`) or a tag (`str`) can be provided.
@@ -77,7 +78,7 @@ def dataset(
         argument value. If a version is already specified, this arg is ignored.
     block_size : optional, int
         Block size in bytes. Provide a hint for the size of the minimal I/O request.
-    commit_handler : optional, CommitLock
+    commit_handler : optional, lance.commit.CommitLock
         If specified, use the provided commit handler to lock the table while
         committing a new version. Not necessary on object stores other than S3
         or when there are no concurrent writers.

diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
@@ -152,7 +152,7 @@ def when_not_matched_by_source_delete(self, expr: Optional[str] = None):
 
 
 class LanceDataset(pa.dataset.Dataset):
-    """A dataset in Lance format where the data is stored at the given uri."""
+    """A Lance Dataset in Lance format where the data is stored at the given uri."""
 
     def __init__(
         self,
@@ -325,6 +325,7 @@ def scanner(
                     "nprobes": 1,
                     "refine_factor": 1
                 }
+
         batch_size: int, default None
             The max size of batches returned.
         io_buffer_size: int, default None
@@ -366,7 +367,7 @@ def scanner(
             If True, then all columns are late materialized.
             If False, then all columns are early materialized.
             If a list of strings, then only the columns in the list are
-              late materialized.
+            late materialized.
 
             The default uses a heuristic that assumes filters will select about 0.1%
             of the rows.  If your filter is more selective (e.g. find by id) you may
@@ -376,6 +377,7 @@ def scanner(
             query string to search for, the results will be ranked by BM25.
             e.g. "hello world", would match documents containing "hello" or "world".
             or a dictionary with the following keys:
+
             - columns: list[str]
                 The columns to search,
                 currently only supports a single column in the columns list.
@@ -389,6 +391,7 @@ def scanner(
         -----
 
         For now, if BOTH filter and nearest is specified, then:
+
         1. nearest is executed first.
         2. The results are filtered afterwards.
 
@@ -506,7 +509,7 @@ def to_table(
         late_materialization: Optional[bool | List[str]] = None,
         use_scalar_index: Optional[bool] = None,
     ) -> pa.Table:
-        """Read the data into memory as a pyarrow Table.
+        """Read the data into memory as a :py:class:`pyarrow.Table`
 
         Parameters
         ----------
@@ -567,6 +570,7 @@ def to_table(
             query string to search for, the results will be ranked by BM25.
             e.g. "hello world", would match documents contains "hello" or "world".
             or a dictionary with the following keys:
+
             - columns: list[str]
                 The columns to search,
                 currently only supports a single column in the columns list.
@@ -576,6 +580,7 @@ def to_table(
         Notes
         -----
         If BOTH filter and nearest is specified, then:
+
         1. nearest is executed first.
         2. The results are filtered afterward, unless pre-filter sets to True.
         """
@@ -734,11 +739,11 @@ def take(
             Or a dictionary of column names to SQL expressions.
             All columns are fetched if None or unspecified.
         **kwargs : dict, optional
-            See scanner() method for full parameter description.
+            See :py:method::scanner method for full parameter description.
 
         Returns
         -------
-        table : Table
+        table : pyarrow.Table
         """
         columns_with_transform = None
         if isinstance(columns, dict):
@@ -787,7 +792,11 @@ def take_blobs(
         blob_column: str,
     ) -> List[BlobFile]:
         """
-        Select blobs by row_ids.
+        Select blobs by row IDs.
+
+        Instead of loading large binary blob data into memory before processing it,
+        this API allows you to open binary blob data as a regular Python file-like
+        object. For more details, see :py:class:`lance.BlobFile`.
 
         Parameters
         ----------
@@ -1612,15 +1621,19 @@ def create_index(
             Replace the existing index if it exists.
         num_partitions : int, optional
             The number of partitions of IVF (Inverted File Index).
-        ivf_centroids : ``np.ndarray``, ``pyarrow.FixedSizeListArray``
-        or ``pyarrow.FixedShapeTensorArray``. Optional.
-            A ``num_partitions x dimension`` array of K-mean centroids for IVF
-            clustering. If not provided, a new Kmean model will be trained.
-        pq_codebook : ``np.ndarray``, ``pyarrow.FixedSizeListArray``
-        or ``pyarrow.FixedShapeTensorArray``. Optional.
+        ivf_centroids : optional
+            It can be either :py:class:`np.ndarray`,
+            :py:class:`pyarrow.FixedSizeListArray` or
+            :py:class:`pyarrow.FixedShapeTensorArray`.
+            A ``num_partitions x dimension`` array of existing K-mean centroids
+            for IVF clustering. If not provided, a new KMeans model will be trained.
+        pq_codebook : optional,
+            It can be :py:class:`np.ndarray`, :py:class:`pyarrow.FixedSizeListArray`,
+            or :py:class:`pyarrow.FixedShapeTensorArray`.
             A ``num_sub_vectors x (2 ^ nbits * dimensions // num_sub_vectors)``
             array of K-mean centroids for PQ codebook.
-            Note: nbits is always 8 for now.
+
+            Note: ``nbits`` is always 8 for now.
             If not provided, a new PQ model will be trained.
         num_sub_vectors : int, optional
             The number of sub-vectors for PQ (Product Quantization).
@@ -1654,7 +1667,9 @@ def create_index(
         kwargs :
             Parameters passed to the index building process.
 
-        The SQ (Scalar Quantization) is available for only "IVF_HNSW_SQ" index type,
+
+
+        The SQ (Scalar Quantization) is available for only ``IVF_HNSW_SQ`` index type,
         this quantization method is used to reduce the memory usage of the index,
         it maps the float vectors to integer vectors, each integer is of ``num_bits``,
         now only 8 bits are supported.
@@ -1665,20 +1680,21 @@ def create_index(
         If ``index_type`` is with "PQ", then the following parameters are required:
             num_sub_vectors
 
-        Optional parameters for "IVF_PQ":
-            ivf_centroids :
-                K-mean centroids for IVF clustering.
-            num_bits : int, optional
+        Optional parameters for `IVF_PQ`:
+
+            - ivf_centroids
+                Existing K-mean centroids for IVF clustering.
+            - num_bits
                 The number of bits for PQ (Product Quantization). Default is 8.
                 Only 4, 8 are supported.
 
-        Optional parameters for "IVF_HNSW_*":
-            max_level : int
-                the maximum number of levels in the graph.
-            m : int
-                the number of edges per node in the graph.
-            ef_construction : int
-                the number of nodes to examine during the construction.
+        Optional parameters for `IVF_HNSW_*`:
+            max_level
+                Int, the maximum number of levels in the graph.
+            m
+                Int, the number of edges per node in the graph.
+            ef_construction
+                Int, the number of nodes to examine during the construction.
 
         Examples
         --------