feat!: support to customize tokenizer (#2992)

users can customize the tokenizer: - language - remove long words - lower case - stem - remove stop words - ascii folding solve #2996 This introduces a breaking change: we used `en_stem` as default tokenizer before, which stems the words, but this PR switches the default tokenizer to be without stemming --------- Signed-off-by: BubbleCal <[email protected]> Co-authored-by: Weston Pace <[email protected]>
lancedb · Oct 21, 2024 · c152d36 · c152d36
1 parent f803ca3
commit c152d36
Show file tree

Hide file tree

Showing 9 changed files with 364 additions and 49 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -141,7 +141,7 @@ serde = { version = "^1" }
 serde_json = { version = "1" }
 shellexpand = "3.0"
 snafu = "0.7.5"
-tantivy = "0.22.0"
+tantivy = { version = "0.22.0", features = ["stopwords"] }
 tempfile = "3"
 test-log = { version = "0.2.15" }
 tokio = { version = "1.23", features = [

diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
@@ -1349,6 +1349,31 @@ def create_scalar_index(
             query. This will significantly increase the index size.
             It won't impact the performance of non-phrase queries even if it is set to
             True.
+        base_tokenizer: str, default "simple"
+            This is for the ``INVERTED`` index. The base tokenizer to use. The value
+            can be:
+            * "simple": splits tokens on whitespace and punctuation.
+            * "whitespace": splits tokens on whitespace.
+            * "raw": no tokenization.
+        language: str, default "English"
+            This is for the ``INVERTED`` index. The language for stemming
+            and stop words. This is only used when `stem` or `remove_stop_words` is true
+        max_token_length: Optional[int], default 40
+            This is for the ``INVERTED`` index. The maximum token length.
+            Any token longer than this will be removed.
+        lower_case: bool, default True
+            This is for the ``INVERTED`` index. If True, the index will convert all
+            text to lowercase.
+        stem: bool, default False
+            This is for the ``INVERTED`` index. If True, the index will stem the
+            tokens.
+        remove_stop_words: bool, default False
+            This is for the ``INVERTED`` index. If True, the index will remove
+            stop words.
+        ascii_folding: bool, default False
+            This is for the ``INVERTED`` index. If True, the index will convert
+            non-ascii characters to ascii characters if possible.
+            This would remove accents like "é" -> "e".
 
         Examples
         --------

diff --git a/python/src/dataset.rs b/python/src/dataset.rs
@@ -1275,6 +1275,43 @@ impl Dataset {
                     if let Some(with_position) = kwargs.get_item("with_position")? {
                         params.with_position = with_position.extract()?;
                     }
+                    if let Some(base_tokenizer) = kwargs.get_item("base_tokenizer")? {
+                        params.tokenizer_config = params
+                            .tokenizer_config
+                            .base_tokenizer(base_tokenizer.extract()?);
+                    }
+                    if let Some(language) = kwargs.get_item("language")? {
+                        let language = language.extract()?;
+                        params.tokenizer_config =
+                            params.tokenizer_config.language(language).map_err(|e| {
+                                PyValueError::new_err(format!(
+                                    "can't set tokenizer language to {}: {:?}",
+                                    language, e
+                                ))
+                            })?;
+                    }
+                    if let Some(max_token_length) = kwargs.get_item("max_token_length")? {
+                        params.tokenizer_config = params
+                            .tokenizer_config
+                            .max_token_length(max_token_length.extract()?);
+                    }
+                    if let Some(lower_case) = kwargs.get_item("lower_case")? {
+                        params.tokenizer_config =
+                            params.tokenizer_config.lower_case(lower_case.extract()?);
+                    }
+                    if let Some(stem) = kwargs.get_item("stem")? {
+                        params.tokenizer_config = params.tokenizer_config.stem(stem.extract()?);
+                    }
+                    if let Some(remove_stop_words) = kwargs.get_item("remove_stop_words")? {
+                        params.tokenizer_config = params
+                            .tokenizer_config
+                            .remove_stop_words(remove_stop_words.extract()?);
+                    }
+                    if let Some(ascii_folding) = kwargs.get_item("ascii_folding")? {
+                        params.tokenizer_config = params
+                            .tokenizer_config
+                            .ascii_folding(ascii_folding.extract()?);
+                    }
                 }
                 Box::new(params)
             }

diff --git a/rust/lance-index/src/scalar.rs b/rust/lance-index/src/scalar.rs
@@ -4,6 +4,7 @@
 //! Scalar indices for metadata search & filtering
 
 use std::collections::HashMap;
+use std::fmt::Debug;
 use std::{any::Any, ops::Bound, sync::Arc};
 
 use arrow::buffer::{OffsetBuffer, ScalarBuffer};
@@ -17,6 +18,7 @@ use datafusion_common::{scalar::ScalarValue, Column};
 use datafusion_expr::expr::ScalarFunction;
 use datafusion_expr::Expr;
 use deepsize::DeepSizeOf;
+use inverted::TokenizerConfig;
 use lance_core::utils::mask::RowIdTreeMap;
 use lance_core::{Error, Result};
 use snafu::{location, Location};
@@ -91,19 +93,36 @@ impl IndexParams for ScalarIndexParams {
     }
 }
 
-#[derive(Debug, Clone, DeepSizeOf)]
+#[derive(Clone)]
 pub struct InvertedIndexParams {
     /// If true, store the position of the term in the document
     /// This can significantly increase the size of the index
     /// If false, only store the frequency of the term in the document
     /// Default is true
     pub with_position: bool,
+
+    pub tokenizer_config: TokenizerConfig,
+}
+
+impl Debug for InvertedIndexParams {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("InvertedIndexParams")
+            .field("with_position", &self.with_position)
+            .finish()
+    }
+}
+
+impl DeepSizeOf for InvertedIndexParams {
+    fn deep_size_of_children(&self, _: &mut deepsize::Context) -> usize {
+        0
+    }
 }
 
 impl Default for InvertedIndexParams {
     fn default() -> Self {
         Self {
             with_position: true,
+            tokenizer_config: TokenizerConfig::default(),
         }
     }
 }

diff --git a/rust/lance-index/src/scalar/inverted.rs b/rust/lance-index/src/scalar/inverted.rs
@@ -3,11 +3,13 @@
 
 mod builder;
 mod index;
+mod tokenizer;
 mod wand;
 
 pub use builder::InvertedIndexBuilder;
 pub use index::*;
 use lance_core::Result;
+pub use tokenizer::*;
 
 use super::btree::TrainingSource;
 use super::{IndexStore, InvertedIndexParams};