Skip to content

Commit

Permalink
feat!: support to customize tokenizer (#2992)
Browse files Browse the repository at this point in the history
users can customize the tokenizer:
- language
- remove long words
- lower case
- stem
- remove stop words
- ascii folding
solve #2996

This introduces a breaking change: we used `en_stem` as default
tokenizer before, which stems the words, but this PR switches the
default tokenizer to be without stemming

---------

Signed-off-by: BubbleCal <[email protected]>
Co-authored-by: Weston Pace <[email protected]>
  • Loading branch information
BubbleCal and westonpace authored Oct 21, 2024
1 parent f803ca3 commit c152d36
Show file tree
Hide file tree
Showing 9 changed files with 364 additions and 49 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ serde = { version = "^1" }
serde_json = { version = "1" }
shellexpand = "3.0"
snafu = "0.7.5"
tantivy = "0.22.0"
tantivy = { version = "0.22.0", features = ["stopwords"] }
tempfile = "3"
test-log = { version = "0.2.15" }
tokio = { version = "1.23", features = [
Expand Down
25 changes: 25 additions & 0 deletions python/python/lance/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1349,6 +1349,31 @@ def create_scalar_index(
query. This will significantly increase the index size.
It won't impact the performance of non-phrase queries even if it is set to
True.
base_tokenizer: str, default "simple"
This is for the ``INVERTED`` index. The base tokenizer to use. The value
can be:
* "simple": splits tokens on whitespace and punctuation.
* "whitespace": splits tokens on whitespace.
* "raw": no tokenization.
language: str, default "English"
This is for the ``INVERTED`` index. The language for stemming
and stop words. This is only used when `stem` or `remove_stop_words` is true
max_token_length: Optional[int], default 40
This is for the ``INVERTED`` index. The maximum token length.
Any token longer than this will be removed.
lower_case: bool, default True
This is for the ``INVERTED`` index. If True, the index will convert all
text to lowercase.
stem: bool, default False
This is for the ``INVERTED`` index. If True, the index will stem the
tokens.
remove_stop_words: bool, default False
This is for the ``INVERTED`` index. If True, the index will remove
stop words.
ascii_folding: bool, default False
This is for the ``INVERTED`` index. If True, the index will convert
non-ascii characters to ascii characters if possible.
This would remove accents like "é" -> "e".
Examples
--------
Expand Down
37 changes: 37 additions & 0 deletions python/src/dataset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1275,6 +1275,43 @@ impl Dataset {
if let Some(with_position) = kwargs.get_item("with_position")? {
params.with_position = with_position.extract()?;
}
if let Some(base_tokenizer) = kwargs.get_item("base_tokenizer")? {
params.tokenizer_config = params
.tokenizer_config
.base_tokenizer(base_tokenizer.extract()?);
}
if let Some(language) = kwargs.get_item("language")? {
let language = language.extract()?;
params.tokenizer_config =
params.tokenizer_config.language(language).map_err(|e| {
PyValueError::new_err(format!(
"can't set tokenizer language to {}: {:?}",
language, e
))
})?;
}
if let Some(max_token_length) = kwargs.get_item("max_token_length")? {
params.tokenizer_config = params
.tokenizer_config
.max_token_length(max_token_length.extract()?);
}
if let Some(lower_case) = kwargs.get_item("lower_case")? {
params.tokenizer_config =
params.tokenizer_config.lower_case(lower_case.extract()?);
}
if let Some(stem) = kwargs.get_item("stem")? {
params.tokenizer_config = params.tokenizer_config.stem(stem.extract()?);
}
if let Some(remove_stop_words) = kwargs.get_item("remove_stop_words")? {
params.tokenizer_config = params
.tokenizer_config
.remove_stop_words(remove_stop_words.extract()?);
}
if let Some(ascii_folding) = kwargs.get_item("ascii_folding")? {
params.tokenizer_config = params
.tokenizer_config
.ascii_folding(ascii_folding.extract()?);
}
}
Box::new(params)
}
Expand Down
21 changes: 20 additions & 1 deletion rust/lance-index/src/scalar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
//! Scalar indices for metadata search & filtering
use std::collections::HashMap;
use std::fmt::Debug;
use std::{any::Any, ops::Bound, sync::Arc};

use arrow::buffer::{OffsetBuffer, ScalarBuffer};
Expand All @@ -17,6 +18,7 @@ use datafusion_common::{scalar::ScalarValue, Column};
use datafusion_expr::expr::ScalarFunction;
use datafusion_expr::Expr;
use deepsize::DeepSizeOf;
use inverted::TokenizerConfig;
use lance_core::utils::mask::RowIdTreeMap;
use lance_core::{Error, Result};
use snafu::{location, Location};
Expand Down Expand Up @@ -91,19 +93,36 @@ impl IndexParams for ScalarIndexParams {
}
}

#[derive(Debug, Clone, DeepSizeOf)]
#[derive(Clone)]
pub struct InvertedIndexParams {
/// If true, store the position of the term in the document
/// This can significantly increase the size of the index
/// If false, only store the frequency of the term in the document
/// Default is true
pub with_position: bool,

pub tokenizer_config: TokenizerConfig,
}

impl Debug for InvertedIndexParams {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("InvertedIndexParams")
.field("with_position", &self.with_position)
.finish()
}
}

impl DeepSizeOf for InvertedIndexParams {
fn deep_size_of_children(&self, _: &mut deepsize::Context) -> usize {
0
}
}

impl Default for InvertedIndexParams {
fn default() -> Self {
Self {
with_position: true,
tokenizer_config: TokenizerConfig::default(),
}
}
}
Expand Down
2 changes: 2 additions & 0 deletions rust/lance-index/src/scalar/inverted.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@

mod builder;
mod index;
mod tokenizer;
mod wand;

pub use builder::InvertedIndexBuilder;
pub use index::*;
use lance_core::Result;
pub use tokenizer::*;

use super::btree::TrainingSource;
use super::{IndexStore, InvertedIndexParams};
Expand Down
Loading

0 comments on commit c152d36

Please sign in to comment.