Skip to content

Commit

Permalink
[PERF]: remove mutex around tokenizer (#2735)
Browse files Browse the repository at this point in the history
  • Loading branch information
codetheweb authored Aug 29, 2024
1 parent 6337df5 commit 60d9342
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 109 deletions.
37 changes: 6 additions & 31 deletions rust/index/src/fulltext/tokenizer.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
use parking_lot::Mutex;
use std::sync::Arc;
use tantivy::tokenizer::{NgramTokenizer, Token, TokenStream, Tokenizer};

pub trait ChromaTokenStream: Send {
fn process(&mut self, sink: &mut dyn FnMut(&Token));
fn get_tokens(&self) -> &Vec<Token>;
}

Expand All @@ -18,36 +15,28 @@ impl TantivyChromaTokenStream {
}

impl ChromaTokenStream for TantivyChromaTokenStream {
fn process(&mut self, sink: &mut dyn FnMut(&Token)) {
for token in &self.tokens {
sink(token);
}
}

fn get_tokens(&self) -> &Vec<Token> {
&self.tokens
}
}

pub trait ChromaTokenizer: Send {
pub trait ChromaTokenizer: Send + Sync {
fn encode(&self, text: &str) -> Box<dyn ChromaTokenStream>;
}

pub struct TantivyChromaTokenizer {
tokenizer: Arc<Mutex<Box<NgramTokenizer>>>,
tokenizer: NgramTokenizer,
}

impl TantivyChromaTokenizer {
pub fn new(tokenizer: Box<NgramTokenizer>) -> Self {
TantivyChromaTokenizer {
tokenizer: Arc::new(Mutex::new(tokenizer)),
}
pub fn new(tokenizer: NgramTokenizer) -> Self {
TantivyChromaTokenizer { tokenizer }
}
}

impl ChromaTokenizer for TantivyChromaTokenizer {
fn encode(&self, text: &str) -> Box<dyn ChromaTokenStream> {
let mut tokenizer = self.tokenizer.lock();
let mut tokenizer = self.tokenizer.clone();
let mut token_stream = tokenizer.token_stream(text);
let mut tokens = Vec::new();
token_stream.process(&mut |token| {
Expand All @@ -60,23 +49,9 @@ impl ChromaTokenizer for TantivyChromaTokenizer {
mod test {
use super::*;

#[test]
fn test_chroma_tokenizer() {
let tokenizer: Box<NgramTokenizer> = Box::new(NgramTokenizer::new(1, 1, false).unwrap());
let chroma_tokenizer = TantivyChromaTokenizer::new(tokenizer);
let mut token_stream = chroma_tokenizer.encode("hello world");
let mut tokens = Vec::new();
token_stream.process(&mut |token| {
tokens.push(token.clone());
});
assert_eq!(tokens.len(), 11);
assert_eq!(tokens[0].text, "h");
assert_eq!(tokens[1].text, "e");
}

#[test]
fn test_get_tokens() {
let tokenizer: Box<NgramTokenizer> = Box::new(NgramTokenizer::new(1, 1, false).unwrap());
let tokenizer = NgramTokenizer::new(1, 1, false).unwrap();
let chroma_tokenizer = TantivyChromaTokenizer::new(tokenizer);
let token_stream = chroma_tokenizer.encode("hello world");
let tokens = token_stream.get_tokens();
Expand Down
Loading

0 comments on commit 60d9342

Please sign in to comment.