Skip to content

Commit

Permalink
fix: token position error (#33)
Browse files Browse the repository at this point in the history
  • Loading branch information
zxch3n authored Aug 19, 2023
1 parent b639973 commit c5e8431
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 8 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@ jieba-rs = { version = "0.6.7", default-features = false }
log = "0.4.19"

[dev-dependencies]
jieba-rs = { version = "0.6.7", default-features = true }
flexi_logger = "0.25.5"
14 changes: 7 additions & 7 deletions src/stream.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,19 @@ use tantivy::tokenizer::Token;

#[derive(Debug)]
pub struct CangjieTokenStream<'a> {
src: &'a str,
result: Vec<&'a str>,
// Begin with 1
index: usize,
offset_from: usize,
token: Token,
}

impl<'a> CangjieTokenStream<'a> {
pub fn new(result: Vec<&'a str>) -> Self {
pub fn new(src: &'a str, result: Vec<&'a str>) -> Self {
CangjieTokenStream {
src,
result,
index: 0,
offset_from: 0,
token: Token::default(),
}
}
Expand All @@ -24,18 +24,18 @@ impl<'a> ::tantivy::tokenizer::TokenStream for CangjieTokenStream<'a> {
fn advance(&mut self) -> bool {
if self.index < self.result.len() {
let current_word = self.result[self.index];
let offset_to = self.offset_from + current_word.len();
let offset_from = current_word.as_ptr() as usize - self.src.as_ptr() as usize;
let offset_to = offset_from + current_word.len();

self.token = Token {
offset_from: self.offset_from,
offset_from,
offset_to,
position: self.index,
text: current_word.to_string(),
position_length: self.result.len(),
position_length: 1,
};

self.index += 1;
self.offset_from = offset_to;
true
} else {
false
Expand Down
2 changes: 1 addition & 1 deletion src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,6 @@ impl ::tantivy::tokenizer::Tokenizer for CangJieTokenizer {
}
};
trace!("{:?}->{:?}", text, result);
CangjieTokenStream::new(result)
CangjieTokenStream::new(text, result)
}
}
55 changes: 55 additions & 0 deletions tests/position.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
use std::sync::Arc;

use cang_jie::{CangJieTokenizer, TokenizerOption, CANG_JIE};
use jieba_rs::Jieba;
use tantivy::{
collector::TopDocs,
doc,
query::QueryParser,
schema::{IndexRecordOption, SchemaBuilder, TextFieldIndexing, TextOptions},
Index, SnippetGenerator,
};

#[test]
fn test_tokenizer_position() -> tantivy::Result<()> {
let mut schema_builder = SchemaBuilder::default();

let text_indexing = TextFieldIndexing::default()
.set_tokenizer(CANG_JIE) // Set custom tokenizer
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let text_options = TextOptions::default()
.set_indexing_options(text_indexing)
.set_stored();

let title = schema_builder.add_text_field("title", text_options);
let schema = schema_builder.build();

let index = Index::create_in_ram(schema);
index.tokenizers().register(CANG_JIE, tokenizer()); // Build cang-jie Tokenizer

let mut index_writer = index.writer(50 * 1024 * 1024)?;
index_writer.add_document(doc! { title => "南京大桥" })?;
index_writer.add_document(doc! { title => "这个是长江" })?;
index_writer.add_document(doc! { title => "这个是南京长" })?;
index_writer.commit()?;

let reader = index.reader()?;
let searcher = reader.searcher();

let query = QueryParser::for_index(&index, vec![title]).parse_query("南京")?;
let top_docs = searcher.search(query.as_ref(), &TopDocs::with_limit(10000))?;

let snippet = SnippetGenerator::create(&searcher, &query, title).unwrap();
for doc in top_docs.iter() {
let s = snippet.snippet_from_doc(&searcher.doc(doc.1).unwrap());
dbg!(s.to_html());
}
Ok(())
}

fn tokenizer() -> CangJieTokenizer {
CangJieTokenizer {
worker: Arc::new(Jieba::new()),
option: TokenizerOption::All,
}
}

0 comments on commit c5e8431

Please sign in to comment.