From a58d63f44fa5872b021c6285527919515508859d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Kav=C3=ADk?= Date: Thu, 19 Mar 2020 21:30:51 +0100 Subject: [PATCH] levenshtein: LevenshteinState, usize input distance --- src/automaton/levenshtein.rs | 69 ++++++++++++++++++++++++++++-------- src/automaton/mod.rs | 2 +- src/map.rs | 16 ++++++--- src/set.rs | 16 ++++++--- 4 files changed, 77 insertions(+), 26 deletions(-) diff --git a/src/automaton/levenshtein.rs b/src/automaton/levenshtein.rs index 9db9748..dc0549f 100644 --- a/src/automaton/levenshtein.rs +++ b/src/automaton/levenshtein.rs @@ -111,12 +111,10 @@ impl Levenshtein { #[inline] pub fn new( query: &str, - distance: u32, + distance: usize, ) -> Result { - let lev = DynamicLevenshtein { - query: query.to_owned(), - dist: distance as usize, - }; + let lev = + DynamicLevenshtein { query: query.to_owned(), dist: distance }; let dfa = DfaBuilder::new(lev.clone()).build()?; Ok(Levenshtein { prog: lev, dfa }) } @@ -165,27 +163,58 @@ impl DynamicLevenshtein { } } +/// Levenshtein automaton state. +/// +/// It is useful for obtaining edit distance while searching. +/// See examples in documentation for `Map::search_with_state` +/// or `Set::search_with_state`. +/// +/// This is only defined when the `levenshtein` crate feature is enabled. +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub struct LevenshteinState { + /// Internal state index. + pub state_idx: usize, + /// Levenshtein edit distance. + pub distance: Option, +} + impl Automaton for Levenshtein { - type State = Option; + type State = Option; #[inline] - fn start(&self) -> Option { - Some(0) + fn start(&self) -> Option { + Some(LevenshteinState { + state_idx: 0, + distance: self.dfa.states[0].distance, + }) } #[inline] - fn is_match(&self, state: &Option) -> bool { - state.map(|state| self.dfa.states[state].is_match).unwrap_or(false) + fn is_match(&self, state: &Option) -> bool { + state + .map(|state| self.dfa.states[state.state_idx].is_match) + .unwrap_or(false) } #[inline] - fn can_match(&self, state: &Option) -> bool { + fn can_match(&self, state: &Option) -> bool { state.is_some() } #[inline] - fn accept(&self, state: &Option, byte: u8) -> Option { - state.and_then(|state| self.dfa.states[state].next[byte as usize]) + fn accept( + &self, + state: &Option, + byte: u8, + ) -> Option { + state.and_then(|state| { + self.dfa.states[state.state_idx].next[byte as usize].map( + |next_state_idx| LevenshteinState { + state_idx: next_state_idx, + distance: self.dfa.states[next_state_idx].distance, + }, + ) + }) } } @@ -197,12 +226,14 @@ struct Dfa { struct State { next: [Option; 256], is_match: bool, + distance: Option, } impl fmt::Debug for State { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { writeln!(f, "State {{")?; writeln!(f, " is_match: {:?}", self.is_match)?; + writeln!(f, " distance: {:?}", self.distance)?; for i in 0..256 { if let Some(si) = self.next[i] { writeln!(f, " {:?}: {:?}", i, si)?; @@ -273,7 +304,11 @@ impl DfaBuilder { Entry::Occupied(v) => (*v.get(), true), Entry::Vacant(v) => { let is_match = self.lev.is_match(lev_state); - self.dfa.states.push(State { next: [None; 256], is_match }); + self.dfa.states.push(State { + next: [None; 256], + is_match, + distance: lev_state.last().copied(), + }); (*v.insert(self.dfa.states.len() - 1), false) } }) @@ -334,7 +369,11 @@ impl DfaBuilder { } fn new_state(&mut self, is_match: bool) -> usize { - self.dfa.states.push(State { next: [None; 256], is_match }); + self.dfa.states.push(State { + next: [None; 256], + is_match, + distance: None, + }); self.dfa.states.len() - 1 } } diff --git a/src/automaton/mod.rs b/src/automaton/mod.rs index a36392a..53c402b 100644 --- a/src/automaton/mod.rs +++ b/src/automaton/mod.rs @@ -1,5 +1,5 @@ #[cfg(feature = "levenshtein")] -pub use self::levenshtein::{Levenshtein, LevenshteinError}; +pub use self::levenshtein::{Levenshtein, LevenshteinError, LevenshteinState}; #[cfg(feature = "levenshtein")] mod levenshtein; diff --git a/src/map.rs b/src/map.rs index 68bf8e4..2b542af 100644 --- a/src/map.rs +++ b/src/map.rs @@ -322,7 +322,7 @@ An implementation of fuzzy search using Levenshtein automata can be used to search maps: ```rust -use fst::automaton::Levenshtein; +use fst::automaton::{Levenshtein, LevenshteinState}; use fst::{IntoStreamer, Streamer, Map}; # fn main() { example().unwrap(); } @@ -341,11 +341,17 @@ fn example() -> Result<(), Box> { while let Some((k, v, s)) = stream.next() { kvs.push((String::from_utf8(k.to_vec())?, v, s)); } - // Currently, there isn't much interesting that you can do with the states. + assert_eq!(kvs, vec![ - ("foo".to_string(), 1, Some(183)), - ("foob".to_string(), 2, Some(123)), - ("fozb".to_string(), 4, Some(83)), + ("foo".to_string(), 1, Some(LevenshteinState { + state_idx: 183, distance: Some(0) + })), + ("foob".to_string(), 2, Some(LevenshteinState { + state_idx: 123, distance: Some(1) + })), + ("fozb".to_string(), 4, Some(LevenshteinState { + state_idx: 83, distance: Some(2) + })), ]); Ok(()) diff --git a/src/set.rs b/src/set.rs index a3419c6..4112e64 100644 --- a/src/set.rs +++ b/src/set.rs @@ -210,7 +210,7 @@ An implementation of fuzzy search using Levenshtein automata can be used to search sets: ```rust -use fst::automaton::Levenshtein; +use fst::automaton::{Levenshtein, LevenshteinState}; use fst::{IntoStreamer, Streamer, Set}; # fn main() { example().unwrap(); } @@ -229,11 +229,17 @@ fn example() -> Result<(), Box> { while let Some((v, s)) = stream.next() { vs.push((String::from_utf8(v.to_vec())?, s)); } - // Currently, there isn't much interesting that you can do with the states. + assert_eq!(vs, vec![ - ("foo".to_string(), Some(183)), - ("foob".to_string(), Some(123)), - ("fozb".to_string(), Some(83)), + ("foo".to_string(), Some(LevenshteinState { + state_idx: 183, distance: Some(0) + })), + ("foob".to_string(), Some(LevenshteinState { + state_idx: 123, distance: Some(1) + })), + ("fozb".to_string(), Some(LevenshteinState { + state_idx: 83, distance: Some(2) + })), ]); Ok(())