From d611cfaff4fa18031cb002a99ea8351290115ad0 Mon Sep 17 00:00:00 2001 From: Phil Weir Date: Sat, 16 Mar 2024 18:36:55 +0100 Subject: [PATCH 1/2] fix: caps the levenshtein distance --- Cargo.lock | 2 +- Cargo.toml | 2 +- src/lib.rs | 3 +++ src/locations_db.rs | 10 +++++++++- src/search.rs | 15 ++++++++++----- tests/data/test-code-list.csv | 1 + tests/data/test-codes.json | 24 ++++++++++++++++++++++++ tests/test_code_list.rs | 27 ++++++++++++++++++++++++++- 8 files changed, 75 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 86b8855..dc02021 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -30,7 +30,7 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "berlin-core" -version = "0.2.2" +version = "0.2.3" dependencies = [ "ahash", "csv", diff --git a/Cargo.toml b/Cargo.toml index f6a95ee..3f1bb39 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "berlin-core" -version = "0.2.3" +version = "0.2.4" edition = "2021" license = "MIT" description = "Identify locations and tag them with UN-LOCODEs and ISO-3166-2 subdivisions." diff --git a/src/lib.rs b/src/lib.rs index 3a9474b..15d3d90 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -17,6 +17,9 @@ pub mod search; const SCORE_SOFT_MAX: i64 = 1000; const STATE_CODE_BOOST: i64 = 32; const SUBDIV_CODE_BOOST: i64 = 16; +const LEV_3_LENGTH_MAX: usize = 10; +const LEV_2_LENGTH_MAX: usize = 20; +const LEV_LENGTH_MAX: usize = 30; const SINGLE_WORD_MATCH_PENALTY: i64 = 100; diff --git a/src/locations_db.rs b/src/locations_db.rs index f9964d9..a1e1c20 100644 --- a/src/locations_db.rs +++ b/src/locations_db.rs @@ -5,6 +5,7 @@ use std::io::BufReader; use std::path::PathBuf; use std::sync::RwLock; use std::time::Instant; +use std::cmp::min; use csv::ReaderBuilder; use fst::{Automaton, Streamer}; @@ -21,6 +22,8 @@ use crate::graph::ResultsGraph; use crate::location::{AnyLocation, CsvLocode, LocData, Location}; use crate::search::{Score, SearchTerm}; use crate::SEARCH_INCLUSION_THRESHOLD; +use crate::LEV_3_LENGTH_MAX; +use crate::LEV_2_LENGTH_MAX; #[derive(Default)] pub struct LocationsDb { @@ -115,7 +118,12 @@ impl LocationsDb { let search_action = |op: fst::map::OpBuilder<'c>, term: &'c str| match term.len() > 3 { true => { let prefix_matcher = fst::automaton::Str::new(term).starts_with(); - let autom = fst::automaton::Levenshtein::new(term, st.lev_dist) + let lev_dist = match term.chars().count() { + count if count < LEV_3_LENGTH_MAX => st.lev_dist, + count if count < LEV_2_LENGTH_MAX => min(st.lev_dist, 2), + _ => min(st.lev_dist, 1) + }; + let autom = fst::automaton::Levenshtein::new(term, lev_dist) .expect("build automaton") .union(prefix_matcher); op.add(fst.search(autom)) diff --git a/src/search.rs b/src/search.rs index bdc39fb..9edea6e 100644 --- a/src/search.rs +++ b/src/search.rs @@ -7,9 +7,10 @@ use unicode_segmentation::UnicodeSegmentation; use ustr::{Ustr, UstrSet}; use crate::SCORE_SOFT_MAX; +use crate::LEV_LENGTH_MAX; -const STOP_WORDS: [&str; 11] = [ - "at", "to", "in", "on", "of", "for", "by", "and", "was", "did", "the", +const STOP_WORDS: [&str; 15] = [ + "any", "all", "are", "is", "at", "to", "in", "on", "of", "for", "by", "and", "was", "did", "the", ]; #[derive(Debug)] @@ -160,9 +161,13 @@ impl SearchableStringSet { op = self .not_exact .iter() - .map(|ne| ne.term.as_str()) + .map(|ne| { + ne.term.as_str() + }) .chain(ungrabbed) - .fold(op, |op, t| search_action(op, t)); + .fold(op, |op, t| { + search_action(op, t) + }); (op, pre_filtered) } @@ -174,7 +179,7 @@ impl SearchableStringSet { _ if self.stop_words.contains(&u) => {} // ignore stop words _ => self.add_exact(u, normalized), }, - None if allow_inexact => self.add_not_exact(normalized.clone(), normalized), + None if allow_inexact && matchable.chars().count() < LEV_LENGTH_MAX => self.add_not_exact(matchable.to_string(), normalized), None => {} } } diff --git a/tests/data/test-code-list.csv b/tests/data/test-code-list.csv index 38eb679..5978b7f 100644 --- a/tests/data/test-code-list.csv +++ b/tests/data/test-code-list.csv @@ -5,3 +5,4 @@ Change,Country,Location,Name,NameWoDiacritics,Subdivision,Status,Function,Date,I ,BG,DA3,Garmen,Garmen,01,RL,--3-----,1601,,4136N 02349E, ,GB,ABC,Abercarn,Abercarn,CAY,RL,-23-----,0701,,5139N 00308W, ,GB,SVN,Stonehaven,Stonehaven,ABD,AA,123-----,0701,,5658N 00213W, +,GB,BSI,Bognor Regis,Bognor Regis,WSX,AA,123-----,0701,,5047N 00041W, diff --git a/tests/data/test-codes.json b/tests/data/test-codes.json index e051ace..be9f1b5 100644 --- a/tests/data/test-codes.json +++ b/tests/data/test-codes.json @@ -49,6 +49,17 @@ "level": "council area" } }, + "GB:WSX": { + "": "ISO-3166-2", + "s": "", + "i": "GB:WSX", + "d": { + "name": "West Sussex", + "supercode": "GB", + "subcode": "WSX", + "level": "ceremonial county" + } + }, "BG:01": { "": "ISO-3166-2", "s": "", @@ -123,6 +134,19 @@ "function_code": "--3-----" } }, + "GB:BSI": { + "": "UN-LOCODE", + "s": "", + "i": "GB:BSI", + "d": { + "name": "Bognor Regis", + "supercode": "GB", + "subcode": "BSI", + "subdivision_name": "West Sussex", + "subdivision_code": "WSX", + "function_code": "123-----" + } + }, "GB:ABC": { "": "UN-LOCODE", "s": "", diff --git a/tests/test_code_list.rs b/tests/test_code_list.rs index 6c6a87e..d6965b9 100644 --- a/tests/test_code_list.rs +++ b/tests/test_code_list.rs @@ -65,7 +65,7 @@ pub fn search_abercorn() -> SearchTerm { #[rstest] fn should_load_codes(fake_data: &LocationsDb) { - assert!(fake_data.all.len() == 11) + assert!(fake_data.all.len() == 13) } #[rstest] @@ -95,3 +95,28 @@ fn should_search_abercorn(fake_data: &LocationsDb, search_abercorn: SearchTerm) assert![abercarn_loc.get_state() == "gb"]; assert![abercarn_loc.get_subdiv().unwrap() == "cay"]; } + +#[rstest] +fn should_search_long_sentence(fake_data: &LocationsDb) { + pub struct LongSearch { + pub q: &'static str, + pub r: usize, + } + [ + LongSearch {q: "WhereareallthedentistsinAbercornIwouldlisomesomewhere", r: 0}, + LongSearch {q: "Where are all the dentists in Abercorn I would like to find some somewhere", r: 1}, + LongSearch {q: "Whereareallthedentists inAbercornIwouldliketofind some somewhere", r: 0}, + LongSearch {q: "Whereareallthedentists in Bognor Regis Iwouldlike some somewhere", r: 1}, + LongSearch {q: "Whereareallthedentists in Bognore Regis Iwouldlike some somewhere", r: 1}, + LongSearch {q: "Whereareallthedentists in Bognoreregis Iwouldlike some somewhere", r: 1}, + LongSearch {q: "Whereareallthedentists in BognoreRegistrar Iwouldlike some somewhere", r: 0}, + LongSearch {q: "Whereareallthedentists some somewhere", r: 0}, + LongSearch {q: "WhereareallthedentistsinAbercornIwouldlisomesomewhere", r: 0}, + ].iter().for_each(|search| { + let long_sentence = SearchTerm::from_raw_query( + search.q.to_string(), None, 5, 3 + ); + let results = fake_data.search(&long_sentence); + assert![results.len() == search.r, "Query: {}, results: {}, expected: {}", search.q, results.len(), search.r]; + }); +} From 0f8706ba27894f0ee561f5fcfc13a4e3befffdac Mon Sep 17 00:00:00 2001 From: Phil Weir Date: Sat, 16 Mar 2024 18:41:47 +0100 Subject: [PATCH 2/2] chore: rustfmt --- Cargo.lock | 2 +- src/locations_db.rs | 8 +++--- src/search.rs | 17 ++++++------ tests/test_code_list.rs | 61 +++++++++++++++++++++++++++++++---------- 4 files changed, 60 insertions(+), 28 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index dc02021..a5b0012 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -30,7 +30,7 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "berlin-core" -version = "0.2.3" +version = "0.2.4" dependencies = [ "ahash", "csv", diff --git a/src/locations_db.rs b/src/locations_db.rs index a1e1c20..b3fabc5 100644 --- a/src/locations_db.rs +++ b/src/locations_db.rs @@ -1,11 +1,11 @@ use std::boxed::Box; +use std::cmp::min; use std::error::Error; use std::fs::File; use std::io::BufReader; use std::path::PathBuf; use std::sync::RwLock; use std::time::Instant; -use std::cmp::min; use csv::ReaderBuilder; use fst::{Automaton, Streamer}; @@ -21,9 +21,9 @@ use ustr::{Ustr, UstrMap, UstrSet}; use crate::graph::ResultsGraph; use crate::location::{AnyLocation, CsvLocode, LocData, Location}; use crate::search::{Score, SearchTerm}; -use crate::SEARCH_INCLUSION_THRESHOLD; -use crate::LEV_3_LENGTH_MAX; use crate::LEV_2_LENGTH_MAX; +use crate::LEV_3_LENGTH_MAX; +use crate::SEARCH_INCLUSION_THRESHOLD; #[derive(Default)] pub struct LocationsDb { @@ -121,7 +121,7 @@ impl LocationsDb { let lev_dist = match term.chars().count() { count if count < LEV_3_LENGTH_MAX => st.lev_dist, count if count < LEV_2_LENGTH_MAX => min(st.lev_dist, 2), - _ => min(st.lev_dist, 1) + _ => min(st.lev_dist, 1), }; let autom = fst::automaton::Levenshtein::new(term, lev_dist) .expect("build automaton") diff --git a/src/search.rs b/src/search.rs index 9edea6e..4b37986 100644 --- a/src/search.rs +++ b/src/search.rs @@ -6,11 +6,12 @@ use strsim::normalized_levenshtein as similarity_algo; use unicode_segmentation::UnicodeSegmentation; use ustr::{Ustr, UstrSet}; -use crate::SCORE_SOFT_MAX; use crate::LEV_LENGTH_MAX; +use crate::SCORE_SOFT_MAX; const STOP_WORDS: [&str; 15] = [ - "any", "all", "are", "is", "at", "to", "in", "on", "of", "for", "by", "and", "was", "did", "the", + "any", "all", "are", "is", "at", "to", "in", "on", "of", "for", "by", "and", "was", "did", + "the", ]; #[derive(Debug)] @@ -161,13 +162,9 @@ impl SearchableStringSet { op = self .not_exact .iter() - .map(|ne| { - ne.term.as_str() - }) + .map(|ne| ne.term.as_str()) .chain(ungrabbed) - .fold(op, |op, t| { - search_action(op, t) - }); + .fold(op, |op, t| search_action(op, t)); (op, pre_filtered) } @@ -179,7 +176,9 @@ impl SearchableStringSet { _ if self.stop_words.contains(&u) => {} // ignore stop words _ => self.add_exact(u, normalized), }, - None if allow_inexact && matchable.chars().count() < LEV_LENGTH_MAX => self.add_not_exact(matchable.to_string(), normalized), + None if allow_inexact && matchable.chars().count() < LEV_LENGTH_MAX => { + self.add_not_exact(matchable.to_string(), normalized) + } None => {} } } diff --git a/tests/test_code_list.rs b/tests/test_code_list.rs index d6965b9..5510590 100644 --- a/tests/test_code_list.rs +++ b/tests/test_code_list.rs @@ -103,20 +103,53 @@ fn should_search_long_sentence(fake_data: &LocationsDb) { pub r: usize, } [ - LongSearch {q: "WhereareallthedentistsinAbercornIwouldlisomesomewhere", r: 0}, - LongSearch {q: "Where are all the dentists in Abercorn I would like to find some somewhere", r: 1}, - LongSearch {q: "Whereareallthedentists inAbercornIwouldliketofind some somewhere", r: 0}, - LongSearch {q: "Whereareallthedentists in Bognor Regis Iwouldlike some somewhere", r: 1}, - LongSearch {q: "Whereareallthedentists in Bognore Regis Iwouldlike some somewhere", r: 1}, - LongSearch {q: "Whereareallthedentists in Bognoreregis Iwouldlike some somewhere", r: 1}, - LongSearch {q: "Whereareallthedentists in BognoreRegistrar Iwouldlike some somewhere", r: 0}, - LongSearch {q: "Whereareallthedentists some somewhere", r: 0}, - LongSearch {q: "WhereareallthedentistsinAbercornIwouldlisomesomewhere", r: 0}, - ].iter().for_each(|search| { - let long_sentence = SearchTerm::from_raw_query( - search.q.to_string(), None, 5, 3 - ); + LongSearch { + q: "WhereareallthedentistsinAbercornIwouldlisomesomewhere", + r: 0, + }, + LongSearch { + q: "Where are all the dentists in Abercorn I would like to find some somewhere", + r: 1, + }, + LongSearch { + q: "Whereareallthedentists inAbercornIwouldliketofind some somewhere", + r: 0, + }, + LongSearch { + q: "Whereareallthedentists in Bognor Regis Iwouldlike some somewhere", + r: 1, + }, + LongSearch { + q: "Whereareallthedentists in Bognore Regis Iwouldlike some somewhere", + r: 1, + }, + LongSearch { + q: "Whereareallthedentists in Bognoreregis Iwouldlike some somewhere", + r: 1, + }, + LongSearch { + q: "Whereareallthedentists in BognoreRegistrar Iwouldlike some somewhere", + r: 0, + }, + LongSearch { + q: "Whereareallthedentists some somewhere", + r: 0, + }, + LongSearch { + q: "WhereareallthedentistsinAbercornIwouldlisomesomewhere", + r: 0, + }, + ] + .iter() + .for_each(|search| { + let long_sentence = SearchTerm::from_raw_query(search.q.to_string(), None, 5, 3); let results = fake_data.search(&long_sentence); - assert![results.len() == search.r, "Query: {}, results: {}, expected: {}", search.q, results.len(), search.r]; + assert![ + results.len() == search.r, + "Query: {}, results: {}, expected: {}", + search.q, + results.len(), + search.r + ]; }); }