diff --git a/Cargo.lock b/Cargo.lock index 86b8855..a5b0012 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -30,7 +30,7 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "berlin-core" -version = "0.2.2" +version = "0.2.4" dependencies = [ "ahash", "csv", diff --git a/Cargo.toml b/Cargo.toml index f6a95ee..3f1bb39 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "berlin-core" -version = "0.2.3" +version = "0.2.4" edition = "2021" license = "MIT" description = "Identify locations and tag them with UN-LOCODEs and ISO-3166-2 subdivisions." diff --git a/src/lib.rs b/src/lib.rs index 3a9474b..15d3d90 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -17,6 +17,9 @@ pub mod search; const SCORE_SOFT_MAX: i64 = 1000; const STATE_CODE_BOOST: i64 = 32; const SUBDIV_CODE_BOOST: i64 = 16; +const LEV_3_LENGTH_MAX: usize = 10; +const LEV_2_LENGTH_MAX: usize = 20; +const LEV_LENGTH_MAX: usize = 30; const SINGLE_WORD_MATCH_PENALTY: i64 = 100; diff --git a/src/locations_db.rs b/src/locations_db.rs index f9964d9..b3fabc5 100644 --- a/src/locations_db.rs +++ b/src/locations_db.rs @@ -1,4 +1,5 @@ use std::boxed::Box; +use std::cmp::min; use std::error::Error; use std::fs::File; use std::io::BufReader; @@ -20,6 +21,8 @@ use ustr::{Ustr, UstrMap, UstrSet}; use crate::graph::ResultsGraph; use crate::location::{AnyLocation, CsvLocode, LocData, Location}; use crate::search::{Score, SearchTerm}; +use crate::LEV_2_LENGTH_MAX; +use crate::LEV_3_LENGTH_MAX; use crate::SEARCH_INCLUSION_THRESHOLD; #[derive(Default)] @@ -115,7 +118,12 @@ impl LocationsDb { let search_action = |op: fst::map::OpBuilder<'c>, term: &'c str| match term.len() > 3 { true => { let prefix_matcher = fst::automaton::Str::new(term).starts_with(); - let autom = fst::automaton::Levenshtein::new(term, st.lev_dist) + let lev_dist = match term.chars().count() { + count if count < LEV_3_LENGTH_MAX => st.lev_dist, + count if count < LEV_2_LENGTH_MAX => min(st.lev_dist, 2), + _ => min(st.lev_dist, 1), + }; + let autom = fst::automaton::Levenshtein::new(term, lev_dist) .expect("build automaton") .union(prefix_matcher); op.add(fst.search(autom)) diff --git a/src/search.rs b/src/search.rs index bdc39fb..4b37986 100644 --- a/src/search.rs +++ b/src/search.rs @@ -6,10 +6,12 @@ use strsim::normalized_levenshtein as similarity_algo; use unicode_segmentation::UnicodeSegmentation; use ustr::{Ustr, UstrSet}; +use crate::LEV_LENGTH_MAX; use crate::SCORE_SOFT_MAX; -const STOP_WORDS: [&str; 11] = [ - "at", "to", "in", "on", "of", "for", "by", "and", "was", "did", "the", +const STOP_WORDS: [&str; 15] = [ + "any", "all", "are", "is", "at", "to", "in", "on", "of", "for", "by", "and", "was", "did", + "the", ]; #[derive(Debug)] @@ -174,7 +176,9 @@ impl SearchableStringSet { _ if self.stop_words.contains(&u) => {} // ignore stop words _ => self.add_exact(u, normalized), }, - None if allow_inexact => self.add_not_exact(normalized.clone(), normalized), + None if allow_inexact && matchable.chars().count() < LEV_LENGTH_MAX => { + self.add_not_exact(matchable.to_string(), normalized) + } None => {} } } diff --git a/tests/data/test-code-list.csv b/tests/data/test-code-list.csv index 38eb679..5978b7f 100644 --- a/tests/data/test-code-list.csv +++ b/tests/data/test-code-list.csv @@ -5,3 +5,4 @@ Change,Country,Location,Name,NameWoDiacritics,Subdivision,Status,Function,Date,I ,BG,DA3,Garmen,Garmen,01,RL,--3-----,1601,,4136N 02349E, ,GB,ABC,Abercarn,Abercarn,CAY,RL,-23-----,0701,,5139N 00308W, ,GB,SVN,Stonehaven,Stonehaven,ABD,AA,123-----,0701,,5658N 00213W, +,GB,BSI,Bognor Regis,Bognor Regis,WSX,AA,123-----,0701,,5047N 00041W, diff --git a/tests/data/test-codes.json b/tests/data/test-codes.json index e051ace..be9f1b5 100644 --- a/tests/data/test-codes.json +++ b/tests/data/test-codes.json @@ -49,6 +49,17 @@ "level": "council area" } }, + "GB:WSX": { + "": "ISO-3166-2", + "s": "", + "i": "GB:WSX", + "d": { + "name": "West Sussex", + "supercode": "GB", + "subcode": "WSX", + "level": "ceremonial county" + } + }, "BG:01": { "": "ISO-3166-2", "s": "", @@ -123,6 +134,19 @@ "function_code": "--3-----" } }, + "GB:BSI": { + "": "UN-LOCODE", + "s": "", + "i": "GB:BSI", + "d": { + "name": "Bognor Regis", + "supercode": "GB", + "subcode": "BSI", + "subdivision_name": "West Sussex", + "subdivision_code": "WSX", + "function_code": "123-----" + } + }, "GB:ABC": { "": "UN-LOCODE", "s": "", diff --git a/tests/test_code_list.rs b/tests/test_code_list.rs index 6c6a87e..5510590 100644 --- a/tests/test_code_list.rs +++ b/tests/test_code_list.rs @@ -65,7 +65,7 @@ pub fn search_abercorn() -> SearchTerm { #[rstest] fn should_load_codes(fake_data: &LocationsDb) { - assert!(fake_data.all.len() == 11) + assert!(fake_data.all.len() == 13) } #[rstest] @@ -95,3 +95,61 @@ fn should_search_abercorn(fake_data: &LocationsDb, search_abercorn: SearchTerm) assert![abercarn_loc.get_state() == "gb"]; assert![abercarn_loc.get_subdiv().unwrap() == "cay"]; } + +#[rstest] +fn should_search_long_sentence(fake_data: &LocationsDb) { + pub struct LongSearch { + pub q: &'static str, + pub r: usize, + } + [ + LongSearch { + q: "WhereareallthedentistsinAbercornIwouldlisomesomewhere", + r: 0, + }, + LongSearch { + q: "Where are all the dentists in Abercorn I would like to find some somewhere", + r: 1, + }, + LongSearch { + q: "Whereareallthedentists inAbercornIwouldliketofind some somewhere", + r: 0, + }, + LongSearch { + q: "Whereareallthedentists in Bognor Regis Iwouldlike some somewhere", + r: 1, + }, + LongSearch { + q: "Whereareallthedentists in Bognore Regis Iwouldlike some somewhere", + r: 1, + }, + LongSearch { + q: "Whereareallthedentists in Bognoreregis Iwouldlike some somewhere", + r: 1, + }, + LongSearch { + q: "Whereareallthedentists in BognoreRegistrar Iwouldlike some somewhere", + r: 0, + }, + LongSearch { + q: "Whereareallthedentists some somewhere", + r: 0, + }, + LongSearch { + q: "WhereareallthedentistsinAbercornIwouldlisomesomewhere", + r: 0, + }, + ] + .iter() + .for_each(|search| { + let long_sentence = SearchTerm::from_raw_query(search.q.to_string(), None, 5, 3); + let results = fake_data.search(&long_sentence); + assert![ + results.len() == search.r, + "Query: {}, results: {}, expected: {}", + search.q, + results.len(), + search.r + ]; + }); +}