flaxandteal · philtweir · Mar 16, 2024 · Mar 16, 2024 · Mar 16, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "berlin-core"
-version = "0.2.3"
+version = "0.2.4"
 edition = "2021"
 license = "MIT"
 description = "Identify locations and tag them with UN-LOCODEs and ISO-3166-2 subdivisions."

diff --git a/src/lib.rs b/src/lib.rs
@@ -17,6 +17,9 @@ pub mod search;
 const SCORE_SOFT_MAX: i64 = 1000;
 const STATE_CODE_BOOST: i64 = 32;
 const SUBDIV_CODE_BOOST: i64 = 16;
+const LEV_3_LENGTH_MAX: usize = 10;
+const LEV_2_LENGTH_MAX: usize = 20;
+const LEV_LENGTH_MAX: usize = 30;
 
 const SINGLE_WORD_MATCH_PENALTY: i64 = 100;
 

diff --git a/src/locations_db.rs b/src/locations_db.rs
@@ -1,4 +1,5 @@
 use std::boxed::Box;
+use std::cmp::min;
 use std::error::Error;
 use std::fs::File;
 use std::io::BufReader;
@@ -20,6 +21,8 @@ use ustr::{Ustr, UstrMap, UstrSet};
 use crate::graph::ResultsGraph;
 use crate::location::{AnyLocation, CsvLocode, LocData, Location};
 use crate::search::{Score, SearchTerm};
+use crate::LEV_2_LENGTH_MAX;
+use crate::LEV_3_LENGTH_MAX;
 use crate::SEARCH_INCLUSION_THRESHOLD;
 
 #[derive(Default)]
@@ -115,7 +118,12 @@ impl LocationsDb {
         let search_action = |op: fst::map::OpBuilder<'c>, term: &'c str| match term.len() > 3 {
             true => {
                 let prefix_matcher = fst::automaton::Str::new(term).starts_with();
-                let autom = fst::automaton::Levenshtein::new(term, st.lev_dist)
+                let lev_dist = match term.chars().count() {
+                    count if count < LEV_3_LENGTH_MAX => st.lev_dist,
+                    count if count < LEV_2_LENGTH_MAX => min(st.lev_dist, 2),
+                    _ => min(st.lev_dist, 1),
+                };
+                let autom = fst::automaton::Levenshtein::new(term, lev_dist)
                     .expect("build automaton")
                     .union(prefix_matcher);
                 op.add(fst.search(autom))

diff --git a/src/search.rs b/src/search.rs
@@ -6,10 +6,12 @@ use strsim::normalized_levenshtein as similarity_algo;
 use unicode_segmentation::UnicodeSegmentation;
 use ustr::{Ustr, UstrSet};
 
+use crate::LEV_LENGTH_MAX;
 use crate::SCORE_SOFT_MAX;
 
-const STOP_WORDS: [&str; 11] = [
-    "at", "to", "in", "on", "of", "for", "by", "and", "was", "did", "the",
+const STOP_WORDS: [&str; 15] = [
+    "any", "all", "are", "is", "at", "to", "in", "on", "of", "for", "by", "and", "was", "did",
+    "the",
 ];
 
 #[derive(Debug)]
@@ -174,7 +176,9 @@ impl SearchableStringSet {
                 _ if self.stop_words.contains(&u) => {} // ignore stop words
                 _ => self.add_exact(u, normalized),
             },
-            None if allow_inexact => self.add_not_exact(normalized.clone(), normalized),
+            None if allow_inexact && matchable.chars().count() < LEV_LENGTH_MAX => {
+                self.add_not_exact(matchable.to_string(), normalized)
+            }
             None => {}
         }
     }

diff --git a/tests/data/test-code-list.csv b/tests/data/test-code-list.csv
@@ -5,3 +5,4 @@ Change,Country,Location,Name,NameWoDiacritics,Subdivision,Status,Function,Date,I
 ,BG,DA3,Garmen,Garmen,01,RL,--3-----,1601,,4136N 02349E,
 ,GB,ABC,Abercarn,Abercarn,CAY,RL,-23-----,0701,,5139N 00308W,
 ,GB,SVN,Stonehaven,Stonehaven,ABD,AA,123-----,0701,,5658N 00213W,
+,GB,BSI,Bognor Regis,Bognor Regis,WSX,AA,123-----,0701,,5047N 00041W,
diff --git a/tests/data/test-codes.json b/tests/data/test-codes.json
@@ -49,6 +49,17 @@
       "level": "council area"
     }
   },
+  "GB:WSX": {
+    "<c>": "ISO-3166-2",
+    "s": "<bln|ISO-3166-2#GB:WSX|\"West Sussex\">",
+    "i": "GB:WSX",
+    "d": {
+      "name": "West Sussex",
+      "supercode": "GB",
+      "subcode": "WSX",
+      "level": "ceremonial county"
+    }
+  },
   "BG:01": {
     "<c>": "ISO-3166-2",
     "s": "<bln|ISO-3166-2#BG:01|\"Blagoevgrad\">",
@@ -123,6 +134,19 @@
       "function_code": "--3-----"
     }
   },
+  "GB:BSI": {
+    "<c>": "UN-LOCODE",
+    "s": "<bln|UN-LOCODE#GB:BSI|\"Bognor Regis\">",
+    "i": "GB:BSI",
+    "d": {
+      "name": "Bognor Regis",
+      "supercode": "GB",
+      "subcode": "BSI",
+      "subdivision_name": "West Sussex",
+      "subdivision_code": "WSX",
+      "function_code": "123-----"
+    }
+  },
   "GB:ABC": {
     "<c>": "UN-LOCODE",
     "s": "<bln|UN-LOCODE#GB:ABC|\"Abercarn\">",

diff --git a/tests/test_code_list.rs b/tests/test_code_list.rs
@@ -65,7 +65,7 @@ pub fn search_abercorn() -> SearchTerm {
 
 #[rstest]
 fn should_load_codes(fake_data: &LocationsDb) {
-    assert!(fake_data.all.len() == 11)
+    assert!(fake_data.all.len() == 13)
 }
 
 #[rstest]
@@ -95,3 +95,61 @@ fn should_search_abercorn(fake_data: &LocationsDb, search_abercorn: SearchTerm)
     assert![abercarn_loc.get_state() == "gb"];
     assert![abercarn_loc.get_subdiv().unwrap() == "cay"];
 }
+
+#[rstest]
+fn should_search_long_sentence(fake_data: &LocationsDb) {
+    pub struct LongSearch {
+        pub q: &'static str,
+        pub r: usize,
+    }
+    [
+        LongSearch {
+            q: "WhereareallthedentistsinAbercornIwouldlisomesomewhere",
+            r: 0,
+        },
+        LongSearch {
+            q: "Where are all the dentists in Abercorn I would like to find some somewhere",
+            r: 1,
+        },
+        LongSearch {
+            q: "Whereareallthedentists inAbercornIwouldliketofind some somewhere",
+            r: 0,
+        },
+        LongSearch {
+            q: "Whereareallthedentists in Bognor Regis Iwouldlike some somewhere",
+            r: 1,
+        },
+        LongSearch {
+            q: "Whereareallthedentists in Bognore Regis Iwouldlike some somewhere",
+            r: 1,
+        },
+        LongSearch {
+            q: "Whereareallthedentists in Bognoreregis Iwouldlike some somewhere",
+            r: 1,
+        },
+        LongSearch {
+            q: "Whereareallthedentists in BognoreRegistrar Iwouldlike some somewhere",
+            r: 0,
+        },
+        LongSearch {
+            q: "Whereareallthedentists some somewhere",
+            r: 0,
+        },
+        LongSearch {
+            q: "WhereareallthedentistsinAbercornIwouldlisomesomewhere",
+            r: 0,
+        },
+    ]
+    .iter()
+    .for_each(|search| {
+        let long_sentence = SearchTerm::from_raw_query(search.q.to_string(), None, 5, 3);
+        let results = fake_data.search(&long_sentence);
+        assert![
+            results.len() == search.r,
+            "Query: {}, results: {}, expected: {}",
+            search.q,
+            results.len(),
+            search.r
+        ];
+    });
+}