From d611cfaff4fa18031cb002a99ea8351290115ad0 Mon Sep 17 00:00:00 2001
From: Phil Weir <phil.weir@flaxandteal.co.uk>
Date: Sat, 16 Mar 2024 18:36:55 +0100
Subject: [PATCH 1/2] fix: caps the levenshtein distance

---
 Cargo.lock                    |  2 +-
 Cargo.toml                    |  2 +-
 src/lib.rs                    |  3 +++
 src/locations_db.rs           | 10 +++++++++-
 src/search.rs                 | 15 ++++++++++-----
 tests/data/test-code-list.csv |  1 +
 tests/data/test-codes.json    | 24 ++++++++++++++++++++++++
 tests/test_code_list.rs       | 27 ++++++++++++++++++++++++++-
 8 files changed, 75 insertions(+), 9 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 86b8855..dc02021 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -30,7 +30,7 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
 
 [[package]]
 name = "berlin-core"
-version = "0.2.2"
+version = "0.2.3"
 dependencies = [
  "ahash",
  "csv",
diff --git a/Cargo.toml b/Cargo.toml
index f6a95ee..3f1bb39 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "berlin-core"
-version = "0.2.3"
+version = "0.2.4"
 edition = "2021"
 license = "MIT"
 description = "Identify locations and tag them with UN-LOCODEs and ISO-3166-2 subdivisions."
diff --git a/src/lib.rs b/src/lib.rs
index 3a9474b..15d3d90 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -17,6 +17,9 @@ pub mod search;
 const SCORE_SOFT_MAX: i64 = 1000;
 const STATE_CODE_BOOST: i64 = 32;
 const SUBDIV_CODE_BOOST: i64 = 16;
+const LEV_3_LENGTH_MAX: usize = 10;
+const LEV_2_LENGTH_MAX: usize = 20;
+const LEV_LENGTH_MAX: usize = 30;
 
 const SINGLE_WORD_MATCH_PENALTY: i64 = 100;
 
diff --git a/src/locations_db.rs b/src/locations_db.rs
index f9964d9..a1e1c20 100644
--- a/src/locations_db.rs
+++ b/src/locations_db.rs
@@ -5,6 +5,7 @@ use std::io::BufReader;
 use std::path::PathBuf;
 use std::sync::RwLock;
 use std::time::Instant;
+use std::cmp::min;
 
 use csv::ReaderBuilder;
 use fst::{Automaton, Streamer};
@@ -21,6 +22,8 @@ use crate::graph::ResultsGraph;
 use crate::location::{AnyLocation, CsvLocode, LocData, Location};
 use crate::search::{Score, SearchTerm};
 use crate::SEARCH_INCLUSION_THRESHOLD;
+use crate::LEV_3_LENGTH_MAX;
+use crate::LEV_2_LENGTH_MAX;
 
 #[derive(Default)]
 pub struct LocationsDb {
@@ -115,7 +118,12 @@ impl LocationsDb {
         let search_action = |op: fst::map::OpBuilder<'c>, term: &'c str| match term.len() > 3 {
             true => {
                 let prefix_matcher = fst::automaton::Str::new(term).starts_with();
-                let autom = fst::automaton::Levenshtein::new(term, st.lev_dist)
+                let lev_dist = match term.chars().count() {
+                    count if count < LEV_3_LENGTH_MAX => st.lev_dist,
+                    count if count < LEV_2_LENGTH_MAX => min(st.lev_dist, 2),
+                    _ => min(st.lev_dist, 1)
+                };
+                let autom = fst::automaton::Levenshtein::new(term, lev_dist)
                     .expect("build automaton")
                     .union(prefix_matcher);
                 op.add(fst.search(autom))
diff --git a/src/search.rs b/src/search.rs
index bdc39fb..9edea6e 100644
--- a/src/search.rs
+++ b/src/search.rs
@@ -7,9 +7,10 @@ use unicode_segmentation::UnicodeSegmentation;
 use ustr::{Ustr, UstrSet};
 
 use crate::SCORE_SOFT_MAX;
+use crate::LEV_LENGTH_MAX;
 
-const STOP_WORDS: [&str; 11] = [
-    "at", "to", "in", "on", "of", "for", "by", "and", "was", "did", "the",
+const STOP_WORDS: [&str; 15] = [
+    "any", "all", "are", "is", "at", "to", "in", "on", "of", "for", "by", "and", "was", "did", "the",
 ];
 
 #[derive(Debug)]
@@ -160,9 +161,13 @@ impl SearchableStringSet {
         op = self
             .not_exact
             .iter()
-            .map(|ne| ne.term.as_str())
+            .map(|ne| {
+                ne.term.as_str()
+            })
             .chain(ungrabbed)
-            .fold(op, |op, t| search_action(op, t));
+            .fold(op, |op, t| {
+                search_action(op, t)
+            });
         (op, pre_filtered)
     }
 
@@ -174,7 +179,7 @@ impl SearchableStringSet {
                 _ if self.stop_words.contains(&u) => {} // ignore stop words
                 _ => self.add_exact(u, normalized),
             },
-            None if allow_inexact => self.add_not_exact(normalized.clone(), normalized),
+            None if allow_inexact && matchable.chars().count() < LEV_LENGTH_MAX => self.add_not_exact(matchable.to_string(), normalized),
             None => {}
         }
     }
diff --git a/tests/data/test-code-list.csv b/tests/data/test-code-list.csv
index 38eb679..5978b7f 100644
--- a/tests/data/test-code-list.csv
+++ b/tests/data/test-code-list.csv
@@ -5,3 +5,4 @@ Change,Country,Location,Name,NameWoDiacritics,Subdivision,Status,Function,Date,I
 ,BG,DA3,Garmen,Garmen,01,RL,--3-----,1601,,4136N 02349E,
 ,GB,ABC,Abercarn,Abercarn,CAY,RL,-23-----,0701,,5139N 00308W,
 ,GB,SVN,Stonehaven,Stonehaven,ABD,AA,123-----,0701,,5658N 00213W,
+,GB,BSI,Bognor Regis,Bognor Regis,WSX,AA,123-----,0701,,5047N 00041W,
diff --git a/tests/data/test-codes.json b/tests/data/test-codes.json
index e051ace..be9f1b5 100644
--- a/tests/data/test-codes.json
+++ b/tests/data/test-codes.json
@@ -49,6 +49,17 @@
       "level": "council area"
     }
   },
+  "GB:WSX": {
+    "<c>": "ISO-3166-2",
+    "s": "<bln|ISO-3166-2#GB:WSX|\"West Sussex\">",
+    "i": "GB:WSX",
+    "d": {
+      "name": "West Sussex",
+      "supercode": "GB",
+      "subcode": "WSX",
+      "level": "ceremonial county"
+    }
+  },
   "BG:01": {
     "<c>": "ISO-3166-2",
     "s": "<bln|ISO-3166-2#BG:01|\"Blagoevgrad\">",
@@ -123,6 +134,19 @@
       "function_code": "--3-----"
     }
   },
+  "GB:BSI": {
+    "<c>": "UN-LOCODE",
+    "s": "<bln|UN-LOCODE#GB:BSI|\"Bognor Regis\">",
+    "i": "GB:BSI",
+    "d": {
+      "name": "Bognor Regis",
+      "supercode": "GB",
+      "subcode": "BSI",
+      "subdivision_name": "West Sussex",
+      "subdivision_code": "WSX",
+      "function_code": "123-----"
+    }
+  },
   "GB:ABC": {
     "<c>": "UN-LOCODE",
     "s": "<bln|UN-LOCODE#GB:ABC|\"Abercarn\">",
diff --git a/tests/test_code_list.rs b/tests/test_code_list.rs
index 6c6a87e..d6965b9 100644
--- a/tests/test_code_list.rs
+++ b/tests/test_code_list.rs
@@ -65,7 +65,7 @@ pub fn search_abercorn() -> SearchTerm {
 
 #[rstest]
 fn should_load_codes(fake_data: &LocationsDb) {
-    assert!(fake_data.all.len() == 11)
+    assert!(fake_data.all.len() == 13)
 }
 
 #[rstest]
@@ -95,3 +95,28 @@ fn should_search_abercorn(fake_data: &LocationsDb, search_abercorn: SearchTerm)
     assert![abercarn_loc.get_state() == "gb"];
     assert![abercarn_loc.get_subdiv().unwrap() == "cay"];
 }
+
+#[rstest]
+fn should_search_long_sentence(fake_data: &LocationsDb) {
+    pub struct LongSearch {
+        pub q: &'static str,
+        pub r: usize,
+    }
+    [
+        LongSearch {q: "WhereareallthedentistsinAbercornIwouldlisomesomewhere", r: 0},
+        LongSearch {q: "Where are all the dentists in Abercorn I would like to find some somewhere", r: 1},
+        LongSearch {q: "Whereareallthedentists inAbercornIwouldliketofind some somewhere", r: 0},
+        LongSearch {q: "Whereareallthedentists in Bognor Regis Iwouldlike some somewhere", r: 1},
+        LongSearch {q: "Whereareallthedentists in Bognore Regis Iwouldlike some somewhere", r: 1},
+        LongSearch {q: "Whereareallthedentists in Bognoreregis Iwouldlike some somewhere", r: 1},
+        LongSearch {q: "Whereareallthedentists in BognoreRegistrar Iwouldlike some somewhere", r: 0},
+        LongSearch {q: "Whereareallthedentists some somewhere", r: 0},
+        LongSearch {q: "WhereareallthedentistsinAbercornIwouldlisomesomewhere", r: 0},
+    ].iter().for_each(|search| {
+        let long_sentence = SearchTerm::from_raw_query(
+            search.q.to_string(), None, 5, 3
+        );
+        let results = fake_data.search(&long_sentence);
+        assert![results.len() == search.r, "Query: {}, results: {}, expected: {}", search.q, results.len(), search.r];
+    });
+}

From 0f8706ba27894f0ee561f5fcfc13a4e3befffdac Mon Sep 17 00:00:00 2001
From: Phil Weir <phil.weir@flaxandteal.co.uk>
Date: Sat, 16 Mar 2024 18:41:47 +0100
Subject: [PATCH 2/2] chore: rustfmt

---
 Cargo.lock              |  2 +-
 src/locations_db.rs     |  8 +++---
 src/search.rs           | 17 ++++++------
 tests/test_code_list.rs | 61 +++++++++++++++++++++++++++++++----------
 4 files changed, 60 insertions(+), 28 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index dc02021..a5b0012 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -30,7 +30,7 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
 
 [[package]]
 name = "berlin-core"
-version = "0.2.3"
+version = "0.2.4"
 dependencies = [
  "ahash",
  "csv",
diff --git a/src/locations_db.rs b/src/locations_db.rs
index a1e1c20..b3fabc5 100644
--- a/src/locations_db.rs
+++ b/src/locations_db.rs
@@ -1,11 +1,11 @@
 use std::boxed::Box;
+use std::cmp::min;
 use std::error::Error;
 use std::fs::File;
 use std::io::BufReader;
 use std::path::PathBuf;
 use std::sync::RwLock;
 use std::time::Instant;
-use std::cmp::min;
 
 use csv::ReaderBuilder;
 use fst::{Automaton, Streamer};
@@ -21,9 +21,9 @@ use ustr::{Ustr, UstrMap, UstrSet};
 use crate::graph::ResultsGraph;
 use crate::location::{AnyLocation, CsvLocode, LocData, Location};
 use crate::search::{Score, SearchTerm};
-use crate::SEARCH_INCLUSION_THRESHOLD;
-use crate::LEV_3_LENGTH_MAX;
 use crate::LEV_2_LENGTH_MAX;
+use crate::LEV_3_LENGTH_MAX;
+use crate::SEARCH_INCLUSION_THRESHOLD;
 
 #[derive(Default)]
 pub struct LocationsDb {
@@ -121,7 +121,7 @@ impl LocationsDb {
                 let lev_dist = match term.chars().count() {
                     count if count < LEV_3_LENGTH_MAX => st.lev_dist,
                     count if count < LEV_2_LENGTH_MAX => min(st.lev_dist, 2),
-                    _ => min(st.lev_dist, 1)
+                    _ => min(st.lev_dist, 1),
                 };
                 let autom = fst::automaton::Levenshtein::new(term, lev_dist)
                     .expect("build automaton")
diff --git a/src/search.rs b/src/search.rs
index 9edea6e..4b37986 100644
--- a/src/search.rs
+++ b/src/search.rs
@@ -6,11 +6,12 @@ use strsim::normalized_levenshtein as similarity_algo;
 use unicode_segmentation::UnicodeSegmentation;
 use ustr::{Ustr, UstrSet};
 
-use crate::SCORE_SOFT_MAX;
 use crate::LEV_LENGTH_MAX;
+use crate::SCORE_SOFT_MAX;
 
 const STOP_WORDS: [&str; 15] = [
-    "any", "all", "are", "is", "at", "to", "in", "on", "of", "for", "by", "and", "was", "did", "the",
+    "any", "all", "are", "is", "at", "to", "in", "on", "of", "for", "by", "and", "was", "did",
+    "the",
 ];
 
 #[derive(Debug)]
@@ -161,13 +162,9 @@ impl SearchableStringSet {
         op = self
             .not_exact
             .iter()
-            .map(|ne| {
-                ne.term.as_str()
-            })
+            .map(|ne| ne.term.as_str())
             .chain(ungrabbed)
-            .fold(op, |op, t| {
-                search_action(op, t)
-            });
+            .fold(op, |op, t| search_action(op, t));
         (op, pre_filtered)
     }
 
@@ -179,7 +176,9 @@ impl SearchableStringSet {
                 _ if self.stop_words.contains(&u) => {} // ignore stop words
                 _ => self.add_exact(u, normalized),
             },
-            None if allow_inexact && matchable.chars().count() < LEV_LENGTH_MAX => self.add_not_exact(matchable.to_string(), normalized),
+            None if allow_inexact && matchable.chars().count() < LEV_LENGTH_MAX => {
+                self.add_not_exact(matchable.to_string(), normalized)
+            }
             None => {}
         }
     }
diff --git a/tests/test_code_list.rs b/tests/test_code_list.rs
index d6965b9..5510590 100644
--- a/tests/test_code_list.rs
+++ b/tests/test_code_list.rs
@@ -103,20 +103,53 @@ fn should_search_long_sentence(fake_data: &LocationsDb) {
         pub r: usize,
     }
     [
-        LongSearch {q: "WhereareallthedentistsinAbercornIwouldlisomesomewhere", r: 0},
-        LongSearch {q: "Where are all the dentists in Abercorn I would like to find some somewhere", r: 1},
-        LongSearch {q: "Whereareallthedentists inAbercornIwouldliketofind some somewhere", r: 0},
-        LongSearch {q: "Whereareallthedentists in Bognor Regis Iwouldlike some somewhere", r: 1},
-        LongSearch {q: "Whereareallthedentists in Bognore Regis Iwouldlike some somewhere", r: 1},
-        LongSearch {q: "Whereareallthedentists in Bognoreregis Iwouldlike some somewhere", r: 1},
-        LongSearch {q: "Whereareallthedentists in BognoreRegistrar Iwouldlike some somewhere", r: 0},
-        LongSearch {q: "Whereareallthedentists some somewhere", r: 0},
-        LongSearch {q: "WhereareallthedentistsinAbercornIwouldlisomesomewhere", r: 0},
-    ].iter().for_each(|search| {
-        let long_sentence = SearchTerm::from_raw_query(
-            search.q.to_string(), None, 5, 3
-        );
+        LongSearch {
+            q: "WhereareallthedentistsinAbercornIwouldlisomesomewhere",
+            r: 0,
+        },
+        LongSearch {
+            q: "Where are all the dentists in Abercorn I would like to find some somewhere",
+            r: 1,
+        },
+        LongSearch {
+            q: "Whereareallthedentists inAbercornIwouldliketofind some somewhere",
+            r: 0,
+        },
+        LongSearch {
+            q: "Whereareallthedentists in Bognor Regis Iwouldlike some somewhere",
+            r: 1,
+        },
+        LongSearch {
+            q: "Whereareallthedentists in Bognore Regis Iwouldlike some somewhere",
+            r: 1,
+        },
+        LongSearch {
+            q: "Whereareallthedentists in Bognoreregis Iwouldlike some somewhere",
+            r: 1,
+        },
+        LongSearch {
+            q: "Whereareallthedentists in BognoreRegistrar Iwouldlike some somewhere",
+            r: 0,
+        },
+        LongSearch {
+            q: "Whereareallthedentists some somewhere",
+            r: 0,
+        },
+        LongSearch {
+            q: "WhereareallthedentistsinAbercornIwouldlisomesomewhere",
+            r: 0,
+        },
+    ]
+    .iter()
+    .for_each(|search| {
+        let long_sentence = SearchTerm::from_raw_query(search.q.to_string(), None, 5, 3);
         let results = fake_data.search(&long_sentence);
-        assert![results.len() == search.r, "Query: {}, results: {}, expected: {}", search.q, results.len(), search.r];
+        assert![
+            results.len() == search.r,
+            "Query: {}, results: {}, expected: {}",
+            search.q,
+            results.len(),
+            search.r
+        ];
     });
 }