Skip to content

Commit

Permalink
Merge pull request #2 from flaxandteal/fix/cap-levenshtein-distance
Browse files Browse the repository at this point in the history
Caps the Levenshtein distance
  • Loading branch information
philtweir authored Mar 16, 2024
2 parents 6e46e04 + 0f8706b commit 902cb13
Show file tree
Hide file tree
Showing 8 changed files with 105 additions and 7 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "berlin-core"
version = "0.2.3"
version = "0.2.4"
edition = "2021"
license = "MIT"
description = "Identify locations and tag them with UN-LOCODEs and ISO-3166-2 subdivisions."
Expand Down
3 changes: 3 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ pub mod search;
const SCORE_SOFT_MAX: i64 = 1000;
const STATE_CODE_BOOST: i64 = 32;
const SUBDIV_CODE_BOOST: i64 = 16;
const LEV_3_LENGTH_MAX: usize = 10;
const LEV_2_LENGTH_MAX: usize = 20;
const LEV_LENGTH_MAX: usize = 30;

const SINGLE_WORD_MATCH_PENALTY: i64 = 100;

Expand Down
10 changes: 9 additions & 1 deletion src/locations_db.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use std::boxed::Box;
use std::cmp::min;
use std::error::Error;
use std::fs::File;
use std::io::BufReader;
Expand All @@ -20,6 +21,8 @@ use ustr::{Ustr, UstrMap, UstrSet};
use crate::graph::ResultsGraph;
use crate::location::{AnyLocation, CsvLocode, LocData, Location};
use crate::search::{Score, SearchTerm};
use crate::LEV_2_LENGTH_MAX;
use crate::LEV_3_LENGTH_MAX;
use crate::SEARCH_INCLUSION_THRESHOLD;

#[derive(Default)]
Expand Down Expand Up @@ -115,7 +118,12 @@ impl LocationsDb {
let search_action = |op: fst::map::OpBuilder<'c>, term: &'c str| match term.len() > 3 {
true => {
let prefix_matcher = fst::automaton::Str::new(term).starts_with();
let autom = fst::automaton::Levenshtein::new(term, st.lev_dist)
let lev_dist = match term.chars().count() {
count if count < LEV_3_LENGTH_MAX => st.lev_dist,
count if count < LEV_2_LENGTH_MAX => min(st.lev_dist, 2),
_ => min(st.lev_dist, 1),
};
let autom = fst::automaton::Levenshtein::new(term, lev_dist)
.expect("build automaton")
.union(prefix_matcher);
op.add(fst.search(autom))
Expand Down
10 changes: 7 additions & 3 deletions src/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@ use strsim::normalized_levenshtein as similarity_algo;
use unicode_segmentation::UnicodeSegmentation;
use ustr::{Ustr, UstrSet};

use crate::LEV_LENGTH_MAX;
use crate::SCORE_SOFT_MAX;

const STOP_WORDS: [&str; 11] = [
"at", "to", "in", "on", "of", "for", "by", "and", "was", "did", "the",
const STOP_WORDS: [&str; 15] = [
"any", "all", "are", "is", "at", "to", "in", "on", "of", "for", "by", "and", "was", "did",
"the",
];

#[derive(Debug)]
Expand Down Expand Up @@ -174,7 +176,9 @@ impl SearchableStringSet {
_ if self.stop_words.contains(&u) => {} // ignore stop words
_ => self.add_exact(u, normalized),
},
None if allow_inexact => self.add_not_exact(normalized.clone(), normalized),
None if allow_inexact && matchable.chars().count() < LEV_LENGTH_MAX => {
self.add_not_exact(matchable.to_string(), normalized)
}
None => {}
}
}
Expand Down
1 change: 1 addition & 0 deletions tests/data/test-code-list.csv
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ Change,Country,Location,Name,NameWoDiacritics,Subdivision,Status,Function,Date,I
,BG,DA3,Garmen,Garmen,01,RL,--3-----,1601,,4136N 02349E,
,GB,ABC,Abercarn,Abercarn,CAY,RL,-23-----,0701,,5139N 00308W,
,GB,SVN,Stonehaven,Stonehaven,ABD,AA,123-----,0701,,5658N 00213W,
,GB,BSI,Bognor Regis,Bognor Regis,WSX,AA,123-----,0701,,5047N 00041W,
24 changes: 24 additions & 0 deletions tests/data/test-codes.json
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,17 @@
"level": "council area"
}
},
"GB:WSX": {
"<c>": "ISO-3166-2",
"s": "<bln|ISO-3166-2#GB:WSX|\"West Sussex\">",
"i": "GB:WSX",
"d": {
"name": "West Sussex",
"supercode": "GB",
"subcode": "WSX",
"level": "ceremonial county"
}
},
"BG:01": {
"<c>": "ISO-3166-2",
"s": "<bln|ISO-3166-2#BG:01|\"Blagoevgrad\">",
Expand Down Expand Up @@ -123,6 +134,19 @@
"function_code": "--3-----"
}
},
"GB:BSI": {
"<c>": "UN-LOCODE",
"s": "<bln|UN-LOCODE#GB:BSI|\"Bognor Regis\">",
"i": "GB:BSI",
"d": {
"name": "Bognor Regis",
"supercode": "GB",
"subcode": "BSI",
"subdivision_name": "West Sussex",
"subdivision_code": "WSX",
"function_code": "123-----"
}
},
"GB:ABC": {
"<c>": "UN-LOCODE",
"s": "<bln|UN-LOCODE#GB:ABC|\"Abercarn\">",
Expand Down
60 changes: 59 additions & 1 deletion tests/test_code_list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ pub fn search_abercorn() -> SearchTerm {

#[rstest]
fn should_load_codes(fake_data: &LocationsDb) {
assert!(fake_data.all.len() == 11)
assert!(fake_data.all.len() == 13)
}

#[rstest]
Expand Down Expand Up @@ -95,3 +95,61 @@ fn should_search_abercorn(fake_data: &LocationsDb, search_abercorn: SearchTerm)
assert![abercarn_loc.get_state() == "gb"];
assert![abercarn_loc.get_subdiv().unwrap() == "cay"];
}

#[rstest]
fn should_search_long_sentence(fake_data: &LocationsDb) {
pub struct LongSearch {
pub q: &'static str,
pub r: usize,
}
[
LongSearch {
q: "WhereareallthedentistsinAbercornIwouldlisomesomewhere",
r: 0,
},
LongSearch {
q: "Where are all the dentists in Abercorn I would like to find some somewhere",
r: 1,
},
LongSearch {
q: "Whereareallthedentists inAbercornIwouldliketofind some somewhere",
r: 0,
},
LongSearch {
q: "Whereareallthedentists in Bognor Regis Iwouldlike some somewhere",
r: 1,
},
LongSearch {
q: "Whereareallthedentists in Bognore Regis Iwouldlike some somewhere",
r: 1,
},
LongSearch {
q: "Whereareallthedentists in Bognoreregis Iwouldlike some somewhere",
r: 1,
},
LongSearch {
q: "Whereareallthedentists in BognoreRegistrar Iwouldlike some somewhere",
r: 0,
},
LongSearch {
q: "Whereareallthedentists some somewhere",
r: 0,
},
LongSearch {
q: "WhereareallthedentistsinAbercornIwouldlisomesomewhere",
r: 0,
},
]
.iter()
.for_each(|search| {
let long_sentence = SearchTerm::from_raw_query(search.q.to_string(), None, 5, 3);
let results = fake_data.search(&long_sentence);
assert![
results.len() == search.r,
"Query: {}, results: {}, expected: {}",
search.q,
results.len(),
search.r
];
});
}

0 comments on commit 902cb13

Please sign in to comment.