Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Caps the Levenshtein distance #2

Merged
merged 2 commits into from
Mar 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "berlin-core"
version = "0.2.3"
version = "0.2.4"
edition = "2021"
license = "MIT"
description = "Identify locations and tag them with UN-LOCODEs and ISO-3166-2 subdivisions."
Expand Down
3 changes: 3 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ pub mod search;
const SCORE_SOFT_MAX: i64 = 1000;
const STATE_CODE_BOOST: i64 = 32;
const SUBDIV_CODE_BOOST: i64 = 16;
const LEV_3_LENGTH_MAX: usize = 10;
const LEV_2_LENGTH_MAX: usize = 20;
const LEV_LENGTH_MAX: usize = 30;

const SINGLE_WORD_MATCH_PENALTY: i64 = 100;

Expand Down
10 changes: 9 additions & 1 deletion src/locations_db.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use std::boxed::Box;
use std::cmp::min;
use std::error::Error;
use std::fs::File;
use std::io::BufReader;
Expand All @@ -20,6 +21,8 @@ use ustr::{Ustr, UstrMap, UstrSet};
use crate::graph::ResultsGraph;
use crate::location::{AnyLocation, CsvLocode, LocData, Location};
use crate::search::{Score, SearchTerm};
use crate::LEV_2_LENGTH_MAX;
use crate::LEV_3_LENGTH_MAX;
use crate::SEARCH_INCLUSION_THRESHOLD;

#[derive(Default)]
Expand Down Expand Up @@ -115,7 +118,12 @@ impl LocationsDb {
let search_action = |op: fst::map::OpBuilder<'c>, term: &'c str| match term.len() > 3 {
true => {
let prefix_matcher = fst::automaton::Str::new(term).starts_with();
let autom = fst::automaton::Levenshtein::new(term, st.lev_dist)
let lev_dist = match term.chars().count() {
count if count < LEV_3_LENGTH_MAX => st.lev_dist,
count if count < LEV_2_LENGTH_MAX => min(st.lev_dist, 2),
_ => min(st.lev_dist, 1),
};
let autom = fst::automaton::Levenshtein::new(term, lev_dist)
.expect("build automaton")
.union(prefix_matcher);
op.add(fst.search(autom))
Expand Down
10 changes: 7 additions & 3 deletions src/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@ use strsim::normalized_levenshtein as similarity_algo;
use unicode_segmentation::UnicodeSegmentation;
use ustr::{Ustr, UstrSet};

use crate::LEV_LENGTH_MAX;
use crate::SCORE_SOFT_MAX;

const STOP_WORDS: [&str; 11] = [
"at", "to", "in", "on", "of", "for", "by", "and", "was", "did", "the",
const STOP_WORDS: [&str; 15] = [
"any", "all", "are", "is", "at", "to", "in", "on", "of", "for", "by", "and", "was", "did",
"the",
];

#[derive(Debug)]
Expand Down Expand Up @@ -174,7 +176,9 @@ impl SearchableStringSet {
_ if self.stop_words.contains(&u) => {} // ignore stop words
_ => self.add_exact(u, normalized),
},
None if allow_inexact => self.add_not_exact(normalized.clone(), normalized),
None if allow_inexact && matchable.chars().count() < LEV_LENGTH_MAX => {
self.add_not_exact(matchable.to_string(), normalized)
}
None => {}
}
}
Expand Down
1 change: 1 addition & 0 deletions tests/data/test-code-list.csv
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ Change,Country,Location,Name,NameWoDiacritics,Subdivision,Status,Function,Date,I
,BG,DA3,Garmen,Garmen,01,RL,--3-----,1601,,4136N 02349E,
,GB,ABC,Abercarn,Abercarn,CAY,RL,-23-----,0701,,5139N 00308W,
,GB,SVN,Stonehaven,Stonehaven,ABD,AA,123-----,0701,,5658N 00213W,
,GB,BSI,Bognor Regis,Bognor Regis,WSX,AA,123-----,0701,,5047N 00041W,
24 changes: 24 additions & 0 deletions tests/data/test-codes.json
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,17 @@
"level": "council area"
}
},
"GB:WSX": {
"<c>": "ISO-3166-2",
"s": "<bln|ISO-3166-2#GB:WSX|\"West Sussex\">",
"i": "GB:WSX",
"d": {
"name": "West Sussex",
"supercode": "GB",
"subcode": "WSX",
"level": "ceremonial county"
}
},
"BG:01": {
"<c>": "ISO-3166-2",
"s": "<bln|ISO-3166-2#BG:01|\"Blagoevgrad\">",
Expand Down Expand Up @@ -123,6 +134,19 @@
"function_code": "--3-----"
}
},
"GB:BSI": {
"<c>": "UN-LOCODE",
"s": "<bln|UN-LOCODE#GB:BSI|\"Bognor Regis\">",
"i": "GB:BSI",
"d": {
"name": "Bognor Regis",
"supercode": "GB",
"subcode": "BSI",
"subdivision_name": "West Sussex",
"subdivision_code": "WSX",
"function_code": "123-----"
}
},
"GB:ABC": {
"<c>": "UN-LOCODE",
"s": "<bln|UN-LOCODE#GB:ABC|\"Abercarn\">",
Expand Down
60 changes: 59 additions & 1 deletion tests/test_code_list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ pub fn search_abercorn() -> SearchTerm {

#[rstest]
fn should_load_codes(fake_data: &LocationsDb) {
assert!(fake_data.all.len() == 11)
assert!(fake_data.all.len() == 13)
}

#[rstest]
Expand Down Expand Up @@ -95,3 +95,61 @@ fn should_search_abercorn(fake_data: &LocationsDb, search_abercorn: SearchTerm)
assert![abercarn_loc.get_state() == "gb"];
assert![abercarn_loc.get_subdiv().unwrap() == "cay"];
}

#[rstest]
fn should_search_long_sentence(fake_data: &LocationsDb) {
pub struct LongSearch {
pub q: &'static str,
pub r: usize,
}
[
LongSearch {
q: "WhereareallthedentistsinAbercornIwouldlisomesomewhere",
r: 0,
},
LongSearch {
q: "Where are all the dentists in Abercorn I would like to find some somewhere",
r: 1,
},
LongSearch {
q: "Whereareallthedentists inAbercornIwouldliketofind some somewhere",
r: 0,
},
LongSearch {
q: "Whereareallthedentists in Bognor Regis Iwouldlike some somewhere",
r: 1,
},
LongSearch {
q: "Whereareallthedentists in Bognore Regis Iwouldlike some somewhere",
r: 1,
},
LongSearch {
q: "Whereareallthedentists in Bognoreregis Iwouldlike some somewhere",
r: 1,
},
LongSearch {
q: "Whereareallthedentists in BognoreRegistrar Iwouldlike some somewhere",
r: 0,
},
LongSearch {
q: "Whereareallthedentists some somewhere",
r: 0,
},
LongSearch {
q: "WhereareallthedentistsinAbercornIwouldlisomesomewhere",
r: 0,
},
]
.iter()
.for_each(|search| {
let long_sentence = SearchTerm::from_raw_query(search.q.to_string(), None, 5, 3);
let results = fake_data.search(&long_sentence);
assert![
results.len() == search.r,
"Query: {}, results: {}, expected: {}",
search.q,
results.len(),
search.r
];
});
}
Loading