Skip to content

Commit

Permalink
Merge pull request #96 from HSLdevcom/DT-5427
Browse files Browse the repository at this point in the history
DT-5427 improve parsing and scoring
  • Loading branch information
MikkoPuustinen authored Jun 16, 2022
2 parents 3db01ae + cd91e53 commit 78e42d1
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 19 deletions.
35 changes: 20 additions & 15 deletions middleware/confidenceScoreDT.js
Original file line number Diff line number Diff line change
Expand Up @@ -273,8 +273,8 @@ function checkLanguageNames(text, doc, stripNumbers, tryGenitive) {
var parent = doc.parent || {};
var textWC = text.split(' ').length;

var checkNewBest = function(_text, name) {
var score = fuzzy.match(_text, name);
var checkNewBest = function(_text, name, coeff) {
var score = fuzzy.match(_text, name) * coeff;
logger.debug('#', _text, '|', name, score);
if (score >= bestScore ) {
bestScore = score;
Expand All @@ -283,16 +283,21 @@ function checkLanguageNames(text, doc, stripNumbers, tryGenitive) {
return score;
};

var checkAdminName = function(_text, admin, name) {
// strict length limit is not necessary against user typed search string
// which can be unfinished: porin raut vs rautatieasema, pori
var checkAdminName = function(_text, admin, name, limitLength) {
admin = normalize(admin);
if(admin && name.indexOf(admin) === -1) {
checkNewBest(_text, admin + ' ' + name);
const extendedName = admin + ' ' + name;
if(!limitLength || extendedName.length <= _text.length) {
checkNewBest(_text, admin + ' ' + name, 0.99);
}
}
};

var checkAdminNames = function(_text, admins, name) {
var checkAdminNames = function(_text, admins, name, limitLength) {
admins.forEach(function(admin) {
checkAdminName(_text, admin, name);
checkAdminName(_text, admin, name, limitLength);
});
};

Expand All @@ -302,25 +307,25 @@ function checkLanguageNames(text, doc, stripNumbers, tryGenitive) {
if(stripNumbers) {
name = removeNumbers(name);
}
var nameLen = name.length;
var nameWC = name.split(' ').length;
var score = checkNewBest(text, name);

var score = checkNewBest(text, name, 1.0);
if (score > genitiveThreshold && tryGenitive) { // don't prefix unless base match is OK
var nameWC = name.split(' ').length;
var nameLen = name.length;

// prefix with parent admins to catch cases like 'kontulan r-kioski = r-kioski, kontula'
for(var key in adminWeights) {
var admins = parent[key];
var check = Array.isArray(admins) ? checkAdminNames : checkAdminName;
var adminCheck = Array.isArray(admins) ? checkAdminNames : checkAdminName;
if(textLen > 2 + nameLen && textWC > nameWC) { // Shortest admin prefix is 'ii '
check(text, admins, name);
adminCheck(text, admins, name, false);
if (doc.street) { // try also street: 'helsinginkadun r-kioski'
checkAdminName(text, doc.street, name);
checkAdminName(text, doc.street, name, false);
}
}
if (nameLen > 2 + textLen && nameWC > textWC) {
check(name, admins, text);
adminCheck(name, admins, text, true);
if (doc.street) {
checkAdminName(name, doc.street, text);
checkAdminName(name, doc.street, text, true);
}
}
}
Expand Down
28 changes: 24 additions & 4 deletions sanitizer/_text_addressit.js
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ var cleanRegions;
var postalCodeValidator = function(code) { return true; }; // default = accept everything
var streetNumberValidator = function(code) { return true; };

const unusedParse = ['state', 'county', 'country', 'borough'];

if (api && api.localization) {
filteredRegions = api.localization.filteredRegions;
cleanRegions = api.localization.cleanRegions;
Expand All @@ -63,7 +65,6 @@ function addAdmin(parsedText, admin) {
}

function assignValidLibpostalParsing(parsedText, fromLibpostal, text) {

// validate street number
if(check.assigned(fromLibpostal.number) && streetNumberValidator(fromLibpostal.number) && fromLibpostal.street) {
parsedText.number = fromLibpostal.number;
Expand Down Expand Up @@ -141,6 +142,25 @@ function assignValidLibpostalParsing(parsedText, fromLibpostal, text) {
if(check.assigned(fromLibpostal.postalcode) && postalCodeValidator(fromLibpostal.postalcode)) {
parsedText.postalcode = fromLibpostal.postalcode;
}

// parser often misinterprets partial text (la, ny, etc) as US state, county or country
// so that some part of search text can get totally ignored
// we should reprogram parsing database with finnish addresses only!
// Libpostal does not document how to do that.
unusedParse.forEach(key => {
const mistake = fromLibpostal[key];
// check if parser has erased some search components which would be ignored in search
if (mistake) {
if ((!parsedText.name || parsedText.name.indexOf(mistake) === -1) &&
(!parsedText.street || parsedText.street.indexOf(mistake) === -1) &&
(!parsedText.regions || parsedText.regions.indexOf(mistake) === -1)
) {
// parser is confused, search for full text
parsedText.name = text;
delete parsedText.regions;
}
}
});
}


Expand Down Expand Up @@ -183,9 +203,9 @@ function _sanitize( raw, clean ){
}
if (parsedText.regions) {
for (var i=0; i<parsedText.regions.length; i++) {
if(parsedText.regions[i].includes(' ')) {
parsedText.regions[i] = parsedText.regions[i].split(' ').slice(0, MAX_WORDS).join(' ');
}
if(parsedText.regions[i].includes(' ')) {
parsedText.regions[i] = parsedText.regions[i].split(' ').slice(0, MAX_WORDS).join(' ');
}
}
}
if (parsedText.regions) {
Expand Down

0 comments on commit 78e42d1

Please sign in to comment.