From 49d87724445e1e1b100bcbf586a02a116731c3fd Mon Sep 17 00:00:00 2001 From: Vesa Meskanen Date: Mon, 23 May 2022 15:42:52 +0300 Subject: [PATCH 1/7] More accurate possessive prefixing Modified search achieved by adding place/street in front of search string must not be longer than matched name. --- middleware/confidenceScoreDT.js | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/middleware/confidenceScoreDT.js b/middleware/confidenceScoreDT.js index 27ec4b35..8825f892 100644 --- a/middleware/confidenceScoreDT.js +++ b/middleware/confidenceScoreDT.js @@ -283,16 +283,19 @@ function checkLanguageNames(text, doc, stripNumbers, tryGenitive) { return score; }; - var checkAdminName = function(_text, admin, name) { + var checkAdminName = function(_text, admin, name, limitLength) { admin = normalize(admin); if(admin && name.indexOf(admin) === -1) { - checkNewBest(_text, admin + ' ' + name); + const extendedName = admin + ' ' + name; + if(!limitLength || extendedName.length <= _text.length) { + checkNewBest(_text, admin + ' ' + name); + } } }; - var checkAdminNames = function(_text, admins, name) { + var checkAdminNames = function(_text, admins, name, limitLength) { admins.forEach(function(admin) { - checkAdminName(_text, admin, name); + checkAdminName(_text, admin, name, limitLength); }); }; @@ -312,15 +315,15 @@ function checkLanguageNames(text, doc, stripNumbers, tryGenitive) { var admins = parent[key]; var check = Array.isArray(admins) ? checkAdminNames : checkAdminName; if(textLen > 2 + nameLen && textWC > nameWC) { // Shortest admin prefix is 'ii ' - check(text, admins, name); + check(text, admins, name, false); if (doc.street) { // try also street: 'helsinginkadun r-kioski' - checkAdminName(text, doc.street, name); + checkAdminName(text, doc.street, name, false); } } if (nameLen > 2 + textLen && nameWC > textWC) { - check(name, admins, text); + check(name, admins, text, true); if (doc.street) { - checkAdminName(name, doc.street, text); + checkAdminName(name, doc.street, text, true); } } } From ce8ab698ef44dc1a1ec2a9b96444c0cfe0a284c2 Mon Sep 17 00:00:00 2001 From: Vesa Meskanen Date: Thu, 9 Jun 2022 15:00:05 +0300 Subject: [PATCH 2/7] Do not remove partial search terms which match US states (la, ny, ba etc) --- sanitizer/_text_addressit.js | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/sanitizer/_text_addressit.js b/sanitizer/_text_addressit.js index 0b1d7315..ea0eb182 100644 --- a/sanitizer/_text_addressit.js +++ b/sanitizer/_text_addressit.js @@ -63,7 +63,6 @@ function addAdmin(parsedText, admin) { } function assignValidLibpostalParsing(parsedText, fromLibpostal, text) { - // validate street number if(check.assigned(fromLibpostal.number) && streetNumberValidator(fromLibpostal.number) && fromLibpostal.street) { parsedText.number = fromLibpostal.number; @@ -109,6 +108,16 @@ function assignValidLibpostalParsing(parsedText, fromLibpostal, text) { } } + // parser often misinterprets partial text (la, ny, etc) as US state + // we should reprogram parsing database with finnish addresses only! + const state = fromLibpostal.state; + if (state) { + if (parsedText.name && parsedText.name.indexOf(state) === -1) { + parsedText.name = text; // parser is confused, search for full text + return; + } + } + const nbrh = fromLibpostal.neighbourhood; if(nbrh) { parsedText.neighbourhood = nbrh; From 0b7225c1bcf23893910afb38844d12247d838e0e Mon Sep 17 00:00:00 2001 From: Vesa Meskanen Date: Thu, 9 Jun 2022 16:36:22 +0300 Subject: [PATCH 3/7] Compute when needed --- middleware/confidenceScoreDT.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/middleware/confidenceScoreDT.js b/middleware/confidenceScoreDT.js index 8825f892..9a5f230e 100644 --- a/middleware/confidenceScoreDT.js +++ b/middleware/confidenceScoreDT.js @@ -306,10 +306,10 @@ function checkLanguageNames(text, doc, stripNumbers, tryGenitive) { name = removeNumbers(name); } var nameLen = name.length; - var nameWC = name.split(' ').length; - var score = checkNewBest(text, name); if (score > genitiveThreshold && tryGenitive) { // don't prefix unless base match is OK + var nameWC = name.split(' ').length; + var score = checkNewBest(text, name); // prefix with parent admins to catch cases like 'kontulan r-kioski = r-kioski, kontula' for(var key in adminWeights) { var admins = parent[key]; From 8a0ae509ffd5ea5469a41eb27bda6afeffb2c866 Mon Sep 17 00:00:00 2001 From: Vesa Meskanen Date: Thu, 9 Jun 2022 17:03:53 +0300 Subject: [PATCH 4/7] Lower the score of variated names slightly --- middleware/confidenceScoreDT.js | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/middleware/confidenceScoreDT.js b/middleware/confidenceScoreDT.js index 9a5f230e..bb2aa21d 100644 --- a/middleware/confidenceScoreDT.js +++ b/middleware/confidenceScoreDT.js @@ -273,8 +273,8 @@ function checkLanguageNames(text, doc, stripNumbers, tryGenitive) { var parent = doc.parent || {}; var textWC = text.split(' ').length; - var checkNewBest = function(_text, name) { - var score = fuzzy.match(_text, name); + var checkNewBest = function(_text, name, coeff) { + var score = fuzzy.match(_text, name) * score; logger.debug('#', _text, '|', name, score); if (score >= bestScore ) { bestScore = score; @@ -283,12 +283,14 @@ function checkLanguageNames(text, doc, stripNumbers, tryGenitive) { return score; }; + // strict length limit is not necessary against user typed search string + // which can be unfinished: porin raut vs rautatieasema, pori var checkAdminName = function(_text, admin, name, limitLength) { admin = normalize(admin); if(admin && name.indexOf(admin) === -1) { const extendedName = admin + ' ' + name; if(!limitLength || extendedName.length <= _text.length) { - checkNewBest(_text, admin + ' ' + name); + checkNewBest(_text, admin + ' ' + name, 0.99); } } }; @@ -309,19 +311,19 @@ function checkLanguageNames(text, doc, stripNumbers, tryGenitive) { if (score > genitiveThreshold && tryGenitive) { // don't prefix unless base match is OK var nameWC = name.split(' ').length; - var score = checkNewBest(text, name); + var score = checkNewBest(text, name, 1.0); // prefix with parent admins to catch cases like 'kontulan r-kioski = r-kioski, kontula' for(var key in adminWeights) { var admins = parent[key]; - var check = Array.isArray(admins) ? checkAdminNames : checkAdminName; + var adminCheck = Array.isArray(admins) ? checkAdminNames : checkAdminName; if(textLen > 2 + nameLen && textWC > nameWC) { // Shortest admin prefix is 'ii ' - check(text, admins, name, false); + adminCheck(text, admins, name, false); if (doc.street) { // try also street: 'helsinginkadun r-kioski' checkAdminName(text, doc.street, name, false); } } if (nameLen > 2 + textLen && nameWC > textWC) { - check(name, admins, text, true); + adminCheck(name, admins, text, true); if (doc.street) { checkAdminName(name, doc.street, text, true); } From 79046f5d5dfd7ae24d4307f1a5817d5229267a78 Mon Sep 17 00:00:00 2001 From: Vesa Meskanen Date: Fri, 10 Jun 2022 07:26:08 +0300 Subject: [PATCH 5/7] Fix bugs --- middleware/confidenceScoreDT.js | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/middleware/confidenceScoreDT.js b/middleware/confidenceScoreDT.js index bb2aa21d..6d17b1a7 100644 --- a/middleware/confidenceScoreDT.js +++ b/middleware/confidenceScoreDT.js @@ -274,7 +274,7 @@ function checkLanguageNames(text, doc, stripNumbers, tryGenitive) { var textWC = text.split(' ').length; var checkNewBest = function(_text, name, coeff) { - var score = fuzzy.match(_text, name) * score; + var score = fuzzy.match(_text, name) * coeff; logger.debug('#', _text, '|', name, score); if (score >= bestScore ) { bestScore = score; @@ -307,11 +307,11 @@ function checkLanguageNames(text, doc, stripNumbers, tryGenitive) { if(stripNumbers) { name = removeNumbers(name); } - var nameLen = name.length; - + var score = checkNewBest(text, name, 1.0); if (score > genitiveThreshold && tryGenitive) { // don't prefix unless base match is OK var nameWC = name.split(' ').length; - var score = checkNewBest(text, name, 1.0); + var nameLen = name.length; + // prefix with parent admins to catch cases like 'kontulan r-kioski = r-kioski, kontula' for(var key in adminWeights) { var admins = parent[key]; From 68c34c292201ea2480b80e54f2e27c156b96e470 Mon Sep 17 00:00:00 2001 From: Vesa Meskanen Date: Fri, 10 Jun 2022 07:29:07 +0300 Subject: [PATCH 6/7] Detect also country parsing mistakes --- sanitizer/_text_addressit.js | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sanitizer/_text_addressit.js b/sanitizer/_text_addressit.js index ea0eb182..8e43875b 100644 --- a/sanitizer/_text_addressit.js +++ b/sanitizer/_text_addressit.js @@ -108,11 +108,11 @@ function assignValidLibpostalParsing(parsedText, fromLibpostal, text) { } } - // parser often misinterprets partial text (la, ny, etc) as US state + // parser often misinterprets partial text (la, ny, etc) as US state or country // we should reprogram parsing database with finnish addresses only! - const state = fromLibpostal.state; - if (state) { - if (parsedText.name && parsedText.name.indexOf(state) === -1) { + const mistake = fromLibpostal.state || fromLibpostal.country; + if (mistake) { + if (parsedText.name && parsedText.name.indexOf(mistake) === -1) { parsedText.name = text; // parser is confused, search for full text return; } From cd91e53600ce1de4666b9212096f91dae6c5c625 Mon Sep 17 00:00:00 2001 From: Vesa Meskanen Date: Fri, 10 Jun 2022 16:04:45 +0300 Subject: [PATCH 7/7] More general handling of wrong parsing --- sanitizer/_text_addressit.js | 37 +++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/sanitizer/_text_addressit.js b/sanitizer/_text_addressit.js index 8e43875b..88417314 100644 --- a/sanitizer/_text_addressit.js +++ b/sanitizer/_text_addressit.js @@ -37,6 +37,8 @@ var cleanRegions; var postalCodeValidator = function(code) { return true; }; // default = accept everything var streetNumberValidator = function(code) { return true; }; +const unusedParse = ['state', 'county', 'country', 'borough']; + if (api && api.localization) { filteredRegions = api.localization.filteredRegions; cleanRegions = api.localization.cleanRegions; @@ -108,16 +110,6 @@ function assignValidLibpostalParsing(parsedText, fromLibpostal, text) { } } - // parser often misinterprets partial text (la, ny, etc) as US state or country - // we should reprogram parsing database with finnish addresses only! - const mistake = fromLibpostal.state || fromLibpostal.country; - if (mistake) { - if (parsedText.name && parsedText.name.indexOf(mistake) === -1) { - parsedText.name = text; // parser is confused, search for full text - return; - } - } - const nbrh = fromLibpostal.neighbourhood; if(nbrh) { parsedText.neighbourhood = nbrh; @@ -150,6 +142,25 @@ function assignValidLibpostalParsing(parsedText, fromLibpostal, text) { if(check.assigned(fromLibpostal.postalcode) && postalCodeValidator(fromLibpostal.postalcode)) { parsedText.postalcode = fromLibpostal.postalcode; } + + // parser often misinterprets partial text (la, ny, etc) as US state, county or country + // so that some part of search text can get totally ignored + // we should reprogram parsing database with finnish addresses only! + // Libpostal does not document how to do that. + unusedParse.forEach(key => { + const mistake = fromLibpostal[key]; + // check if parser has erased some search components which would be ignored in search + if (mistake) { + if ((!parsedText.name || parsedText.name.indexOf(mistake) === -1) && + (!parsedText.street || parsedText.street.indexOf(mistake) === -1) && + (!parsedText.regions || parsedText.regions.indexOf(mistake) === -1) + ) { + // parser is confused, search for full text + parsedText.name = text; + delete parsedText.regions; + } + } + }); } @@ -192,9 +203,9 @@ function _sanitize( raw, clean ){ } if (parsedText.regions) { for (var i=0; i