Skip to content

Commit

Permalink
Merge pull request #91 from HSLdevcom/DT-5262
Browse files Browse the repository at this point in the history
DT-5262 and DT-5261
  • Loading branch information
optionsome authored Jan 28, 2022
2 parents 9d10380 + bca8642 commit 1e60877
Show file tree
Hide file tree
Showing 8 changed files with 100 additions and 87 deletions.
15 changes: 15 additions & 0 deletions controller/predicates/is_plain_multiword_name.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
const _ = require('lodash');

module.exports = (req, res) => {
if (req.clean.hasOwnProperty('parsed_text')) {
// do not match name with admins if name is a street addr or explict admin value was parsed from text
if(req.clean.parsed_text.hasOwnProperty('number') && req.clean.parsed_text.hasOwnProperty('street') ||
req.clean.parsed_text.regions
) {
return false;
}
// no street, no admins - maybe the name itself includes an admin: 'Vilppulan asema'
return req.clean.parsed_text.name && req.clean.parsed_text.name.includes(' ');
}
return false;
};
4 changes: 2 additions & 2 deletions controller/search.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ function isRequestTimeout(err) {
return _.get(err, 'status') === 408;
}

function setup( apiConfig, esclient, query, should_execute ){
function setup( apiConfig, esclient, query, should_execute, options){
function controller( req, res, next ){
if (!should_execute(req, res)) {
return next();
Expand All @@ -23,7 +23,7 @@ function setup( apiConfig, esclient, query, should_execute ){
cleanOutput = logging.removeFields(cleanOutput);
}

const renderedQuery = query(req.clean);
const renderedQuery = query(req.clean, options);

// if there's no query to call ES with, skip the service
if (_.isUndefined(renderedQuery)) {
Expand Down
18 changes: 9 additions & 9 deletions middleware/confidenceScoreDT.js
Original file line number Diff line number Diff line change
Expand Up @@ -161,18 +161,18 @@ function compareResults(a, b) {
var plat1 = _.get(decodeAddendum(a.addendum), 'GTFS.platform');
var plat2 = _.get(decodeAddendum(b.addendum), 'GTFS.platform');
if (plat1 && plat2) {
var n1 = parseInt(plat1);
var n2 = parseInt(plat2);
var l1 = n1 + '';
var l2 = n2 + '';
if (!isNaN(n1) && !isNaN(n2) && l1.length==plat1.length && l2.length==plat2.length) {
// use numeric comparison
plat1 = n1;
plat2 = n2;
var p1 = parseInt(plat1);
var p2 = parseInt(plat2);
var l1 = p1 + '';
var l2 = p2 + '';
if (!isNaN(p1) && !isNaN(p2) && l1.length===plat1.length && l2.length===plat2.length) {
// use numeric comparison
plat1 = p1;
plat2 = p2;
}
diff = compareProperty(plat1, plat2);
if (diff) {
return diff;
return diff;
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions query/search_original.js
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ query.filter( peliasQuery.view.boundary_gid );
map request variables to query variables for all inputs
provided by this HTTP request.
**/
function generateQuery( clean ){
function generateQuery( clean, options ){

var vs = new peliasQuery.Vars( defaults );

Expand Down Expand Up @@ -137,7 +137,7 @@ function generateQuery( clean ){

// run the address parser
if( clean.parsed_text ){
textParser( clean, vs );
textParser( clean, vs, options );
}

return {
Expand Down
19 changes: 3 additions & 16 deletions query/text_parser_pelias.js
Original file line number Diff line number Diff line change
@@ -1,21 +1,8 @@
var logger = require('pelias-logger').get('api');
var placeTypes = require('../helper/placeTypes');

/*
This list should only contain admin fields we are comfortable matching in the case
when we can't identify parts of an address. This shouldn't contain fields like country_a
or postalcode because we should only try to match those when we're sure that's what they are.
*/
var adminFields = placeTypes.concat([
'region_a'
]);

/**
@todo: refactor me
**/

// all the address parsing logic
function addParsedVariablesToQueryVariables( clean, vs ){
function addParsedVariablesToQueryVariables( clean, vs, options ){

// is it a street address?
var isStreetAddress = clean.parsed_text.hasOwnProperty('number') && clean.parsed_text.hasOwnProperty('street');
Expand Down Expand Up @@ -79,7 +66,7 @@ function addParsedVariablesToQueryVariables( clean, vs ){
}
else if( clean.parsed_text.hasOwnProperty('regions') ){
leftoversString = clean.parsed_text.regions.join(' ');
} else if(!isStreetAddress && clean.parsed_text.name && clean.parsed_text.name.split(' ').length >1) {
} else if(options && options.matchNameToAdmin) {
leftoversString = clean.parsed_text.name; // apply unparsed text to boost region hits too: 'porin tori'
}

Expand All @@ -89,7 +76,7 @@ function addParsedVariablesToQueryVariables( clean, vs ){

// cycle through fields and set fields which
// are still currently unset
adminFields.forEach( function( key ){
placeTypes.forEach( function( key ){
if( !vs.isset( 'input:' + key ) ){
vs.var( 'input:' + key, leftoversString );
}
Expand Down
15 changes: 8 additions & 7 deletions routes/v1.js
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ const hasRequestErrors = require('../controller/predicates/has_request_errors');
const isCoarseReverse = require('../controller/predicates/is_coarse_reverse');
const isAdminOnlyAnalysis = require('../controller/predicates/is_admin_only_analysis');
const hasResultsAtLayers = require('../controller/predicates/has_results_at_layers');
const isAddressItParse = require('../controller/predicates/is_addressit_parse');
const isPlainMultiWordName = require('../controller/predicates/is_plain_multiword_name');
const hasRequestCategories = require('../controller/predicates/has_request_parameter')('categories');
const isOnlyNonAdminLayers = require('../controller/predicates/is_only_non_admin_layers');
// this can probably be more generalized
Expand Down Expand Up @@ -238,16 +238,16 @@ function addRoutes(app, peliasConfig) {
not(placeholderShouldHaveExecuted)
);

// call very old prod query if addressit was the parser
const oldProdQueryShouldExecute = all(
// try genitive matching if search seems to need it
const genitiveQueryShouldExecute = all(
not(hasRequestErrors),
isAddressItParse
not(hasGoodMatch),
isPlainMultiWordName
);

// call very old prod query if addressit was the parser
// try fuzzy matching if perfect match was not found
const fuzzyQueryShouldExecute = all(
not(hasRequestErrors),
isAddressItParse,
not(hasGoodMatch)
);

Expand Down Expand Up @@ -305,7 +305,8 @@ function addRoutes(app, peliasConfig) {
controllers.placeholder(placeholderService, geometricFiltersApply, placeholderIdsLookupShouldExecute),
controllers.placeholder(placeholderService, geometricFiltersApply, placeholderIdsLookupShouldExecute),
sanitizers.defer_to_addressit(shouldDeferToAddressIt),
controllers.search(peliasConfig.api, esclient, queries.very_old_prod, oldProdQueryShouldExecute),
controllers.search(peliasConfig.api, esclient, queries.very_old_prod, not(hasRequestErrors)),
controllers.search(peliasConfig.api, esclient, queries.very_old_prod, genitiveQueryShouldExecute, {matchNameToAdmin: true}),
controllers.search(peliasConfig.api, esclient, queries.fuzzy, fuzzyQueryShouldExecute),
postProc.trimByGranularity(),
postProc.distances('focus.point.'),
Expand Down
6 changes: 4 additions & 2 deletions sanitizer/_text.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@ function _sanitize( raw, clean ){
// invalid input 'text'
const text = _.trim( _.trim( raw.text ), QUOTES );

if( !_.isString(text) || _.isEmpty(text) ){
messages.errors.push('invalid param \'text\': text length, must be >0');
if( !_.isString(text) || _.isEmpty(text) ||
(!text.match(/\d/) && !text.match(/[a-z]/i))
){
messages.errors.push('invalid param \'text\': text must have alphanumeric content');
} else {
clean.text = text;
}
Expand Down
106 changes: 57 additions & 49 deletions sanitizer/_text_addressit.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ var api = require('pelias-config').generate().api;
const QUOTES = `"'«»‘’‚‛“”„‟‹›⹂「」『』〝〞〟﹁﹂﹃﹄"'「」`;
const DELIM = ',';
const ADDRESSIT_MIN_CHAR_LENGTH = 4;
const MAX_REGIONS = 3;
const MAX_WORDS = 5;
const MAX_RAW_LEN = 120;

// List of values which should not be included in parsed regions array.
// Usually this includes country name(s) in a national setup.
Expand Down Expand Up @@ -51,14 +54,12 @@ if (api && api.localization) {
}
}


function addAdmin(parsedText, admin) {
if (parsedText.regions && parsedText.regions.indexOf(admin) > -1) {
return; // nop
}
parsedText.regions = parsedText.regions || [];
parsedText.regions.push(admin);
parsedText.admin_parts = (parsedText.admin_parts ? parsedText.admin_parts+', '+admin : admin);
}

function assignValidLibpostalParsing(parsedText, fromLibpostal, text) {
Expand Down Expand Up @@ -96,10 +97,6 @@ function assignValidLibpostalParsing(parsedText, fromLibpostal, text) {
parsedText.regions.splice(addrIndex, 1);
if (parsedText.regions.length === 0) {
delete parsedText.regions;
delete parsedText.admin_parts;
}
else {
parsedText.admin_parts = parsedText.regions.join(DELIM + ' ');
}
}
}
Expand Down Expand Up @@ -137,11 +134,6 @@ function assignValidLibpostalParsing(parsedText, fromLibpostal, text) {
if(check.assigned(fromLibpostal.postalcode) && postalCodeValidator(fromLibpostal.postalcode)) {
parsedText.postalcode = fromLibpostal.postalcode;
}

// remove postalcode from city name
if(check.assigned(parsedText.postalcode) && check.assigned(parsedText.admin_parts) ) {
parsedText.admin_parts = parsedText.admin_parts.replace(parsedText.postalcode, '');
}
}


Expand All @@ -161,55 +153,62 @@ function _sanitize( raw, clean ){
else {
// valid text
clean.text = normalize(raw.text);
clean.parser = 'addressit';
if (clean.text && clean.text.length > MAX_RAW_LEN) {
messages.warnings.push('Too long search string truncated');
clean.text = clean.text.substring(0, MAX_RAW_LEN);
}

// remove anything that may have been parsed before
var fromLibpostal = clean.parsed_text;
delete clean.parsed_text;

// parse text with query parser
var parsed_text = parse(clean);
var parsedText = parse(clean);

// use the libpostal parsed address components if available
if(check.assigned(fromLibpostal)) {
parsed_text = parsed_text || {};
assignValidLibpostalParsing(parsed_text, fromLibpostal, clean.text);
assignValidLibpostalParsing(parsedText, fromLibpostal, clean.text);
}

if (check.assigned(parsed_text) && Object.keys(parsed_text).length > 0) {
clean.parsed_text = parsed_text;
// validate search term complexity
if (parsedText.name && parsedText.name.includes(' ')) {
parsedText.name = parsedText.name.split(' ').slice(0, MAX_WORDS).join(' ');
}
if (parsedText.regions) {
for (var i=0; i<parsedText.regions.length; i++) {
if(parsedText.regions[i].includes(' ')) {
parsedText.regions[i] = parsedText.regions[i].split(' ').slice(0, MAX_WORDS).join(' ');
}
}
}
if (parsedText.regions) {
parsedText.admin_parts = parsedText.regions.join(DELIM + ' ');
}
}

return messages;
}

// naive approach - for admin matching during query time
// split 'flatiron, new york, ny' into 'flatiron' and 'new york, ny'
var naive = function(tokens) {
var parsed_text = {};

if( tokens.length > 1 ){
parsed_text.name = tokens[0];
// remove postalcode from city name
if(check.assigned(parsedText.postalcode) && check.assigned(parsedText.admin_parts) ) {
parsedText.admin_parts = parsedText.admin_parts.replace(parsedText.postalcode, '');
}

// 1. slice away all parts after the first one
// 2. trim spaces from each part just in case
// 3. join the parts back together with appropriate delimiter and spacing
parsed_text.admin_parts = tokens.slice(1).join(`${DELIM} `);
if (Object.keys(parsedText).length > 0) {
clean.parsed_text = parsedText;
}
}

return parsed_text;
};
return messages;
}

function parse(clean) {
var parsedText = {};

// split query on delimiter, trim tokens and remove empty elements
var tokens = clean.text.split(DELIM)
.map( part => part.trim() )
.filter( part => part.length > 0 );

// call the naive parser to try and split tokens
var parsed_text = naive(tokens);
if( tokens.length > 1 ){
parsedText.name = tokens[0];
}

// join tokens back togther with normalized delimiters
var joined = tokens.join(`${DELIM} `);
Expand All @@ -222,16 +221,16 @@ function parse(clean) {
// copy fields from addressit response to parsed_text
for( var attr in parsed ){
if( 'text' === attr ){ continue; } // ignore 'text'
if( !_.isEmpty( parsed[ attr ] ) && _.isUndefined( parsed_text[ attr ] ) ){
parsed_text[ attr ] = parsed[ attr ];
if( !_.isEmpty( parsed[ attr ] ) && _.isUndefined( parsedText[ attr ] ) ){
parsedText[ attr ] = parsed[ attr ];
}
}
}

// if all we found was regions, ignore it as it is not enough information to make smarter decisions
if( Object.keys(parsed_text).length === 1 && !_.isUndefined(parsed_text.regions) ){
if( Object.keys(parsedText).length === 1 && !_.isUndefined(parsedText.regions) ){
logger.debug('AddressIt parsed regions only', {
parsed: parsed_text,
parsed: parsedText,
params: clean
});

Expand All @@ -241,25 +240,34 @@ function parse(clean) {

// addressit puts 1st parsed part (venue or street name) to regions[0].
// That is never desirable so drop the first item
if(cleanRegions && parsed_text.regions) {
if(parsed_text.regions.length>1) {
parsed_text.regions = parsed_text.regions.slice(1);
if(cleanRegions && parsedText.regions) {
if(parsedText.regions.length>1) {
parsedText.regions = parsedText.regions.slice(1);
} else {
delete parsed_text.regions;
delete parsedText.regions;
}
}

// remove undesired region values
if(parsed_text.regions && filteredRegions) {
parsed_text.regions = parsed_text.regions.filter(function(value) {
if(parsedText.regions && filteredRegions) {
parsedText.regions = parsedText.regions.filter(function(value) {
return(filteredRegions.indexOf(value)===-1);
});
if(parsed_text.regions.length===0) {
delete parsed_text.regions;
if(parsedText.regions.length===0) {
delete parsedText.regions;
}
}
if(parsedText.regions) {
// filter region duplicates and validate term count
parsedText.regions = parsedText.regions.filter(function(item, pos) {
return parsedText.regions.indexOf(item) === pos;
});
if (parsedText.regions.length >= MAX_REGIONS) {
parsedText.regions = parsedText.regions.slice(0, MAX_REGIONS);
}
}

return parsed_text;
return parsedText;
}

function _expected(){
Expand Down

0 comments on commit 1e60877

Please sign in to comment.