diff --git a/controller/predicates/is_plain_multiword_name.js b/controller/predicates/is_plain_multiword_name.js new file mode 100644 index 00000000..3b56cfaa --- /dev/null +++ b/controller/predicates/is_plain_multiword_name.js @@ -0,0 +1,15 @@ +const _ = require('lodash'); + +module.exports = (req, res) => { + if (req.clean.hasOwnProperty('parsed_text')) { + // do not match name with admins if name is a street addr or explict admin value was parsed from text + if(req.clean.parsed_text.hasOwnProperty('number') && req.clean.parsed_text.hasOwnProperty('street') || + req.clean.parsed_text.regions + ) { + return false; + } + // no street, no admins - maybe the name itself includes an admin: 'Vilppulan asema' + return req.clean.parsed_text.name && req.clean.parsed_text.name.includes(' '); + } + return false; +}; diff --git a/controller/search.js b/controller/search.js index 35bc8ce6..76d42196 100644 --- a/controller/search.js +++ b/controller/search.js @@ -10,7 +10,7 @@ function isRequestTimeout(err) { return _.get(err, 'status') === 408; } -function setup( apiConfig, esclient, query, should_execute ){ +function setup( apiConfig, esclient, query, should_execute, options){ function controller( req, res, next ){ if (!should_execute(req, res)) { return next(); @@ -23,7 +23,7 @@ function setup( apiConfig, esclient, query, should_execute ){ cleanOutput = logging.removeFields(cleanOutput); } - const renderedQuery = query(req.clean); + const renderedQuery = query(req.clean, options); // if there's no query to call ES with, skip the service if (_.isUndefined(renderedQuery)) { diff --git a/middleware/confidenceScoreDT.js b/middleware/confidenceScoreDT.js index 8776d43e..6f45a655 100644 --- a/middleware/confidenceScoreDT.js +++ b/middleware/confidenceScoreDT.js @@ -161,18 +161,18 @@ function compareResults(a, b) { var plat1 = _.get(decodeAddendum(a.addendum), 'GTFS.platform'); var plat2 = _.get(decodeAddendum(b.addendum), 'GTFS.platform'); if (plat1 && plat2) { - var n1 = parseInt(plat1); - var n2 = parseInt(plat2); - var l1 = n1 + ''; - var l2 = n2 + ''; - if (!isNaN(n1) && !isNaN(n2) && l1.length==plat1.length && l2.length==plat2.length) { - // use numeric comparison - plat1 = n1; - plat2 = n2; + var p1 = parseInt(plat1); + var p2 = parseInt(plat2); + var l1 = p1 + ''; + var l2 = p2 + ''; + if (!isNaN(p1) && !isNaN(p2) && l1.length===plat1.length && l2.length===plat2.length) { + // use numeric comparison + plat1 = p1; + plat2 = p2; } diff = compareProperty(plat1, plat2); if (diff) { - return diff; + return diff; } } } diff --git a/query/search_original.js b/query/search_original.js index 49a89e2a..8af5f346 100644 --- a/query/search_original.js +++ b/query/search_original.js @@ -50,7 +50,7 @@ query.filter( peliasQuery.view.boundary_gid ); map request variables to query variables for all inputs provided by this HTTP request. **/ -function generateQuery( clean ){ +function generateQuery( clean, options ){ var vs = new peliasQuery.Vars( defaults ); @@ -137,7 +137,7 @@ function generateQuery( clean ){ // run the address parser if( clean.parsed_text ){ - textParser( clean, vs ); + textParser( clean, vs, options ); } return { diff --git a/query/text_parser_pelias.js b/query/text_parser_pelias.js index 88ce9b50..c6150181 100644 --- a/query/text_parser_pelias.js +++ b/query/text_parser_pelias.js @@ -1,21 +1,8 @@ var logger = require('pelias-logger').get('api'); var placeTypes = require('../helper/placeTypes'); -/* -This list should only contain admin fields we are comfortable matching in the case -when we can't identify parts of an address. This shouldn't contain fields like country_a -or postalcode because we should only try to match those when we're sure that's what they are. - */ -var adminFields = placeTypes.concat([ - 'region_a' -]); - -/** - @todo: refactor me -**/ - // all the address parsing logic -function addParsedVariablesToQueryVariables( clean, vs ){ +function addParsedVariablesToQueryVariables( clean, vs, options ){ // is it a street address? var isStreetAddress = clean.parsed_text.hasOwnProperty('number') && clean.parsed_text.hasOwnProperty('street'); @@ -79,7 +66,7 @@ function addParsedVariablesToQueryVariables( clean, vs ){ } else if( clean.parsed_text.hasOwnProperty('regions') ){ leftoversString = clean.parsed_text.regions.join(' '); - } else if(!isStreetAddress && clean.parsed_text.name && clean.parsed_text.name.split(' ').length >1) { + } else if(options && options.matchNameToAdmin) { leftoversString = clean.parsed_text.name; // apply unparsed text to boost region hits too: 'porin tori' } @@ -89,7 +76,7 @@ function addParsedVariablesToQueryVariables( clean, vs ){ // cycle through fields and set fields which // are still currently unset - adminFields.forEach( function( key ){ + placeTypes.forEach( function( key ){ if( !vs.isset( 'input:' + key ) ){ vs.var( 'input:' + key, leftoversString ); } diff --git a/routes/v1.js b/routes/v1.js index 3531cf27..c982deef 100644 --- a/routes/v1.js +++ b/routes/v1.js @@ -80,7 +80,7 @@ const hasRequestErrors = require('../controller/predicates/has_request_errors'); const isCoarseReverse = require('../controller/predicates/is_coarse_reverse'); const isAdminOnlyAnalysis = require('../controller/predicates/is_admin_only_analysis'); const hasResultsAtLayers = require('../controller/predicates/has_results_at_layers'); -const isAddressItParse = require('../controller/predicates/is_addressit_parse'); +const isPlainMultiWordName = require('../controller/predicates/is_plain_multiword_name'); const hasRequestCategories = require('../controller/predicates/has_request_parameter')('categories'); const isOnlyNonAdminLayers = require('../controller/predicates/is_only_non_admin_layers'); // this can probably be more generalized @@ -238,16 +238,16 @@ function addRoutes(app, peliasConfig) { not(placeholderShouldHaveExecuted) ); - // call very old prod query if addressit was the parser - const oldProdQueryShouldExecute = all( + // try genitive matching if search seems to need it + const genitiveQueryShouldExecute = all( not(hasRequestErrors), - isAddressItParse + not(hasGoodMatch), + isPlainMultiWordName ); - // call very old prod query if addressit was the parser + // try fuzzy matching if perfect match was not found const fuzzyQueryShouldExecute = all( not(hasRequestErrors), - isAddressItParse, not(hasGoodMatch) ); @@ -305,7 +305,8 @@ function addRoutes(app, peliasConfig) { controllers.placeholder(placeholderService, geometricFiltersApply, placeholderIdsLookupShouldExecute), controllers.placeholder(placeholderService, geometricFiltersApply, placeholderIdsLookupShouldExecute), sanitizers.defer_to_addressit(shouldDeferToAddressIt), - controllers.search(peliasConfig.api, esclient, queries.very_old_prod, oldProdQueryShouldExecute), + controllers.search(peliasConfig.api, esclient, queries.very_old_prod, not(hasRequestErrors)), + controllers.search(peliasConfig.api, esclient, queries.very_old_prod, genitiveQueryShouldExecute, {matchNameToAdmin: true}), controllers.search(peliasConfig.api, esclient, queries.fuzzy, fuzzyQueryShouldExecute), postProc.trimByGranularity(), postProc.distances('focus.point.'), diff --git a/sanitizer/_text.js b/sanitizer/_text.js index 0989faa3..c11c5481 100644 --- a/sanitizer/_text.js +++ b/sanitizer/_text.js @@ -12,8 +12,10 @@ function _sanitize( raw, clean ){ // invalid input 'text' const text = _.trim( _.trim( raw.text ), QUOTES ); - if( !_.isString(text) || _.isEmpty(text) ){ - messages.errors.push('invalid param \'text\': text length, must be >0'); + if( !_.isString(text) || _.isEmpty(text) || + (!text.match(/\d/) && !text.match(/[a-z]/i)) + ){ + messages.errors.push('invalid param \'text\': text must have alphanumeric content'); } else { clean.text = text; } diff --git a/sanitizer/_text_addressit.js b/sanitizer/_text_addressit.js index eac986fc..1940f1d3 100644 --- a/sanitizer/_text_addressit.js +++ b/sanitizer/_text_addressit.js @@ -24,6 +24,9 @@ var api = require('pelias-config').generate().api; const QUOTES = `"'«»‘’‚‛“”„‟‹›⹂「」『』〝〞〟﹁﹂﹃﹄"'「」`; const DELIM = ','; const ADDRESSIT_MIN_CHAR_LENGTH = 4; +const MAX_REGIONS = 3; +const MAX_WORDS = 5; +const MAX_RAW_LEN = 120; // List of values which should not be included in parsed regions array. // Usually this includes country name(s) in a national setup. @@ -51,14 +54,12 @@ if (api && api.localization) { } } - function addAdmin(parsedText, admin) { if (parsedText.regions && parsedText.regions.indexOf(admin) > -1) { return; // nop } parsedText.regions = parsedText.regions || []; parsedText.regions.push(admin); - parsedText.admin_parts = (parsedText.admin_parts ? parsedText.admin_parts+', '+admin : admin); } function assignValidLibpostalParsing(parsedText, fromLibpostal, text) { @@ -96,10 +97,6 @@ function assignValidLibpostalParsing(parsedText, fromLibpostal, text) { parsedText.regions.splice(addrIndex, 1); if (parsedText.regions.length === 0) { delete parsedText.regions; - delete parsedText.admin_parts; - } - else { - parsedText.admin_parts = parsedText.regions.join(DELIM + ' '); } } } @@ -137,11 +134,6 @@ function assignValidLibpostalParsing(parsedText, fromLibpostal, text) { if(check.assigned(fromLibpostal.postalcode) && postalCodeValidator(fromLibpostal.postalcode)) { parsedText.postalcode = fromLibpostal.postalcode; } - - // remove postalcode from city name - if(check.assigned(parsedText.postalcode) && check.assigned(parsedText.admin_parts) ) { - parsedText.admin_parts = parsedText.admin_parts.replace(parsedText.postalcode, ''); - } } @@ -161,55 +153,62 @@ function _sanitize( raw, clean ){ else { // valid text clean.text = normalize(raw.text); - clean.parser = 'addressit'; + if (clean.text && clean.text.length > MAX_RAW_LEN) { + messages.warnings.push('Too long search string truncated'); + clean.text = clean.text.substring(0, MAX_RAW_LEN); + } // remove anything that may have been parsed before var fromLibpostal = clean.parsed_text; delete clean.parsed_text; // parse text with query parser - var parsed_text = parse(clean); + var parsedText = parse(clean); // use the libpostal parsed address components if available if(check.assigned(fromLibpostal)) { - parsed_text = parsed_text || {}; - assignValidLibpostalParsing(parsed_text, fromLibpostal, clean.text); + assignValidLibpostalParsing(parsedText, fromLibpostal, clean.text); } - if (check.assigned(parsed_text) && Object.keys(parsed_text).length > 0) { - clean.parsed_text = parsed_text; + // validate search term complexity + if (parsedText.name && parsedText.name.includes(' ')) { + parsedText.name = parsedText.name.split(' ').slice(0, MAX_WORDS).join(' '); + } + if (parsedText.regions) { + for (var i=0; i 1 ){ - parsed_text.name = tokens[0]; + // remove postalcode from city name + if(check.assigned(parsedText.postalcode) && check.assigned(parsedText.admin_parts) ) { + parsedText.admin_parts = parsedText.admin_parts.replace(parsedText.postalcode, ''); + } - // 1. slice away all parts after the first one - // 2. trim spaces from each part just in case - // 3. join the parts back together with appropriate delimiter and spacing - parsed_text.admin_parts = tokens.slice(1).join(`${DELIM} `); + if (Object.keys(parsedText).length > 0) { + clean.parsed_text = parsedText; + } } - return parsed_text; -}; + return messages; +} function parse(clean) { + var parsedText = {}; // split query on delimiter, trim tokens and remove empty elements var tokens = clean.text.split(DELIM) .map( part => part.trim() ) .filter( part => part.length > 0 ); - // call the naive parser to try and split tokens - var parsed_text = naive(tokens); + if( tokens.length > 1 ){ + parsedText.name = tokens[0]; + } // join tokens back togther with normalized delimiters var joined = tokens.join(`${DELIM} `); @@ -222,16 +221,16 @@ function parse(clean) { // copy fields from addressit response to parsed_text for( var attr in parsed ){ if( 'text' === attr ){ continue; } // ignore 'text' - if( !_.isEmpty( parsed[ attr ] ) && _.isUndefined( parsed_text[ attr ] ) ){ - parsed_text[ attr ] = parsed[ attr ]; + if( !_.isEmpty( parsed[ attr ] ) && _.isUndefined( parsedText[ attr ] ) ){ + parsedText[ attr ] = parsed[ attr ]; } } } // if all we found was regions, ignore it as it is not enough information to make smarter decisions - if( Object.keys(parsed_text).length === 1 && !_.isUndefined(parsed_text.regions) ){ + if( Object.keys(parsedText).length === 1 && !_.isUndefined(parsedText.regions) ){ logger.debug('AddressIt parsed regions only', { - parsed: parsed_text, + parsed: parsedText, params: clean }); @@ -241,25 +240,34 @@ function parse(clean) { // addressit puts 1st parsed part (venue or street name) to regions[0]. // That is never desirable so drop the first item - if(cleanRegions && parsed_text.regions) { - if(parsed_text.regions.length>1) { - parsed_text.regions = parsed_text.regions.slice(1); + if(cleanRegions && parsedText.regions) { + if(parsedText.regions.length>1) { + parsedText.regions = parsedText.regions.slice(1); } else { - delete parsed_text.regions; + delete parsedText.regions; } } // remove undesired region values - if(parsed_text.regions && filteredRegions) { - parsed_text.regions = parsed_text.regions.filter(function(value) { + if(parsedText.regions && filteredRegions) { + parsedText.regions = parsedText.regions.filter(function(value) { return(filteredRegions.indexOf(value)===-1); }); - if(parsed_text.regions.length===0) { - delete parsed_text.regions; + if(parsedText.regions.length===0) { + delete parsedText.regions; + } + } + if(parsedText.regions) { + // filter region duplicates and validate term count + parsedText.regions = parsedText.regions.filter(function(item, pos) { + return parsedText.regions.indexOf(item) === pos; + }); + if (parsedText.regions.length >= MAX_REGIONS) { + parsedText.regions = parsedText.regions.slice(0, MAX_REGIONS); } } - return parsed_text; + return parsedText; } function _expected(){