Skip to content

Commit

Permalink
Merge pull request #1219 from pelias/max_character_count_layer_filter
Browse files Browse the repository at this point in the history
feat(autocomplete): filter out address records for character counts below threshold
  • Loading branch information
orangejulius authored Nov 3, 2018
2 parents aae1d7d + 3c37e94 commit 32c295d
Show file tree
Hide file tree
Showing 11 changed files with 505 additions and 17 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ The API recognizes the following properties under the top-level `api` key in you
|`defaultParameters.focus.point.lon` <br> `defaultParameters.focus.point.lat`|no | |default coordinates for focus point
|`targets.layers_by_source` <br> `targets.source_aliases` <br> `targets.layer_aliases`|no | |custom values for which `sources` and `layers` the API accepts ([more info](https://github.com/pelias/api/pull/1131)).
|`customBoosts` | no | `{}` | Allows configuring boosts for specific sources and layers, in order to influence result order. See [Configurable Boosts](#custom-boosts) below for details |
|`autocomplete.exclude_address_length` | no | 0 | As a performance optimization, this optional parameter allows excluding address results for queries below the configured length. Addresses are usually the bulk of the records in Elasticsearch, and searching across all of them for very short text inputs can be slow, with little benefit. Consider setting this to 1 or 2 if you have several million addresses in Pelias. |
|`indexName`|*no*|*pelias*|name of the Elasticsearch index to be used when building queries|
|`attributionURL`|no| (autodetected)|The full URL to use for the attribution link returned in all Pelias responses. Pelias will attempt to autodetect this host, but it will often be correct if, for example, there is a proxy between Pelias and its users. This parameter allows setting a specific URL to avoid any such issues|
|`accessLog`|*no*||name of the format to use for access logs; may be any one of the [predefined values](https://github.com/expressjs/morgan#predefined-formats) in the `morgan` package. Defaults to `"common"`; if set to `false`, or an otherwise falsy value, disables access-logging entirely.|
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
"markdown": "^0.5.0",
"morgan": "^1.8.2",
"pelias-categories": "^1.2.0",
"pelias-config": "^3.7.0",
"pelias-config": "^3.8.0",
"pelias-labels": "^1.8.0",
"pelias-logger": "^1.2.0",
"pelias-microservice-wrapper": "^1.7.0",
Expand Down
8 changes: 5 additions & 3 deletions query/autocomplete.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ const defaults = require('./autocomplete_defaults');
const textParser = require('./text_parser_addressit');
const check = require('check-types');
const logger = require('pelias-logger').get('api');
const config = require('pelias-config').generate().api;
const config = require('pelias-config').generate();

// additional views (these may be merged in to pelias/query at a later date)
var views = {
Expand All @@ -12,7 +12,8 @@ var views = {
ngrams_last_token_only: require('./view/ngrams_last_token_only'),
phrase_first_tokens_only: require('./view/phrase_first_tokens_only'),
pop_subquery: require('./view/pop_subquery'),
boost_exact_matches: require('./view/boost_exact_matches')
boost_exact_matches: require('./view/boost_exact_matches'),
max_character_count_layer_filter: require('./view/max_character_count_layer_filter')
};

//------------------------------
Expand Down Expand Up @@ -45,9 +46,10 @@ query.score( views.boost_exact_matches );
query.score( peliasQuery.view.focus( views.ngrams_strict ) );
query.score( peliasQuery.view.popularity( views.pop_subquery ) );
query.score( peliasQuery.view.population( views.pop_subquery ) );
query.score( views.custom_boosts( config.customBoosts ) );
query.score( views.custom_boosts( config.get('api.customBoosts') ) );

// non-scoring hard filters
query.filter( views.max_character_count_layer_filter(['address'], config.get('api.autocomplete.exclude_address_length' ) ) );
query.filter( peliasQuery.view.sources );
query.filter( peliasQuery.view.layers );
query.filter( peliasQuery.view.boundary_rect );
Expand Down
58 changes: 58 additions & 0 deletions query/view/max_character_count_layer_filter.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
const _ = require('lodash');
const peliasQuery = require('pelias-query');
const allLayers = require('../../helper/type_mapping').layers;

/**
Layer terms filter view which counts the length of 'input:name' and only
applies the filter condition if the text is shorter than or equal to $maxCharCount.
You must provide a list of $excludedLayers, all layers listed in the type mapping
will be targeted, minus any listed in $excludedLayers.
eg. to filter by 'layer=address' for all one & two digit inputs:
view = filter(['address'],2)
**/

// lowest and highest valid character count (enforced)
const MIN_CHAR_COUNT = 1;
const MAX_CHAR_COUNT = 99;

module.exports = function( excludedLayers, maxCharCount ) {

// validate args, return no-op view if invalid
if( !_.isArray(excludedLayers) || _.isEmpty(excludedLayers) ||
!_.isNumber(maxCharCount) ){
return () => null;
}

// create an array containing all layers minus excluded layers
let includedLayers = _.difference(allLayers, excludedLayers);

// included layers is equal to all layers, return no-op view
if( includedLayers.length === allLayers.length ){
return () => null;
}

// create a new VariableStore with only the layers property
var vsWithOnlyIncludedLayers = new peliasQuery.Vars({ 'layers': includedLayers });

// ensure char count is within a reasonable range
maxCharCount = _.clamp(maxCharCount, MIN_CHAR_COUNT, MAX_CHAR_COUNT);

return function( vs ){

// validate required params
if( !vs.isset('input:name') ){
return null;
}

// enforce maximum character length
let charCount = vs.var('input:name').toString().length;
if( !_.inRange(charCount, 1, maxCharCount+1) ){
return null;
}

// use existing 'layers' query
return peliasQuery.view.layers(vsWithOnlyIncludedLayers);
};
};
86 changes: 86 additions & 0 deletions test/unit/fixture/autocomplete_linguistic_one_char_token.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
module.exports = {
'query': {
'bool': {
'must': [{
'constant_score': {
'query': {
'match': {
'name.default': {
'analyzer': 'peliasQueryPartialToken',
'boost': 100,
'query': 't',
'type': 'phrase',
'operator': 'and',
'slop': 3
}
}
}
}
}],
'should':[{
'function_score': {
'query': {
'match_all': {}
},
'max_boost': 20,
'score_mode': 'first',
'boost_mode': 'replace',
'functions': [{
'field_value_factor': {
'modifier': 'log1p',
'field': 'popularity',
'missing': 1
},
'weight': 1
}]
}
},{
'function_score': {
'query': {
'match_all': {}
},
'max_boost': 20,
'score_mode': 'first',
'boost_mode': 'replace',
'functions': [{
'field_value_factor': {
'modifier': 'log1p',
'field': 'population',
'missing': 1
},
'weight': 3
}]
}
}],
'filter': [{
'terms': {
'layer': [
'venue',
'street',
'country',
'macroregion',
'region',
'county',
'localadmin',
'locality',
'borough',
'neighbourhood',
'continent',
'empire',
'dependency',
'macrocounty',
'macrohood',
'microhood',
'disputed',
'postalcode',
'ocean',
'marinearea'
]
}
}]
}
},
'sort': [ '_score' ],
'size': 20,
'track_scores': true
};
60 changes: 60 additions & 0 deletions test/unit/fixture/autocomplete_linguistic_three_char_token.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
module.exports = {
'query': {
'bool': {
'must': [{
'constant_score': {
'query': {
'match': {
'name.default': {
'analyzer': 'peliasQueryPartialToken',
'boost': 100,
'query': 'tes',
'type': 'phrase',
'operator': 'and',
'slop': 3
}
}
}
}
}],
'should':[{
'function_score': {
'query': {
'match_all': {}
},
'max_boost': 20,
'score_mode': 'first',
'boost_mode': 'replace',
'functions': [{
'field_value_factor': {
'modifier': 'log1p',
'field': 'popularity',
'missing': 1
},
'weight': 1
}]
}
},{
'function_score': {
'query': {
'match_all': {}
},
'max_boost': 20,
'score_mode': 'first',
'boost_mode': 'replace',
'functions': [{
'field_value_factor': {
'modifier': 'log1p',
'field': 'population',
'missing': 1
},
'weight': 3
}]
}
}]
}
},
'sort': [ '_score' ],
'size': 20,
'track_scores': true
};
86 changes: 86 additions & 0 deletions test/unit/fixture/autocomplete_linguistic_two_char_token.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
module.exports = {
'query': {
'bool': {
'must': [{
'constant_score': {
'query': {
'match': {
'name.default': {
'analyzer': 'peliasQueryPartialToken',
'boost': 100,
'query': 'te',
'type': 'phrase',
'operator': 'and',
'slop': 3
}
}
}
}
}],
'should':[{
'function_score': {
'query': {
'match_all': {}
},
'max_boost': 20,
'score_mode': 'first',
'boost_mode': 'replace',
'functions': [{
'field_value_factor': {
'modifier': 'log1p',
'field': 'popularity',
'missing': 1
},
'weight': 1
}]
}
},{
'function_score': {
'query': {
'match_all': {}
},
'max_boost': 20,
'score_mode': 'first',
'boost_mode': 'replace',
'functions': [{
'field_value_factor': {
'modifier': 'log1p',
'field': 'population',
'missing': 1
},
'weight': 3
}]
}
}],
'filter': [{
'terms': {
'layer': [
'venue',
'street',
'country',
'macroregion',
'region',
'county',
'localadmin',
'locality',
'borough',
'neighbourhood',
'continent',
'empire',
'dependency',
'macrocounty',
'macrohood',
'microhood',
'disputed',
'postalcode',
'ocean',
'marinearea'
]
}
}]
}
},
'sort': [ '_score' ],
'size': 20,
'track_scores': true
};
Loading

0 comments on commit 32c295d

Please sign in to comment.