Skip to content

Commit

Permalink
feat(analysis): improvements to peliasStreet and peliasPhrase analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
missinglink authored and orangejulius committed Jan 18, 2019
1 parent 76fe8ca commit b59ae7e
Show file tree
Hide file tree
Showing 9 changed files with 389 additions and 1,147 deletions.
22 changes: 17 additions & 5 deletions integration/address_matching.js
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,18 @@ module.exports.tests.functional = function(test, common){
}, done );
});

suite.action( function( done ){
suite.client.index({
index: suite.props.index, type: 'test',
id: '4', body: { address_parts: {
name: 'Mystery Location',
number: 300,
street: 'east 26th street',
zip: '100 10'
}}
}, done );
});

// search by street number
suite.assert( function( done ){
suite.client.search({
Expand All @@ -71,7 +83,7 @@ module.exports.tests.functional = function(test, common){
index: suite.props.index,
type: 'test',
body: { query: { bool: { must: [
{ match: { 'address_parts.street': 'west 26th street' } }
{ match_phrase: { 'address_parts.street': 'west 26th street' } }
]}}}
}, function( err, res ){
t.equal( err, undefined );
Expand All @@ -86,7 +98,7 @@ module.exports.tests.functional = function(test, common){
index: suite.props.index,
type: 'test',
body: { query: { bool: { must: [
{ match: { 'address_parts.street': 'W 26th ST' } }
{ match_phrase: { 'address_parts.street': 'W 26th ST' } }
]}}}
}, function( err, res ){
t.equal( err, undefined );
Expand All @@ -105,7 +117,7 @@ module.exports.tests.functional = function(test, common){
]}}}
}, function( err, res ){
t.equal( err, undefined );
t.equal( res.hits.total, 2, 'match zip - numeric' );
t.equal( res.hits.total, 3, 'match zip - numeric' );
done();
});
});
Expand Down Expand Up @@ -135,7 +147,7 @@ module.exports.tests.functional = function(test, common){
]}}}
}, function( err, res ){
t.equal( err, undefined );
t.equal( res.hits.total, 2, 'match zip - numeric - punct' );
t.equal( res.hits.total, 3, 'match zip - numeric - punct' );
done();
});
});
Expand All @@ -150,7 +162,7 @@ module.exports.tests.functional = function(test, common){
]}}}
}, function( err, res ){
t.equal( err, undefined );
t.equal( res.hits.total, 2, 'match zip - numeric - whitespace' );
t.equal( res.hits.total, 3, 'match zip - numeric - whitespace' );
done();
});
});
Expand Down
80 changes: 42 additions & 38 deletions integration/analyzer_peliasPhrase.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ module.exports.tests.analyze = function(test, common){
assertAnalysis( 'asciifolding', 'ł', ['l']);
assertAnalysis( 'asciifolding', 'ɰ', ['m']);
assertAnalysis( 'trim', ' f ', ['f'] );
assertAnalysis( 'stop_words (disabled)', 'a st b ave c', ['a','st','b','ave','c'] );
assertAnalysis( 'stop_words (disabled)', 'a st b ave c', ['0:a', '1:st', '1:street', '2:b', '3:ave', '3:avenue', '4:c'], true );
assertAnalysis( 'ampersand', 'a and b', ['a','&','b'] );
assertAnalysis( 'ampersand', 'a & b', ['a','&','b'] );
assertAnalysis( 'ampersand', 'a and & and b', ['a','&','b'] );
Expand All @@ -38,11 +38,11 @@ module.exports.tests.analyze = function(test, common){
assertAnalysis( 'unique', '1 1 1', ['1'] );
assertAnalysis( 'notnull', ' ^ ', [] );

assertAnalysis( 'stem street suffixes', 'streets avenue', ['streets','ave'] );
assertAnalysis( 'stem street suffixes', 'boulevard roads', ['blvd','roads'] );
assertAnalysis( 'stem street suffixes', 'streets avenue', ['0:streets', '1:avenue', '1:ave'], true );
assertAnalysis( 'stem street suffixes', 'boulevard roads', ['0:boulevard', '0:blvd', '1:roads'], true );

assertAnalysis( 'stem direction synonyms', 'south by southwest', ['s','by','sw'] );
assertAnalysis( 'stem direction synonyms', '20 bear road northeast', ['20','bear','rd','ne'] );
assertAnalysis( 'stem direction synonyms', 'south by southwest', ['0:south', '0:s', '1:by', '2:southwest', '2:sw'], true );
assertAnalysis( 'stem direction synonyms', '20 bear road northeast', ['0:20', '1:bear', '2:road', '2:rd', '3:northeast', '3:ne'], true );

// remove punctuation (handled by the char_filter)
assertAnalysis( 'punctuation', punctuation.all.join(''), [ '-&' ] );
Expand All @@ -66,29 +66,33 @@ module.exports.tests.functional = function(test, common){
'toys', 'r', 'us'
]);

assertAnalysis( 'address', '101 mapzen pl', [
'101', 'mapzen', 'pl'
]);
assertAnalysis( 'address', '101 geocode pl', [
'0:101', '1:geocode', '2:pl', '2:place'
], true);

// both terms should map to same tokens
var expected1 = [ '325', 'n', '12th', 'st' ];
assertAnalysis( 'address', '325 N 12th St', expected1 );
assertAnalysis( 'address', '325 North 12th Street', expected1 );
var expected1 = [ '0:325', '1:n', '1:north', '2:12', '3:st', '3:street' ];
var expected2 = [ '0:325', '1:north', '1:n', '2:12', '3:street', '3:st' ];
assertAnalysis( 'address', '325 N 12th St', expected1, true );
assertAnalysis( 'address', '325 North 12th Street', expected2, true );

// both terms should map to same tokens
var expected2 = [ '13509', 'colfax', 'ave', 's' ];
assertAnalysis( 'address', '13509 Colfax Ave S', expected2 );
assertAnalysis( 'address', '13509 Colfax Avenue South', expected2 );
var expected3 = [ '0:13509', '1:colfax', '2:ave', '2:avenue', '3:s', '3:south' ];
var expected4 = [ '0:13509', '1:colfax', '2:avenue', '2:ave', '3:south', '3:s' ];
assertAnalysis( 'address', '13509 Colfax Ave S', expected3, true );
assertAnalysis( 'address', '13509 Colfax Avenue South', expected4, true );

// both terms should map to same tokens
var expected3 = [ '100', 's', 'lake', 'dr' ];
assertAnalysis( 'address', '100 S Lake Dr', expected3 );
assertAnalysis( 'address', '100 South Lake Drive', expected3 );
var expected5 = [ '0:100', '1:s', '1:south', '2:lake', '3:dr', '3:drive' ];
var expected6 = [ '0:100', '1:south', '1:s', '2:lake', '3:drive', '3:dr' ];
assertAnalysis( 'address', '100 S Lake Dr', expected5, true );
assertAnalysis( 'address', '100 South Lake Drive', expected6, true );

// both terms should map to same tokens
var expected4 = [ '100', 'nw', 'hwy' ];
assertAnalysis( 'address', '100 northwest highway', expected4 );
assertAnalysis( 'address', '100 nw hwy', expected4 );
var expected7 = [ '0:100', '1:northwest', '1:nw', '2:highway', '2:hwy' ];
var expected8 = [ '0:100', '1:nw', '1:northwest', '2:hwy', '2:highway' ];
assertAnalysis( 'address', '100 northwest highway', expected7, true );
assertAnalysis( 'address', '100 nw hwy', expected8, true );

suite.run( t.end );
});
Expand All @@ -101,19 +105,19 @@ module.exports.tests.tokenizer = function(test, common){
var assertAnalysis = analyze.bind( null, suite, t, 'peliasPhrase' );
suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up

// specify 2 parts with a delimeter
assertAnalysis( 'forward slash', 'Bedell Street/133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]);
assertAnalysis( 'forward slash', 'Bedell Street /133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]);
assertAnalysis( 'forward slash', 'Bedell Street/ 133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]);
assertAnalysis( 'back slash', 'Bedell Street\\133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]);
assertAnalysis( 'back slash', 'Bedell Street \\133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]);
assertAnalysis( 'back slash', 'Bedell Street\\ 133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]);
assertAnalysis( 'comma', 'Bedell Street,133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]);
assertAnalysis( 'comma', 'Bedell Street ,133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]);
assertAnalysis( 'comma', 'Bedell Street, 133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]);
assertAnalysis( 'space', 'Bedell Street,133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]);
assertAnalysis( 'space', 'Bedell Street ,133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]);
assertAnalysis( 'space', 'Bedell Street, 133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]);
var expected1 = [ '0:bedell', '1:street', '1:st', '2:133', '3:avenue', '3:ave' ];
var expected2 = [ '0:bedell', '1:street', '1:st', '102:133', '103:avenue', '103:ave' ];

// specify 2 streets with a delimeter
assertAnalysis( 'forward slash', 'Bedell Street/133rd Avenue', expected1, true );
assertAnalysis( 'forward slash', 'Bedell Street /133rd Avenue', expected1, true );
assertAnalysis( 'forward slash', 'Bedell Street/ 133rd Avenue', expected1, true );
assertAnalysis( 'back slash', 'Bedell Street\\133rd Avenue', expected1, true );
assertAnalysis( 'back slash', 'Bedell Street \\133rd Avenue', expected1, true );
assertAnalysis( 'back slash', 'Bedell Street\\ 133rd Avenue', expected1, true );
assertAnalysis( 'comma', 'Bedell Street,133rd Avenue', expected2, true );
assertAnalysis( 'comma', 'Bedell Street ,133rd Avenue', expected2, true );
assertAnalysis( 'comma', 'Bedell Street, 133rd Avenue', expected2, true );

suite.run( t.end );
});
Expand Down Expand Up @@ -323,22 +327,22 @@ module.exports.all = function (tape, common) {
}
};

function analyze( suite, t, analyzer, comment, text, expected ){
function analyze( suite, t, analyzer, comment, text, expected, includePosition ){
suite.assert( function( done ){
suite.client.indices.analyze({
index: suite.props.index,
analyzer: analyzer,
text: text
}, function( err, res ){
if( err ) console.error( err );
t.deepEqual( simpleTokens( res.tokens ), expected, comment );
if( err ){ console.error( err ); }
t.deepEqual( simpleTokens( res.tokens, includePosition ), expected, comment );
done();
});
});
}

function simpleTokens( tokens ){
function simpleTokens( tokens, includePosition ){
return tokens.map( function( t ){
return t.token;
return (!!includePosition ? t.position + ':' : '') + t.token;
});
}
63 changes: 34 additions & 29 deletions integration/analyzer_peliasStreet.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@ module.exports.tests.analyze = function(test, common){
assertAnalysis( 'lowercase', 'F', ['f']);
assertAnalysis( 'asciifolding', 'Max-Beer-Straße', ['max-beer-strasse']);
assertAnalysis( 'trim', ' f ', ['f'] );
assertAnalysis( 'keyword_street_suffix', 'foo Street', ['foo', 'street'] );
assertAnalysis( 'keyword_street_suffix', 'foo Road', ['foo', 'road'] );
assertAnalysis( 'keyword_street_suffix', 'foo Crescent', ['foo', 'crescent'] );
assertAnalysis( 'keyword_compass', 'north foo', ['n', 'foo'] );
assertAnalysis( 'keyword_compass', 'SouthWest foo', ['sw', 'foo'] );
assertAnalysis( 'keyword_compass', 'foo SouthWest', ['foo', 'sw'] );
assertAnalysis( 'keyword_street_suffix', 'foo Street', ['0:foo', '1:street', '1:st'], true );
assertAnalysis( 'keyword_street_suffix', 'foo Road', ['0:foo', '1:road', '1:rd'], true );
assertAnalysis( 'keyword_street_suffix', 'foo Crescent', ['0:foo', '1:crescent', '1:cres'], true );
assertAnalysis( 'keyword_compass', 'north foo', ['0:north', '0:n', '1:foo'], true );
assertAnalysis( 'keyword_compass', 'SouthWest foo', ['0:southwest', '0:sw', '1:foo'], true );
assertAnalysis( 'keyword_compass', 'foo SouthWest', ['0:foo', '1:southwest', '1:sw'], true );
assertAnalysis( 'remove_ordinals', '1st 2nd 3rd 4th 5th', ['1','2','3','4','5'] );
assertAnalysis( 'remove_ordinals', 'Ast th 101st', ['ast','th','101'] );

Expand All @@ -37,11 +37,11 @@ module.exports.tests.functional = function(test, common){
var assertAnalysis = analyze.bind( null, suite, t, 'peliasStreet' );
suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up

assertAnalysis( 'USA address', 'west 26th street', [ 'w', '26', 'street' ]);
assertAnalysis( 'USA address', 'West 26th Street', [ 'w', '26', 'street' ]);
assertAnalysis( 'USA address', 'w 26th st', [ 'w', '26', 'st' ]);
assertAnalysis( 'USA address', 'WEST 26th STREET', [ 'w', '26', 'street' ]);
assertAnalysis( 'USA address', 'WEST 26th ST', [ 'w', '26', 'st' ]);
assertAnalysis( 'USA address', 'west 26th street', [ '0:west', '0:w', '1:26', '2:street', '2:st' ], true);
assertAnalysis( 'USA address', 'West 26th Street', [ '0:west', '0:w', '1:26', '2:street', '2:st' ], true);
assertAnalysis( 'USA address', 'w 26th st', [ '0:w', '0:west', '1:26', '2:st', '2:street' ], true);
assertAnalysis( 'USA address', 'WEST 26th STREET', [ '0:west', '0:w', '1:26', '2:street', '2:st' ], true);
assertAnalysis( 'USA address', 'WEST 26th ST', [ '0:west', '0:w', '1:26', '2:st', '2:street' ], true);

suite.run( t.end );
});
Expand All @@ -54,10 +54,12 @@ module.exports.tests.normalize_punctuation = function(test, common){
var assertAnalysis = analyze.bind( null, suite, t, 'peliasStreet' );
suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up

assertAnalysis( 'single space', 'Chapala Street', [ 'chapala', 'street' ]);
assertAnalysis( 'double space', 'Chapala Street', [ 'chapala', 'street' ]);
assertAnalysis( 'triple space', 'Chapala Street', [ 'chapala', 'street' ]);
assertAnalysis( 'quad space', 'Chapala Street', [ 'chapala', 'street' ]);
var expected = [ '0:chapala', '1:street', '1:st' ];

assertAnalysis( 'single space', 'Chapala Street', expected, true );
assertAnalysis( 'double space', 'Chapala Street', expected, true );
assertAnalysis( 'triple space', 'Chapala Street', expected, true );
assertAnalysis( 'quad space', 'Chapala Street', expected, true );

suite.run( t.end );
});
Expand Down Expand Up @@ -144,16 +146,19 @@ module.exports.tests.tokenizer = function(test, common){
var assertAnalysis = analyze.bind( null, suite, t, 'peliasStreet' );
suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up

var expected1 = [ '0:bedell', '1:street', '1:st', '2:133', '3:avenue', '3:ave' ];
var expected2 = [ '0:bedell', '1:street', '1:st', '102:133', '103:avenue', '103:ave' ];

// specify 2 streets with a delimeter
assertAnalysis( 'forward slash', 'Bedell Street/133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]);
assertAnalysis( 'forward slash', 'Bedell Street /133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]);
assertAnalysis( 'forward slash', 'Bedell Street/ 133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]);
assertAnalysis( 'back slash', 'Bedell Street\\133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]);
assertAnalysis( 'back slash', 'Bedell Street \\133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]);
assertAnalysis( 'back slash', 'Bedell Street\\ 133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]);
assertAnalysis( 'comma', 'Bedell Street,133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]);
assertAnalysis( 'comma', 'Bedell Street ,133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]);
assertAnalysis( 'comma', 'Bedell Street, 133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]);
assertAnalysis( 'forward slash', 'Bedell Street/133rd Avenue', expected1, true );
assertAnalysis( 'forward slash', 'Bedell Street /133rd Avenue', expected1, true );
assertAnalysis( 'forward slash', 'Bedell Street/ 133rd Avenue', expected1, true );
assertAnalysis( 'back slash', 'Bedell Street\\133rd Avenue', expected1, true );
assertAnalysis( 'back slash', 'Bedell Street \\133rd Avenue', expected1, true );
assertAnalysis( 'back slash', 'Bedell Street\\ 133rd Avenue', expected1, true );
assertAnalysis( 'comma', 'Bedell Street,133rd Avenue', expected2, true );
assertAnalysis( 'comma', 'Bedell Street ,133rd Avenue', expected2, true );
assertAnalysis( 'comma', 'Bedell Street, 133rd Avenue', expected2, true );

suite.run( t.end );
});
Expand Down Expand Up @@ -202,22 +207,22 @@ module.exports.all = function (tape, common) {
}
};

function analyze( suite, t, analyzer, comment, text, expected ){
function analyze( suite, t, analyzer, comment, text, expected, includePosition ){
suite.assert( function( done ){
suite.client.indices.analyze({
index: suite.props.index,
analyzer: analyzer,
text: text
}, function( err, res ){
if( err ) console.error( err );
t.deepEqual( simpleTokens( res.tokens ), expected, comment );
if( err ){ console.error( err ); }
t.deepEqual( simpleTokens( res.tokens, includePosition ), expected, comment );
done();
});
});
}

function simpleTokens( tokens ){
function simpleTokens( tokens, includePosition ){
return tokens.map( function( t ){
return t.token;
return (!!includePosition ? t.position + ':' : '') + t.token;
});
}
48 changes: 10 additions & 38 deletions settings.js
Original file line number Diff line number Diff line change
Expand Up @@ -112,12 +112,14 @@ function generate(){
"char_filter" : ["punctuation", "nfkc_normalizer"],
"filter": [
"lowercase",
"icu_folding",
"trim",
"custom_name",
"remove_duplicate_spaces",
"ampersand",
"street_suffix_contractions",
"custom_name",
"street_suffix",
"directionals",
"icu_folding",
"remove_ordinals",
"unique",
"notnull"
]
Expand Down Expand Up @@ -151,17 +153,15 @@ function generate(){
"char_filter" : ["punctuation", "nfkc_normalizer"],
"filter": [
"lowercase",
"icu_folding",
"trim",
"remove_duplicate_spaces",
"custom_street",
].concat( synonyms.street_suffix_contractions.map( function( synonym ){
return "keyword_street_suffix_" + synonym.split(' ')[0];
})).concat( synonyms.directionals.map( function( synonym ){
return "keyword_compass_" + synonym.split(' ')[0];
})).concat([
"street_suffix",
"directionals",
"icu_folding",
"remove_ordinals",
"trim"
])
]
}
},
"filter" : {
Expand Down Expand Up @@ -272,34 +272,6 @@ function generate(){
};
}

// dynamically create filters which can replace text *inside* a token.
// we are not able to re-use the synonym functionality in elasticsearch
// because it only matches whole tokens, not strings *within* tokens.
// eg. synonyms are capable of ['street'] => ['st'] but not
// ['sesame street'] => ['sesame st']

// street suffix filters (replace text inside tokens)
// based off synonym list
synonyms.street_suffix_contractions.forEach( function( synonym ){
var split = synonym.split(' ');
settings.analysis.filter[ "keyword_street_suffix_" + split[0] ] = {
"type": "pattern_replace",
"pattern": " " + split[0],
"replacement": " " + split[2]
};
});

// compass prefix filters (replace text inside tokens)
// based off directionals list
synonyms.directionals.forEach( function( synonym ){
var split = synonym.split(' ');
settings.analysis.filter[ "keyword_compass_" + split[0] ] = {
"type": "pattern_replace",
"pattern": split[0],
"replacement": split[2]
};
});

// Merge settings from pelias/config
if( 'object' === typeof config &&
'object' === typeof config.elasticsearch &&
Expand Down
Loading

0 comments on commit b59ae7e

Please sign in to comment.