From 304144804b0ce1ae381df1ae77b6831e64cc1898 Mon Sep 17 00:00:00 2001 From: Julian Simioni Date: Sun, 15 Apr 2018 21:01:57 -0700 Subject: [PATCH 1/3] Split on whitespace for peliasStreet analyzer --- settings.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/settings.js b/settings.js index f0d2d1dd..831a076f 100644 --- a/settings.js +++ b/settings.js @@ -33,7 +33,7 @@ function generate(){ }, "peliasStreetTokenizer": { "type": "pattern", - "pattern": "[,/\\\\]+" + "pattern": "[\\s,/\\\\]+" } }, "analyzer": { From 76fe8ca24e193a70504d4d6043fda0cd0e5fd286 Mon Sep 17 00:00:00 2001 From: missinglink Date: Mon, 16 Apr 2018 19:05:15 +0200 Subject: [PATCH 2/3] fix(tests): peliasStreet --- integration/analyzer_peliasStreet.js | 52 ++++++++++++++-------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/integration/analyzer_peliasStreet.js b/integration/analyzer_peliasStreet.js index f739140e..820f5efb 100644 --- a/integration/analyzer_peliasStreet.js +++ b/integration/analyzer_peliasStreet.js @@ -17,14 +17,14 @@ module.exports.tests.analyze = function(test, common){ assertAnalysis( 'lowercase', 'F', ['f']); assertAnalysis( 'asciifolding', 'Max-Beer-Straße', ['max-beer-strasse']); assertAnalysis( 'trim', ' f ', ['f'] ); - assertAnalysis( 'keyword_street_suffix', 'foo Street', ['foo st'] ); - assertAnalysis( 'keyword_street_suffix', 'foo Road', ['foo rd'] ); - assertAnalysis( 'keyword_street_suffix', 'foo Crescent', ['foo cres'] ); - assertAnalysis( 'keyword_compass', 'north foo', ['n foo'] ); - assertAnalysis( 'keyword_compass', 'SouthWest foo', ['sw foo'] ); - assertAnalysis( 'keyword_compass', 'foo SouthWest', ['foo sw'] ); - assertAnalysis( 'remove_ordinals', '1st 2nd 3rd 4th 5th', ['1 2 3 4 5'] ); - assertAnalysis( 'remove_ordinals', 'Ast th 101st', ['ast th 101'] ); + assertAnalysis( 'keyword_street_suffix', 'foo Street', ['foo', 'street'] ); + assertAnalysis( 'keyword_street_suffix', 'foo Road', ['foo', 'road'] ); + assertAnalysis( 'keyword_street_suffix', 'foo Crescent', ['foo', 'crescent'] ); + assertAnalysis( 'keyword_compass', 'north foo', ['n', 'foo'] ); + assertAnalysis( 'keyword_compass', 'SouthWest foo', ['sw', 'foo'] ); + assertAnalysis( 'keyword_compass', 'foo SouthWest', ['foo', 'sw'] ); + assertAnalysis( 'remove_ordinals', '1st 2nd 3rd 4th 5th', ['1','2','3','4','5'] ); + assertAnalysis( 'remove_ordinals', 'Ast th 101st', ['ast','th','101'] ); suite.run( t.end ); }); @@ -37,11 +37,11 @@ module.exports.tests.functional = function(test, common){ var assertAnalysis = analyze.bind( null, suite, t, 'peliasStreet' ); suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up - assertAnalysis( 'USA address', 'west 26th street', [ 'w 26 st' ]); - assertAnalysis( 'USA address', 'West 26th Street', [ 'w 26 st' ]); - assertAnalysis( 'USA address', 'w 26th st', [ 'w 26 st' ]); - assertAnalysis( 'USA address', 'WEST 26th STREET', [ 'w 26 st' ]); - assertAnalysis( 'USA address', 'WEST 26th ST', [ 'w 26 st' ]); + assertAnalysis( 'USA address', 'west 26th street', [ 'w', '26', 'street' ]); + assertAnalysis( 'USA address', 'West 26th Street', [ 'w', '26', 'street' ]); + assertAnalysis( 'USA address', 'w 26th st', [ 'w', '26', 'st' ]); + assertAnalysis( 'USA address', 'WEST 26th STREET', [ 'w', '26', 'street' ]); + assertAnalysis( 'USA address', 'WEST 26th ST', [ 'w', '26', 'st' ]); suite.run( t.end ); }); @@ -54,10 +54,10 @@ module.exports.tests.normalize_punctuation = function(test, common){ var assertAnalysis = analyze.bind( null, suite, t, 'peliasStreet' ); suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up - assertAnalysis( 'single space', 'Chapala Street', [ 'chapala st' ]); - assertAnalysis( 'double space', 'Chapala Street', [ 'chapala st' ]); - assertAnalysis( 'triple space', 'Chapala Street', [ 'chapala st' ]); - assertAnalysis( 'quad space', 'Chapala Street', [ 'chapala st' ]); + assertAnalysis( 'single space', 'Chapala Street', [ 'chapala', 'street' ]); + assertAnalysis( 'double space', 'Chapala Street', [ 'chapala', 'street' ]); + assertAnalysis( 'triple space', 'Chapala Street', [ 'chapala', 'street' ]); + assertAnalysis( 'quad space', 'Chapala Street', [ 'chapala', 'street' ]); suite.run( t.end ); }); @@ -145,15 +145,15 @@ module.exports.tests.tokenizer = function(test, common){ suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up // specify 2 streets with a delimeter - assertAnalysis( 'forward slash', 'Bedell Street/133rd Avenue', [ 'bedell st', '133 ave' ]); - assertAnalysis( 'forward slash', 'Bedell Street /133rd Avenue', [ 'bedell st', '133 ave' ]); - assertAnalysis( 'forward slash', 'Bedell Street/ 133rd Avenue', [ 'bedell st', '133 ave' ]); - assertAnalysis( 'back slash', 'Bedell Street\\133rd Avenue', [ 'bedell st', '133 ave' ]); - assertAnalysis( 'back slash', 'Bedell Street \\133rd Avenue', [ 'bedell st', '133 ave' ]); - assertAnalysis( 'back slash', 'Bedell Street\\ 133rd Avenue', [ 'bedell st', '133 ave' ]); - assertAnalysis( 'comma', 'Bedell Street,133rd Avenue', [ 'bedell st', '133 ave' ]); - assertAnalysis( 'comma', 'Bedell Street ,133rd Avenue', [ 'bedell st', '133 ave' ]); - assertAnalysis( 'comma', 'Bedell Street, 133rd Avenue', [ 'bedell st', '133 ave' ]); + assertAnalysis( 'forward slash', 'Bedell Street/133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]); + assertAnalysis( 'forward slash', 'Bedell Street /133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]); + assertAnalysis( 'forward slash', 'Bedell Street/ 133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]); + assertAnalysis( 'back slash', 'Bedell Street\\133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]); + assertAnalysis( 'back slash', 'Bedell Street \\133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]); + assertAnalysis( 'back slash', 'Bedell Street\\ 133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]); + assertAnalysis( 'comma', 'Bedell Street,133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]); + assertAnalysis( 'comma', 'Bedell Street ,133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]); + assertAnalysis( 'comma', 'Bedell Street, 133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]); suite.run( t.end ); }); From b59ae7e9ab5f42f4c65f021d2efadfbdb405c2ed Mon Sep 17 00:00:00 2001 From: missinglink Date: Mon, 16 Apr 2018 20:05:16 +0200 Subject: [PATCH 3/3] feat(analysis): improvements to peliasStreet and peliasPhrase analysis --- integration/address_matching.js | 22 +- integration/analyzer_peliasPhrase.js | 80 +- integration/analyzer_peliasStreet.js | 63 +- settings.js | 48 +- synonyms/directionals.txt | 16 +- synonyms/street_suffix.txt | 120 +++ synonyms/street_suffix_contractions.txt | 120 --- test/fixtures/expected.json | 1041 +++-------------------- test/settings.js | 26 +- 9 files changed, 389 insertions(+), 1147 deletions(-) create mode 100644 synonyms/street_suffix.txt delete mode 100644 synonyms/street_suffix_contractions.txt diff --git a/integration/address_matching.js b/integration/address_matching.js index 3b6c633b..68879c2c 100644 --- a/integration/address_matching.js +++ b/integration/address_matching.js @@ -50,6 +50,18 @@ module.exports.tests.functional = function(test, common){ }, done ); }); + suite.action( function( done ){ + suite.client.index({ + index: suite.props.index, type: 'test', + id: '4', body: { address_parts: { + name: 'Mystery Location', + number: 300, + street: 'east 26th street', + zip: '100 10' + }} + }, done ); + }); + // search by street number suite.assert( function( done ){ suite.client.search({ @@ -71,7 +83,7 @@ module.exports.tests.functional = function(test, common){ index: suite.props.index, type: 'test', body: { query: { bool: { must: [ - { match: { 'address_parts.street': 'west 26th street' } } + { match_phrase: { 'address_parts.street': 'west 26th street' } } ]}}} }, function( err, res ){ t.equal( err, undefined ); @@ -86,7 +98,7 @@ module.exports.tests.functional = function(test, common){ index: suite.props.index, type: 'test', body: { query: { bool: { must: [ - { match: { 'address_parts.street': 'W 26th ST' } } + { match_phrase: { 'address_parts.street': 'W 26th ST' } } ]}}} }, function( err, res ){ t.equal( err, undefined ); @@ -105,7 +117,7 @@ module.exports.tests.functional = function(test, common){ ]}}} }, function( err, res ){ t.equal( err, undefined ); - t.equal( res.hits.total, 2, 'match zip - numeric' ); + t.equal( res.hits.total, 3, 'match zip - numeric' ); done(); }); }); @@ -135,7 +147,7 @@ module.exports.tests.functional = function(test, common){ ]}}} }, function( err, res ){ t.equal( err, undefined ); - t.equal( res.hits.total, 2, 'match zip - numeric - punct' ); + t.equal( res.hits.total, 3, 'match zip - numeric - punct' ); done(); }); }); @@ -150,7 +162,7 @@ module.exports.tests.functional = function(test, common){ ]}}} }, function( err, res ){ t.equal( err, undefined ); - t.equal( res.hits.total, 2, 'match zip - numeric - whitespace' ); + t.equal( res.hits.total, 3, 'match zip - numeric - whitespace' ); done(); }); }); diff --git a/integration/analyzer_peliasPhrase.js b/integration/analyzer_peliasPhrase.js index 5ceddbbc..63cd8bde 100644 --- a/integration/analyzer_peliasPhrase.js +++ b/integration/analyzer_peliasPhrase.js @@ -21,7 +21,7 @@ module.exports.tests.analyze = function(test, common){ assertAnalysis( 'asciifolding', 'ł', ['l']); assertAnalysis( 'asciifolding', 'ɰ', ['m']); assertAnalysis( 'trim', ' f ', ['f'] ); - assertAnalysis( 'stop_words (disabled)', 'a st b ave c', ['a','st','b','ave','c'] ); + assertAnalysis( 'stop_words (disabled)', 'a st b ave c', ['0:a', '1:st', '1:street', '2:b', '3:ave', '3:avenue', '4:c'], true ); assertAnalysis( 'ampersand', 'a and b', ['a','&','b'] ); assertAnalysis( 'ampersand', 'a & b', ['a','&','b'] ); assertAnalysis( 'ampersand', 'a and & and b', ['a','&','b'] ); @@ -38,11 +38,11 @@ module.exports.tests.analyze = function(test, common){ assertAnalysis( 'unique', '1 1 1', ['1'] ); assertAnalysis( 'notnull', ' ^ ', [] ); - assertAnalysis( 'stem street suffixes', 'streets avenue', ['streets','ave'] ); - assertAnalysis( 'stem street suffixes', 'boulevard roads', ['blvd','roads'] ); + assertAnalysis( 'stem street suffixes', 'streets avenue', ['0:streets', '1:avenue', '1:ave'], true ); + assertAnalysis( 'stem street suffixes', 'boulevard roads', ['0:boulevard', '0:blvd', '1:roads'], true ); - assertAnalysis( 'stem direction synonyms', 'south by southwest', ['s','by','sw'] ); - assertAnalysis( 'stem direction synonyms', '20 bear road northeast', ['20','bear','rd','ne'] ); + assertAnalysis( 'stem direction synonyms', 'south by southwest', ['0:south', '0:s', '1:by', '2:southwest', '2:sw'], true ); + assertAnalysis( 'stem direction synonyms', '20 bear road northeast', ['0:20', '1:bear', '2:road', '2:rd', '3:northeast', '3:ne'], true ); // remove punctuation (handled by the char_filter) assertAnalysis( 'punctuation', punctuation.all.join(''), [ '-&' ] ); @@ -66,29 +66,33 @@ module.exports.tests.functional = function(test, common){ 'toys', 'r', 'us' ]); - assertAnalysis( 'address', '101 mapzen pl', [ - '101', 'mapzen', 'pl' - ]); + assertAnalysis( 'address', '101 geocode pl', [ + '0:101', '1:geocode', '2:pl', '2:place' + ], true); // both terms should map to same tokens - var expected1 = [ '325', 'n', '12th', 'st' ]; - assertAnalysis( 'address', '325 N 12th St', expected1 ); - assertAnalysis( 'address', '325 North 12th Street', expected1 ); + var expected1 = [ '0:325', '1:n', '1:north', '2:12', '3:st', '3:street' ]; + var expected2 = [ '0:325', '1:north', '1:n', '2:12', '3:street', '3:st' ]; + assertAnalysis( 'address', '325 N 12th St', expected1, true ); + assertAnalysis( 'address', '325 North 12th Street', expected2, true ); // both terms should map to same tokens - var expected2 = [ '13509', 'colfax', 'ave', 's' ]; - assertAnalysis( 'address', '13509 Colfax Ave S', expected2 ); - assertAnalysis( 'address', '13509 Colfax Avenue South', expected2 ); + var expected3 = [ '0:13509', '1:colfax', '2:ave', '2:avenue', '3:s', '3:south' ]; + var expected4 = [ '0:13509', '1:colfax', '2:avenue', '2:ave', '3:south', '3:s' ]; + assertAnalysis( 'address', '13509 Colfax Ave S', expected3, true ); + assertAnalysis( 'address', '13509 Colfax Avenue South', expected4, true ); // both terms should map to same tokens - var expected3 = [ '100', 's', 'lake', 'dr' ]; - assertAnalysis( 'address', '100 S Lake Dr', expected3 ); - assertAnalysis( 'address', '100 South Lake Drive', expected3 ); + var expected5 = [ '0:100', '1:s', '1:south', '2:lake', '3:dr', '3:drive' ]; + var expected6 = [ '0:100', '1:south', '1:s', '2:lake', '3:drive', '3:dr' ]; + assertAnalysis( 'address', '100 S Lake Dr', expected5, true ); + assertAnalysis( 'address', '100 South Lake Drive', expected6, true ); // both terms should map to same tokens - var expected4 = [ '100', 'nw', 'hwy' ]; - assertAnalysis( 'address', '100 northwest highway', expected4 ); - assertAnalysis( 'address', '100 nw hwy', expected4 ); + var expected7 = [ '0:100', '1:northwest', '1:nw', '2:highway', '2:hwy' ]; + var expected8 = [ '0:100', '1:nw', '1:northwest', '2:hwy', '2:highway' ]; + assertAnalysis( 'address', '100 northwest highway', expected7, true ); + assertAnalysis( 'address', '100 nw hwy', expected8, true ); suite.run( t.end ); }); @@ -101,19 +105,19 @@ module.exports.tests.tokenizer = function(test, common){ var assertAnalysis = analyze.bind( null, suite, t, 'peliasPhrase' ); suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up - // specify 2 parts with a delimeter - assertAnalysis( 'forward slash', 'Bedell Street/133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]); - assertAnalysis( 'forward slash', 'Bedell Street /133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]); - assertAnalysis( 'forward slash', 'Bedell Street/ 133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]); - assertAnalysis( 'back slash', 'Bedell Street\\133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]); - assertAnalysis( 'back slash', 'Bedell Street \\133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]); - assertAnalysis( 'back slash', 'Bedell Street\\ 133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]); - assertAnalysis( 'comma', 'Bedell Street,133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]); - assertAnalysis( 'comma', 'Bedell Street ,133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]); - assertAnalysis( 'comma', 'Bedell Street, 133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]); - assertAnalysis( 'space', 'Bedell Street,133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]); - assertAnalysis( 'space', 'Bedell Street ,133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]); - assertAnalysis( 'space', 'Bedell Street, 133rd Avenue', [ 'bedell', 'st', '133rd', 'ave' ]); + var expected1 = [ '0:bedell', '1:street', '1:st', '2:133', '3:avenue', '3:ave' ]; + var expected2 = [ '0:bedell', '1:street', '1:st', '102:133', '103:avenue', '103:ave' ]; + + // specify 2 streets with a delimeter + assertAnalysis( 'forward slash', 'Bedell Street/133rd Avenue', expected1, true ); + assertAnalysis( 'forward slash', 'Bedell Street /133rd Avenue', expected1, true ); + assertAnalysis( 'forward slash', 'Bedell Street/ 133rd Avenue', expected1, true ); + assertAnalysis( 'back slash', 'Bedell Street\\133rd Avenue', expected1, true ); + assertAnalysis( 'back slash', 'Bedell Street \\133rd Avenue', expected1, true ); + assertAnalysis( 'back slash', 'Bedell Street\\ 133rd Avenue', expected1, true ); + assertAnalysis( 'comma', 'Bedell Street,133rd Avenue', expected2, true ); + assertAnalysis( 'comma', 'Bedell Street ,133rd Avenue', expected2, true ); + assertAnalysis( 'comma', 'Bedell Street, 133rd Avenue', expected2, true ); suite.run( t.end ); }); @@ -323,22 +327,22 @@ module.exports.all = function (tape, common) { } }; -function analyze( suite, t, analyzer, comment, text, expected ){ +function analyze( suite, t, analyzer, comment, text, expected, includePosition ){ suite.assert( function( done ){ suite.client.indices.analyze({ index: suite.props.index, analyzer: analyzer, text: text }, function( err, res ){ - if( err ) console.error( err ); - t.deepEqual( simpleTokens( res.tokens ), expected, comment ); + if( err ){ console.error( err ); } + t.deepEqual( simpleTokens( res.tokens, includePosition ), expected, comment ); done(); }); }); } -function simpleTokens( tokens ){ +function simpleTokens( tokens, includePosition ){ return tokens.map( function( t ){ - return t.token; + return (!!includePosition ? t.position + ':' : '') + t.token; }); } diff --git a/integration/analyzer_peliasStreet.js b/integration/analyzer_peliasStreet.js index 820f5efb..b21643b5 100644 --- a/integration/analyzer_peliasStreet.js +++ b/integration/analyzer_peliasStreet.js @@ -17,12 +17,12 @@ module.exports.tests.analyze = function(test, common){ assertAnalysis( 'lowercase', 'F', ['f']); assertAnalysis( 'asciifolding', 'Max-Beer-Straße', ['max-beer-strasse']); assertAnalysis( 'trim', ' f ', ['f'] ); - assertAnalysis( 'keyword_street_suffix', 'foo Street', ['foo', 'street'] ); - assertAnalysis( 'keyword_street_suffix', 'foo Road', ['foo', 'road'] ); - assertAnalysis( 'keyword_street_suffix', 'foo Crescent', ['foo', 'crescent'] ); - assertAnalysis( 'keyword_compass', 'north foo', ['n', 'foo'] ); - assertAnalysis( 'keyword_compass', 'SouthWest foo', ['sw', 'foo'] ); - assertAnalysis( 'keyword_compass', 'foo SouthWest', ['foo', 'sw'] ); + assertAnalysis( 'keyword_street_suffix', 'foo Street', ['0:foo', '1:street', '1:st'], true ); + assertAnalysis( 'keyword_street_suffix', 'foo Road', ['0:foo', '1:road', '1:rd'], true ); + assertAnalysis( 'keyword_street_suffix', 'foo Crescent', ['0:foo', '1:crescent', '1:cres'], true ); + assertAnalysis( 'keyword_compass', 'north foo', ['0:north', '0:n', '1:foo'], true ); + assertAnalysis( 'keyword_compass', 'SouthWest foo', ['0:southwest', '0:sw', '1:foo'], true ); + assertAnalysis( 'keyword_compass', 'foo SouthWest', ['0:foo', '1:southwest', '1:sw'], true ); assertAnalysis( 'remove_ordinals', '1st 2nd 3rd 4th 5th', ['1','2','3','4','5'] ); assertAnalysis( 'remove_ordinals', 'Ast th 101st', ['ast','th','101'] ); @@ -37,11 +37,11 @@ module.exports.tests.functional = function(test, common){ var assertAnalysis = analyze.bind( null, suite, t, 'peliasStreet' ); suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up - assertAnalysis( 'USA address', 'west 26th street', [ 'w', '26', 'street' ]); - assertAnalysis( 'USA address', 'West 26th Street', [ 'w', '26', 'street' ]); - assertAnalysis( 'USA address', 'w 26th st', [ 'w', '26', 'st' ]); - assertAnalysis( 'USA address', 'WEST 26th STREET', [ 'w', '26', 'street' ]); - assertAnalysis( 'USA address', 'WEST 26th ST', [ 'w', '26', 'st' ]); + assertAnalysis( 'USA address', 'west 26th street', [ '0:west', '0:w', '1:26', '2:street', '2:st' ], true); + assertAnalysis( 'USA address', 'West 26th Street', [ '0:west', '0:w', '1:26', '2:street', '2:st' ], true); + assertAnalysis( 'USA address', 'w 26th st', [ '0:w', '0:west', '1:26', '2:st', '2:street' ], true); + assertAnalysis( 'USA address', 'WEST 26th STREET', [ '0:west', '0:w', '1:26', '2:street', '2:st' ], true); + assertAnalysis( 'USA address', 'WEST 26th ST', [ '0:west', '0:w', '1:26', '2:st', '2:street' ], true); suite.run( t.end ); }); @@ -54,10 +54,12 @@ module.exports.tests.normalize_punctuation = function(test, common){ var assertAnalysis = analyze.bind( null, suite, t, 'peliasStreet' ); suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up - assertAnalysis( 'single space', 'Chapala Street', [ 'chapala', 'street' ]); - assertAnalysis( 'double space', 'Chapala Street', [ 'chapala', 'street' ]); - assertAnalysis( 'triple space', 'Chapala Street', [ 'chapala', 'street' ]); - assertAnalysis( 'quad space', 'Chapala Street', [ 'chapala', 'street' ]); + var expected = [ '0:chapala', '1:street', '1:st' ]; + + assertAnalysis( 'single space', 'Chapala Street', expected, true ); + assertAnalysis( 'double space', 'Chapala Street', expected, true ); + assertAnalysis( 'triple space', 'Chapala Street', expected, true ); + assertAnalysis( 'quad space', 'Chapala Street', expected, true ); suite.run( t.end ); }); @@ -144,16 +146,19 @@ module.exports.tests.tokenizer = function(test, common){ var assertAnalysis = analyze.bind( null, suite, t, 'peliasStreet' ); suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up + var expected1 = [ '0:bedell', '1:street', '1:st', '2:133', '3:avenue', '3:ave' ]; + var expected2 = [ '0:bedell', '1:street', '1:st', '102:133', '103:avenue', '103:ave' ]; + // specify 2 streets with a delimeter - assertAnalysis( 'forward slash', 'Bedell Street/133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]); - assertAnalysis( 'forward slash', 'Bedell Street /133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]); - assertAnalysis( 'forward slash', 'Bedell Street/ 133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]); - assertAnalysis( 'back slash', 'Bedell Street\\133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]); - assertAnalysis( 'back slash', 'Bedell Street \\133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]); - assertAnalysis( 'back slash', 'Bedell Street\\ 133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]); - assertAnalysis( 'comma', 'Bedell Street,133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]); - assertAnalysis( 'comma', 'Bedell Street ,133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]); - assertAnalysis( 'comma', 'Bedell Street, 133rd Avenue', [ 'bedell', 'street', '133', 'avenue' ]); + assertAnalysis( 'forward slash', 'Bedell Street/133rd Avenue', expected1, true ); + assertAnalysis( 'forward slash', 'Bedell Street /133rd Avenue', expected1, true ); + assertAnalysis( 'forward slash', 'Bedell Street/ 133rd Avenue', expected1, true ); + assertAnalysis( 'back slash', 'Bedell Street\\133rd Avenue', expected1, true ); + assertAnalysis( 'back slash', 'Bedell Street \\133rd Avenue', expected1, true ); + assertAnalysis( 'back slash', 'Bedell Street\\ 133rd Avenue', expected1, true ); + assertAnalysis( 'comma', 'Bedell Street,133rd Avenue', expected2, true ); + assertAnalysis( 'comma', 'Bedell Street ,133rd Avenue', expected2, true ); + assertAnalysis( 'comma', 'Bedell Street, 133rd Avenue', expected2, true ); suite.run( t.end ); }); @@ -202,22 +207,22 @@ module.exports.all = function (tape, common) { } }; -function analyze( suite, t, analyzer, comment, text, expected ){ +function analyze( suite, t, analyzer, comment, text, expected, includePosition ){ suite.assert( function( done ){ suite.client.indices.analyze({ index: suite.props.index, analyzer: analyzer, text: text }, function( err, res ){ - if( err ) console.error( err ); - t.deepEqual( simpleTokens( res.tokens ), expected, comment ); + if( err ){ console.error( err ); } + t.deepEqual( simpleTokens( res.tokens, includePosition ), expected, comment ); done(); }); }); } -function simpleTokens( tokens ){ +function simpleTokens( tokens, includePosition ){ return tokens.map( function( t ){ - return t.token; + return (!!includePosition ? t.position + ':' : '') + t.token; }); } diff --git a/settings.js b/settings.js index 831a076f..a31096c3 100644 --- a/settings.js +++ b/settings.js @@ -112,12 +112,14 @@ function generate(){ "char_filter" : ["punctuation", "nfkc_normalizer"], "filter": [ "lowercase", - "icu_folding", "trim", - "custom_name", + "remove_duplicate_spaces", "ampersand", - "street_suffix_contractions", + "custom_name", + "street_suffix", "directionals", + "icu_folding", + "remove_ordinals", "unique", "notnull" ] @@ -151,17 +153,15 @@ function generate(){ "char_filter" : ["punctuation", "nfkc_normalizer"], "filter": [ "lowercase", - "icu_folding", + "trim", "remove_duplicate_spaces", "custom_street", - ].concat( synonyms.street_suffix_contractions.map( function( synonym ){ - return "keyword_street_suffix_" + synonym.split(' ')[0]; - })).concat( synonyms.directionals.map( function( synonym ){ - return "keyword_compass_" + synonym.split(' ')[0]; - })).concat([ + "street_suffix", + "directionals", + "icu_folding", "remove_ordinals", "trim" - ]) + ] } }, "filter" : { @@ -272,34 +272,6 @@ function generate(){ }; } - // dynamically create filters which can replace text *inside* a token. - // we are not able to re-use the synonym functionality in elasticsearch - // because it only matches whole tokens, not strings *within* tokens. - // eg. synonyms are capable of ['street'] => ['st'] but not - // ['sesame street'] => ['sesame st'] - - // street suffix filters (replace text inside tokens) - // based off synonym list - synonyms.street_suffix_contractions.forEach( function( synonym ){ - var split = synonym.split(' '); - settings.analysis.filter[ "keyword_street_suffix_" + split[0] ] = { - "type": "pattern_replace", - "pattern": " " + split[0], - "replacement": " " + split[2] - }; - }); - - // compass prefix filters (replace text inside tokens) - // based off directionals list - synonyms.directionals.forEach( function( synonym ){ - var split = synonym.split(' '); - settings.analysis.filter[ "keyword_compass_" + split[0] ] = { - "type": "pattern_replace", - "pattern": split[0], - "replacement": split[2] - }; - }); - // Merge settings from pelias/config if( 'object' === typeof config && 'object' === typeof config.elasticsearch && diff --git a/synonyms/directionals.txt b/synonyms/directionals.txt index bdb00bbc..a427ff2e 100644 --- a/synonyms/directionals.txt +++ b/synonyms/directionals.txt @@ -1,11 +1,11 @@ # note: more descriptive tokens must come before less descriptive ones # eg: 'southwest' must come before 'west' else 'southwest foo' -> 'southw foo' -southwest => sw -southeast => se -northwest => nw -northeast => ne -north => n -south => s -east => e -west => w +southwest, sw +southeast, se +northwest, nw +northeast, ne +north, n +south, s +east, e +west, w diff --git a/synonyms/street_suffix.txt b/synonyms/street_suffix.txt new file mode 100644 index 00000000..f8431648 --- /dev/null +++ b/synonyms/street_suffix.txt @@ -0,0 +1,120 @@ +alley, aly +annex, anx +avenue, ave +bayou, byu +beach, bch +bend, bnd +bluff, blf +bluffs, blfs +bottom, btm +boulevard, blvd +branch, br +bridge, brg +brook, brk +bypass, byp +canyon, cyn +cape, cp +causeway, cswy +center, ctr +channel, chnnl +circle, cir +cliff, clf +close, cl +club, clb +common, cmn +commons, cmns +connector, con +corridor, cor +course, crse +court, ct +cove, cv +creek, crk +crescent, cres +crest, crst +crossing, xing +crossroad, xrd +crossroads, xrds +curve, curv +dale, dl +dam, dm +drive, dr +esplanade, esp +expressway, expy +extended, ext +falls, fls +ferry, fry +field, fld +fields, flds +flat, flt +flats, flts +ford, frd +forest, frst +forge, frg +fork, frk +forks, frks +freeway, fwy +garden, gdn +gardens, gdns +gateway, gtwy +glen, gln +glenn, gln +green, grn +grove, grv +harbor, hbr +haven, hvn +heights, hts +highway, hwy +hill, hl +hills, hls +hollow, holw +isle, is +junction, jct +key, ky +keys, kys +knoll, knl +knolls, knls +landing, lndg +lane, ln +light, lgt +lights, lgts +lock, lck +locks, lcks +manor, mnr +meadow, mdw +meadows, mdws +mill, ml +mills, mls +mountain, mnt +motorway, mtwy +neck, nck +orchard, orch +parkway, pkwy +pasage, psge +pier, pr +pine, pne +pines, pnes +place, pl +plaza, plz +ranch, rnch +ridge, rdg +ridges, rdgs +river, riv +road, rd +route, rte +shore, shr +shores, shrs +skyway, skwy +spring, spg +springs, spgs +square, sq +street, st +suite, ste +terrace, terr +trail, tr +trafficway, trfy +tunnel, tunl +turnpike, tpke +valley, vly +vista, vis +village, vlg +way, wy diff --git a/synonyms/street_suffix_contractions.txt b/synonyms/street_suffix_contractions.txt deleted file mode 100644 index 75a5e3f2..00000000 --- a/synonyms/street_suffix_contractions.txt +++ /dev/null @@ -1,120 +0,0 @@ -alley => aly -annex => anx -avenue => ave -bayou => byu -beach => bch -bend => bnd -bluff => blf -bluffs => blfs -bottom => btm -boulevard => blvd -branch => br -bridge => brg -brook => brk -bypass => byp -canyon => cyn -cape => cp -causeway => cswy -center => ctr -channel => chnnl -circle => cir -cliff => clf -close => cl -club => clb -common => cmn -commons => cmns -connector => con -corridor => cor -course => crse -court => ct -cove => cv -creek => crk -crescent => cres -crest => crst -crossing => xing -crossroad => xrd -crossroads => xrds -curve => curv -dale => dl -dam => dm -drive => dr -esplanade => esp -expressway => expy -extended => ext -falls => fls -ferry => fry -field => fld -fields => flds -flat => flt -flats => flts -ford => frd -forest => frst -forge => frg -fork => frk -forks => frks -freeway => fwy -garden => gdn -gardens => gdns -gateway => gtwy -glen => gln -glenn => gln -green => grn -grove => grv -harbor => hbr -haven => hvn -heights => hts -highway => hwy -hill => hl -hills => hls -hollow => holw -isle => is -junction => jct -key => ky -keys => kys -knoll => knl -knolls => knls -landing => lndg -lane => ln -light => lgt -lights => lgts -lock => lck -locks => lcks -manor => mnr -meadow => mdw -meadows => mdws -mill => ml -mills => mls -mountain => mnt -motorway => mtwy -neck => nck -orchard => orch -parkway => pkwy -pasage => psge -pier => pr -pine => pne -pines => pnes -place => pl -plaza => plz -ranch => rnch -ridge => rdg -ridges => rdgs -river => riv -road => rd -route => rte -shore => shr -shores => shrs -skyway => skwy -spring => spg -springs => spgs -square => sq -street => st -suite => ste -terrace => terr -trail => tr -trafficway => trfy -tunnel => tunl -turnpike => tpke -valley => vly -vista => vis -village => vlg -way => wy diff --git a/test/fixtures/expected.json b/test/fixtures/expected.json index 4a473866..2442b7be 100644 --- a/test/fixtures/expected.json +++ b/test/fixtures/expected.json @@ -8,7 +8,7 @@ }, "peliasStreetTokenizer": { "type": "pattern", - "pattern": "[,/\\\\]+" + "pattern": "[\\s,/\\\\]+" } }, "analyzer": { @@ -102,12 +102,14 @@ ], "filter": [ "lowercase", - "icu_folding", "trim", - "custom_name", + "remove_duplicate_spaces", "ampersand", - "street_suffix_contractions", + "custom_name", + "street_suffix", "directionals", + "icu_folding", + "remove_ordinals", "unique", "notnull" ] @@ -150,137 +152,12 @@ ], "filter": [ "lowercase", - "icu_folding", + "trim", "remove_duplicate_spaces", "custom_street", - "keyword_street_suffix_alley", - "keyword_street_suffix_annex", - "keyword_street_suffix_avenue", - "keyword_street_suffix_bayou", - "keyword_street_suffix_beach", - "keyword_street_suffix_bend", - "keyword_street_suffix_bluff", - "keyword_street_suffix_bluffs", - "keyword_street_suffix_bottom", - "keyword_street_suffix_boulevard", - "keyword_street_suffix_branch", - "keyword_street_suffix_bridge", - "keyword_street_suffix_brook", - "keyword_street_suffix_bypass", - "keyword_street_suffix_canyon", - "keyword_street_suffix_cape", - "keyword_street_suffix_causeway", - "keyword_street_suffix_center", - "keyword_street_suffix_channel", - "keyword_street_suffix_circle", - "keyword_street_suffix_cliff", - "keyword_street_suffix_close", - "keyword_street_suffix_club", - "keyword_street_suffix_common", - "keyword_street_suffix_commons", - "keyword_street_suffix_connector", - "keyword_street_suffix_corridor", - "keyword_street_suffix_course", - "keyword_street_suffix_court", - "keyword_street_suffix_cove", - "keyword_street_suffix_creek", - "keyword_street_suffix_crescent", - "keyword_street_suffix_crest", - "keyword_street_suffix_crossing", - "keyword_street_suffix_crossroad", - "keyword_street_suffix_crossroads", - "keyword_street_suffix_curve", - "keyword_street_suffix_dale", - "keyword_street_suffix_dam", - "keyword_street_suffix_drive", - "keyword_street_suffix_esplanade", - "keyword_street_suffix_expressway", - "keyword_street_suffix_extended", - "keyword_street_suffix_falls", - "keyword_street_suffix_ferry", - "keyword_street_suffix_field", - "keyword_street_suffix_fields", - "keyword_street_suffix_flat", - "keyword_street_suffix_flats", - "keyword_street_suffix_ford", - "keyword_street_suffix_forest", - "keyword_street_suffix_forge", - "keyword_street_suffix_fork", - "keyword_street_suffix_forks", - "keyword_street_suffix_freeway", - "keyword_street_suffix_garden", - "keyword_street_suffix_gardens", - "keyword_street_suffix_gateway", - "keyword_street_suffix_glen", - "keyword_street_suffix_glenn", - "keyword_street_suffix_green", - "keyword_street_suffix_grove", - "keyword_street_suffix_harbor", - "keyword_street_suffix_haven", - "keyword_street_suffix_heights", - "keyword_street_suffix_highway", - "keyword_street_suffix_hill", - "keyword_street_suffix_hills", - "keyword_street_suffix_hollow", - "keyword_street_suffix_isle", - "keyword_street_suffix_junction", - "keyword_street_suffix_key", - "keyword_street_suffix_keys", - "keyword_street_suffix_knoll", - "keyword_street_suffix_knolls", - "keyword_street_suffix_landing", - "keyword_street_suffix_lane", - "keyword_street_suffix_light", - "keyword_street_suffix_lights", - "keyword_street_suffix_lock", - "keyword_street_suffix_locks", - "keyword_street_suffix_manor", - "keyword_street_suffix_meadow", - "keyword_street_suffix_meadows", - "keyword_street_suffix_mill", - "keyword_street_suffix_mills", - "keyword_street_suffix_mountain", - "keyword_street_suffix_motorway", - "keyword_street_suffix_neck", - "keyword_street_suffix_orchard", - "keyword_street_suffix_parkway", - "keyword_street_suffix_pasage", - "keyword_street_suffix_pier", - "keyword_street_suffix_pine", - "keyword_street_suffix_pines", - "keyword_street_suffix_place", - "keyword_street_suffix_plaza", - "keyword_street_suffix_ranch", - "keyword_street_suffix_ridge", - "keyword_street_suffix_ridges", - "keyword_street_suffix_river", - "keyword_street_suffix_road", - "keyword_street_suffix_route", - "keyword_street_suffix_shore", - "keyword_street_suffix_shores", - "keyword_street_suffix_skyway", - "keyword_street_suffix_spring", - "keyword_street_suffix_springs", - "keyword_street_suffix_square", - "keyword_street_suffix_street", - "keyword_street_suffix_suite", - "keyword_street_suffix_terrace", - "keyword_street_suffix_trail", - "keyword_street_suffix_trafficway", - "keyword_street_suffix_tunnel", - "keyword_street_suffix_turnpike", - "keyword_street_suffix_valley", - "keyword_street_suffix_vista", - "keyword_street_suffix_village", - "keyword_street_suffix_way", - "keyword_compass_southwest", - "keyword_compass_southeast", - "keyword_compass_northwest", - "keyword_compass_northeast", - "keyword_compass_north", - "keyword_compass_south", - "keyword_compass_east", - "keyword_compass_west", + "street_suffix", + "directionals", + "icu_folding", "remove_ordinals", "trim" ] @@ -330,129 +207,129 @@ "pattern": "^(0*)", "replacement": "" }, - "street_suffix_contractions": { + "street_suffix": { "type": "synonym", "synonyms": [ - "alley => aly", - "annex => anx", - "avenue => ave", - "bayou => byu", - "beach => bch", - "bend => bnd", - "bluff => blf", - "bluffs => blfs", - "bottom => btm", - "boulevard => blvd", - "branch => br", - "bridge => brg", - "brook => brk", - "bypass => byp", - "canyon => cyn", - "cape => cp", - "causeway => cswy", - "center => ctr", - "channel => chnnl", - "circle => cir", - "cliff => clf", - "close => cl", - "club => clb", - "common => cmn", - "commons => cmns", - "connector => con", - "corridor => cor", - "course => crse", - "court => ct", - "cove => cv", - "creek => crk", - "crescent => cres", - "crest => crst", - "crossing => xing", - "crossroad => xrd", - "crossroads => xrds", - "curve => curv", - "dale => dl", - "dam => dm", - "drive => dr", - "esplanade => esp", - "expressway => expy", - "extended => ext", - "falls => fls", - "ferry => fry", - "field => fld", - "fields => flds", - "flat => flt", - "flats => flts", - "ford => frd", - "forest => frst", - "forge => frg", - "fork => frk", - "forks => frks", - "freeway => fwy", - "garden => gdn", - "gardens => gdns", - "gateway => gtwy", - "glen => gln", - "glenn => gln", - "green => grn", - "grove => grv", - "harbor => hbr", - "haven => hvn", - "heights => hts", - "highway => hwy", - "hill => hl", - "hills => hls", - "hollow => holw", - "isle => is", - "junction => jct", - "key => ky", - "keys => kys", - "knoll => knl", - "knolls => knls", - "landing => lndg", - "lane => ln", - "light => lgt", - "lights => lgts", - "lock => lck", - "locks => lcks", - "manor => mnr", - "meadow => mdw", - "meadows => mdws", - "mill => ml", - "mills => mls", - "mountain => mnt", - "motorway => mtwy", - "neck => nck", - "orchard => orch", - "parkway => pkwy", - "pasage => psge", - "pier => pr", - "pine => pne", - "pines => pnes", - "place => pl", - "plaza => plz", - "ranch => rnch", - "ridge => rdg", - "ridges => rdgs", - "river => riv", - "road => rd", - "route => rte", - "shore => shr", - "shores => shrs", - "skyway => skwy", - "spring => spg", - "springs => spgs", - "square => sq", - "street => st", - "suite => ste", - "terrace => terr", - "trail => tr", - "trafficway => trfy", - "tunnel => tunl", - "turnpike => tpke", - "valley => vly", - "vista => vis", - "village => vlg", - "way => wy" + "alley,aly", + "annex,anx", + "avenue,ave", + "bayou,byu", + "beach,bch", + "bend,bnd", + "bluff,blf", + "bluffs,blfs", + "bottom,btm", + "boulevard,blvd", + "branch,br", + "bridge,brg", + "brook,brk", + "bypass,byp", + "canyon,cyn", + "cape,cp", + "causeway,cswy", + "center,ctr", + "channel,chnnl", + "circle,cir", + "cliff,clf", + "close,cl", + "club,clb", + "common,cmn", + "commons,cmns", + "connector,con", + "corridor,cor", + "course,crse", + "court,ct", + "cove,cv", + "creek,crk", + "crescent,cres", + "crest,crst", + "crossing,xing", + "crossroad,xrd", + "crossroads,xrds", + "curve,curv", + "dale,dl", + "dam,dm", + "drive,dr", + "esplanade,esp", + "expressway,expy", + "extended,ext", + "falls,fls", + "ferry,fry", + "field,fld", + "fields,flds", + "flat,flt", + "flats,flts", + "ford,frd", + "forest,frst", + "forge,frg", + "fork,frk", + "forks,frks", + "freeway,fwy", + "garden,gdn", + "gardens,gdns", + "gateway,gtwy", + "glen,gln", + "glenn,gln", + "green,grn", + "grove,grv", + "harbor,hbr", + "haven,hvn", + "heights,hts", + "highway,hwy", + "hill,hl", + "hills,hls", + "hollow,holw", + "isle,is", + "junction,jct", + "key,ky", + "keys,kys", + "knoll,knl", + "knolls,knls", + "landing,lndg", + "lane,ln", + "light,lgt", + "lights,lgts", + "lock,lck", + "locks,lcks", + "manor,mnr", + "meadow,mdw", + "meadows,mdws", + "mill,ml", + "mills,mls", + "mountain,mnt", + "motorway,mtwy", + "neck,nck", + "orchard,orch", + "parkway,pkwy", + "pasage,psge", + "pier,pr", + "pine,pne", + "pines,pnes", + "place,pl", + "plaza,plz", + "ranch,rnch", + "ridge,rdg", + "ridges,rdgs", + "river,riv", + "road,rd", + "route,rte", + "shore,shr", + "shores,shrs", + "skyway,skwy", + "spring,spg", + "springs,spgs", + "square,sq", + "street,st", + "suite,ste", + "terrace,terr", + "trail,tr", + "trafficway,trfy", + "tunnel,tunl", + "turnpike,tpke", + "valley,vly", + "vista,vis", + "village,vlg", + "way,wy" ] }, "partial_token_address_suffix_expansion": { @@ -664,14 +541,14 @@ "directionals": { "type": "synonym", "synonyms": [ - "southwest => sw", - "southeast => se", - "northwest => nw", - "northeast => ne", - "north => n", - "south => s", - "east => e", - "west => w" + "southwest,sw", + "southeast,se", + "northwest,nw", + "northeast,ne", + "north,n", + "south,s", + "east,e", + "west,w" ] }, "remove_ordinals": { @@ -718,646 +595,6 @@ "type": "pattern_replace", "pattern": "^\u0002(.*)\u0003$", "replacement": "$1" - }, - "keyword_street_suffix_alley": { - "type": "pattern_replace", - "pattern": " alley", - "replacement": " aly" - }, - "keyword_street_suffix_annex": { - "type": "pattern_replace", - "pattern": " annex", - "replacement": " anx" - }, - "keyword_street_suffix_avenue": { - "type": "pattern_replace", - "pattern": " avenue", - "replacement": " ave" - }, - "keyword_street_suffix_bayou": { - "type": "pattern_replace", - "pattern": " bayou", - "replacement": " byu" - }, - "keyword_street_suffix_beach": { - "type": "pattern_replace", - "pattern": " beach", - "replacement": " bch" - }, - "keyword_street_suffix_bend": { - "type": "pattern_replace", - "pattern": " bend", - "replacement": " bnd" - }, - "keyword_street_suffix_bluff": { - "type": "pattern_replace", - "pattern": " bluff", - "replacement": " blf" - }, - "keyword_street_suffix_bluffs": { - "type": "pattern_replace", - "pattern": " bluffs", - "replacement": " blfs" - }, - "keyword_street_suffix_bottom": { - "type": "pattern_replace", - "pattern": " bottom", - "replacement": " btm" - }, - "keyword_street_suffix_boulevard": { - "type": "pattern_replace", - "pattern": " boulevard", - "replacement": " blvd" - }, - "keyword_street_suffix_branch": { - "type": "pattern_replace", - "pattern": " branch", - "replacement": " br" - }, - "keyword_street_suffix_bridge": { - "type": "pattern_replace", - "pattern": " bridge", - "replacement": " brg" - }, - "keyword_street_suffix_brook": { - "type": "pattern_replace", - "pattern": " brook", - "replacement": " brk" - }, - "keyword_street_suffix_bypass": { - "type": "pattern_replace", - "pattern": " bypass", - "replacement": " byp" - }, - "keyword_street_suffix_canyon": { - "type": "pattern_replace", - "pattern": " canyon", - "replacement": " cyn" - }, - "keyword_street_suffix_cape": { - "type": "pattern_replace", - "pattern": " cape", - "replacement": " cp" - }, - "keyword_street_suffix_causeway": { - "type": "pattern_replace", - "pattern": " causeway", - "replacement": " cswy" - }, - "keyword_street_suffix_center": { - "type": "pattern_replace", - "pattern": " center", - "replacement": " ctr" - }, - "keyword_street_suffix_channel": { - "type": "pattern_replace", - "pattern": " channel", - "replacement": " chnnl" - }, - "keyword_street_suffix_circle": { - "type": "pattern_replace", - "pattern": " circle", - "replacement": " cir" - }, - "keyword_street_suffix_cliff": { - "type": "pattern_replace", - "pattern": " cliff", - "replacement": " clf" - }, - "keyword_street_suffix_close": { - "type": "pattern_replace", - "pattern": " close", - "replacement": " cl" - }, - "keyword_street_suffix_club": { - "type": "pattern_replace", - "pattern": " club", - "replacement": " clb" - }, - "keyword_street_suffix_common": { - "type": "pattern_replace", - "pattern": " common", - "replacement": " cmn" - }, - "keyword_street_suffix_commons": { - "type": "pattern_replace", - "pattern": " commons", - "replacement": " cmns" - }, - "keyword_street_suffix_connector": { - "type": "pattern_replace", - "pattern": " connector", - "replacement": " con" - }, - "keyword_street_suffix_corridor": { - "type": "pattern_replace", - "pattern": " corridor", - "replacement": " cor" - }, - "keyword_street_suffix_course": { - "type": "pattern_replace", - "pattern": " course", - "replacement": " crse" - }, - "keyword_street_suffix_court": { - "type": "pattern_replace", - "pattern": " court", - "replacement": " ct" - }, - "keyword_street_suffix_cove": { - "type": "pattern_replace", - "pattern": " cove", - "replacement": " cv" - }, - "keyword_street_suffix_creek": { - "type": "pattern_replace", - "pattern": " creek", - "replacement": " crk" - }, - "keyword_street_suffix_crescent": { - "type": "pattern_replace", - "pattern": " crescent", - "replacement": " cres" - }, - "keyword_street_suffix_crest": { - "type": "pattern_replace", - "pattern": " crest", - "replacement": " crst" - }, - "keyword_street_suffix_crossing": { - "type": "pattern_replace", - "pattern": " crossing", - "replacement": " xing" - }, - "keyword_street_suffix_crossroad": { - "type": "pattern_replace", - "pattern": " crossroad", - "replacement": " xrd" - }, - "keyword_street_suffix_crossroads": { - "type": "pattern_replace", - "pattern": " crossroads", - "replacement": " xrds" - }, - "keyword_street_suffix_curve": { - "type": "pattern_replace", - "pattern": " curve", - "replacement": " curv" - }, - "keyword_street_suffix_dale": { - "type": "pattern_replace", - "pattern": " dale", - "replacement": " dl" - }, - "keyword_street_suffix_dam": { - "type": "pattern_replace", - "pattern": " dam", - "replacement": " dm" - }, - "keyword_street_suffix_drive": { - "type": "pattern_replace", - "pattern": " drive", - "replacement": " dr" - }, - "keyword_street_suffix_esplanade": { - "type": "pattern_replace", - "pattern": " esplanade", - "replacement": " esp" - }, - "keyword_street_suffix_expressway": { - "type": "pattern_replace", - "pattern": " expressway", - "replacement": " expy" - }, - "keyword_street_suffix_extended": { - "type": "pattern_replace", - "pattern": " extended", - "replacement": " ext" - }, - "keyword_street_suffix_falls": { - "type": "pattern_replace", - "pattern": " falls", - "replacement": " fls" - }, - "keyword_street_suffix_ferry": { - "type": "pattern_replace", - "pattern": " ferry", - "replacement": " fry" - }, - "keyword_street_suffix_field": { - "type": "pattern_replace", - "pattern": " field", - "replacement": " fld" - }, - "keyword_street_suffix_fields": { - "type": "pattern_replace", - "pattern": " fields", - "replacement": " flds" - }, - "keyword_street_suffix_flat": { - "type": "pattern_replace", - "pattern": " flat", - "replacement": " flt" - }, - "keyword_street_suffix_flats": { - "type": "pattern_replace", - "pattern": " flats", - "replacement": " flts" - }, - "keyword_street_suffix_ford": { - "type": "pattern_replace", - "pattern": " ford", - "replacement": " frd" - }, - "keyword_street_suffix_forest": { - "type": "pattern_replace", - "pattern": " forest", - "replacement": " frst" - }, - "keyword_street_suffix_forge": { - "type": "pattern_replace", - "pattern": " forge", - "replacement": " frg" - }, - "keyword_street_suffix_fork": { - "type": "pattern_replace", - "pattern": " fork", - "replacement": " frk" - }, - "keyword_street_suffix_forks": { - "type": "pattern_replace", - "pattern": " forks", - "replacement": " frks" - }, - "keyword_street_suffix_freeway": { - "type": "pattern_replace", - "pattern": " freeway", - "replacement": " fwy" - }, - "keyword_street_suffix_garden": { - "type": "pattern_replace", - "pattern": " garden", - "replacement": " gdn" - }, - "keyword_street_suffix_gardens": { - "type": "pattern_replace", - "pattern": " gardens", - "replacement": " gdns" - }, - "keyword_street_suffix_gateway": { - "type": "pattern_replace", - "pattern": " gateway", - "replacement": " gtwy" - }, - "keyword_street_suffix_glen": { - "type": "pattern_replace", - "pattern": " glen", - "replacement": " gln" - }, - "keyword_street_suffix_glenn": { - "type": "pattern_replace", - "pattern": " glenn", - "replacement": " gln" - }, - "keyword_street_suffix_green": { - "type": "pattern_replace", - "pattern": " green", - "replacement": " grn" - }, - "keyword_street_suffix_grove": { - "type": "pattern_replace", - "pattern": " grove", - "replacement": " grv" - }, - "keyword_street_suffix_harbor": { - "type": "pattern_replace", - "pattern": " harbor", - "replacement": " hbr" - }, - "keyword_street_suffix_haven": { - "type": "pattern_replace", - "pattern": " haven", - "replacement": " hvn" - }, - "keyword_street_suffix_heights": { - "type": "pattern_replace", - "pattern": " heights", - "replacement": " hts" - }, - "keyword_street_suffix_highway": { - "type": "pattern_replace", - "pattern": " highway", - "replacement": " hwy" - }, - "keyword_street_suffix_hill": { - "type": "pattern_replace", - "pattern": " hill", - "replacement": " hl" - }, - "keyword_street_suffix_hills": { - "type": "pattern_replace", - "pattern": " hills", - "replacement": " hls" - }, - "keyword_street_suffix_hollow": { - "type": "pattern_replace", - "pattern": " hollow", - "replacement": " holw" - }, - "keyword_street_suffix_isle": { - "type": "pattern_replace", - "pattern": " isle", - "replacement": " is" - }, - "keyword_street_suffix_junction": { - "type": "pattern_replace", - "pattern": " junction", - "replacement": " jct" - }, - "keyword_street_suffix_key": { - "type": "pattern_replace", - "pattern": " key", - "replacement": " ky" - }, - "keyword_street_suffix_keys": { - "type": "pattern_replace", - "pattern": " keys", - "replacement": " kys" - }, - "keyword_street_suffix_knoll": { - "type": "pattern_replace", - "pattern": " knoll", - "replacement": " knl" - }, - "keyword_street_suffix_knolls": { - "type": "pattern_replace", - "pattern": " knolls", - "replacement": " knls" - }, - "keyword_street_suffix_landing": { - "type": "pattern_replace", - "pattern": " landing", - "replacement": " lndg" - }, - "keyword_street_suffix_lane": { - "type": "pattern_replace", - "pattern": " lane", - "replacement": " ln" - }, - "keyword_street_suffix_light": { - "type": "pattern_replace", - "pattern": " light", - "replacement": " lgt" - }, - "keyword_street_suffix_lights": { - "type": "pattern_replace", - "pattern": " lights", - "replacement": " lgts" - }, - "keyword_street_suffix_lock": { - "type": "pattern_replace", - "pattern": " lock", - "replacement": " lck" - }, - "keyword_street_suffix_locks": { - "type": "pattern_replace", - "pattern": " locks", - "replacement": " lcks" - }, - "keyword_street_suffix_manor": { - "type": "pattern_replace", - "pattern": " manor", - "replacement": " mnr" - }, - "keyword_street_suffix_meadow": { - "type": "pattern_replace", - "pattern": " meadow", - "replacement": " mdw" - }, - "keyword_street_suffix_meadows": { - "type": "pattern_replace", - "pattern": " meadows", - "replacement": " mdws" - }, - "keyword_street_suffix_mill": { - "type": "pattern_replace", - "pattern": " mill", - "replacement": " ml" - }, - "keyword_street_suffix_mills": { - "type": "pattern_replace", - "pattern": " mills", - "replacement": " mls" - }, - "keyword_street_suffix_mountain": { - "type": "pattern_replace", - "pattern": " mountain", - "replacement": " mnt" - }, - "keyword_street_suffix_motorway": { - "type": "pattern_replace", - "pattern": " motorway", - "replacement": " mtwy" - }, - "keyword_street_suffix_neck": { - "type": "pattern_replace", - "pattern": " neck", - "replacement": " nck" - }, - "keyword_street_suffix_orchard": { - "type": "pattern_replace", - "pattern": " orchard", - "replacement": " orch" - }, - "keyword_street_suffix_parkway": { - "type": "pattern_replace", - "pattern": " parkway", - "replacement": " pkwy" - }, - "keyword_street_suffix_pasage": { - "type": "pattern_replace", - "pattern": " pasage", - "replacement": " psge" - }, - "keyword_street_suffix_pier": { - "type": "pattern_replace", - "pattern": " pier", - "replacement": " pr" - }, - "keyword_street_suffix_pine": { - "type": "pattern_replace", - "pattern": " pine", - "replacement": " pne" - }, - "keyword_street_suffix_pines": { - "type": "pattern_replace", - "pattern": " pines", - "replacement": " pnes" - }, - "keyword_street_suffix_place": { - "type": "pattern_replace", - "pattern": " place", - "replacement": " pl" - }, - "keyword_street_suffix_plaza": { - "type": "pattern_replace", - "pattern": " plaza", - "replacement": " plz" - }, - "keyword_street_suffix_ranch": { - "type": "pattern_replace", - "pattern": " ranch", - "replacement": " rnch" - }, - "keyword_street_suffix_ridge": { - "type": "pattern_replace", - "pattern": " ridge", - "replacement": " rdg" - }, - "keyword_street_suffix_ridges": { - "type": "pattern_replace", - "pattern": " ridges", - "replacement": " rdgs" - }, - "keyword_street_suffix_river": { - "type": "pattern_replace", - "pattern": " river", - "replacement": " riv" - }, - "keyword_street_suffix_road": { - "type": "pattern_replace", - "pattern": " road", - "replacement": " rd" - }, - "keyword_street_suffix_route": { - "type": "pattern_replace", - "pattern": " route", - "replacement": " rte" - }, - "keyword_street_suffix_shore": { - "type": "pattern_replace", - "pattern": " shore", - "replacement": " shr" - }, - "keyword_street_suffix_shores": { - "type": "pattern_replace", - "pattern": " shores", - "replacement": " shrs" - }, - "keyword_street_suffix_skyway": { - "type": "pattern_replace", - "pattern": " skyway", - "replacement": " skwy" - }, - "keyword_street_suffix_spring": { - "type": "pattern_replace", - "pattern": " spring", - "replacement": " spg" - }, - "keyword_street_suffix_springs": { - "type": "pattern_replace", - "pattern": " springs", - "replacement": " spgs" - }, - "keyword_street_suffix_square": { - "type": "pattern_replace", - "pattern": " square", - "replacement": " sq" - }, - "keyword_street_suffix_street": { - "type": "pattern_replace", - "pattern": " street", - "replacement": " st" - }, - "keyword_street_suffix_suite": { - "type": "pattern_replace", - "pattern": " suite", - "replacement": " ste" - }, - "keyword_street_suffix_terrace": { - "type": "pattern_replace", - "pattern": " terrace", - "replacement": " terr" - }, - "keyword_street_suffix_trail": { - "type": "pattern_replace", - "pattern": " trail", - "replacement": " tr" - }, - "keyword_street_suffix_trafficway": { - "type": "pattern_replace", - "pattern": " trafficway", - "replacement": " trfy" - }, - "keyword_street_suffix_tunnel": { - "type": "pattern_replace", - "pattern": " tunnel", - "replacement": " tunl" - }, - "keyword_street_suffix_turnpike": { - "type": "pattern_replace", - "pattern": " turnpike", - "replacement": " tpke" - }, - "keyword_street_suffix_valley": { - "type": "pattern_replace", - "pattern": " valley", - "replacement": " vly" - }, - "keyword_street_suffix_vista": { - "type": "pattern_replace", - "pattern": " vista", - "replacement": " vis" - }, - "keyword_street_suffix_village": { - "type": "pattern_replace", - "pattern": " village", - "replacement": " vlg" - }, - "keyword_street_suffix_way": { - "type": "pattern_replace", - "pattern": " way", - "replacement": " wy" - }, - "keyword_compass_southwest": { - "type": "pattern_replace", - "pattern": "southwest", - "replacement": "sw" - }, - "keyword_compass_southeast": { - "type": "pattern_replace", - "pattern": "southeast", - "replacement": "se" - }, - "keyword_compass_northwest": { - "type": "pattern_replace", - "pattern": "northwest", - "replacement": "nw" - }, - "keyword_compass_northeast": { - "type": "pattern_replace", - "pattern": "northeast", - "replacement": "ne" - }, - "keyword_compass_north": { - "type": "pattern_replace", - "pattern": "north", - "replacement": "n" - }, - "keyword_compass_south": { - "type": "pattern_replace", - "pattern": "south", - "replacement": "s" - }, - "keyword_compass_east": { - "type": "pattern_replace", - "pattern": "east", - "replacement": "e" - }, - "keyword_compass_west": { - "type": "pattern_replace", - "pattern": "west", - "replacement": "w" } }, "char_filter": { diff --git a/test/settings.js b/test/settings.js index 89432160..c223a77a 100644 --- a/test/settings.js +++ b/test/settings.js @@ -115,12 +115,14 @@ module.exports.tests.peliasPhraseAnalyzer = function(test, common) { var analyzer = settings().analysis.analyzer.peliasPhrase; t.deepEqual( analyzer.filter, [ "lowercase", - "icu_folding", "trim", - "custom_name", + "remove_duplicate_spaces", "ampersand", - "street_suffix_contractions", + "custom_name", + "street_suffix", "directionals", + "icu_folding", + "remove_ordinals", "unique", "notnull" ]); @@ -196,7 +198,17 @@ module.exports.tests.peliasStreetAnalyzer = function(test, common) { }); test('peliasStreet token filters', function(t) { var analyzer = settings().analysis.analyzer.peliasStreet; - t.equal( analyzer.filter.length, 134, 'lots of filters' ); + t.deepEqual( analyzer.filter, [ + "lowercase", + "trim", + "remove_duplicate_spaces", + "custom_street", + "street_suffix", + "directionals", + "icu_folding", + "remove_ordinals", + "trim" + ]); t.end(); }); }; @@ -302,10 +314,10 @@ module.exports.tests.removeAllZeroNumericPrefixFilter = function(test, common) { // this filter stems common street suffixes // eg. road=>rd and street=>st module.exports.tests.streetSynonymFilter = function(test, common) { - test('has street_suffix_contractions filter', function(t) { + test('has street_suffix filter', function(t) { var s = settings(); - t.equal(typeof s.analysis.filter.street_suffix_contractions, 'object', 'there is an street_suffix_contractions filter'); - var filter = s.analysis.filter.street_suffix_contractions; + t.equal(typeof s.analysis.filter.street_suffix, 'object', 'there is an street_suffix filter'); + var filter = s.analysis.filter.street_suffix; t.equal(filter.type, 'synonym'); t.true(Array.isArray(filter.synonyms)); t.equal(filter.synonyms.length, 120);