diff --git a/lib/natural/tokenizers/aggressive_tokenizer_fr.js b/lib/natural/tokenizers/aggressive_tokenizer_fr.js index 3e7297f50..ca55adafd 100644 --- a/lib/natural/tokenizers/aggressive_tokenizer_fr.js +++ b/lib/natural/tokenizers/aggressive_tokenizer_fr.js @@ -34,5 +34,5 @@ module.exports = AggressiveTokenizer AggressiveTokenizer.prototype.tokenize = function (text) { // break a string up into an array of tokens by anything non-word - return this.trim(text.split(/[^a-z0-9äâàéèëêïîöôùüûœç]+/i)) + return this.trim(text.split(/[^a-z0-9äâàéèëêïîöôùüûœç-]+/i)) } diff --git a/lib/natural/tokenizers/aggressive_tokenizer_nl.js b/lib/natural/tokenizers/aggressive_tokenizer_nl.js index 6a9ca999f..cc80938c1 100644 --- a/lib/natural/tokenizers/aggressive_tokenizer_nl.js +++ b/lib/natural/tokenizers/aggressive_tokenizer_nl.js @@ -34,5 +34,5 @@ module.exports = AggressiveTokenizer AggressiveTokenizer.prototype.tokenize = function (text) { // break a string up into an array of tokens by anything non-word - return this.trim(text.split(/[^a-zA-Z0-9_']+/)) + return this.trim(text.split(/[^a-zA-Z0-9_'-]+/)) } diff --git a/spec/aggressive_tokenizer_fr_spec.js b/spec/aggressive_tokenizer_fr_spec.js index aa55c0380..4368f5f91 100644 --- a/spec/aggressive_tokenizer_fr_spec.js +++ b/spec/aggressive_tokenizer_fr_spec.js @@ -95,6 +95,13 @@ describe('aggressive_tokenizer_fr', function () { expect(tokenizer.tokenize(text)).toEqual(tokenized) }) + it('should handle hyphens in words correctly', function () { + const sentence = 'Des sous-pages dans le sous-bois de la ville de Paris' + const res = tokenizer.tokenize(sentence) + const expectedRes = ['Des', 'sous-pages', 'dans', 'le', 'sous-bois', 'de', 'la', 'ville', 'de', 'Paris'] + expect(res).toEqual(expectedRes) + }) + /* it('should tokenize strings via attached string method', function() { tokenizer.attach(); diff --git a/spec/aggressive_tokenizer_nl_spec.js b/spec/aggressive_tokenizer_nl_spec.js index a241cc597..edc21cc57 100644 --- a/spec/aggressive_tokenizer_nl_spec.js +++ b/spec/aggressive_tokenizer_nl_spec.js @@ -30,6 +30,13 @@ describe('aggressive_tokenizer_nl', function () { expect(tokenizer.tokenize('\'s Morgens is het nog erg koud, vertelde de weerman over een van de radio\'s')).toEqual(['\'s', 'Morgens', 'is', 'het', 'nog', 'erg', 'koud', 'vertelde', 'de', 'weerman', 'over', 'een', 'van', 'de', 'radio\'s']) }) + it('should handle hyphens in words correctly', function () { + const sentence = 'clearing-systeem front-office-automatisering christelijk-historisch mond-op-mond, kant-en-klaar, kruidje-roer-me-niet, doe-het-zelver' + const res = tokenizer.tokenize(sentence) + const expectedRes = ['clearing-systeem', 'front-office-automatisering', 'christelijk-historisch', 'mond-op-mond', 'kant-en-klaar', 'kruidje-roer-me-niet', 'doe-het-zelver'] + expect(res).toEqual(expectedRes) + }) + /* it('should tokenize strings via attached string method', function() { tokenizer.attach();