Added support for words with hyphens to aggressive tokenizer for Fren…

…ch (#610) and Dutch
NaturalNode · Aug 30, 2021 · f84d1bc · f84d1bc
1 parent 0dd86f9
commit f84d1bc
Show file tree

Hide file tree

Showing 4 changed files with 16 additions and 2 deletions.
diff --git a/lib/natural/tokenizers/aggressive_tokenizer_fr.js b/lib/natural/tokenizers/aggressive_tokenizer_fr.js
@@ -34,5 +34,5 @@ module.exports = AggressiveTokenizer
 
 AggressiveTokenizer.prototype.tokenize = function (text) {
   // break a string up into an array of tokens by anything non-word
-  return this.trim(text.split(/[^a-z0-9äâàéèëêïîöôùüûœç]+/i))
+  return this.trim(text.split(/[^a-z0-9äâàéèëêïîöôùüûœç-]+/i))
 }
diff --git a/lib/natural/tokenizers/aggressive_tokenizer_nl.js b/lib/natural/tokenizers/aggressive_tokenizer_nl.js
@@ -34,5 +34,5 @@ module.exports = AggressiveTokenizer
 
 AggressiveTokenizer.prototype.tokenize = function (text) {
   // break a string up into an array of tokens by anything non-word
-  return this.trim(text.split(/[^a-zA-Z0-9_']+/))
+  return this.trim(text.split(/[^a-zA-Z0-9_'-]+/))
 }
diff --git a/spec/aggressive_tokenizer_fr_spec.js b/spec/aggressive_tokenizer_fr_spec.js
@@ -95,6 +95,13 @@ describe('aggressive_tokenizer_fr', function () {
     expect(tokenizer.tokenize(text)).toEqual(tokenized)
   })
 
+  it('should handle hyphens in words correctly', function () {
+    const sentence = 'Des sous-pages dans le sous-bois de la ville de Paris'
+    const res = tokenizer.tokenize(sentence)
+    const expectedRes = ['Des', 'sous-pages', 'dans', 'le', 'sous-bois', 'de', 'la', 'ville', 'de', 'Paris']
+    expect(res).toEqual(expectedRes)
+  })
+
   /*
   it('should tokenize strings via attached string method', function() {
     tokenizer.attach();

diff --git a/spec/aggressive_tokenizer_nl_spec.js b/spec/aggressive_tokenizer_nl_spec.js
@@ -30,6 +30,13 @@ describe('aggressive_tokenizer_nl', function () {
     expect(tokenizer.tokenize('\'s Morgens is het nog erg koud, vertelde de weerman over een van de radio\'s')).toEqual(['\'s', 'Morgens', 'is', 'het', 'nog', 'erg', 'koud', 'vertelde', 'de', 'weerman', 'over', 'een', 'van', 'de', 'radio\'s'])
   })
 
+  it('should handle hyphens in words correctly', function () {
+    const sentence = 'clearing-systeem front-office-automatisering christelijk-historisch mond-op-mond, kant-en-klaar, kruidje-roer-me-niet, doe-het-zelver'
+    const res = tokenizer.tokenize(sentence)
+    const expectedRes = ['clearing-systeem', 'front-office-automatisering', 'christelijk-historisch', 'mond-op-mond', 'kant-en-klaar', 'kruidje-roer-me-niet', 'doe-het-zelver']
+    expect(res).toEqual(expectedRes)
+  })
+
   /*
   it('should tokenize strings via attached string method', function() {
     tokenizer.attach();