Skip to content

Commit

Permalink
Added support for words with hyphens to aggressive tokenizer for Fren…
Browse files Browse the repository at this point in the history
…ch (#610)

and Dutch
  • Loading branch information
Hugo-ter-Doest authored Aug 30, 2021
1 parent 0dd86f9 commit f84d1bc
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 2 deletions.
2 changes: 1 addition & 1 deletion lib/natural/tokenizers/aggressive_tokenizer_fr.js
Original file line number Diff line number Diff line change
Expand Up @@ -34,5 +34,5 @@ module.exports = AggressiveTokenizer

AggressiveTokenizer.prototype.tokenize = function (text) {
// break a string up into an array of tokens by anything non-word
return this.trim(text.split(/[^a-z0-9äâàéèëêïîöôùüûœç]+/i))
return this.trim(text.split(/[^a-z0-9äâàéèëêïîöôùüûœç-]+/i))
}
2 changes: 1 addition & 1 deletion lib/natural/tokenizers/aggressive_tokenizer_nl.js
Original file line number Diff line number Diff line change
Expand Up @@ -34,5 +34,5 @@ module.exports = AggressiveTokenizer

AggressiveTokenizer.prototype.tokenize = function (text) {
// break a string up into an array of tokens by anything non-word
return this.trim(text.split(/[^a-zA-Z0-9_']+/))
return this.trim(text.split(/[^a-zA-Z0-9_'-]+/))
}
7 changes: 7 additions & 0 deletions spec/aggressive_tokenizer_fr_spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,13 @@ describe('aggressive_tokenizer_fr', function () {
expect(tokenizer.tokenize(text)).toEqual(tokenized)
})

it('should handle hyphens in words correctly', function () {
const sentence = 'Des sous-pages dans le sous-bois de la ville de Paris'
const res = tokenizer.tokenize(sentence)
const expectedRes = ['Des', 'sous-pages', 'dans', 'le', 'sous-bois', 'de', 'la', 'ville', 'de', 'Paris']
expect(res).toEqual(expectedRes)
})

/*
it('should tokenize strings via attached string method', function() {
tokenizer.attach();
Expand Down
7 changes: 7 additions & 0 deletions spec/aggressive_tokenizer_nl_spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,13 @@ describe('aggressive_tokenizer_nl', function () {
expect(tokenizer.tokenize('\'s Morgens is het nog erg koud, vertelde de weerman over een van de radio\'s')).toEqual(['\'s', 'Morgens', 'is', 'het', 'nog', 'erg', 'koud', 'vertelde', 'de', 'weerman', 'over', 'een', 'van', 'de', 'radio\'s'])
})

it('should handle hyphens in words correctly', function () {
const sentence = 'clearing-systeem front-office-automatisering christelijk-historisch mond-op-mond, kant-en-klaar, kruidje-roer-me-niet, doe-het-zelver'
const res = tokenizer.tokenize(sentence)
const expectedRes = ['clearing-systeem', 'front-office-automatisering', 'christelijk-historisch', 'mond-op-mond', 'kant-en-klaar', 'kruidje-roer-me-niet', 'doe-het-zelver']
expect(res).toEqual(expectedRes)
})

/*
it('should tokenize strings via attached string method', function() {
tokenizer.attach();
Expand Down

0 comments on commit f84d1bc

Please sign in to comment.