diff --git a/lib/natural/tokenizers/sentence_tokenizer.js b/lib/natural/tokenizers/sentence_tokenizer.js index f9454ce2b..abd4f6794 100644 --- a/lib/natural/tokenizers/sentence_tokenizer.js +++ b/lib/natural/tokenizers/sentence_tokenizer.js @@ -34,7 +34,7 @@ util.inherits(SentenceTokenizer, Tokenizer) SentenceTokenizer.prototype.tokenize = function (text) { // break string up in to sentences based on punctation and quotation marks - let tokens = text.match(/(?<=\s+|^)["'‘“'"[({⟨](.*?[.?!])(\s[.?!])*["'’”'"\])}⟩](?=\s+|$)|(?<=\s+|^)\S(.*?[.?!])(\s[.?!])*(?=\s+|$)/g) + let tokens = text.match(/(?<=\s+|^)["'‘“'"[({⟨]?(.*?[.?!])(\s[.?!])*["'’”'"\])}⟩]?(?=\s+|$)|(?<=\s+|^)\S(.*?[.?!])(\s[.?!])*(?=\s+|$)/g) DEBUG && console.log('SentenceTokenizer.tokenize: ' + tokens) diff --git a/spec/sentence_tokenizer_spec.js b/spec/sentence_tokenizer_spec.js index d3423b9f9..2d7c26550 100644 --- a/spec/sentence_tokenizer_spec.js +++ b/spec/sentence_tokenizer_spec.js @@ -131,4 +131,31 @@ describe('sentence_tokenizer', function () { 'Same Bat-Channel!' ]) }) + + it('should handle braces and quotes (issue #591)', function () { + expect( + tokenizer.tokenize('Teste. Test test. Test test: “Test.”') + ).toEqual([ + 'Teste.', + 'Test test.', + 'Test test: “Test.”' + ]) + expect( + tokenizer.tokenize('Test Test. Test test, test test (test test) test: “Test.”') + ).toEqual([ + 'Test Test.', + 'Test test, test test (test test) test: “Test.”' + ]) + expect( + tokenizer.tokenize('Test Test. Test test, test (test) test (test test) test: “Test.”') + ).toEqual([ + 'Test Test.', + 'Test test, test (test) test (test test) test: “Test.”' + ]) + expect( + tokenizer.tokenize('Test: Test (test) test “Test.”') + ).toEqual([ + 'Test: Test (test) test “Test.”' + ]) + }) })