Skip to content

Commit

Permalink
Merge pull request #3 from benniekrijger/fix-tokenizer
Browse files Browse the repository at this point in the history
Pattern fix in whitespace and punctuation tokenizer
  • Loading branch information
fieg committed Apr 15, 2014
2 parents d2aaa9b + f9e6af8 commit 9492fba
Showing 1 changed file with 3 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@

class WhitespaceAndPunctuationTokenizer implements TokenizerInterface
{
protected $pattern = "/[ ,.?!-:;\\n\\r\\t…_]/";

public function tokenize($string)
{
$retval = preg_split("/[ ,.?!-:\n\t]/i", mb_strtolower($string));
$retval = preg_split($this->pattern, mb_strtolower($string));
$retval = array_filter($retval, 'trim');
$retval = array_values($retval);

Expand Down

0 comments on commit 9492fba

Please sign in to comment.