cdimascio · freecraver · Feb 21, 2022 · Feb 21, 2022
diff --git a/README.md b/README.md
@@ -240,6 +240,17 @@ print(lw.score)
 print(lw.grade_level)
 ```
 
+## What makes a word
+
+Bear in mind that there is no consensus as to what is (not) a word. The default word tokenizer keeps clitics as one word
+(e.g. `we've`) and splits abbreviations (e.g. `['U', '.', 'S', '.']`). To change this behavior you can pass your own 
+tokenizer.  
+**_example:_**
+```python
+from nltk import word_tokenize
+r = Readability(text, f_tokenize_words=word_tokenize)
+```
+
 ## [Contributing](CONTRIBUTING.md)
 
 Contributions are welcome!

diff --git a/readability/readability.py b/readability/readability.py
@@ -4,8 +4,15 @@
 
 
 class Readability:
-    def __init__(self, text):
-        self._analyzer = Analyzer()
+    def __init__(self, text, f_tokenize_words=None):
+        """
+        :param text: str
+            Input text. Consider cleaning your text before calculating readability metrics
+        :param f_tokenize_words: callable, default=None
+            Override default tokenization of words;
+            Example: lambda txt: NltkTokenizer('english').word_tokenize(txt)
+        """
+        self._analyzer = Analyzer(f_tokenize_words)
         self._statistics = self._analyzer.analyze(text)
 
     def ari(self):

diff --git a/readability/text/analyzer.py b/readability/text/analyzer.py
@@ -56,7 +56,13 @@ def __str__(self):
 
 
 class Analyzer:
-    def __init__(self):
+    def __init__(self, f_tokenize_words=None):
+        """
+        :param f_tokenize_words: callable, default=None
+            Override default tokenization of words;
+            Example: lambda txt: NltkTokenizer('english').word_tokenize(txt)
+        """
+        self.f_tokenize_words = f_tokenize_words
         pass
 
     def analyze(self, text):
@@ -125,6 +131,9 @@ def _tokenize_sentences(self, text):
         return sent_tokenize(text)
 
     def _tokenize(self, text):
+        if self.f_tokenize_words:
+            return self.f_tokenize_words(text)
+
         tokenizer = TweetTokenizer()
         return tokenizer.tokenize(text)
 

diff --git a/test/test_tokenize.py b/test/test_tokenize.py
@@ -0,0 +1,37 @@
+import unittest
+
+from nltk import word_tokenize
+
+from readability import Readability
+
+
+class WordTokenizeTest(unittest.TestCase):
+    """
+    Tests non-default word-tokenization
+    """
+
+    def setUp(self):
+        # taken from https://en.wikipedia.org/wiki/Word#Summary
+        self.text = """
+        What makes a word? 
+        In linguistics, a word of a spoken language can be defined as the smallest sequence of phonemes that can be 
+        uttered in isolation with objective or practical meaning.
+        There have been many proposed criteria for identifying words. 
+        However, no definition has been found to apply to all languages. 
+        Dictionaries categorize a language's lexicon (i.e., its vocabulary) into lemmas.
+        These can be taken as an indication of what constitutes a "word" in the opinion of the writers of that language.
+        The most appropriate means of measuring the length of a word is by counting its syllables or morphemes. 
+        When a word has multiple definitions or multiple senses, it may result in confusion in a debate or discussion.
+        """
+
+    def test_nltk_treebank_tokenizer(self):
+        r = Readability(self.text, f_tokenize_words=word_tokenize).ari()
+        self.assertEqual(8.730483870967742, r.score)
+        self.assertEqual(['9'], r.grade_levels)
+        self.assertEqual([14, 15], r.ages)
+
+    def test_default_tokenizer(self):
+        r = Readability(self.text).ari()
+        self.assertEqual(8.578548387096774, r.score)
+        self.assertEqual(['9'], r.grade_levels)
+        self.assertEqual([14, 15], r.ages)