From 0d51e27c4f28c986e8ee39a9ea0355f0baa6f7c3 Mon Sep 17 00:00:00 2001 From: Martin Freisehner Date: Mon, 21 Feb 2022 10:42:30 +0100 Subject: [PATCH 1/2] allow word tokenization override --- README.md | 11 ++++++++++ readability/readability.py | 11 ++++++++-- readability/text/analyzer.py | 11 +++++++++- test/test_tokenize.py | 39 ++++++++++++++++++++++++++++++++++++ 4 files changed, 69 insertions(+), 3 deletions(-) create mode 100644 test/test_tokenize.py diff --git a/README.md b/README.md index a2ca0a7..cf5ae44 100644 --- a/README.md +++ b/README.md @@ -240,6 +240,17 @@ print(lw.score) print(lw.grade_level) ``` +## What makes a word + +Bear in mind that there is no consensus as to what is (not) a word. The default word tokenizer keeps clitics as one word +(e.g. `we've`) and splits abbreviations (e.g. `['U', '.', 'S', '.']`). To change this behavior you can pass your own +tokenizer. +**_example:_** +```python +from nltk import word_tokenize +r = Readability(text, f_tokenize_words=word_tokenize) +``` + ## [Contributing](CONTRIBUTING.md) Contributions are welcome! diff --git a/readability/readability.py b/readability/readability.py index 91341b7..1f67604 100644 --- a/readability/readability.py +++ b/readability/readability.py @@ -4,8 +4,15 @@ class Readability: - def __init__(self, text): - self._analyzer = Analyzer() + def __init__(self, text, f_tokenize_words=None): + """ + :param text: str + Input text. Consider cleaning your text before calculating readability metrics + :param f_tokenize_words: callable, default=None + Override default tokenization of words; + Example: lambda txt: NltkTokenizer('english').word_tokenize(txt) + """ + self._analyzer = Analyzer(f_tokenize_words) self._statistics = self._analyzer.analyze(text) def ari(self): diff --git a/readability/text/analyzer.py b/readability/text/analyzer.py index dce409e..de40f21 100644 --- a/readability/text/analyzer.py +++ b/readability/text/analyzer.py @@ -56,7 +56,13 @@ def __str__(self): class Analyzer: - def __init__(self): + def __init__(self, f_tokenize_words=None): + """ + :param f_tokenize_words: callable, default=None + Override default tokenization of words; + Example: lambda txt: NltkTokenizer('english').word_tokenize(txt) + """ + self.f_tokenize_words = f_tokenize_words pass def analyze(self, text): @@ -125,6 +131,9 @@ def _tokenize_sentences(self, text): return sent_tokenize(text) def _tokenize(self, text): + if self.f_tokenize_words: + return self.f_tokenize_words(text) + tokenizer = TweetTokenizer() return tokenizer.tokenize(text) diff --git a/test/test_tokenize.py b/test/test_tokenize.py new file mode 100644 index 0000000..3c52922 --- /dev/null +++ b/test/test_tokenize.py @@ -0,0 +1,39 @@ +import unittest + +from nltk import word_tokenize + +from readability import Readability + + +class WordTokenizeTest(unittest.TestCase): + """ + Tests non-default word-tokenization + """ + + def setUp(self): + # taken from https://en.wikipedia.org/wiki/Word#Summary + self.text = """ + What makes a word? + In linguistics, a word of a spoken language can be defined as the smallest sequence of phonemes that can be + uttered in isolation with objective or practical meaning. + There have been many proposed criteria for identifying words. + However, no definition has been found to apply to all languages. + Dictionaries categorize a language's lexicon (i.e., its vocabulary) into lemmas. + These can be taken as an indication of what constitutes a "word" in the opinion of the writers of that language. + The most appropriate means of measuring the length of a word is by counting its syllables or morphemes. + When a word has multiple definitions or multiple senses, it may result in confusion in a debate or discussion. + """ + + def test_nltk_treebank_tokenizer(self): + r = Readability(self.text, f_tokenize_words=word_tokenize).ari() + print(r) + self.assertEqual(8.730483870967742, r.score) + self.assertEqual(['9'], r.grade_levels) + self.assertEqual([14, 15], r.ages) + + def test_default_tokenizer(self): + r = Readability(self.text).ari() + print(r) + self.assertEqual(8.578548387096774, r.score) + self.assertEqual(['9'], r.grade_levels) + self.assertEqual([14, 15], r.ages) From 0debeb158006e4a86344283c6f07cdb362ac6044 Mon Sep 17 00:00:00 2001 From: Martin Freisehner Date: Mon, 21 Feb 2022 10:59:24 +0100 Subject: [PATCH 2/2] removed prints --- test/test_tokenize.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/test_tokenize.py b/test/test_tokenize.py index 3c52922..44247f1 100644 --- a/test/test_tokenize.py +++ b/test/test_tokenize.py @@ -26,14 +26,12 @@ def setUp(self): def test_nltk_treebank_tokenizer(self): r = Readability(self.text, f_tokenize_words=word_tokenize).ari() - print(r) self.assertEqual(8.730483870967742, r.score) self.assertEqual(['9'], r.grade_levels) self.assertEqual([14, 15], r.ages) def test_default_tokenizer(self): r = Readability(self.text).ari() - print(r) self.assertEqual(8.578548387096774, r.score) self.assertEqual(['9'], r.grade_levels) self.assertEqual([14, 15], r.ages)