Implement Automated Readability Index, Closes jbesomi#20

New pull request from jbesomi#46 as we had some Git problems. Input checking done with pd.api.types.is_string_dtype. Not a permanent solution, will be improved by jbesomi#60 etc. Co-authored-by: Maximilian Krahn <[email protected]>
SummerOfCode-NoHate · Jul 12, 2020 · ec2dfa8 · ec2dfa8
1 parent 452c93c
commit ec2dfa8
Show file tree

Hide file tree

Showing 2 changed files with 66 additions and 2 deletions.
diff --git a/tests/test_visualization.py b/tests/test_visualization.py
@@ -59,3 +59,25 @@ def test_top_words_digits_punctuation(self):
     def test_wordcloud(self):
         s = pd.Series("one two three")
         self.assertEqual(visualization.wordcloud(s), None)
+
+    """
+    Test automated readability index
+    """
+
+    def test_automated_readability_index(self):
+        s = pd.Series(["New York is a beautiful city.", "Look: New York!", "Wow"])
+        s_true = pd.Series([3.0, 6.0, 0.0])
+        self.assertEqual(visualization.automated_readability_index(s), s_true)
+
+    def test_automated_readability_index_index(self):
+        s = pd.Series(
+            ["New York is a beautiful city.", "Look: New York!", "Wow"],
+            index=[5, 6, 7],
+        )
+        self.assertTrue(
+            visualization.automated_readability_index(s).index.equals(s.index)
+        )
+
+    def test_automated_readability_index_numeric(self):
+        s = pd.Series([1.0, 2.0])
+        self.assertRaises(TypeError, visualization.automated_readability_index, s)
diff --git a/texthero/visualization.py b/texthero/visualization.py
@@ -7,7 +7,7 @@
 
 from wordcloud import WordCloud
 
-from texthero import preprocessing
+from texthero import preprocessing, nlp
 import string
 
 from matplotlib.colors import LinearSegmentedColormap as lsg
@@ -158,7 +158,7 @@ def top_words(s: pd.Series, normalize=False) -> pd.Series:
     Return a pandas series with index the top words and as value the count.
 
     Tokenization: split by space and remove all punctuations that are not between characters.
-    
+
     Parameters
     ----------
     normalize :
@@ -185,3 +185,45 @@ def top_words(s: pd.Series, normalize=False) -> pd.Series:
         .explode()  # one word for each line
         .value_counts(normalize=normalize)
     )
+
+
+def automated_readability_index(s: pd.Series) -> pd.Series:
+    """
+    Calculate the automated readability index (ARI).
+
+    Calculate ARI for each item in the given Pandas Series. Return a Pandas Series with the ARI scores.
+    Score is NaN if it cannot be computed (e.g. if the number of sentences is 0).
+
+    Examples
+    --------
+    >>> import texthero as hero
+    >>> import pandas as pd
+    >>> s = pd.Series(["New York is a beautiful city.", "Look: New York!", "Wow"])
+    >>> hero.automated_readability_index(s)
+    0    3.0
+    1    6.0
+    2    0.0
+    dtype: float64
+
+    Reference
+    --------
+    `Automated Readability Index <https://en.wikipedia.org/wiki/Automated_readability_index>`_
+
+    """
+    if not pd.api.types.is_string_dtype(s):
+        raise TypeError("Non-string values in given Series.")
+
+    words_s = s.str.split().str.len() - 1
+    characters_s = s.str.count(r"[a-zA-Z0-9]")  # Regex for alphanumeric.
+    sentences_s = nlp.count_sentences(s)
+
+    score_s = (
+        4.71 * (characters_s / words_s) + 0.5 * (words_s / sentences_s) - 21.43
+    )
+    score_s = np.ceil(score_s)
+
+    # Pandas does not raise an Error when dividing by zero -> remove
+    # wrong values by ourselves.
+    score_s.loc[~np.isfinite(score_s)] = 0
+
+    return score_s