diff --git a/tests/test_visualization.py b/tests/test_visualization.py index 4ffa4566..2a16d295 100644 --- a/tests/test_visualization.py +++ b/tests/test_visualization.py @@ -59,3 +59,25 @@ def test_top_words_digits_punctuation(self): def test_wordcloud(self): s = pd.Series("one two three") self.assertEqual(visualization.wordcloud(s), None) + + """ + Test automated readability index + """ + + def test_automated_readability_index(self): + s = pd.Series(["New York is a beautiful city.", "Look: New York!", "Wow"]) + s_true = pd.Series([3.0, 6.0, 0.0]) + self.assertEqual(visualization.automated_readability_index(s), s_true) + + def test_automated_readability_index_index(self): + s = pd.Series( + ["New York is a beautiful city.", "Look: New York!", "Wow"], + index=[5, 6, 7], + ) + self.assertTrue( + visualization.automated_readability_index(s).index.equals(s.index) + ) + + def test_automated_readability_index_numeric(self): + s = pd.Series([1.0, 2.0]) + self.assertRaises(TypeError, visualization.automated_readability_index, s) diff --git a/texthero/visualization.py b/texthero/visualization.py index c363fc17..dbe088fe 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -7,7 +7,7 @@ from wordcloud import WordCloud -from texthero import preprocessing +from texthero import preprocessing, nlp import string from matplotlib.colors import LinearSegmentedColormap as lsg @@ -158,7 +158,7 @@ def top_words(s: pd.Series, normalize=False) -> pd.Series: Return a pandas series with index the top words and as value the count. Tokenization: split by space and remove all punctuations that are not between characters. - + Parameters ---------- normalize : @@ -185,3 +185,45 @@ def top_words(s: pd.Series, normalize=False) -> pd.Series: .explode() # one word for each line .value_counts(normalize=normalize) ) + + +def automated_readability_index(s: pd.Series) -> pd.Series: + """ + Calculate the automated readability index (ARI). + + Calculate ARI for each item in the given Pandas Series. Return a Pandas Series with the ARI scores. + Score is NaN if it cannot be computed (e.g. if the number of sentences is 0). + + Examples + -------- + >>> import texthero as hero + >>> import pandas as pd + >>> s = pd.Series(["New York is a beautiful city.", "Look: New York!", "Wow"]) + >>> hero.automated_readability_index(s) + 0 3.0 + 1 6.0 + 2 0.0 + dtype: float64 + + Reference + -------- + `Automated Readability Index `_ + + """ + if not pd.api.types.is_string_dtype(s): + raise TypeError("Non-string values in given Series.") + + words_s = s.str.split().str.len() - 1 + characters_s = s.str.count(r"[a-zA-Z0-9]") # Regex for alphanumeric. + sentences_s = nlp.count_sentences(s) + + score_s = ( + 4.71 * (characters_s / words_s) + 0.5 * (words_s / sentences_s) - 21.43 + ) + score_s = np.ceil(score_s) + + # Pandas does not raise an Error when dividing by zero -> remove + # wrong values by ourselves. + score_s.loc[~np.isfinite(score_s)] = 0 + + return score_s