Skip to content

Commit

Permalink
Implement Automated Readability Index, Closes jbesomi#20
Browse files Browse the repository at this point in the history
New pull request from jbesomi#46 as we had some Git problems.

Input checking done with pd.api.types.is_string_dtype. Not a
permanent solution, will be improved by jbesomi#60 etc.

Co-authored-by: Maximilian Krahn <[email protected]>
  • Loading branch information
henrifroese and mk2510 committed Jul 12, 2020
1 parent 452c93c commit ec2dfa8
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 2 deletions.
22 changes: 22 additions & 0 deletions tests/test_visualization.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,25 @@ def test_top_words_digits_punctuation(self):
def test_wordcloud(self):
s = pd.Series("one two three")
self.assertEqual(visualization.wordcloud(s), None)

"""
Test automated readability index
"""

def test_automated_readability_index(self):
s = pd.Series(["New York is a beautiful city.", "Look: New York!", "Wow"])
s_true = pd.Series([3.0, 6.0, 0.0])
self.assertEqual(visualization.automated_readability_index(s), s_true)

def test_automated_readability_index_index(self):
s = pd.Series(
["New York is a beautiful city.", "Look: New York!", "Wow"],
index=[5, 6, 7],
)
self.assertTrue(
visualization.automated_readability_index(s).index.equals(s.index)
)

def test_automated_readability_index_numeric(self):
s = pd.Series([1.0, 2.0])
self.assertRaises(TypeError, visualization.automated_readability_index, s)
46 changes: 44 additions & 2 deletions texthero/visualization.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from wordcloud import WordCloud

from texthero import preprocessing
from texthero import preprocessing, nlp
import string

from matplotlib.colors import LinearSegmentedColormap as lsg
Expand Down Expand Up @@ -158,7 +158,7 @@ def top_words(s: pd.Series, normalize=False) -> pd.Series:
Return a pandas series with index the top words and as value the count.
Tokenization: split by space and remove all punctuations that are not between characters.
Parameters
----------
normalize :
Expand All @@ -185,3 +185,45 @@ def top_words(s: pd.Series, normalize=False) -> pd.Series:
.explode() # one word for each line
.value_counts(normalize=normalize)
)


def automated_readability_index(s: pd.Series) -> pd.Series:
"""
Calculate the automated readability index (ARI).
Calculate ARI for each item in the given Pandas Series. Return a Pandas Series with the ARI scores.
Score is NaN if it cannot be computed (e.g. if the number of sentences is 0).
Examples
--------
>>> import texthero as hero
>>> import pandas as pd
>>> s = pd.Series(["New York is a beautiful city.", "Look: New York!", "Wow"])
>>> hero.automated_readability_index(s)
0 3.0
1 6.0
2 0.0
dtype: float64
Reference
--------
`Automated Readability Index <https://en.wikipedia.org/wiki/Automated_readability_index>`_
"""
if not pd.api.types.is_string_dtype(s):
raise TypeError("Non-string values in given Series.")

words_s = s.str.split().str.len() - 1
characters_s = s.str.count(r"[a-zA-Z0-9]") # Regex for alphanumeric.
sentences_s = nlp.count_sentences(s)

score_s = (
4.71 * (characters_s / words_s) + 0.5 * (words_s / sentences_s) - 21.43
)
score_s = np.ceil(score_s)

# Pandas does not raise an Error when dividing by zero -> remove
# wrong values by ourselves.
score_s.loc[~np.isfinite(score_s)] = 0

return score_s

0 comments on commit ec2dfa8

Please sign in to comment.