From 452c93cbea39d0f17aa159a48fb12a9441be8f52 Mon Sep 17 00:00:00 2001 From: henrifroese <50276689+henrifroese@users.noreply.github.com> Date: Sat, 11 Jul 2020 10:44:40 +0200 Subject: [PATCH] Add count_sentences function to nlp.py (#51) * Add count_sentences function to nlp.py Also add tests for the function to test_nlp.py * Implement suggestions from pull request. Add more tests, change style (docstring, tests naming). Remove unicode-casting to avoid unexpected behaviour. * Add link to spacy documentation. Additionally update index tests, they're cleaner now. Co-authored-by: Henri Froese --- tests/test_nlp.py | 32 ++++++++++++++++++++++++++++++++ texthero/nlp.py | 34 ++++++++++++++++++++++++++++++++-- 2 files changed, 64 insertions(+), 2 deletions(-) diff --git a/tests/test_nlp.py b/tests/test_nlp.py index bd062a9e..2df9db61 100644 --- a/tests/test_nlp.py +++ b/tests/test_nlp.py @@ -1,4 +1,5 @@ import pandas as pd +import numpy as np from texthero import nlp from . import PandasTestCase @@ -36,3 +37,34 @@ def test_noun_chunks(self): [[("Today", "NP", 0, 5), ("such a beautiful day", "NP", 9, 29)]] ) self.assertEqual(nlp.noun_chunks(s), s_true) + + """ + Count sentences. + """ + + def test_count_sentences(self): + s = pd.Series("I think ... it counts correctly. Doesn't it? Great!") + s_true = pd.Series(3) + self.assertEqual(nlp.count_sentences(s), s_true) + + def test_count_sentences_numeric(self): + s = pd.Series([13.0, 42.0]) + self.assertRaises(TypeError, nlp.count_sentences, s) + + def test_count_sentences_missing_value(self): + s = pd.Series(["Test.", np.nan]) + self.assertRaises(TypeError, nlp.count_sentences, s) + + def test_count_sentences_index(self): + s = pd.Series(["Test"], index=[5]) + counted_sentences_s = nlp.count_sentences(s) + t_same_index = pd.Series([""], index=[5]) + + self.assertTrue(counted_sentences_s.index.equals(t_same_index.index)) + + def test_count_sentences_wrong_index(self): + s = pd.Series(["Test", "Test"], index=[5, 6]) + counted_sentences_s = nlp.count_sentences(s) + t_different_index = pd.Series(["", ""], index=[5, 7]) + + self.assertFalse(counted_sentences_s.index.equals(t_different_index.index)) diff --git a/texthero/nlp.py b/texthero/nlp.py index d2da7af5..df32128e 100644 --- a/texthero/nlp.py +++ b/texthero/nlp.py @@ -11,11 +11,11 @@ def named_entities(s, package="spacy"): Return named-entities. Return a Pandas Series where each rows contains a list of tuples containing information regarding the given named entities. - + Tuple: (`entity'name`, `entity'label`, `starting character`, `ending character`) Under the hood, `named_entities` make use of Spacy name entity recognition. - + List of labels: - `PERSON`: People, including fictional. - `NORP`: Nationalities or religious or political groups. @@ -76,3 +76,33 @@ def noun_chunks(s): ) return pd.Series(noun_chunks, index=s.index) + + +def count_sentences(s: pd.Series) -> pd.Series: + """ + Count the number of sentences per cell in a Pandas Series. + + Return a new Pandas Series with the number of sentences per cell. + + This makes use of the SpaCy `sentencizer `. + + Examples + -------- + >>> import texthero as hero + >>> import pandas as pd + >>> s = pd.Series(["Yesterday I was in NY with Bill de Blasio. Great story...", "This is the F.B.I.! What? Open up!"]) + >>> hero.count_sentences(s) + 0 2 + 1 3 + dtype: int64 + """ + number_of_sentences = [] + + nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"]) + nlp.add_pipe(nlp.create_pipe("sentencizer")) # Pipe is only "sentencizer" + + for doc in nlp.pipe(s.values, batch_size=32): + sentences = len(list(doc.sents)) + number_of_sentences.append(sentences) + + return pd.Series(number_of_sentences, index=s.index)