From 452c93cbea39d0f17aa159a48fb12a9441be8f52 Mon Sep 17 00:00:00 2001
From: henrifroese <50276689+henrifroese@users.noreply.github.com>
Date: Sat, 11 Jul 2020 10:44:40 +0200
Subject: [PATCH] Add count_sentences function to nlp.py (#51)

* Add count_sentences function to nlp.py

Also add tests for the function to test_nlp.py

* Implement suggestions from pull request.

Add more tests, change style (docstring, tests naming).
Remove unicode-casting to avoid unexpected behaviour.

* Add link to spacy documentation.

Additionally update index tests, they're cleaner now.

Co-authored-by: Henri Froese <henri.froese@yahoo.com>
---
 tests/test_nlp.py | 32 ++++++++++++++++++++++++++++++++
 texthero/nlp.py   | 34 ++++++++++++++++++++++++++++++++--
 2 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/tests/test_nlp.py b/tests/test_nlp.py
index bd062a9e..2df9db61 100644
--- a/tests/test_nlp.py
+++ b/tests/test_nlp.py
@@ -1,4 +1,5 @@
 import pandas as pd
+import numpy as np
 from texthero import nlp
 
 from . import PandasTestCase
@@ -36,3 +37,34 @@ def test_noun_chunks(self):
             [[("Today", "NP", 0, 5), ("such a beautiful day", "NP", 9, 29)]]
         )
         self.assertEqual(nlp.noun_chunks(s), s_true)
+
+    """
+    Count sentences.
+    """
+
+    def test_count_sentences(self):
+        s = pd.Series("I think ... it counts correctly. Doesn't it? Great!")
+        s_true = pd.Series(3)
+        self.assertEqual(nlp.count_sentences(s), s_true)
+
+    def test_count_sentences_numeric(self):
+        s = pd.Series([13.0, 42.0])
+        self.assertRaises(TypeError, nlp.count_sentences, s)
+
+    def test_count_sentences_missing_value(self):
+        s = pd.Series(["Test.", np.nan])
+        self.assertRaises(TypeError, nlp.count_sentences, s)
+
+    def test_count_sentences_index(self):
+        s = pd.Series(["Test"], index=[5])
+        counted_sentences_s = nlp.count_sentences(s)
+        t_same_index = pd.Series([""], index=[5])
+
+        self.assertTrue(counted_sentences_s.index.equals(t_same_index.index))
+
+    def test_count_sentences_wrong_index(self):
+        s = pd.Series(["Test", "Test"], index=[5, 6])
+        counted_sentences_s = nlp.count_sentences(s)
+        t_different_index = pd.Series(["", ""], index=[5, 7])
+
+        self.assertFalse(counted_sentences_s.index.equals(t_different_index.index))
diff --git a/texthero/nlp.py b/texthero/nlp.py
index d2da7af5..df32128e 100644
--- a/texthero/nlp.py
+++ b/texthero/nlp.py
@@ -11,11 +11,11 @@ def named_entities(s, package="spacy"):
     Return named-entities.
 
     Return a Pandas Series where each rows contains a list of tuples containing information regarding the given named entities.
-    
+
     Tuple: (`entity'name`, `entity'label`, `starting character`, `ending character`)
 
     Under the hood, `named_entities` make use of Spacy name entity recognition.
-    
+
     List of labels:
      - `PERSON`: People, including fictional.
      - `NORP`: Nationalities or religious or political groups.
@@ -76,3 +76,33 @@ def noun_chunks(s):
         )
 
     return pd.Series(noun_chunks, index=s.index)
+
+
+def count_sentences(s: pd.Series) -> pd.Series:
+    """
+    Count the number of sentences per cell in a Pandas Series.
+
+    Return a new Pandas Series with the number of sentences per cell.
+
+    This makes use of the SpaCy `sentencizer <https://spacy.io/api/sentencizer>`.
+
+    Examples
+    --------
+    >>> import texthero as hero
+    >>> import pandas as pd
+    >>> s = pd.Series(["Yesterday I was in NY with Bill de Blasio. Great story...", "This is the F.B.I.! What? Open up!"])
+    >>> hero.count_sentences(s)
+    0    2
+    1    3
+    dtype: int64
+    """
+    number_of_sentences = []
+
+    nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])
+    nlp.add_pipe(nlp.create_pipe("sentencizer"))  # Pipe is only "sentencizer"
+
+    for doc in nlp.pipe(s.values, batch_size=32):
+        sentences = len(list(doc.sents))
+        number_of_sentences.append(sentences)
+
+    return pd.Series(number_of_sentences, index=s.index)