From 5147c2a929608d4d1b87d4e3e03b8b9f9f419154 Mon Sep 17 00:00:00 2001 From: Matthew Baxter Date: Thu, 27 Oct 2022 16:38:55 -0400 Subject: [PATCH] Process only unique text Make sure that only unique text is passed to the clustering algorithm. This helps to avoid `ConvergenceWarning` and makes sure that the summary does not repeat sentences. --- summarizer/text_processors/sentence_handler.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/summarizer/text_processors/sentence_handler.py b/summarizer/text_processors/sentence_handler.py index a0961a1..5ad2d19 100644 --- a/summarizer/text_processors/sentence_handler.py +++ b/summarizer/text_processors/sentence_handler.py @@ -39,4 +39,8 @@ def process( :return: Returns a list of sentences. """ doc = self.nlp(body) - return self.sentence_processor(doc, min_length, max_length) + + sentences = self.sentence_processor(doc, min_length, max_length) + unique_sentences = list(dict.fromkeys([s.strip() for s in sentences])) + + return unique_sentences