From 16de42718b81590616058290a73ec87833ab15a4 Mon Sep 17 00:00:00 2001 From: Christopher Harrison Date: Fri, 1 Mar 2024 02:54:20 +0000 Subject: [PATCH] fixed tokenize to strip and clean whitespace instead of just a single space --- llmware/util.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/llmware/util.py b/llmware/util.py index ebfb245f..66d259c7 100644 --- a/llmware/util.py +++ b/llmware/util.py @@ -795,9 +795,15 @@ def __init__(self, lower_case=True, remove_punctuation=True, remove_stop_words=T self.one_letter_removal = one_letter_removal def tokenize(self, text): + + # strip the whitespace from the beginning and end of the text so we can tokenize the data + text = text.strip() + # start with basic whitespace tokenizing, + #is there a reason the text is being split on one space only? + #text2 = text.split(" ") + # this line will split on whitespace regardless of tab or multispaces between words + text2 = text.split() - # start with basic whitespace tokenizing - text2 = text.split(" ") if self.remove_punctuation: text2 = Utilities().clean_list(text2)