Merge pull request #471 from chair300/main

fix tokenize whitespace issue (Issue #453)
llmware-ai · Mar 1, 2024 · 33fb1c4 · 33fb1c4
2 parents 59866d5 + c8c4d36
commit 33fb1c4
Showing 1 changed file with 8 additions and 2 deletions.
diff --git a/llmware/util.py b/llmware/util.py
@@ -795,9 +795,15 @@ def __init__(self, lower_case=True, remove_punctuation=True, remove_stop_words=T
         self.one_letter_removal = one_letter_removal
 
     def tokenize(self, text):
+
+        # strip the whitespace from the beginning and end of the text so we can tokenize the data
+        text = text.strip()
+        # start with basic whitespace tokenizing, 
+        #is there a reason the text is being split on one space only?   
+        #text2 = text.split(" ")
+        # this line will split on whitespace regardless of tab or multispaces between words
+        text2 = text.split()
 
-        # start with basic whitespace tokenizing
-        text2 = text.split(" ")
 
         if self.remove_punctuation:
             text2 = Utilities().clean_list(text2)