From 16de42718b81590616058290a73ec87833ab15a4 Mon Sep 17 00:00:00 2001
From: Christopher Harrison <harrison@provoco.ai>
Date: Fri, 1 Mar 2024 02:54:20 +0000
Subject: [PATCH] fixed tokenize to strip and clean whitespace instead of just
 a single space

---
 llmware/util.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/llmware/util.py b/llmware/util.py
index ebfb245f..66d259c7 100644
--- a/llmware/util.py
+++ b/llmware/util.py
@@ -795,9 +795,15 @@ def __init__(self, lower_case=True, remove_punctuation=True, remove_stop_words=T
         self.one_letter_removal = one_letter_removal
 
     def tokenize(self, text):
+        
+        # strip the whitespace from the beginning and end of the text so we can tokenize the data
+        text = text.strip()
+        # start with basic whitespace tokenizing, 
+        #is there a reason the text is being split on one space only?   
+        #text2 = text.split(" ")
+        # this line will split on whitespace regardless of tab or multispaces between words
+        text2 = text.split()
 
-        # start with basic whitespace tokenizing
-        text2 = text.split(" ")
 
         if self.remove_punctuation:
             text2 = Utilities().clean_list(text2)