Skip to content

Commit

Permalink
Merge pull request #471 from chair300/main
Browse files Browse the repository at this point in the history
fix tokenize whitespace issue (Issue #453)
  • Loading branch information
doberst authored Mar 1, 2024
2 parents 59866d5 + c8c4d36 commit 33fb1c4
Showing 1 changed file with 8 additions and 2 deletions.
10 changes: 8 additions & 2 deletions llmware/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -795,9 +795,15 @@ def __init__(self, lower_case=True, remove_punctuation=True, remove_stop_words=T
self.one_letter_removal = one_letter_removal

def tokenize(self, text):

# strip the whitespace from the beginning and end of the text so we can tokenize the data
text = text.strip()
# start with basic whitespace tokenizing,
#is there a reason the text is being split on one space only?
#text2 = text.split(" ")
# this line will split on whitespace regardless of tab or multispaces between words
text2 = text.split()

# start with basic whitespace tokenizing
text2 = text.split(" ")

if self.remove_punctuation:
text2 = Utilities().clean_list(text2)
Expand Down

0 comments on commit 33fb1c4

Please sign in to comment.