From f77e90d79e9a3b2303d12ec4fbfd35ffa9f5e0d5 Mon Sep 17 00:00:00 2001 From: Michael Graczyk Date: Thu, 24 Oct 2024 20:29:26 -0700 Subject: [PATCH] Fix crash tokenizing with empty word_to_id (#72) --- bm25s/tokenization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bm25s/tokenization.py b/bm25s/tokenization.py index b639b78..a82e908 100644 --- a/bm25s/tokenization.py +++ b/bm25s/tokenization.py @@ -273,7 +273,7 @@ def streaming_tokenize( if len(doc_ids) == 0 and allow_empty is True: if update_vocab is True and "" not in self.word_to_id: - self.word_to_id[""] = max(self.word_to_id.values()) + 1 + self.word_to_id[""] = max(self.word_to_id.values(), default=0) + 1 # get the ID for the empty string if "" in self.word_to_id: