From f77e90d79e9a3b2303d12ec4fbfd35ffa9f5e0d5 Mon Sep 17 00:00:00 2001
From: Michael Graczyk <michael@mgraczyk.com>
Date: Thu, 24 Oct 2024 20:29:26 -0700
Subject: [PATCH] Fix crash tokenizing with empty word_to_id (#72)

---
 bm25s/tokenization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bm25s/tokenization.py b/bm25s/tokenization.py
index b639b78..a82e908 100644
--- a/bm25s/tokenization.py
+++ b/bm25s/tokenization.py
@@ -273,7 +273,7 @@ def streaming_tokenize(
 
             if len(doc_ids) == 0 and allow_empty is True:
                 if update_vocab is True and "" not in self.word_to_id:
-                    self.word_to_id[""] = max(self.word_to_id.values()) + 1
+                    self.word_to_id[""] = max(self.word_to_id.values(), default=0) + 1
                 
                 # get the ID for the empty string
                 if "" in self.word_to_id: