chore(code): Porting over changes from gpt_bpe.go for Mistral

coreweave · May 15, 2024 · cbd802b · cbd802b
1 parent 94aa0f5
commit cbd802b
Show file tree

Hide file tree

Showing 2 changed files with 88 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,64 @@
+# Binaries for programs and plugins
+*.exe
+*.exe~
+*.dll
+*.so
+*.dylib
+
+# Test binary, coverage reports, and build directories
+*.test
+*.out
+*.coverprofile
+*.cover
+*.cov
+
+# Logs
+*.log
+
+# OS generated files
+.DS_Store
+Thumbs.db
+
+# Temporary files
+*.tmp
+*.temp
+
+# IDE and Editor specific files
+.vscode/
+.idea/
+*.swp
+*~
+*.swo
+
+# Dependency directories and files
+vendor/
+Gopkg.lock
+Gopkg.toml
+go.sum
+
+# Build directories
+bin/
+obj/
+pkg/
+
+# Test directories
+Test*/
+
+# IDE specific project files and directories
+*.iml
+*.ipr
+*.iws
+
+# Code coverage tool output
+*.prof
+coverage.txt
+
+# Go workspace and tools
+go.work
+go.work.sum
+
+# Exclude Go module download cache
+/.go/
+
+# Exclude VS Code Go extension settings
+.vscode/go.*.json
diff --git a/gpt_bpe.go b/gpt_bpe.go
@@ -105,6 +105,7 @@ const VOCAB_ID_CLIP = "clip-tokenizer"
 const VOCAB_ID_NERDSTASH_V1 = "nerdstash_v1-tokenizer"
 const VOCAB_ID_NERDSTASH_V2 = "nerdstash_v2-tokenizer"
 const VOCAB_ID_LLAMA = "llama-tokenizer"
+const VOCAB_ID_MISTRAL = "mistral-tokenizer"
 
 func NewGPT2Encoder() GPTEncoder {
 	encoder, _ := NewEncoder(VOCAB_ID_GPT2)
@@ -136,6 +137,11 @@ func NewLlama2Encoder() GPTEncoder {
 	return *encoder
 }
 
+func NewMistralEncoder() GPTEncoder {
+	encoder, _ := NewEncoder(VOCAB_ID_MISTRAL)
+	return *encoder
+}
+
 // NewEncoder
 // Returns a GPTEncoder with the tokenizer data loaded for that vocabulary
 // id.
@@ -174,11 +180,23 @@ func NewEncoder(vocabId string) (*GPTEncoder, error) {
 		AddEosToken: false,
 		PadToken:    "",
 	}
+	altMistralSpecialsConfig := resources.MistralSpecialsConfig{
+		AddBosToken: false,
+		AddEosToken: false,
+		PadToken:    "",
+	}
 	if special, ok := (rsrcs)["tokenizer_config.json"]; ok {
 		if special.Data != nil {
 			err := json.Unmarshal(*special.Data, &tokenizerSpecialConfig)
 			if err != nil {
-				log.Fatal("Error unmarshalling tokenizer_config.json: ", err)
+				err = json.Unmarshal(*special.Data, &altMistralSpecialsConfig)
+				if err != nil {
+					log.Fatal("Error unmarshalling tokenizer_config.json")
+				}
+				//populate the tokenizerSpecialConfig from the altMistralSpecialsConfig
+				tokenizerSpecialConfig.AddBosToken = altMistralSpecialsConfig.AddBosToken
+				tokenizerSpecialConfig.AddEosToken = altMistralSpecialsConfig.AddEosToken
+				tokenizerSpecialConfig.PadToken = altMistralSpecialsConfig.PadToken
 			}
 		}
 	}
@@ -1018,6 +1036,11 @@ func (encoder *GPTEncoder) StreamingEncode(reader io.RuneReader) func(int) *Toke
 	if encoder.encloseEosBos || encoder.encloseBos {
 		accumulator = append(accumulator, encoder.BosToken)
 	}
+	// Temporary hack - inject a space token at the end of the accumulator for mistral-tokenizer
+	if encoder.VocabId == "mistral-tokenizer" {
+		accumulator = append(accumulator, encoder.Encoder[" "])
+	}
+
 	return func(desiredTokens int) *Tokens {
 		for {
 			// If we have enough tokens, then we return them, and reset the