Skip to content

Commit

Permalink
chore(code): Porting over changes from gpt_bpe.go for Mistral
Browse files Browse the repository at this point in the history
  • Loading branch information
rtalaricw committed May 15, 2024
1 parent 94aa0f5 commit cbd802b
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 1 deletion.
64 changes: 64 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Binaries for programs and plugins
*.exe
*.exe~
*.dll
*.so
*.dylib

# Test binary, coverage reports, and build directories
*.test
*.out
*.coverprofile
*.cover
*.cov

# Logs
*.log

# OS generated files
.DS_Store
Thumbs.db

# Temporary files
*.tmp
*.temp

# IDE and Editor specific files
.vscode/
.idea/
*.swp
*~
*.swo

# Dependency directories and files
vendor/
Gopkg.lock
Gopkg.toml
go.sum

# Build directories
bin/
obj/
pkg/

# Test directories
Test*/

# IDE specific project files and directories
*.iml
*.ipr
*.iws

# Code coverage tool output
*.prof
coverage.txt

# Go workspace and tools
go.work
go.work.sum

# Exclude Go module download cache
/.go/

# Exclude VS Code Go extension settings
.vscode/go.*.json
25 changes: 24 additions & 1 deletion gpt_bpe.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ const VOCAB_ID_CLIP = "clip-tokenizer"
const VOCAB_ID_NERDSTASH_V1 = "nerdstash_v1-tokenizer"
const VOCAB_ID_NERDSTASH_V2 = "nerdstash_v2-tokenizer"
const VOCAB_ID_LLAMA = "llama-tokenizer"
const VOCAB_ID_MISTRAL = "mistral-tokenizer"

func NewGPT2Encoder() GPTEncoder {
encoder, _ := NewEncoder(VOCAB_ID_GPT2)
Expand Down Expand Up @@ -136,6 +137,11 @@ func NewLlama2Encoder() GPTEncoder {
return *encoder
}

func NewMistralEncoder() GPTEncoder {
encoder, _ := NewEncoder(VOCAB_ID_MISTRAL)
return *encoder
}

// NewEncoder
// Returns a GPTEncoder with the tokenizer data loaded for that vocabulary
// id.
Expand Down Expand Up @@ -174,11 +180,23 @@ func NewEncoder(vocabId string) (*GPTEncoder, error) {
AddEosToken: false,
PadToken: "",
}
altMistralSpecialsConfig := resources.MistralSpecialsConfig{
AddBosToken: false,
AddEosToken: false,
PadToken: "",
}
if special, ok := (rsrcs)["tokenizer_config.json"]; ok {
if special.Data != nil {
err := json.Unmarshal(*special.Data, &tokenizerSpecialConfig)
if err != nil {
log.Fatal("Error unmarshalling tokenizer_config.json: ", err)
err = json.Unmarshal(*special.Data, &altMistralSpecialsConfig)
if err != nil {
log.Fatal("Error unmarshalling tokenizer_config.json")
}
//populate the tokenizerSpecialConfig from the altMistralSpecialsConfig
tokenizerSpecialConfig.AddBosToken = altMistralSpecialsConfig.AddBosToken
tokenizerSpecialConfig.AddEosToken = altMistralSpecialsConfig.AddEosToken
tokenizerSpecialConfig.PadToken = altMistralSpecialsConfig.PadToken
}
}
}
Expand Down Expand Up @@ -1018,6 +1036,11 @@ func (encoder *GPTEncoder) StreamingEncode(reader io.RuneReader) func(int) *Toke
if encoder.encloseEosBos || encoder.encloseBos {
accumulator = append(accumulator, encoder.BosToken)
}
// Temporary hack - inject a space token at the end of the accumulator for mistral-tokenizer
if encoder.VocabId == "mistral-tokenizer" {
accumulator = append(accumulator, encoder.Encoder[" "])
}

return func(desiredTokens int) *Tokens {
for {
// If we have enough tokens, then we return them, and reset the
Expand Down

0 comments on commit cbd802b

Please sign in to comment.