Skip to content

Commit

Permalink
improve EstimateTranslatingTokens
Browse files Browse the repository at this point in the history
  • Loading branch information
zensh committed Sep 17, 2023
1 parent 394abf9 commit b50339c
Show file tree
Hide file tree
Showing 7 changed files with 131 additions and 34 deletions.
93 changes: 93 additions & 0 deletions config/default.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,99 @@ access_key_id = ""
access_key_secret = ""
base_url = "https://fs.yiwen.pub/"

[tokens_rate]
English = 1.0
Chinese = 1.40
Afrikaans = 1.56
Aragonese = 1.45
Arabic = 2.60
Azerbaijani = 2.70
Assamese = 5.65
Belarusian = 3.00
Bengali = 5.38
Bislama = 1.58
Bosnian = 1.78
Breton = 1.71
Bulgarian = 2.16
Catalan = 1.67
Chamorro = 1.24
Czech = 1.98
Chechen = 1.98
"Church Slavic" = 1.98
Cornish = 1.94
Corsican = 1.85
Welsh = 2.04
Danish = 1.55
German = 1.39
Esperanto = 1.78
Estonian = 1.74
"Modern Greek" = 4.38
Faroese = 1.98
Persian = 2.91
Finnish = 1.86
French = 1.36
"Western Frisian" = 1.66
Basque = 1.85
Fulah = 1.77
Galician = 1.32
Irish = 2.15
"Scottish Gaelic" = 2.28
Gujarati = 7.30
Croatian = 1.78
Hungarian = 2.17
Hebrew = 3.40
Interlingua = 1.30
Hindi = 4.12
Indonesian = 1.44
Icelandic = 2.05
Italian = 1.49
Javanese = 1.68
Japanese = 1.95
Kazakh = 3.71
Kannada = 6.65
Kashmiri = 4.18
Korean = 1.92
Latvian = 2.14
Lithuanian = 1.98
Luxembourgish = 1.83
Macedonian = 2.32
Maltese = 2.34
Malagasy = 2.10
Malay = 1.68
Mongolian = 3.86
Dutch = 1.45
"Norwegian Bokmål" = 1.50
"Norwegian Nynorsk" = 1.52
Nepali = 4.53
Occitan = 1.69
Norwegian = 1.50
Polish = 1.67
Portuguese = 1.33
Panjabi = 6.95
Romansh = 1.56
Romanian = 1.74
Pushto = 3.43
Russian = 1.93
Slovak = 2.03
Slovenian = 1.74
Spanish = 1.27
Somali = 1.97
Albanian = 2.11
Serbian = 1.77
Sundanese = 1.70
Swahili = 1.76
Swedish = 1.49
Tagalog = 1.89
Thai = 3.39
Turkmen = 1.78
Turkish = 1.86
Ukrainian = 2.74
Urdu = 3.45
Uzbek = 2.26
Vietnamese = 2.22
Xhosa = 2.01
Yiddish = 3.42

[[recommendations]]
gid = "cil6ehjmps48vprp24f0"
cid = "cjoip8a7a762siruokm0"
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ require (
github.com/bsm/redislock v0.9.4
github.com/fxamacker/cbor/v2 v2.5.0
github.com/gabriel-vasile/mimetype v1.4.2
github.com/go-playground/validator/v10 v10.15.3
github.com/go-playground/validator/v10 v10.15.4
github.com/google/uuid v1.3.1
github.com/jaevor/go-nanoid v1.3.0
github.com/klauspost/compress v1.16.7
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ github.com/go-playground/locales v0.14.1 h1:EWaQ/wswjilfKLTECiXz7Rh+3BjFhfDFKv/o
github.com/go-playground/locales v0.14.1/go.mod h1:hxrqLVvrK65+Rwrd5Fc6F2O76J/NuW9t0sjnWqG1slY=
github.com/go-playground/universal-translator v0.18.1 h1:Bcnm0ZwsGyWbCzImXv+pAJnYK9S473LQFuzCbDbfSFY=
github.com/go-playground/universal-translator v0.18.1/go.mod h1:xekY+UJKNuX9WP91TpwSH2VMlDf28Uj24BCp08ZFTUY=
github.com/go-playground/validator/v10 v10.15.3 h1:S+sSpunYjNPDuXkWbK+x+bA7iXiW296KG4dL3X7xUZo=
github.com/go-playground/validator/v10 v10.15.3/go.mod h1:9iXMNT7sEkjXb0I+enO7QXmzG6QCsPWY4zveKFVRSyU=
github.com/go-playground/validator/v10 v10.15.4 h1:zMXza4EpOdooxPel5xDqXEdXG5r+WggpvnAKMsalBjs=
github.com/go-playground/validator/v10 v10.15.4/go.mod h1:9iXMNT7sEkjXb0I+enO7QXmzG6QCsPWY4zveKFVRSyU=
github.com/google/uuid v1.3.1 h1:KjJaJ9iWZ3jOFZIf1Lqf4laDRCasjl0BCmnEGxkdLb4=
github.com/google/uuid v1.3.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/jaevor/go-nanoid v1.3.0 h1:nD+iepesZS6pr3uOVf20vR9GdGgJW1HPaR46gtrxzkg=
Expand Down
4 changes: 2 additions & 2 deletions src/api/publication.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ func (a *Publication) Estimate(ctx *gear.Context) error {
return gear.ErrInternalServerError.From(err)
}

tokens := util.EstimateTranslatingTokens(trans, input.Language, toLang)
tokens := a.blls.Jarvis.EstimateTranslatingTokens(trans, input.Language, toLang)
output := &EstimateOutput{
Balance: wallet.Balance(),
Tokens: tokens,
Expand Down Expand Up @@ -166,7 +166,7 @@ func (a *Publication) Create(ctx *gear.Context) error {
tokens, util.MAX_TOKENS)
}

tokens := uint32(float32(util.EstimateTranslatingTokens(trans, input.Language, *input.ToLanguage)) * 0.9)
tokens := a.blls.Jarvis.EstimateTranslatingTokens(trans, input.Language, *input.ToLanguage)
estimate_cost := model.CostWEN(tokens)
if b := wallet.Balance(); b < estimate_cost {
return gear.ErrPaymentRequired.WithMsgf("insufficient balance, expected %d, got %d", estimate_cost, b)
Expand Down
26 changes: 25 additions & 1 deletion src/bll/jarvis.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,19 @@ package bll
import (
"context"
"errors"
"strings"
"time"

"github.com/teambition/gear"
"github.com/yiwen-ai/yiwen-api/src/conf"
"github.com/yiwen-ai/yiwen-api/src/logging"
"github.com/yiwen-ai/yiwen-api/src/service"
"github.com/yiwen-ai/yiwen-api/src/util"
)

type Jarvis struct {
svc service.APIHost
svc service.APIHost
tokensRate map[string]float32
}

func (b *Jarvis) InitApp(ctx context.Context, app *gear.App) error {
Expand All @@ -22,9 +25,30 @@ func (b *Jarvis) InitApp(ctx context.Context, app *gear.App) error {
}

app.Set(util.LanguagesKey, util.Languages(output))

b.tokensRate = make(map[string]float32, len(conf.Config.TokensRate))

for _, vv := range output {
if f, ok := conf.Config.TokensRate[vv[1]]; ok {
b.tokensRate[vv[0]] = f
}
}

return nil
}

func (b *Jarvis) getTokensRate(lang string) float32 {
if v, ok := b.tokensRate[strings.ToLower(lang)]; ok {
return v
}
return 1.0
}

func (b *Jarvis) EstimateTranslatingTokens(text, srcLang, dstLang string) uint32 {
tokens := util.Tiktokens(text)
return tokens + uint32(float32(tokens)*b.getTokensRate(dstLang)/b.getTokensRate(srcLang))
}

func (b *Jarvis) ListLanguages(ctx context.Context) ([][]string, error) {
output := SuccessResponse[[][]string]{}
if err := b.svc.Get(ctx, "/v1/translating/list_languages", &output); err != nil {
Expand Down
15 changes: 8 additions & 7 deletions src/conf/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,13 +83,14 @@ type ConfigTpl struct {
Rand *rand.Rand
GlobalSignal context.Context
GlobalShutdown context.Context
Env string `json:"env" toml:"env"`
Logger Logger `json:"log" toml:"log"`
Server Server `json:"server" toml:"server"`
Redis Redis `json:"redis" toml:"redis"`
Base Base `json:"base" toml:"base"`
OSS OSS `json:"oss" toml:"oss"`
Recommendations []Recommendation `json:"recommendations" toml:"recommendations"`
Env string `json:"env" toml:"env"`
Logger Logger `json:"log" toml:"log"`
Server Server `json:"server" toml:"server"`
Redis Redis `json:"redis" toml:"redis"`
Base Base `json:"base" toml:"base"`
OSS OSS `json:"oss" toml:"oss"`
TokensRate map[string]float32 `json:"tokens_rate" toml:"tokens_rate"`
Recommendations []Recommendation `json:"recommendations" toml:"recommendations"`

globalJobs int64 // global async jobs counter for graceful shutdown
}
Expand Down
21 changes: 0 additions & 21 deletions src/util/token.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package util

import (
"strings"
"sync"

"github.com/pkoukk/tiktoken-go"
Expand All @@ -10,14 +9,6 @@ import (

var onceTK sync.Once
var tk *tiktoken.Tiktoken
var tokensRate = map[string]float32{
"eng": 1.00,
"zho": 1.20,
"jpn": 1.65,
"fra": 1.31,
"kor": 1.57,
"ara": 2.10,
}

const MAX_CREATION_TOKENS = 64 * 1024
const MAX_TOKENS = 128 * 1024
Expand All @@ -36,15 +27,3 @@ func init() {
func Tiktokens(input string) uint32 {
return uint32(len(tk.Encode(input, nil, nil)))
}

func getTokensRate(lang string) float32 {
if v, ok := tokensRate[strings.ToLower(lang)]; ok {
return v
}
return 1.0
}

func EstimateTranslatingTokens(text, srcLang, dstLang string) uint32 {
tokens := Tiktokens(text)
return tokens + uint32(float32(tokens)*getTokensRate(dstLang)/getTokensRate(srcLang))
}

0 comments on commit b50339c

Please sign in to comment.