Skip to content

Commit

Permalink
feat(api): allow to pass audios to backends (#3603)
Browse files Browse the repository at this point in the history
Signed-off-by: Ettore Di Giacinto <[email protected]>
  • Loading branch information
mudler authored Sep 19, 2024
1 parent fbb9fac commit 191bc2e
Show file tree
Hide file tree
Showing 6 changed files with 27 additions and 5 deletions.
1 change: 1 addition & 0 deletions backend/backend.proto
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ message PredictOptions {
bool UseTokenizerTemplate = 43;
repeated Message Messages = 44;
repeated string Videos = 45;
repeated string Audios = 46;
}

// The response message containing the result
Expand Down
3 changes: 2 additions & 1 deletion core/backend/llm.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ type TokenUsage struct {
Completion int
}

func ModelInference(ctx context.Context, s string, messages []schema.Message, images, videos []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
func ModelInference(ctx context.Context, s string, messages []schema.Message, images, videos, audios []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
modelFile := c.Model
threads := c.Threads
if *threads == 0 && o.Threads != 0 {
Expand Down Expand Up @@ -102,6 +102,7 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
opts.UseTokenizerTemplate = c.TemplateConfig.UseTokenizerTemplate
opts.Images = images
opts.Videos = videos
opts.Audios = audios

tokenUsage := TokenUsage{}

Expand Down
6 changes: 5 additions & 1 deletion core/http/endpoints/openai/chat.go
Original file line number Diff line number Diff line change
Expand Up @@ -644,8 +644,12 @@ func handleQuestion(config *config.BackendConfig, input *schema.OpenAIRequest, m
for _, m := range input.Messages {
videos = append(videos, m.StringVideos...)
}
audios := []string{}
for _, m := range input.Messages {
audios = append(audios, m.StringAudios...)
}

predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, videos, ml, *config, o, nil)
predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, videos, audios, ml, *config, o, nil)
if err != nil {
log.Error().Err(err).Msg("model inference failed")
return "", err
Expand Down
6 changes: 5 additions & 1 deletion core/http/endpoints/openai/inference.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,13 @@ func ComputeChoices(
for _, m := range req.Messages {
videos = append(videos, m.StringVideos...)
}
audios := []string{}
for _, m := range req.Messages {
audios = append(audios, m.StringAudios...)
}

// get the model function to call for the result
predFunc, err := backend.ModelInference(req.Context, predInput, req.Messages, images, videos, loader, *config, o, tokenCallback)
predFunc, err := backend.ModelInference(req.Context, predInput, req.Messages, images, videos, audios, loader, *config, o, tokenCallback)
if err != nil {
return result, backend.TokenUsage{}, err
}
Expand Down
14 changes: 12 additions & 2 deletions core/http/endpoints/openai/request.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
}

// Decode each request's message content
imgIndex, vidIndex := 0, 0
imgIndex, vidIndex, audioIndex := 0, 0, 0
for i, m := range input.Messages {
switch content := m.Content.(type) {
case string:
Expand All @@ -160,9 +160,19 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
// set a placeholder for each image
input.Messages[i].StringContent = fmt.Sprintf("[vid-%d]", vidIndex) + input.Messages[i].StringContent
vidIndex++
case "audio_url", "audio":
// Decode content as base64 either if it's an URL or base64 text
base64, err := utils.GetContentURIAsBase64(pp.AudioURL.URL)
if err != nil {
log.Error().Msgf("Failed encoding image: %s", err)
continue CONTENT
}
input.Messages[i].StringAudios = append(input.Messages[i].StringAudios, base64) // TODO: make sure that we only return base64 stuff
// set a placeholder for each image
input.Messages[i].StringContent = fmt.Sprintf("[audio-%d]", audioIndex) + input.Messages[i].StringContent
audioIndex++
case "image_url", "image":
// Decode content as base64 either if it's an URL or base64 text

base64, err := utils.GetContentURIAsBase64(pp.ImageURL.URL)
if err != nil {
log.Error().Msgf("Failed encoding image: %s", err)
Expand Down
2 changes: 2 additions & 0 deletions core/schema/openai.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ type Content struct {
Type string `json:"type" yaml:"type"`
Text string `json:"text" yaml:"text"`
ImageURL ContentURL `json:"image_url" yaml:"image_url"`
AudioURL ContentURL `json:"audio_url" yaml:"audio_url"`
VideoURL ContentURL `json:"video_url" yaml:"video_url"`
}

Expand All @@ -78,6 +79,7 @@ type Message struct {
StringContent string `json:"string_content,omitempty" yaml:"string_content,omitempty"`
StringImages []string `json:"string_images,omitempty" yaml:"string_images,omitempty"`
StringVideos []string `json:"string_videos,omitempty" yaml:"string_videos,omitempty"`
StringAudios []string `json:"string_audios,omitempty" yaml:"string_audios,omitempty"`

// A result of a function call
FunctionCall interface{} `json:"function_call,omitempty" yaml:"function_call,omitempty"`
Expand Down

0 comments on commit 191bc2e

Please sign in to comment.