diff --git a/llm_benchmark_suite.py b/llm_benchmark_suite.py index eba230d..9da95e2 100644 --- a/llm_benchmark_suite.py +++ b/llm_benchmark_suite.py @@ -360,7 +360,20 @@ def _image_models(): ] -def _av_models(): +def _audio_models(): + return [ + # _Llm(GPT_4O), doesn't suppot audio yet + # _Llm("gemini-1.5-pro-preview-0514"), 400ing right now + # _Llm("gemini-1.5-flash-preview-0514"), 400ing right now + _Llm( + "fixie-ai/ultravox-v0.2", + base_url="https://ultravox.api.fixie.ai/v1", + api_key=os.getenv("ULTRAVOX_API_KEY"), + ), + ] + + +def _video_models(): return [ # _Llm(GPT_4O), _Llm("gemini-1.5-pro-preview-0514"), @@ -372,8 +385,8 @@ def _get_models(mode: str, filter: Optional[str] = None): mode_map = { "text": _text_models, "image": _image_models, - "audio": _av_models, - "video": _av_models, + "audio": _audio_models, + "video": _video_models, } if mode not in mode_map: raise ValueError(f"Unknown mode {mode}") @@ -397,9 +410,9 @@ def _get_prompt(mode: str) -> List[str]: ] elif mode == "audio": return [ - "Summarize the information in the audio clip.", + "Listen to the following audio and provide a response:", "--file", - "media/audio/news.wav", + "media/audio/boolq.wav", ] elif mode == "video": return [ diff --git a/llm_request.py b/llm_request.py index 8aa56de..f1b275e 100644 --- a/llm_request.py +++ b/llm_request.py @@ -179,8 +179,8 @@ def make_openai_messages(ctx: ApiContext): content: List[Dict[str, Any]] = [{"type": "text", "text": ctx.prompt}] for file in ctx.files: - if not file.mime_type.startswith("image/"): - raise ValueError(f"Unsupported file type: {file.mime_type}") + # if not file.mime_type.startswith("image/"): + # raise ValueError(f"Unsupported file type: {file.mime_type}") url = f"data:{file.mime_type};base64,{file.base64_data}" image_url = {"url": url} if ctx.detail: diff --git a/media/audio/boolq.wav b/media/audio/boolq.wav new file mode 100644 index 0000000..8c14034 Binary files /dev/null and b/media/audio/boolq.wav differ