diff --git a/docs/public/schemas/llms.json b/docs/public/schemas/llms.json index 6de1953e38..47f33f8717 100644 --- a/docs/public/schemas/llms.json +++ b/docs/public/schemas/llms.json @@ -60,6 +60,10 @@ "type": "boolean", "description": "Indicates if speech transcription is supported" }, + "speech": { + "type": "boolean", + "description": "Indicates if speech synthesis is supported" + }, "openaiCompatibility": { "type": "string", "description": "Uses OpenAI API compatibility layer documentation URL" diff --git a/packages/core/src/chat.ts b/packages/core/src/chat.ts index d17bff37a1..8e4e7cb915 100644 --- a/packages/core/src/chat.ts +++ b/packages/core/src/chat.ts @@ -152,12 +152,30 @@ export type TranscribeFunction = ( options: TraceOptions & CancellationOptions ) => Promise +export type CreateSpeechRequest = { + input: string + model: string + voice?: string +} + +export type CreateSpeechResult = { + audio: Uint8Array + error?: SerializedError +} + +export type SpeechFunction = ( + req: CreateSpeechRequest, + cfg: LanguageModelConfiguration, + options: TraceOptions & CancellationOptions +) => Promise + export interface LanguageModel { id: string completer: ChatCompletionHandler listModels?: ListModelsFunction pullModel?: PullModelFunction transcriber?: TranscribeFunction + speaker?: SpeechFunction } async function runToolCalls( diff --git a/packages/core/src/constants.ts b/packages/core/src/constants.ts index 26ca93de78..9681a4bfaf 100644 --- a/packages/core/src/constants.ts +++ b/packages/core/src/constants.ts @@ -61,6 +61,7 @@ export const SMALL_MODEL_ID = "small" export const LARGE_MODEL_ID = "large" export const VISION_MODEL_ID = "vision" export const TRANSCRIPTION_MODEL_ID = "transcription" +export const SPEECH_MODEL_ID = "speech" export const DEFAULT_FENCE_FORMAT: FenceFormat = "xml" export const DEFAULT_TEMPERATURE = 0.8 export const BUILTIN_PREFIX = "_builtin/" @@ -200,6 +201,7 @@ export const MODEL_PROVIDERS = Object.freeze< bearerToken?: boolean listModels?: boolean transcribe?: boolean + speech?: boolean aliases?: Record }[] >(CONFIGURATION_DATA.providers) diff --git a/packages/core/src/llms.json b/packages/core/src/llms.json index 65d178c9f8..8b398b45b7 100644 --- a/packages/core/src/llms.json +++ b/packages/core/src/llms.json @@ -6,6 +6,7 @@ "detail": "OpenAI (or compatible)", "bearerToken": true, "transcribe": true, + "speech": true, "aliases": { "large": "gpt-4o", "small": "gpt-4o-mini", @@ -13,7 +14,8 @@ "embeddings": "text-embedding-3-small", "reasoning": "o1", "reasoning_small": "o1-mini", - "transcription": "whisper-1" + "transcription": "whisper-1", + "speech": "tts-1" } }, { @@ -21,7 +23,9 @@ "detail": "Azure OpenAI deployment", "listModels": false, "bearerToken": false, - "prediction": false + "prediction": false, + "transcribe": true, + "speech": true }, { "id": "azure_serverless", diff --git a/packages/core/src/lm.ts b/packages/core/src/lm.ts index 5045a317b1..aec81336ab 100644 --- a/packages/core/src/lm.ts +++ b/packages/core/src/lm.ts @@ -1,4 +1,3 @@ -import { Transform } from "stream" import { AICIModel } from "./aici" import { AnthropicBedrockModel, AnthropicModel } from "./anthropic" import { LanguageModel } from "./chat" @@ -39,5 +38,6 @@ export function resolveLanguageModel(provider: string): LanguageModel { return LocalOpenAICompatibleModel(provider, { listModels: features?.listModels !== false, transcribe: features?.transcribe, + speech: features?.speech, }) } diff --git a/packages/core/src/openai.ts b/packages/core/src/openai.ts index 7baf0cd230..1c9b9b1bc7 100644 --- a/packages/core/src/openai.ts +++ b/packages/core/src/openai.ts @@ -20,12 +20,13 @@ import { import { estimateTokens } from "./tokens" import { ChatCompletionHandler, + CreateSpeechRequest, + CreateSpeechResult, CreateTranscriptionRequest, LanguageModel, - PullModelFunction, } from "./chat" import { RequestError, errorMessage, serializeError } from "./error" -import { createFetch, iterateBody, traceFetchPost } from "./fetch" +import { createFetch, traceFetchPost } from "./fetch" import { parseModelIdentifier } from "./models" import { JSON5TryParse } from "./json5" import { @@ -435,7 +436,6 @@ export async function OpenAITranscribe( options: TraceOptions & CancellationOptions ): Promise { const { trace } = options || {} - const fetch = await createFetch(options) try { logVerbose(`${cfg.provider}: transcribe with ${cfg.model}`) const route = req.translate ? "translations" : "transcriptions" @@ -453,7 +453,7 @@ export async function OpenAITranscribe( method: "POST", headers: { ...getConfigHeaders(cfg), - ContentType: "multipart/form-data", + "Content-Type": "multipart/form-data", Accept: "application/json", }, body: body, @@ -461,8 +461,10 @@ export async function OpenAITranscribe( traceFetchPost(trace, url, freq.headers, freq.body) // TODO: switch back to cross-fetch in the future const res = await global.fetch(url, freq as any) + trace.itemValue(`status`, `${res.status} ${res.statusText}`) const j = await res.json() - return j + if (!res.ok) return { text: undefined, error: j?.error } + else return j } catch (e) { logError(e) trace?.error(e) @@ -470,9 +472,52 @@ export async function OpenAITranscribe( } } +export async function OpenAISpeech( + req: CreateSpeechRequest, + cfg: LanguageModelConfiguration, + options: TraceOptions & CancellationOptions +): Promise { + const { model, input, voice = "alloy", ...rest } = req + const { trace } = options || {} + const fetch = await createFetch(options) + try { + logVerbose(`${cfg.provider}: speak with ${cfg.model}`) + const url = `${cfg.base}/audio/speech` + trace.itemValue(`url`, `[${url}](${url})`) + const body = { + model, + input, + voice, + } + const freq = { + method: "POST", + headers: { + ...getConfigHeaders(cfg), + "Content-Type": "application/json", + }, + body: JSON.stringify(body), + } + traceFetchPost(trace, url, freq.headers, body) + // TODO: switch back to cross-fetch in the future + const res = await fetch(url, freq as any) + trace.itemValue(`status`, `${res.status} ${res.statusText}`) + if (!res.ok) + return { audio: undefined, error: (await res.json())?.error } + const j = await res.arrayBuffer() + return { audio: new Uint8Array(j) } satisfies CreateSpeechResult + } catch (e) { + logError(e) + trace?.error(e) + return { + audio: undefined, + error: serializeError(e), + } satisfies CreateSpeechResult + } +} + export function LocalOpenAICompatibleModel( providerId: string, - options: { listModels?: boolean; transcribe?: boolean } + options: { listModels?: boolean; transcribe?: boolean; speech?: boolean } ) { return Object.freeze( deleteUndefinedValues({ @@ -480,6 +525,7 @@ export function LocalOpenAICompatibleModel( id: providerId, listModels: options?.listModels ? OpenAIListModels : undefined, transcriber: options?.transcribe ? OpenAITranscribe : undefined, + speaker: options?.speech ? OpenAISpeech : undefined, }) ) } diff --git a/packages/core/src/runpromptcontext.ts b/packages/core/src/runpromptcontext.ts index f52cc57738..5fc61d1fb4 100644 --- a/packages/core/src/runpromptcontext.ts +++ b/packages/core/src/runpromptcontext.ts @@ -30,7 +30,15 @@ import { GenerationOptions } from "./generation" import { promptParametersSchemaToJSONSchema } from "./parameters" import { consoleLogFormat, stdout } from "./logging" import { isGlobMatch } from "./glob" -import { arrayify, assert, logError, logVerbose, logWarn } from "./util" +import { + arrayify, + assert, + deleteUndefinedValues, + dotGenaiscriptPath, + logError, + logVerbose, + logWarn, +} from "./util" import { renderShellOutput } from "./chatrender" import { jinjaRender } from "./jinja" import { mustacheRender } from "./mustache" @@ -39,6 +47,7 @@ import { delay, uniq } from "es-toolkit" import { addToolDefinitionsMessage, appendSystemMessage, + CreateSpeechRequest, executeChatSession, mergeGenerationOptions, toChatCompletionUserMessage, @@ -55,6 +64,7 @@ import { DOCS_DEF_FILES_IS_EMPTY_URL, TRANSCRIPTION_MEMORY_CACHE_NAME, TRANSCRIPTION_MODEL_ID, + SPEECH_MODEL_ID, } from "./constants" import { renderAICI } from "./aici" import { resolveSystems, resolveTools } from "./systems" @@ -82,6 +92,9 @@ import { BufferToBlob } from "./bufferlike" import { host } from "./host" import { srtVttRender } from "./transcription" import { deleteEmptyValues } from "./clone" +import { hash } from "./crypto" +import { fileTypeFromBuffer } from "file-type" +import { writeFile } from "fs" export function createChatTurnGenerationContext( options: GenerationOptions, @@ -660,15 +673,18 @@ export function createChatGenerationContext( configuration.provider ) if (!transcriber) - throw new Error("model driver not found for " + info.model) + throw new Error("audio transcribe not found for " + info.model) const audioFile = await videoExtractAudio(audio, { trace: transcriptionTrace, }) const file = await BufferToBlob(await host.readFile(audioFile)) const update: () => Promise = async () => { - trace.itemValue(`model`, configuration.model) - trace.itemValue(`file size`, prettyBytes(file.size)) - trace.itemValue(`file type`, file.type) + transcriptionTrace.itemValue(`model`, configuration.model) + transcriptionTrace.itemValue( + `file size`, + prettyBytes(file.size) + ) + transcriptionTrace.itemValue(`file type`, file.type) const res = await transcriber( { file, @@ -703,12 +719,15 @@ export function createChatGenerationContext( update, (res) => !res.error ) - trace.itemValue(`cache ${hit.cached ? "hit" : "miss"}`, hit.key) + transcriptionTrace.itemValue( + `cache ${hit.cached ? "hit" : "miss"}`, + hit.key + ) res = hit.value } else res = await update() - trace.fence(res.text, "markdown") - if (res.error) trace.error(errorMessage(res.error)) - if (res.segments) trace.fence(res.segments, "yaml") + transcriptionTrace.fence(res.text, "markdown") + if (res.error) transcriptionTrace.error(errorMessage(res.error)) + if (res.segments) transcriptionTrace.fence(res.segments, "yaml") return res } catch (e) { logError(e) @@ -722,6 +741,71 @@ export function createChatGenerationContext( } } + const speak = async ( + input: string, + options?: SpeechOptions + ): Promise => { + const { cache, voice, ...rest } = options || {} + const speechTrace = trace.startTraceDetails("🦜 speak") + try { + const conn: ModelConnectionOptions = { + model: options?.model || SPEECH_MODEL_ID, + } + const { info, configuration } = await resolveModelConnectionInfo( + conn, + { + trace: speechTrace, + cancellationToken, + token: true, + } + ) + if (info.error) throw new Error(info.error) + if (!configuration) throw new Error("model configuration not found") + checkCancelled(cancellationToken) + const { ok } = await runtimeHost.pullModel(configuration, { + trace: speechTrace, + cancellationToken, + }) + if (!ok) throw new Error(`failed to pull model ${conn}`) + checkCancelled(cancellationToken) + const { speaker } = await resolveLanguageModel( + configuration.provider + ) + if (!speaker) + throw new Error("speech converter not found for " + info.model) + speechTrace.itemValue(`model`, configuration.model) + const req = deleteUndefinedValues({ + input, + model: configuration.model, + voice, + }) satisfies CreateSpeechRequest + const res = await speaker(req, configuration, { + trace: speechTrace, + cancellationToken, + }) + if (res.error) { + speechTrace.error(errorMessage(res.error)) + return { error: res.error } satisfies SpeechResult + } + const h = await hash(res.audio, { length: 20 }) + const { ext } = (await fileTypeFromBuffer(res.audio)) || {} + const filename = dotGenaiscriptPath("speech", h + "." + ext) + await host.writeFile(filename, res.audio) + return { + filename, + } satisfies SpeechResult + } catch (e) { + logError(e) + speechTrace.error(e) + return { + filename: undefined, + error: serializeError(e), + } satisfies SpeechResult + } finally { + speechTrace.endDetails() + } + } + const runPrompt = async ( generator: string | PromptGenerator, runOptions?: PromptGeneratorOptions @@ -968,6 +1052,7 @@ export function createChatGenerationContext( prompt, runPrompt, transcribe, + speak, }) return ctx diff --git a/packages/core/src/types/prompt_template.d.ts b/packages/core/src/types/prompt_template.d.ts index c5445edd19..98724ae767 100644 --- a/packages/core/src/types/prompt_template.d.ts +++ b/packages/core/src/types/prompt_template.d.ts @@ -2978,11 +2978,13 @@ type BufferLike = | ArrayBuffer | ReadableStream +type TranscriptionModelType = OptionsOrString<"openai:whisper-1"> + interface TranscriptionOptions { /** * Model to use for transcription. By default uses the `transcribe` alias. */ - model?: TranscribeModelType + model?: TranscriptionModelType /** * Translate to English. @@ -3054,6 +3056,46 @@ interface TranscriptionResult { }[] } +type SpeechModelType = OptionsOrString<"openai:tts-1-hd" | "openai:tts-1"> + +type SpeechVoiceType = OptionsOrString< + "alloy", + "ash", + "coral", + "echo", + "fable", + "onyx", + "nova", + "sage", + "shimmer" +> + +interface SpeechOptions { + /** + * Speech to text model + */ + model?: SpeechModelType + /** + * Voice to use (model-specific) + */ + voice?: SpeechVoiceType + /** + * If true, the transcription will be cached. + */ + cache?: boolean | string +} + +interface SpeechResult { + /** + * Generate audio-buffer file + */ + filename?: string + /** + * Error if any + */ + error?: SerializedError +} + interface ChatGenerationContext extends ChatTurnGenerationContext { defSchema( name: string, @@ -3108,6 +3150,7 @@ interface ChatGenerationContext extends ChatTurnGenerationContext { audio: string | WorkspaceFile, options?: TranscriptionOptions ): Promise + speak(text: string, options?: SpeechOptions): Promise } interface GenerationOutput { diff --git a/packages/core/src/types/prompt_type.d.ts b/packages/core/src/types/prompt_type.d.ts index bf8ce93fc3..f4f7871464 100644 --- a/packages/core/src/types/prompt_type.d.ts +++ b/packages/core/src/types/prompt_type.d.ts @@ -336,3 +336,10 @@ declare function transcribe( audio: string | WorkspaceFile, options?: TranscriptionOptions ): Promise + +/** + * Converts text to speech. + * @param text + * @param options + */ +declare function speak(text: string, options?: SpeechOptions): Promise diff --git a/packages/sample/genaisrc/speech.genai.mjs b/packages/sample/genaisrc/speech.genai.mjs new file mode 100644 index 0000000000..7fda27ab13 --- /dev/null +++ b/packages/sample/genaisrc/speech.genai.mjs @@ -0,0 +1,4 @@ + + +const hello = await speak('Hello, world!'); +console.log(hello) \ No newline at end of file