diff --git a/docs/public/schemas/llms.json b/docs/public/schemas/llms.json index 5a3147842..0db5c7b6f 100644 --- a/docs/public/schemas/llms.json +++ b/docs/public/schemas/llms.json @@ -56,6 +56,10 @@ "type": "boolean", "description": "Indicates if pulling models is supported" }, + "transcribe": { + "type": "boolean", + "description": "Indicates if speech transcription is supported" + }, "openaiCompatibility": { "type": "string", "description": "Uses OpenAI API compatibility layer documentation URL" diff --git a/packages/core/package.json b/packages/core/package.json index 2e8e9bfb2..3ac8074ac 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -44,7 +44,6 @@ "@types/semver": "^7.5.8", "@types/shell-quote": "^1.7.5", "ajv": "^8.17.1", - "cross-fetch": "^4.1.0", "csv-parse": "^5.6.0", "csv-stringify": "^6.5.2", "diff": "^7.0.0", diff --git a/packages/core/src/bufferlike.ts b/packages/core/src/bufferlike.ts new file mode 100644 index 000000000..f29521c96 --- /dev/null +++ b/packages/core/src/bufferlike.ts @@ -0,0 +1,27 @@ +import { resolveFileBytes } from "./file" +import { TraceOptions } from "./trace" + +export async function resolveBufferLike( + bufferLike: BufferLike, + options?: TraceOptions +): Promise { + // If the URL is a string, resolve it to a data URI + if (typeof bufferLike === "string") + return Buffer.from(await resolveFileBytes(bufferLike, options)) + else if (bufferLike instanceof Blob) + return Buffer.from(await bufferLike.arrayBuffer()) + else if (bufferLike instanceof ReadableStream) { + const stream: ReadableStream = bufferLike + return Buffer.from(await new Response(stream).arrayBuffer()) + } else if (bufferLike instanceof ArrayBuffer) + bufferLike = Buffer.from(bufferLike) + else if ( + typeof bufferLike === "object" && + (bufferLike as WorkspaceFile).content + ) + return Buffer.from( + (bufferLike as WorkspaceFile).content, + (bufferLike as WorkspaceFile).encoding || "utf-8" + ) + throw new Error("Unsupported buffer-like object") +} diff --git a/packages/core/src/chat.ts b/packages/core/src/chat.ts index 7465c48f7..0deb9f534 100644 --- a/packages/core/src/chat.ts +++ b/packages/core/src/chat.ts @@ -141,7 +141,8 @@ export type PullModelFunction = ( ) => Promise<{ ok: boolean; error?: SerializedError }> export type CreateTranscriptionRequest = { - file: BufferLike + file: Blob + model: string } & TranscriptionOptions export type TranscribeFunction = ( diff --git a/packages/core/src/constants.ts b/packages/core/src/constants.ts index 8ec905eda..8fe5744a7 100644 --- a/packages/core/src/constants.ts +++ b/packages/core/src/constants.ts @@ -195,6 +195,7 @@ export const MODEL_PROVIDERS = Object.freeze< bearerToken?: boolean listModels?: boolean pullModel?: boolean + transcribe?: boolean aliases?: Record }[] >(CONFIGURATION_DATA.providers) diff --git a/packages/core/src/crypto.ts b/packages/core/src/crypto.ts index ca336f737..e943a40bb 100644 --- a/packages/core/src/crypto.ts +++ b/packages/core/src/crypto.ts @@ -62,6 +62,7 @@ export async function hash(value: any, options?: HashOptions) { h.push(sep) await append(c) } + else if (v instanceof Uint8Array) h.push(v) else if (v instanceof Buffer) h.push(new Uint8Array(v)) else if (v instanceof ArrayBuffer) h.push(new Uint8Array(v)) else if (v instanceof Blob) diff --git a/packages/core/src/fetch.ts b/packages/core/src/fetch.ts index 795cffe6e..30bc92118 100644 --- a/packages/core/src/fetch.ts +++ b/packages/core/src/fetch.ts @@ -1,4 +1,3 @@ -import crossFetch from "cross-fetch" import wrapFetch from "fetch-retry" import { MarkdownTrace, TraceOptions } from "./trace" import { @@ -53,9 +52,8 @@ export async function createFetch( // We enrich crossFetch with the proxy. const crossFetchWithProxy: typeof fetch = agent - ? (url, options) => - crossFetch(url, { ...(options || {}), agent } as any) - : crossFetch + ? (url, options) => global.fetch(url, { ...(options || {}), agent } as any) + : global.fetch // Return the default fetch if no retry status codes are specified if (!retryOn?.length) return crossFetchWithProxy @@ -66,18 +64,21 @@ export async function createFetch( retries, retryDelay: (attempt, error, response) => { const code: string = (error as any)?.code as string - if (code === "ECONNRESET" || + if ( + code === "ECONNRESET" || code === "ENOTFOUND" || - cancellationToken?.isCancellationRequested) + cancellationToken?.isCancellationRequested + ) // Return undefined for fatal errors or cancellations to stop retries return undefined const message = errorMessage(error) const status = statusToMessage(response) - const delay = Math.min( - maxDelay, - Math.pow(FETCH_RETRY_GROWTH_FACTOR, attempt) * retryDelay - ) * + const delay = + Math.min( + maxDelay, + Math.pow(FETCH_RETRY_GROWTH_FACTOR, attempt) * retryDelay + ) * (1 + Math.random() / 20) // 5% jitter for delay randomization const msg = toStringList( `retry #${attempt + 1} in ${renderWithPrecision(Math.floor(delay) / 1000, 1)}s`, diff --git a/packages/core/src/image.ts b/packages/core/src/image.ts index 2ee288ff1..f71c90748 100644 --- a/packages/core/src/image.ts +++ b/packages/core/src/image.ts @@ -1,4 +1,5 @@ // Import necessary functions and types from other modules +import { resolveBufferLike } from "./bufferlike" import { IMAGE_DETAIL_LOW_HEIGHT, IMAGE_DETAIL_LOW_WIDTH } from "./constants" import { resolveFileBytes } from "./file" import { TraceOptions } from "./trace" @@ -29,18 +30,11 @@ export async function imageEncodeForLLM( // https://platform.openai.com/docs/guides/vision/calculating-costs#managing-images // If the URL is a string, resolve it to a data URI - if (typeof url === "string") - url = Buffer.from(await resolveFileBytes(url, options)) - else if (url instanceof Blob) url = Buffer.from(await url.arrayBuffer()) - else if (url instanceof ReadableStream) { - const stream: ReadableStream = url - url = Buffer.from(await new Response(stream).arrayBuffer()) - } else if (url instanceof ArrayBuffer) url = Buffer.from(url) - + const buffer = await resolveBufferLike(url) // Read the image using Jimp const { Jimp, HorizontalAlign, VerticalAlign, ResizeStrategy } = await import("jimp") - const img = await Jimp.read(url) + const img = await Jimp.read(buffer) const { width, height } = img if (crop) { const x = Math.max(0, Math.min(width, crop.x ?? 0)) diff --git a/packages/core/src/llms.json b/packages/core/src/llms.json index cc3ef14ce..d748d1671 100644 --- a/packages/core/src/llms.json +++ b/packages/core/src/llms.json @@ -5,6 +5,7 @@ "id": "openai", "detail": "OpenAI (or compatible)", "bearerToken": true, + "transcribe": true, "aliases": { "large": "gpt-4o", "small": "gpt-4o-mini", diff --git a/packages/core/src/lm.ts b/packages/core/src/lm.ts index 378418db7..babac1571 100644 --- a/packages/core/src/lm.ts +++ b/packages/core/src/lm.ts @@ -36,5 +36,6 @@ export function resolveLanguageModel(provider: string): LanguageModel { return LocalOpenAICompatibleModel(provider, { listModels: features?.listModels !== false, pullModel: features?.pullModel, + transcribe: features?.transcribe }) } diff --git a/packages/core/src/openai.ts b/packages/core/src/openai.ts index d2d23bebb..8027845ee 100644 --- a/packages/core/src/openai.ts +++ b/packages/core/src/openai.ts @@ -19,7 +19,12 @@ import { TOOL_URL, } from "./constants" import { estimateTokens } from "./tokens" -import { ChatCompletionHandler, LanguageModel, PullModelFunction } from "./chat" +import { + ChatCompletionHandler, + CreateTranscriptionRequest, + LanguageModel, + PullModelFunction, +} from "./chat" import { RequestError, errorMessage, serializeError } from "./error" import { createFetch, iterateBody, traceFetchPost } from "./fetch" import { parseModelIdentifier } from "./models" @@ -44,6 +49,7 @@ import { LanguageModelConfiguration, LanguageModelInfo, } from "./server/messages" +import prettyBytes from "pretty-bytes" export function getConfigHeaders(cfg: LanguageModelConfiguration) { let { token, type, base, provider } = cfg @@ -82,8 +88,6 @@ export const OpenAIChatCompletion: ChatCompletionHandler = async ( const { requestOptions, partialCb, - cache: cacheOrName, - cacheName, retry, retryDelay, maxDelay, @@ -91,8 +95,7 @@ export const OpenAIChatCompletion: ChatCompletionHandler = async ( inner, } = options const { headers = {}, ...rest } = requestOptions || {} - const { token, source, ...cfgNoToken } = cfg - const { provider, model } = parseModelIdentifier(req.model) + const { model } = parseModelIdentifier(req.model) const { encode: encoder } = await resolveTokenEncoder(model) const postReq = structuredClone({ @@ -477,7 +480,7 @@ const pullModel: PullModelFunction = async (modelId, options) => { return { ok: false, status: resPull.status } } 0 - for await (const chunk of iterateBody(resPull, { cancellationToken })) + for await (const {} of iterateBody(resPull, { cancellationToken })) process.stderr.write(".") process.stderr.write("\n") return { ok: true } @@ -488,9 +491,47 @@ const pullModel: PullModelFunction = async (modelId, options) => { } } +const transcribe = async ( + req: CreateTranscriptionRequest, + cfg: LanguageModelConfiguration, + options: TraceOptions & CancellationOptions +): Promise => { + const { trace } = options || {} + const fetch = await createFetch(options) + try { + logVerbose(`${cfg.provider}: transcribe with ${cfg.model}`) + const route = req.translate ? "translations" : "transcriptions" + const url = `${cfg.base}/audio/${route}` + trace.itemValue(`url`, `[${url}](${url})`) + const body = new FormData() + body.append("file", req.file) + body.append("model", req.model) + body.append("response_format", "verbose_json") + if (req.temperature) + body.append("temperature", req.temperature.toString()) + if (req.language) body.append("language", req.language) + + const res = await fetch(url, { + method: "POST", + headers: { + ...getConfigHeaders(cfg), + ContentType: "multipart/form-data", + Accept: "application/json", + }, + body: body, + }) + const j = await res.json() + return j + } catch (e) { + logError(e) + trace?.error(e) + return { text: undefined, error: serializeError(e) } + } +} + export function LocalOpenAICompatibleModel( providerId: string, - options: { listModels?: boolean; pullModel?: boolean } + options: { listModels?: boolean; pullModel?: boolean; transcribe?: boolean } ) { return Object.freeze( deleteUndefinedValues({ @@ -498,6 +539,7 @@ export function LocalOpenAICompatibleModel( id: providerId, listModels: options?.listModels ? listModels : undefined, pullModel: options?.pullModel ? pullModel : undefined, + transcribe: options?.transcribe ? transcribe : undefined, }) ) } diff --git a/packages/core/src/runpromptcontext.ts b/packages/core/src/runpromptcontext.ts index 678815cab..e4fda54c0 100644 --- a/packages/core/src/runpromptcontext.ts +++ b/packages/core/src/runpromptcontext.ts @@ -79,6 +79,9 @@ import { agentAddMemory, agentQueryMemory } from "./agent" import { YAMLStringify } from "./yaml" import { Project } from "./server/messages" import { parametersToVars } from "./vars" +import { resolveBufferLike } from "./bufferlike" +import { fileTypeFromBuffer } from "file-type" +import prettyBytes from "pretty-bytes" export function createChatTurnGenerationContext( options: GenerationOptions, @@ -626,7 +629,7 @@ export function createChatGenerationContext( } const transcribe = async ( - file: BufferLike, + audio: BufferLike, options?: TranscriptionOptions ): Promise => { const transcriptionTrace = trace.startTraceDetails("🎤 transcribe") @@ -656,9 +659,18 @@ export function createChatGenerationContext( ) if (!transcribe) throw new Error("model driver not found for " + info.model) + const data = await resolveBufferLike(audio, { + trace: transcriptionTrace, + }) + const mimeType = await fileTypeFromBuffer(data) + const file = new Blob([data], { type: mimeType.mime }) + trace.itemValue(`model`, configuration.model) + trace.itemValue(`file size`, prettyBytes(file.size)) + trace.itemValue(`file type`, file.type) const res = await transcribe( { file, + model: configuration.model, language: options?.language, translate: options?.translate, }, diff --git a/packages/sample/genaisrc/transcribe.genai.mjs b/packages/sample/genaisrc/transcribe.genai.mjs new file mode 100644 index 000000000..d62389ed6 --- /dev/null +++ b/packages/sample/genaisrc/transcribe.genai.mjs @@ -0,0 +1,2 @@ +const res = await transcribe("src/audio/helloworld.mp3") +console.log(res) diff --git a/packages/sample/src/audio/helloworld.m4a b/packages/sample/src/audio/helloworld.m4a new file mode 100644 index 000000000..325a08737 Binary files /dev/null and b/packages/sample/src/audio/helloworld.m4a differ diff --git a/packages/sample/src/audio/helloworld.mp3 b/packages/sample/src/audio/helloworld.mp3 new file mode 100644 index 000000000..56e8ab978 Binary files /dev/null and b/packages/sample/src/audio/helloworld.mp3 differ diff --git a/yarn.lock b/yarn.lock index 39573276f..b745e3b91 100644 --- a/yarn.lock +++ b/yarn.lock @@ -4405,13 +4405,6 @@ cpu-features@~0.0.10: buildcheck "~0.0.6" nan "^2.19.0" -cross-fetch@^4.1.0: - version "4.1.0" - resolved "https://registry.yarnpkg.com/cross-fetch/-/cross-fetch-4.1.0.tgz#8f69355007ee182e47fa692ecbaa37a52e43c3d2" - integrity sha512-uKm5PU+MHTootlWEY+mZ4vvXoCn4fLQxT9dSc1sXVMSFkINTJVN8cAQROpwcKm8bJ/c7rgZVIBWzH5T78sNZZw== - dependencies: - node-fetch "^2.7.0" - cross-spawn@^6.0.5: version "6.0.6" resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-6.0.6.tgz#30d0efa0712ddb7eb5a76e1e8721bffafa6b5d57" @@ -8072,7 +8065,7 @@ node-domexception@1.0.0: resolved "https://registry.yarnpkg.com/node-domexception/-/node-domexception-1.0.0.tgz#6888db46a1f71c0b76b3f7555016b63fe64766e5" integrity sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ== -node-fetch@^2.6.7, node-fetch@^2.7.0: +node-fetch@^2.6.7: version "2.7.0" resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.7.0.tgz#d0f0fa6e3e2dc1d27efcd8ad99d550bda94d187d" integrity sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==