transcriptipoin

microsoft · Jan 9, 2025 · c98be0b · c98be0b
1 parent 1206e96
commit c98be0b
Show file tree

Hide file tree

Showing 16 changed files with 116 additions and 37 deletions.
diff --git a/docs/public/schemas/llms.json b/docs/public/schemas/llms.json
@@ -56,6 +56,10 @@
                         "type": "boolean",
                         "description": "Indicates if pulling models is supported"
                     },
+                    "transcribe": {
+                        "type": "boolean",
+                        "description": "Indicates if speech transcription is supported"
+                    },
                     "openaiCompatibility": {
                         "type": "string",
                         "description": "Uses OpenAI API compatibility layer documentation URL"

diff --git a/packages/core/package.json b/packages/core/package.json
@@ -44,7 +44,6 @@
     "@types/semver": "^7.5.8",
     "@types/shell-quote": "^1.7.5",
     "ajv": "^8.17.1",
-    "cross-fetch": "^4.1.0",
     "csv-parse": "^5.6.0",
     "csv-stringify": "^6.5.2",
     "diff": "^7.0.0",

diff --git a/packages/core/src/bufferlike.ts b/packages/core/src/bufferlike.ts
@@ -0,0 +1,27 @@
+import { resolveFileBytes } from "./file"
+import { TraceOptions } from "./trace"
+
+export async function resolveBufferLike(
+    bufferLike: BufferLike,
+    options?: TraceOptions
+): Promise<Buffer> {
+    // If the URL is a string, resolve it to a data URI
+    if (typeof bufferLike === "string")
+        return Buffer.from(await resolveFileBytes(bufferLike, options))
+    else if (bufferLike instanceof Blob)
+        return Buffer.from(await bufferLike.arrayBuffer())
+    else if (bufferLike instanceof ReadableStream) {
+        const stream: ReadableStream = bufferLike
+        return Buffer.from(await new Response(stream).arrayBuffer())
+    } else if (bufferLike instanceof ArrayBuffer)
+        bufferLike = Buffer.from(bufferLike)
+    else if (
+        typeof bufferLike === "object" &&
+        (bufferLike as WorkspaceFile).content
+    )
+        return Buffer.from(
+            (bufferLike as WorkspaceFile).content,
+            (bufferLike as WorkspaceFile).encoding || "utf-8"
+        )
+    throw new Error("Unsupported buffer-like object")
+}
diff --git a/packages/core/src/chat.ts b/packages/core/src/chat.ts
@@ -141,7 +141,8 @@ export type PullModelFunction = (
 ) => Promise<{ ok: boolean; error?: SerializedError }>
 
 export type CreateTranscriptionRequest = {
-    file: BufferLike
+    file: Blob
+    model: string
 } & TranscriptionOptions
 
 export type TranscribeFunction = (

diff --git a/packages/core/src/constants.ts b/packages/core/src/constants.ts
@@ -195,6 +195,7 @@ export const MODEL_PROVIDERS = Object.freeze<
         bearerToken?: boolean
         listModels?: boolean
         pullModel?: boolean
+        transcribe?: boolean
         aliases?: Record<string, string>
     }[]
 >(CONFIGURATION_DATA.providers)

diff --git a/packages/core/src/crypto.ts b/packages/core/src/crypto.ts
@@ -62,6 +62,7 @@ export async function hash(value: any, options?: HashOptions) {
                 h.push(sep)
                 await append(c)
             }
+        else if (v instanceof Uint8Array) h.push(v)
         else if (v instanceof Buffer) h.push(new Uint8Array(v))
         else if (v instanceof ArrayBuffer) h.push(new Uint8Array(v))
         else if (v instanceof Blob)

diff --git a/packages/core/src/fetch.ts b/packages/core/src/fetch.ts
@@ -1,4 +1,3 @@
-import crossFetch from "cross-fetch"
 import wrapFetch from "fetch-retry"
 import { MarkdownTrace, TraceOptions } from "./trace"
 import {
@@ -53,9 +52,8 @@ export async function createFetch(
 
     // We enrich crossFetch with the proxy.
     const crossFetchWithProxy: typeof fetch = agent
-        ? (url, options) =>
-              crossFetch(url, { ...(options || {}), agent } as any)
-        : crossFetch
+        ? (url, options) => global.fetch(url, { ...(options || {}), agent } as any)
+        : global.fetch
 
     // Return the default fetch if no retry status codes are specified
     if (!retryOn?.length) return crossFetchWithProxy
@@ -66,18 +64,21 @@ export async function createFetch(
         retries,
         retryDelay: (attempt, error, response) => {
             const code: string = (error as any)?.code as string
-            if (code === "ECONNRESET" ||
+            if (
+                code === "ECONNRESET" ||
                 code === "ENOTFOUND" ||
-                cancellationToken?.isCancellationRequested)
+                cancellationToken?.isCancellationRequested
+            )
                 // Return undefined for fatal errors or cancellations to stop retries
                 return undefined
 
             const message = errorMessage(error)
             const status = statusToMessage(response)
-            const delay = Math.min(
-                maxDelay,
-                Math.pow(FETCH_RETRY_GROWTH_FACTOR, attempt) * retryDelay
-            ) *
+            const delay =
+                Math.min(
+                    maxDelay,
+                    Math.pow(FETCH_RETRY_GROWTH_FACTOR, attempt) * retryDelay
+                ) *
                 (1 + Math.random() / 20) // 5% jitter for delay randomization
             const msg = toStringList(
                 `retry #${attempt + 1} in ${renderWithPrecision(Math.floor(delay) / 1000, 1)}s`,

diff --git a/packages/core/src/image.ts b/packages/core/src/image.ts
@@ -1,4 +1,5 @@
 // Import necessary functions and types from other modules
+import { resolveBufferLike } from "./bufferlike"
 import { IMAGE_DETAIL_LOW_HEIGHT, IMAGE_DETAIL_LOW_WIDTH } from "./constants"
 import { resolveFileBytes } from "./file"
 import { TraceOptions } from "./trace"
@@ -29,18 +30,11 @@ export async function imageEncodeForLLM(
 
     // https://platform.openai.com/docs/guides/vision/calculating-costs#managing-images
     // If the URL is a string, resolve it to a data URI
-    if (typeof url === "string")
-        url = Buffer.from(await resolveFileBytes(url, options))
-    else if (url instanceof Blob) url = Buffer.from(await url.arrayBuffer())
-    else if (url instanceof ReadableStream) {
-        const stream: ReadableStream = url
-        url = Buffer.from(await new Response(stream).arrayBuffer())
-    } else if (url instanceof ArrayBuffer) url = Buffer.from(url)
-
+    const buffer = await resolveBufferLike(url)
     // Read the image using Jimp
     const { Jimp, HorizontalAlign, VerticalAlign, ResizeStrategy } =
         await import("jimp")
-    const img = await Jimp.read(url)
+    const img = await Jimp.read(buffer)
     const { width, height } = img
     if (crop) {
         const x = Math.max(0, Math.min(width, crop.x ?? 0))

diff --git a/packages/core/src/llms.json b/packages/core/src/llms.json
@@ -5,6 +5,7 @@
             "id": "openai",
             "detail": "OpenAI (or compatible)",
             "bearerToken": true,
+            "transcribe": true,
             "aliases": {
                 "large": "gpt-4o",
                 "small": "gpt-4o-mini",

diff --git a/packages/core/src/lm.ts b/packages/core/src/lm.ts
@@ -36,5 +36,6 @@ export function resolveLanguageModel(provider: string): LanguageModel {
     return LocalOpenAICompatibleModel(provider, {
         listModels: features?.listModels !== false,
         pullModel: features?.pullModel,
+        transcribe: features?.transcribe
     })
 }
diff --git a/packages/core/src/openai.ts b/packages/core/src/openai.ts
@@ -19,7 +19,12 @@ import {
     TOOL_URL,
 } from "./constants"
 import { estimateTokens } from "./tokens"
-import { ChatCompletionHandler, LanguageModel, PullModelFunction } from "./chat"
+import {
+    ChatCompletionHandler,
+    CreateTranscriptionRequest,
+    LanguageModel,
+    PullModelFunction,
+} from "./chat"
 import { RequestError, errorMessage, serializeError } from "./error"
 import { createFetch, iterateBody, traceFetchPost } from "./fetch"
 import { parseModelIdentifier } from "./models"
@@ -44,6 +49,7 @@ import {
     LanguageModelConfiguration,
     LanguageModelInfo,
 } from "./server/messages"
+import prettyBytes from "pretty-bytes"
 
 export function getConfigHeaders(cfg: LanguageModelConfiguration) {
     let { token, type, base, provider } = cfg
@@ -82,17 +88,14 @@ export const OpenAIChatCompletion: ChatCompletionHandler = async (
     const {
         requestOptions,
         partialCb,
-        cache: cacheOrName,
-        cacheName,
         retry,
         retryDelay,
         maxDelay,
         cancellationToken,
         inner,
     } = options
     const { headers = {}, ...rest } = requestOptions || {}
-    const { token, source, ...cfgNoToken } = cfg
-    const { provider, model } = parseModelIdentifier(req.model)
+    const { model } = parseModelIdentifier(req.model)
     const { encode: encoder } = await resolveTokenEncoder(model)
 
     const postReq = structuredClone({
@@ -477,7 +480,7 @@ const pullModel: PullModelFunction = async (modelId, options) => {
             return { ok: false, status: resPull.status }
         }
         0
-        for await (const chunk of iterateBody(resPull, { cancellationToken }))
+        for await (const {} of iterateBody(resPull, { cancellationToken }))
             process.stderr.write(".")
         process.stderr.write("\n")
         return { ok: true }
@@ -488,16 +491,55 @@ const pullModel: PullModelFunction = async (modelId, options) => {
     }
 }
 
+const transcribe = async (
+    req: CreateTranscriptionRequest,
+    cfg: LanguageModelConfiguration,
+    options: TraceOptions & CancellationOptions
+): Promise<TranscriptionResult> => {
+    const { trace } = options || {}
+    const fetch = await createFetch(options)
+    try {
+        logVerbose(`${cfg.provider}: transcribe with ${cfg.model}`)
+        const route = req.translate ? "translations" : "transcriptions"
+        const url = `${cfg.base}/audio/${route}`
+        trace.itemValue(`url`, `[${url}](${url})`)
+        const body = new FormData()
+        body.append("file", req.file)
+        body.append("model", req.model)
+        body.append("response_format", "verbose_json")
+        if (req.temperature)
+            body.append("temperature", req.temperature.toString())
+        if (req.language) body.append("language", req.language)
+
+        const res = await fetch(url, {
+            method: "POST",
+            headers: {
+                ...getConfigHeaders(cfg),
+                ContentType: "multipart/form-data",
+                Accept: "application/json",
+            },
+            body: body,
+        })
+        const j = await res.json()
+        return j
+    } catch (e) {
+        logError(e)
+        trace?.error(e)
+        return { text: undefined, error: serializeError(e) }
+    }
+}
+
 export function LocalOpenAICompatibleModel(
     providerId: string,
-    options: { listModels?: boolean; pullModel?: boolean }
+    options: { listModels?: boolean; pullModel?: boolean; transcribe?: boolean }
 ) {
     return Object.freeze<LanguageModel>(
         deleteUndefinedValues({
             completer: OpenAIChatCompletion,
             id: providerId,
             listModels: options?.listModels ? listModels : undefined,
             pullModel: options?.pullModel ? pullModel : undefined,
+            transcribe: options?.transcribe ? transcribe : undefined,
         })
     )
 }
diff --git a/packages/core/src/runpromptcontext.ts b/packages/core/src/runpromptcontext.ts
@@ -79,6 +79,9 @@ import { agentAddMemory, agentQueryMemory } from "./agent"
 import { YAMLStringify } from "./yaml"
 import { Project } from "./server/messages"
 import { parametersToVars } from "./vars"
+import { resolveBufferLike } from "./bufferlike"
+import { fileTypeFromBuffer } from "file-type"
+import prettyBytes from "pretty-bytes"
 
 export function createChatTurnGenerationContext(
     options: GenerationOptions,
@@ -626,7 +629,7 @@ export function createChatGenerationContext(
     }
 
     const transcribe = async (
-        file: BufferLike,
+        audio: BufferLike,
         options?: TranscriptionOptions
     ): Promise<TranscriptionResult> => {
         const transcriptionTrace = trace.startTraceDetails("🎤 transcribe")
@@ -656,9 +659,18 @@ export function createChatGenerationContext(
             )
             if (!transcribe)
                 throw new Error("model driver not found for " + info.model)
+            const data = await resolveBufferLike(audio, {
+                trace: transcriptionTrace,
+            })
+            const mimeType = await fileTypeFromBuffer(data)
+            const file = new Blob([data], { type: mimeType.mime })
+            trace.itemValue(`model`, configuration.model)
+            trace.itemValue(`file size`, prettyBytes(file.size))
+            trace.itemValue(`file type`, file.type)
             const res = await transcribe(
                 {
                     file,
+                    model: configuration.model,
                     language: options?.language,
                     translate: options?.translate,
                 },

diff --git a/packages/sample/genaisrc/transcribe.genai.mjs b/packages/sample/genaisrc/transcribe.genai.mjs
@@ -0,0 +1,2 @@
+const res = await transcribe("src/audio/helloworld.mp3")
+console.log(res)
diff --git a/packages/sample/src/audio/helloworld.m4a b/packages/sample/src/audio/helloworld.m4a
diff --git a/packages/sample/src/audio/helloworld.mp3 b/packages/sample/src/audio/helloworld.mp3
diff --git a/yarn.lock b/yarn.lock
@@ -4405,13 +4405,6 @@ cpu-features@~0.0.10:
     buildcheck "~0.0.6"
     nan "^2.19.0"
 
-cross-fetch@^4.1.0:
-  version "4.1.0"
-  resolved "https://registry.yarnpkg.com/cross-fetch/-/cross-fetch-4.1.0.tgz#8f69355007ee182e47fa692ecbaa37a52e43c3d2"
-  integrity sha512-uKm5PU+MHTootlWEY+mZ4vvXoCn4fLQxT9dSc1sXVMSFkINTJVN8cAQROpwcKm8bJ/c7rgZVIBWzH5T78sNZZw==
-  dependencies:
-    node-fetch "^2.7.0"
-
 cross-spawn@^6.0.5:
   version "6.0.6"
   resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-6.0.6.tgz#30d0efa0712ddb7eb5a76e1e8721bffafa6b5d57"
@@ -8072,7 +8065,7 @@ [email protected]:
   resolved "https://registry.yarnpkg.com/node-domexception/-/node-domexception-1.0.0.tgz#6888db46a1f71c0b76b3f7555016b63fe64766e5"
   integrity sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==
 
-node-fetch@^2.6.7, node-fetch@^2.7.0:
+node-fetch@^2.6.7:
   version "2.7.0"
   resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.7.0.tgz#d0f0fa6e3e2dc1d27efcd8ad99d550bda94d187d"
   integrity sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		const res = await transcribe("src/audio/helloworld.mp3")
		console.log(res)