Skip to content

Commit

Permalink
transcriptipoin
Browse files Browse the repository at this point in the history
  • Loading branch information
pelikhan committed Jan 9, 2025
1 parent 1206e96 commit c98be0b
Show file tree
Hide file tree
Showing 16 changed files with 116 additions and 37 deletions.
4 changes: 4 additions & 0 deletions docs/public/schemas/llms.json
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@
"type": "boolean",
"description": "Indicates if pulling models is supported"
},
"transcribe": {
"type": "boolean",
"description": "Indicates if speech transcription is supported"
},
"openaiCompatibility": {
"type": "string",
"description": "Uses OpenAI API compatibility layer documentation URL"
Expand Down
1 change: 0 additions & 1 deletion packages/core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@
"@types/semver": "^7.5.8",
"@types/shell-quote": "^1.7.5",
"ajv": "^8.17.1",
"cross-fetch": "^4.1.0",
"csv-parse": "^5.6.0",
"csv-stringify": "^6.5.2",
"diff": "^7.0.0",
Expand Down
27 changes: 27 additions & 0 deletions packages/core/src/bufferlike.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import { resolveFileBytes } from "./file"
import { TraceOptions } from "./trace"

export async function resolveBufferLike(
bufferLike: BufferLike,
options?: TraceOptions
): Promise<Buffer> {
// If the URL is a string, resolve it to a data URI
if (typeof bufferLike === "string")
return Buffer.from(await resolveFileBytes(bufferLike, options))
else if (bufferLike instanceof Blob)
return Buffer.from(await bufferLike.arrayBuffer())
else if (bufferLike instanceof ReadableStream) {
const stream: ReadableStream = bufferLike
return Buffer.from(await new Response(stream).arrayBuffer())
} else if (bufferLike instanceof ArrayBuffer)
bufferLike = Buffer.from(bufferLike)
else if (
typeof bufferLike === "object" &&
(bufferLike as WorkspaceFile).content
)
return Buffer.from(
(bufferLike as WorkspaceFile).content,
(bufferLike as WorkspaceFile).encoding || "utf-8"
)
throw new Error("Unsupported buffer-like object")
}
3 changes: 2 additions & 1 deletion packages/core/src/chat.ts
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,8 @@ export type PullModelFunction = (
) => Promise<{ ok: boolean; error?: SerializedError }>

export type CreateTranscriptionRequest = {
file: BufferLike
file: Blob
model: string
} & TranscriptionOptions

export type TranscribeFunction = (
Expand Down
1 change: 1 addition & 0 deletions packages/core/src/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ export const MODEL_PROVIDERS = Object.freeze<
bearerToken?: boolean
listModels?: boolean
pullModel?: boolean
transcribe?: boolean
aliases?: Record<string, string>
}[]
>(CONFIGURATION_DATA.providers)
Expand Down
1 change: 1 addition & 0 deletions packages/core/src/crypto.ts
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ export async function hash(value: any, options?: HashOptions) {
h.push(sep)
await append(c)
}
else if (v instanceof Uint8Array) h.push(v)
else if (v instanceof Buffer) h.push(new Uint8Array(v))
else if (v instanceof ArrayBuffer) h.push(new Uint8Array(v))
else if (v instanceof Blob)
Expand Down
21 changes: 11 additions & 10 deletions packages/core/src/fetch.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import crossFetch from "cross-fetch"
import wrapFetch from "fetch-retry"
import { MarkdownTrace, TraceOptions } from "./trace"
import {
Expand Down Expand Up @@ -53,9 +52,8 @@ export async function createFetch(

// We enrich crossFetch with the proxy.
const crossFetchWithProxy: typeof fetch = agent
? (url, options) =>
crossFetch(url, { ...(options || {}), agent } as any)
: crossFetch
? (url, options) => global.fetch(url, { ...(options || {}), agent } as any)
: global.fetch

// Return the default fetch if no retry status codes are specified
if (!retryOn?.length) return crossFetchWithProxy
Expand All @@ -66,18 +64,21 @@ export async function createFetch(
retries,
retryDelay: (attempt, error, response) => {
const code: string = (error as any)?.code as string
if (code === "ECONNRESET" ||
if (
code === "ECONNRESET" ||
code === "ENOTFOUND" ||
cancellationToken?.isCancellationRequested)
cancellationToken?.isCancellationRequested
)
// Return undefined for fatal errors or cancellations to stop retries
return undefined

const message = errorMessage(error)
const status = statusToMessage(response)
const delay = Math.min(
maxDelay,
Math.pow(FETCH_RETRY_GROWTH_FACTOR, attempt) * retryDelay
) *
const delay =
Math.min(
maxDelay,
Math.pow(FETCH_RETRY_GROWTH_FACTOR, attempt) * retryDelay
) *
(1 + Math.random() / 20) // 5% jitter for delay randomization
const msg = toStringList(
`retry #${attempt + 1} in ${renderWithPrecision(Math.floor(delay) / 1000, 1)}s`,
Expand Down
12 changes: 3 additions & 9 deletions packages/core/src/image.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
// Import necessary functions and types from other modules
import { resolveBufferLike } from "./bufferlike"
import { IMAGE_DETAIL_LOW_HEIGHT, IMAGE_DETAIL_LOW_WIDTH } from "./constants"
import { resolveFileBytes } from "./file"
import { TraceOptions } from "./trace"
Expand Down Expand Up @@ -29,18 +30,11 @@ export async function imageEncodeForLLM(

// https://platform.openai.com/docs/guides/vision/calculating-costs#managing-images
// If the URL is a string, resolve it to a data URI
if (typeof url === "string")
url = Buffer.from(await resolveFileBytes(url, options))
else if (url instanceof Blob) url = Buffer.from(await url.arrayBuffer())
else if (url instanceof ReadableStream) {
const stream: ReadableStream = url
url = Buffer.from(await new Response(stream).arrayBuffer())
} else if (url instanceof ArrayBuffer) url = Buffer.from(url)

const buffer = await resolveBufferLike(url)
// Read the image using Jimp
const { Jimp, HorizontalAlign, VerticalAlign, ResizeStrategy } =
await import("jimp")
const img = await Jimp.read(url)
const img = await Jimp.read(buffer)
const { width, height } = img
if (crop) {
const x = Math.max(0, Math.min(width, crop.x ?? 0))
Expand Down
1 change: 1 addition & 0 deletions packages/core/src/llms.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"id": "openai",
"detail": "OpenAI (or compatible)",
"bearerToken": true,
"transcribe": true,
"aliases": {
"large": "gpt-4o",
"small": "gpt-4o-mini",
Expand Down
1 change: 1 addition & 0 deletions packages/core/src/lm.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,5 +36,6 @@ export function resolveLanguageModel(provider: string): LanguageModel {
return LocalOpenAICompatibleModel(provider, {
listModels: features?.listModels !== false,
pullModel: features?.pullModel,
transcribe: features?.transcribe
})
}
56 changes: 49 additions & 7 deletions packages/core/src/openai.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,12 @@ import {
TOOL_URL,
} from "./constants"
import { estimateTokens } from "./tokens"
import { ChatCompletionHandler, LanguageModel, PullModelFunction } from "./chat"
import {
ChatCompletionHandler,
CreateTranscriptionRequest,
LanguageModel,
PullModelFunction,
} from "./chat"
import { RequestError, errorMessage, serializeError } from "./error"
import { createFetch, iterateBody, traceFetchPost } from "./fetch"
import { parseModelIdentifier } from "./models"
Expand All @@ -44,6 +49,7 @@ import {
LanguageModelConfiguration,
LanguageModelInfo,
} from "./server/messages"
import prettyBytes from "pretty-bytes"

export function getConfigHeaders(cfg: LanguageModelConfiguration) {
let { token, type, base, provider } = cfg
Expand Down Expand Up @@ -82,17 +88,14 @@ export const OpenAIChatCompletion: ChatCompletionHandler = async (
const {
requestOptions,
partialCb,
cache: cacheOrName,
cacheName,
retry,
retryDelay,
maxDelay,
cancellationToken,
inner,
} = options
const { headers = {}, ...rest } = requestOptions || {}
const { token, source, ...cfgNoToken } = cfg
const { provider, model } = parseModelIdentifier(req.model)
const { model } = parseModelIdentifier(req.model)
const { encode: encoder } = await resolveTokenEncoder(model)

const postReq = structuredClone({
Expand Down Expand Up @@ -477,7 +480,7 @@ const pullModel: PullModelFunction = async (modelId, options) => {
return { ok: false, status: resPull.status }
}
0
for await (const chunk of iterateBody(resPull, { cancellationToken }))
for await (const {} of iterateBody(resPull, { cancellationToken }))
process.stderr.write(".")
process.stderr.write("\n")
return { ok: true }
Expand All @@ -488,16 +491,55 @@ const pullModel: PullModelFunction = async (modelId, options) => {
}
}

const transcribe = async (
req: CreateTranscriptionRequest,
cfg: LanguageModelConfiguration,
options: TraceOptions & CancellationOptions
): Promise<TranscriptionResult> => {
const { trace } = options || {}
const fetch = await createFetch(options)
try {
logVerbose(`${cfg.provider}: transcribe with ${cfg.model}`)
const route = req.translate ? "translations" : "transcriptions"
const url = `${cfg.base}/audio/${route}`
trace.itemValue(`url`, `[${url}](${url})`)
const body = new FormData()
body.append("file", req.file)
body.append("model", req.model)
body.append("response_format", "verbose_json")
if (req.temperature)
body.append("temperature", req.temperature.toString())
if (req.language) body.append("language", req.language)

const res = await fetch(url, {
method: "POST",
headers: {
...getConfigHeaders(cfg),
ContentType: "multipart/form-data",
Accept: "application/json",
},
body: body,
})
const j = await res.json()
return j
} catch (e) {
logError(e)
trace?.error(e)
return { text: undefined, error: serializeError(e) }
}
}

export function LocalOpenAICompatibleModel(
providerId: string,
options: { listModels?: boolean; pullModel?: boolean }
options: { listModels?: boolean; pullModel?: boolean; transcribe?: boolean }
) {
return Object.freeze<LanguageModel>(
deleteUndefinedValues({
completer: OpenAIChatCompletion,
id: providerId,
listModels: options?.listModels ? listModels : undefined,
pullModel: options?.pullModel ? pullModel : undefined,
transcribe: options?.transcribe ? transcribe : undefined,
})
)
}
14 changes: 13 additions & 1 deletion packages/core/src/runpromptcontext.ts
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ import { agentAddMemory, agentQueryMemory } from "./agent"
import { YAMLStringify } from "./yaml"
import { Project } from "./server/messages"
import { parametersToVars } from "./vars"
import { resolveBufferLike } from "./bufferlike"
import { fileTypeFromBuffer } from "file-type"
import prettyBytes from "pretty-bytes"

export function createChatTurnGenerationContext(
options: GenerationOptions,
Expand Down Expand Up @@ -626,7 +629,7 @@ export function createChatGenerationContext(
}

const transcribe = async (
file: BufferLike,
audio: BufferLike,
options?: TranscriptionOptions
): Promise<TranscriptionResult> => {
const transcriptionTrace = trace.startTraceDetails("🎤 transcribe")
Expand Down Expand Up @@ -656,9 +659,18 @@ export function createChatGenerationContext(
)
if (!transcribe)
throw new Error("model driver not found for " + info.model)
const data = await resolveBufferLike(audio, {
trace: transcriptionTrace,
})
const mimeType = await fileTypeFromBuffer(data)
const file = new Blob([data], { type: mimeType.mime })
trace.itemValue(`model`, configuration.model)
trace.itemValue(`file size`, prettyBytes(file.size))
trace.itemValue(`file type`, file.type)
const res = await transcribe(
{
file,
model: configuration.model,
language: options?.language,
translate: options?.translate,
},
Expand Down
2 changes: 2 additions & 0 deletions packages/sample/genaisrc/transcribe.genai.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
const res = await transcribe("src/audio/helloworld.mp3")
console.log(res)
Binary file added packages/sample/src/audio/helloworld.m4a
Binary file not shown.
Binary file added packages/sample/src/audio/helloworld.mp3
Binary file not shown.
9 changes: 1 addition & 8 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -4405,13 +4405,6 @@ cpu-features@~0.0.10:
buildcheck "~0.0.6"
nan "^2.19.0"

cross-fetch@^4.1.0:
version "4.1.0"
resolved "https://registry.yarnpkg.com/cross-fetch/-/cross-fetch-4.1.0.tgz#8f69355007ee182e47fa692ecbaa37a52e43c3d2"
integrity sha512-uKm5PU+MHTootlWEY+mZ4vvXoCn4fLQxT9dSc1sXVMSFkINTJVN8cAQROpwcKm8bJ/c7rgZVIBWzH5T78sNZZw==
dependencies:
node-fetch "^2.7.0"

cross-spawn@^6.0.5:
version "6.0.6"
resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-6.0.6.tgz#30d0efa0712ddb7eb5a76e1e8721bffafa6b5d57"
Expand Down Expand Up @@ -8072,7 +8065,7 @@ [email protected]:
resolved "https://registry.yarnpkg.com/node-domexception/-/node-domexception-1.0.0.tgz#6888db46a1f71c0b76b3f7555016b63fe64766e5"
integrity sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==

node-fetch@^2.6.7, node-fetch@^2.7.0:
node-fetch@^2.6.7:
version "2.7.0"
resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.7.0.tgz#d0f0fa6e3e2dc1d27efcd8ad99d550bda94d187d"
integrity sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==
Expand Down

0 comments on commit c98be0b

Please sign in to comment.