From 9e15430387406e59947ef19974ed71b2fc766107 Mon Sep 17 00:00:00 2001 From: aoife cassidy Date: Thu, 19 Dec 2024 19:45:54 +0200 Subject: [PATCH 1/3] fix(pipeline): add transcription for AGENT_SPEECH_COMMITTED (#218) --- .changeset/khaki-ties-design.md | 5 +++++ agents/src/pipeline/agent_output.ts | 21 ++++++++++++++------- agents/src/pipeline/pipeline_agent.ts | 8 ++------ 3 files changed, 21 insertions(+), 13 deletions(-) create mode 100644 .changeset/khaki-ties-design.md diff --git a/.changeset/khaki-ties-design.md b/.changeset/khaki-ties-design.md new file mode 100644 index 00000000..04de819d --- /dev/null +++ b/.changeset/khaki-ties-design.md @@ -0,0 +1,5 @@ +--- +"@livekit/agents": patch +--- + +fix(pipeline): add transcription for AGENT_SPEECH_COMMITTED diff --git a/agents/src/pipeline/agent_output.ts b/agents/src/pipeline/agent_output.ts index 38519f79..686a789b 100644 --- a/agents/src/pipeline/agent_output.ts +++ b/agents/src/pipeline/agent_output.ts @@ -13,6 +13,7 @@ export class SynthesisHandle { static readonly FLUSH_SENTINEL = Symbol('FLUSH_SENTINEL'); #speechId: string; + text?: string; ttsSource: SpeechSource; #agentPlayout: AgentPlayout; tts: TTS; @@ -97,7 +98,7 @@ export class AgentOutput { // eslint-disable-next-line @typescript-eslint/no-unused-vars return new CancellablePromise(async (resolve, _, onCancel) => { const ttsSource = await handle.ttsSource; - let task: CancellablePromise; + let task: CancellablePromise; if (typeof ttsSource === 'string') { task = stringSynthesisTask(ttsSource, handle); } else { @@ -113,6 +114,10 @@ export class AgentOutput { } finally { if (handle.intFut.done) { gracefullyCancel(task); + } else { + task.then((text) => { + handle.text = text; + }); } } @@ -121,9 +126,9 @@ export class AgentOutput { } } -const stringSynthesisTask = (text: string, handle: SynthesisHandle): CancellablePromise => { +const stringSynthesisTask = (text: string, handle: SynthesisHandle): CancellablePromise => { // eslint-disable-next-line @typescript-eslint/no-unused-vars - return new CancellablePromise(async (resolve, _, onCancel) => { + return new CancellablePromise(async (resolve, _, onCancel) => { let cancelled = false; onCancel(() => { cancelled = true; @@ -141,16 +146,17 @@ const stringSynthesisTask = (text: string, handle: SynthesisHandle): Cancellable } handle.queue.put(SynthesisHandle.FLUSH_SENTINEL); - resolve(); + resolve(text); }); }; const streamSynthesisTask = ( stream: AsyncIterable, handle: SynthesisHandle, -): CancellablePromise => { +): CancellablePromise => { // eslint-disable-next-line @typescript-eslint/no-unused-vars - return new CancellablePromise(async (resolve, _, onCancel) => { + return new CancellablePromise(async (resolve, _, onCancel) => { + let fullText = ''; let cancelled = false; onCancel(() => { cancelled = true; @@ -170,12 +176,13 @@ const streamSynthesisTask = ( readGeneratedAudio(); for await (const text of stream) { + fullText += text; if (cancelled) break; ttsStream.pushText(text); } ttsStream.flush(); ttsStream.endInput(); - resolve(); + resolve(fullText); }); }; diff --git a/agents/src/pipeline/pipeline_agent.ts b/agents/src/pipeline/pipeline_agent.ts index a966ece2..435490b6 100644 --- a/agents/src/pipeline/pipeline_agent.ts +++ b/agents/src/pipeline/pipeline_agent.ts @@ -520,8 +520,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter< // add it to the chat context for this new reply synthesis copiedCtx.messages.push( ChatMessage.create({ - // TODO(nbsp): uhhh unsure where to get the played text here - // text: playingSpeech.synthesisHandle.(theres no ttsForwarder here) + text: playingSpeech.synthesisHandle.text, role: ChatRole.ASSISTANT, }), ); @@ -620,8 +619,7 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter< } commitUserQuestionIfNeeded(); - // TODO(nbsp): what goes here - let collectedText = ''; + const collectedText = handle.synthesisHandle.text; const isUsingTools = handle.source instanceof LLMStream && !!handle.source.functionCalls.length; const extraToolsMessages = []; // additional messages from the functions to add to the context let interrupted = handle.interrupted; @@ -685,8 +683,6 @@ export class VoicePipelineAgent extends (EventEmitter as new () => TypedEmitter< const playHandle = answerSynthesis.play(); await playHandle.join().await; - // TODO(nbsp): what text goes here - collectedText = ''; interrupted = answerSynthesis.interrupted; newFunctionCalls = answerLLMStream.functionCalls; From 0c307f5cc7eb37576a40828c0cbcada11e787d27 Mon Sep 17 00:00:00 2001 From: Sam Trost Date: Wed, 18 Dec 2024 14:53:30 -0500 Subject: [PATCH 2/3] fix(tts): add missing crypto import to OpenAI tts (#216) Co-authored-by: aoife cassidy --- .changeset/eight-mugs-glow.md | 5 +++++ plugins/openai/src/tts.ts | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 .changeset/eight-mugs-glow.md diff --git a/.changeset/eight-mugs-glow.md b/.changeset/eight-mugs-glow.md new file mode 100644 index 00000000..174ea415 --- /dev/null +++ b/.changeset/eight-mugs-glow.md @@ -0,0 +1,5 @@ +--- +"@livekit/agents-plugin-openai": patch +--- + +fix(tts): add missing crypto import to OpenAI tts diff --git a/plugins/openai/src/tts.ts b/plugins/openai/src/tts.ts index 6ccd928f..e5dea369 100644 --- a/plugins/openai/src/tts.ts +++ b/plugins/openai/src/tts.ts @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 import { AudioByteStream, tts } from '@livekit/agents'; +import { randomUUID } from 'crypto'; import { OpenAI } from 'openai'; import type { TTSModels, TTSVoices } from './models.js'; @@ -81,7 +82,7 @@ export class ChunkedStream extends tts.ChunkedStream { async #run(stream: Promise) { const buffer = await stream.then((r) => r.arrayBuffer()); - const requestId = crypto.randomUUID(); + const requestId = randomUUID(); const audioByteStream = new AudioByteStream(OPENAI_TTS_SAMPLE_RATE, OPENAI_TTS_CHANNELS); const frames = audioByteStream.write(buffer); From e20d82aa19eff08940438601ccc50b82e49af9f9 Mon Sep 17 00:00:00 2001 From: Sam Trost Date: Wed, 11 Dec 2024 08:35:27 -0500 Subject: [PATCH 3/3] groq: add support for llama 3.3 70b (#209) Co-authored-by: aoife cassidy --- .changeset/moody-poems-juggle.md | 5 +++++ plugins/openai/src/models.ts | 1 + 2 files changed, 6 insertions(+) create mode 100644 .changeset/moody-poems-juggle.md diff --git a/.changeset/moody-poems-juggle.md b/.changeset/moody-poems-juggle.md new file mode 100644 index 00000000..a48d5185 --- /dev/null +++ b/.changeset/moody-poems-juggle.md @@ -0,0 +1,5 @@ +--- +"@livekit/agents-plugin-openai": patch +--- + +groq: add support for llama 3.3 70b diff --git a/plugins/openai/src/models.ts b/plugins/openai/src/models.ts index 3ae3f100..f064b0b1 100644 --- a/plugins/openai/src/models.ts +++ b/plugins/openai/src/models.ts @@ -53,6 +53,7 @@ export type GroqChatModels = | 'llama-3.1-405b-reasoning' | 'llama-3.1-70b-versatile' | 'llama-3.1-8b-instant' + | 'llama-3.3-70b-versatile' | 'llama3-groq-70b-8192-tool-use-preview' | 'llama3-groq-8b-8192-tool-use-preview' | 'llama-guard-3-8b'