From 8dfb5e6daaf2b94b50fe992299fb60d5714c5b7d Mon Sep 17 00:00:00 2001 From: aoife cassidy Date: Tue, 10 Dec 2024 23:44:00 +0200 Subject: [PATCH] test(stt): add openai and deepgram tests (#205) --- .github/workflows/test.yml | 3 + REUSE.toml | 6 ++ plugins/deepgram/package.json | 2 + plugins/deepgram/src/stt.test.ts | 13 +++ plugins/openai/package.json | 1 + plugins/openai/src/stt.test.ts | 11 +++ plugins/test/.gitattributes | 1 + plugins/test/package.json | 2 +- plugins/test/src/index.ts | 1 + plugins/test/src/long.wav | 3 + plugins/test/src/stt.ts | 146 +++++++++++++++++++++++++++++++ pnpm-lock.yaml | 9 ++ 12 files changed, 197 insertions(+), 1 deletion(-) create mode 100644 plugins/deepgram/src/stt.test.ts create mode 100644 plugins/openai/src/stt.test.ts create mode 100644 plugins/test/.gitattributes create mode 100644 plugins/test/src/long.wav create mode 100644 plugins/test/src/stt.ts diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5885f14d..b0cc5094 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -17,6 +17,8 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 + with: + lfs: true - uses: pnpm/action-setup@v4 - name: Setup node uses: actions/setup-node@v4 @@ -31,4 +33,5 @@ jobs: env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} ELEVEN_API_KEY: ${{ secrets.ELEVEN_API_KEY }} + DEEPGRAM_API_KEY: ${{ secrets.DEEPGRAM_API_KEY }} run: pnpm test diff --git a/REUSE.toml b/REUSE.toml index 8c4b40cf..51b34b9b 100644 --- a/REUSE.toml +++ b/REUSE.toml @@ -30,3 +30,9 @@ SPDX-License-Identifier = "Apache-2.0" path = ["**/*.onnx"] SPDX-FileCopyrightText = "2024 Silero Team" SPDX-License-Identifier = "CC-BY-NC-SA-4.0" + +# testing files +[[annotations]] +path = ["**/.gitattributes", "**.wav"] +SPDX-FileCopyrightText = "2024 LiveKit, Inc." +SPDX-License-Identifier = "Apache-2.0" diff --git a/plugins/deepgram/package.json b/plugins/deepgram/package.json index 35721c2a..596bfc4a 100644 --- a/plugins/deepgram/package.json +++ b/plugins/deepgram/package.json @@ -31,6 +31,8 @@ }, "devDependencies": { "@livekit/agents": "workspace:^x", + "@livekit/agents-plugin-silero": "workspace:^x", + "@livekit/agents-plugins-test": "workspace:^x", "@livekit/rtc-node": "^0.12.1", "@microsoft/api-extractor": "^7.35.0", "@types/ws": "^8.5.10", diff --git a/plugins/deepgram/src/stt.test.ts b/plugins/deepgram/src/stt.test.ts new file mode 100644 index 00000000..96caf388 --- /dev/null +++ b/plugins/deepgram/src/stt.test.ts @@ -0,0 +1,13 @@ +// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { initializeLogger } from '@livekit/agents'; +import { VAD } from '@livekit/agents-plugin-silero'; +import { stt } from '@livekit/agents-plugins-test'; +import { describe } from 'vitest'; +import { STT } from './stt.js'; + +describe('Deepgram', async () => { + initializeLogger({ pretty: false }); + await stt(new STT(), await VAD.load(), { nonStreaming: false }); +}); diff --git a/plugins/openai/package.json b/plugins/openai/package.json index 87674a07..deb80012 100644 --- a/plugins/openai/package.json +++ b/plugins/openai/package.json @@ -31,6 +31,7 @@ }, "devDependencies": { "@livekit/agents": "workspace:^x", + "@livekit/agents-plugin-silero": "workspace:^x", "@livekit/agents-plugins-test": "workspace:^x", "@livekit/rtc-node": "^0.12.1", "@microsoft/api-extractor": "^7.35.0", diff --git a/plugins/openai/src/stt.test.ts b/plugins/openai/src/stt.test.ts new file mode 100644 index 00000000..678cfdb4 --- /dev/null +++ b/plugins/openai/src/stt.test.ts @@ -0,0 +1,11 @@ +// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { VAD } from '@livekit/agents-plugin-silero'; +import { stt } from '@livekit/agents-plugins-test'; +import { describe } from 'vitest'; +import { STT } from './stt.js'; + +describe('OpenAI', async () => { + await stt(new STT(), await VAD.load(), { streaming: false }); +}); diff --git a/plugins/test/.gitattributes b/plugins/test/.gitattributes new file mode 100644 index 00000000..d899f655 --- /dev/null +++ b/plugins/test/.gitattributes @@ -0,0 +1 @@ +*.wav filter=lfs diff=lfs merge=lfs -text diff --git a/plugins/test/package.json b/plugins/test/package.json index 5ba6ecf0..ee24fd9b 100644 --- a/plugins/test/package.json +++ b/plugins/test/package.json @@ -23,7 +23,7 @@ "README.md" ], "scripts": { - "build": "tsup --onSuccess \"tsc --declaration --emitDeclarationOnly\"", + "build": "tsup --onSuccess \"tsc --declaration --emitDeclarationOnly\" && cp src/long.wav dist/", "lint": "eslint -f unix \"src/**/*.{ts,js}\"" }, "devDependencies": { diff --git a/plugins/test/src/index.ts b/plugins/test/src/index.ts index 04f7aaa5..94d6457d 100644 --- a/plugins/test/src/index.ts +++ b/plugins/test/src/index.ts @@ -2,3 +2,4 @@ // // SPDX-License-Identifier: Apache-2.0 export { tts } from './tts.js'; +export { stt } from './stt.js'; diff --git a/plugins/test/src/long.wav b/plugins/test/src/long.wav new file mode 100644 index 00000000..40cdb174 --- /dev/null +++ b/plugins/test/src/long.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ab87c695f2525c7553a4f0044335ab9466576f1115a4299e3fac3d3f8d9b795 +size 2398244 diff --git a/plugins/test/src/stt.ts b/plugins/test/src/stt.ts new file mode 100644 index 00000000..998d1692 --- /dev/null +++ b/plugins/test/src/stt.ts @@ -0,0 +1,146 @@ +// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { + AudioByteStream, + type VAD, + initializeLogger, + mergeFrames, + stt as sttlib, +} from '@livekit/agents'; +import { AudioFrame, AudioResampler } from '@livekit/rtc-node'; +import { distance } from 'fastest-levenshtein'; +import { readFileSync } from 'node:fs'; +import { join } from 'node:path'; +import { describe, expect, it } from 'vitest'; + +const TRANSCRIPT = + 'It could not have been ten seconds, and yet it seemed a long time that their hands were clasped together. ' + + 'He had time to learn every detail of her hand. ' + + 'He explored the long fingers, the shapely nails, the work-hardened palm with its row of callouses, the smooth flesh under the wrist. ' + + 'Merely from feeling it he would have known it by sight. ' + + "In the same instant it occurred to him that he did not know what colour the girl's eyes were. " + + 'They were probably brown, but people with dark hair sometimes had blue eyes. ' + + 'To turn his head and look at her would have been inconceivable folly. ' + + 'With hands locked together, invisible among the press of bodies, ' + + 'they stared steadily in front of them, and instead of the eyes of the girl, the eyes of the aged prisoner gazed mournfully at Winston out of nests of hair.'; + +const validate = async (text: string, transcript: string, threshold: number) => { + text = text.toLowerCase().replace(/\s/g, ' ').trim(); + transcript = transcript.toLowerCase().replace(/\s/g, ' ').trim(); + expect(distance(text, transcript) / text.length).toBeLessThanOrEqual(threshold); +}; + +export const stt = async ( + stt: sttlib.STT, + vad: VAD, + supports: Partial<{ streaming: boolean; nonStreaming: boolean }> = {}, +) => { + initializeLogger({ pretty: false }); + supports = { streaming: true, nonStreaming: true, ...supports }; + describe('STT', async () => { + it.skipIf(!supports.nonStreaming)('should properly transcribe speech', async () => { + [24000, 44100].forEach(async (sampleRate) => { + const frames = makeTestSpeech(sampleRate); + const event = await stt.recognize(frames); + const text = event.alternatives![0].text; + await validate(text, TRANSCRIPT, 0.2); + expect(event.type).toStrictEqual(sttlib.SpeechEventType.FINAL_TRANSCRIPT); + }); + }); + it('should properly stream transcribe speech', async () => { + [24000, 44100].forEach(async (sampleRate) => { + const frames = makeTestSpeech(sampleRate, 10); + let stream: sttlib.SpeechStream; + if (supports.streaming) { + stream = stt.stream(); + } else { + stream = new sttlib.StreamAdapter(stt, vad).stream(); + } + + const input = async () => { + for (const frame of frames) { + stream.pushFrame(frame); + await new Promise((resolve) => setTimeout(resolve, 5)); + stream.endInput(); + } + }; + + const output = async () => { + let text = ''; + let recvStart = false; + let recvEnd = true; + + for await (const event of stream) { + switch (event.type) { + case sttlib.SpeechEventType.START_OF_SPEECH: + expect(recvEnd).toBeTruthy(); + expect(recvStart).toBeFalsy(); + recvEnd = false; + recvStart = true; + break; + case sttlib.SpeechEventType.FINAL_TRANSCRIPT: + text += event.alternatives![0].text; + break; + case sttlib.SpeechEventType.END_OF_SPEECH: + recvStart = false; + recvEnd = true; + } + } + + await validate(text, TRANSCRIPT, 0.2); + }; + + Promise.all([input, output]); + }); + }); + }); +}; + +const makeTestSpeech = (targetSampleRate: number, chunkDuration?: number): AudioFrame[] => { + const sample = readFileSync(join(import.meta.dirname, './long.wav')); + const channels = sample.readUInt16LE(22); + const sampleRate = sample.readUInt32LE(24); + const dataSize = sample.readUInt32LE(40) / 2; + const buffer = new Int16Array(sample.buffer); + + let written = 44; // start of WAVE data stream + const FRAME_DURATION = 1; // write 1s of audio at a time + const numSamples = sampleRate * FRAME_DURATION; + let frames: AudioFrame[] = []; + while (written < dataSize) { + const available = dataSize - written; + const frameSize = Math.min(numSamples, available); + + frames.push( + new AudioFrame( + buffer.slice(written, written + frameSize), + sampleRate, + channels, + Math.trunc(frameSize / channels), + ), + ); + written += frameSize; + } + + if (sampleRate !== targetSampleRate) { + const resampler = new AudioResampler(sampleRate, targetSampleRate, channels); + const output = []; + for (const frame of frames) { + output.push(...resampler.push(frame)); + } + output.push(...resampler.flush()); + frames = output; + } + + const merged = mergeFrames(frames); + if (!chunkDuration) { + return [merged]; + } + + const chunkSize = (targetSampleRate * chunkDuration) / 1000; + const bstream = new AudioByteStream(targetSampleRate, channels, chunkSize); + frames = bstream.write(merged.data); + frames.push(...bstream.flush()); + return frames; +}; diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index cca7ed48..fea9bed6 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -167,6 +167,12 @@ importers: '@livekit/agents': specifier: workspace:^x version: link:../../agents + '@livekit/agents-plugin-silero': + specifier: workspace:^x + version: link:../silero + '@livekit/agents-plugins-test': + specifier: workspace:^x + version: link:../test '@livekit/rtc-node': specifier: ^0.12.1 version: 0.12.1 @@ -229,6 +235,9 @@ importers: '@livekit/agents': specifier: workspace:^x version: link:../../agents + '@livekit/agents-plugin-silero': + specifier: workspace:^x + version: link:../silero '@livekit/agents-plugins-test': specifier: workspace:^x version: link:../test