test(stt): add openai and deepgram tests (#205)

livekit · Dec 10, 2024 · 8dfb5e6 · 8dfb5e6
1 parent 3cab81d
commit 8dfb5e6
Show file tree

Hide file tree

Showing 12 changed files with 197 additions and 1 deletion.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -17,6 +17,8 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
+        with:
+          lfs: true
       - uses: pnpm/action-setup@v4
       - name: Setup node
         uses: actions/setup-node@v4
@@ -31,4 +33,5 @@ jobs:
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
           ELEVEN_API_KEY: ${{ secrets.ELEVEN_API_KEY }}
+          DEEPGRAM_API_KEY: ${{ secrets.DEEPGRAM_API_KEY }}
         run: pnpm test
diff --git a/REUSE.toml b/REUSE.toml
@@ -30,3 +30,9 @@ SPDX-License-Identifier = "Apache-2.0"
 path = ["**/*.onnx"]
 SPDX-FileCopyrightText = "2024 Silero Team"
 SPDX-License-Identifier = "CC-BY-NC-SA-4.0"
+
+# testing files
+[[annotations]]
+path = ["**/.gitattributes", "**.wav"]
+SPDX-FileCopyrightText = "2024 LiveKit, Inc."
+SPDX-License-Identifier = "Apache-2.0"
diff --git a/plugins/deepgram/package.json b/plugins/deepgram/package.json
@@ -31,6 +31,8 @@
   },
   "devDependencies": {
     "@livekit/agents": "workspace:^x",
+    "@livekit/agents-plugin-silero": "workspace:^x",
+    "@livekit/agents-plugins-test": "workspace:^x",
     "@livekit/rtc-node": "^0.12.1",
     "@microsoft/api-extractor": "^7.35.0",
     "@types/ws": "^8.5.10",

diff --git a/plugins/deepgram/src/stt.test.ts b/plugins/deepgram/src/stt.test.ts
@@ -0,0 +1,13 @@
+// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+import { initializeLogger } from '@livekit/agents';
+import { VAD } from '@livekit/agents-plugin-silero';
+import { stt } from '@livekit/agents-plugins-test';
+import { describe } from 'vitest';
+import { STT } from './stt.js';
+
+describe('Deepgram', async () => {
+  initializeLogger({ pretty: false });
+  await stt(new STT(), await VAD.load(), { nonStreaming: false });
+});
diff --git a/plugins/openai/package.json b/plugins/openai/package.json
@@ -31,6 +31,7 @@
   },
   "devDependencies": {
     "@livekit/agents": "workspace:^x",
+    "@livekit/agents-plugin-silero": "workspace:^x",
     "@livekit/agents-plugins-test": "workspace:^x",
     "@livekit/rtc-node": "^0.12.1",
     "@microsoft/api-extractor": "^7.35.0",

diff --git a/plugins/openai/src/stt.test.ts b/plugins/openai/src/stt.test.ts
@@ -0,0 +1,11 @@
+// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+import { VAD } from '@livekit/agents-plugin-silero';
+import { stt } from '@livekit/agents-plugins-test';
+import { describe } from 'vitest';
+import { STT } from './stt.js';
+
+describe('OpenAI', async () => {
+  await stt(new STT(), await VAD.load(), { streaming: false });
+});
diff --git a/plugins/test/.gitattributes b/plugins/test/.gitattributes
@@ -0,0 +1 @@
+*.wav filter=lfs diff=lfs merge=lfs -text
diff --git a/plugins/test/package.json b/plugins/test/package.json
@@ -23,7 +23,7 @@
     "README.md"
   ],
   "scripts": {
-    "build": "tsup --onSuccess \"tsc --declaration --emitDeclarationOnly\"",
+    "build": "tsup --onSuccess \"tsc --declaration --emitDeclarationOnly\" && cp src/long.wav dist/",
     "lint": "eslint -f unix \"src/**/*.{ts,js}\""
   },
   "devDependencies": {

diff --git a/plugins/test/src/index.ts b/plugins/test/src/index.ts
@@ -2,3 +2,4 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 export { tts } from './tts.js';
+export { stt } from './stt.js';
diff --git a/plugins/test/src/long.wav b/plugins/test/src/long.wav
diff --git a/plugins/test/src/stt.ts b/plugins/test/src/stt.ts
@@ -0,0 +1,146 @@
+// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+import {
+  AudioByteStream,
+  type VAD,
+  initializeLogger,
+  mergeFrames,
+  stt as sttlib,
+} from '@livekit/agents';
+import { AudioFrame, AudioResampler } from '@livekit/rtc-node';
+import { distance } from 'fastest-levenshtein';
+import { readFileSync } from 'node:fs';
+import { join } from 'node:path';
+import { describe, expect, it } from 'vitest';
+
+const TRANSCRIPT =
+  'It could not have been ten seconds, and yet it seemed a long time that their hands were clasped together. ' +
+  'He had time to learn every detail of her hand. ' +
+  'He explored the long fingers, the shapely nails, the work-hardened palm with its row of callouses, the smooth flesh under the wrist. ' +
+  'Merely from feeling it he would have known it by sight. ' +
+  "In the same instant it occurred to him that he did not know what colour the girl's eyes were. " +
+  'They were probably brown, but people with dark hair sometimes had blue eyes. ' +
+  'To turn his head and look at her would have been inconceivable folly. ' +
+  'With hands locked together, invisible among the press of bodies, ' +
+  'they stared steadily in front of them, and instead of the eyes of the girl, the eyes of the aged prisoner gazed mournfully at Winston out of nests of hair.';
+
+const validate = async (text: string, transcript: string, threshold: number) => {
+  text = text.toLowerCase().replace(/\s/g, ' ').trim();
+  transcript = transcript.toLowerCase().replace(/\s/g, ' ').trim();
+  expect(distance(text, transcript) / text.length).toBeLessThanOrEqual(threshold);
+};
+
+export const stt = async (
+  stt: sttlib.STT,
+  vad: VAD,
+  supports: Partial<{ streaming: boolean; nonStreaming: boolean }> = {},
+) => {
+  initializeLogger({ pretty: false });
+  supports = { streaming: true, nonStreaming: true, ...supports };
+  describe('STT', async () => {
+    it.skipIf(!supports.nonStreaming)('should properly transcribe speech', async () => {
+      [24000, 44100].forEach(async (sampleRate) => {
+        const frames = makeTestSpeech(sampleRate);
+        const event = await stt.recognize(frames);
+        const text = event.alternatives![0].text;
+        await validate(text, TRANSCRIPT, 0.2);
+        expect(event.type).toStrictEqual(sttlib.SpeechEventType.FINAL_TRANSCRIPT);
+      });
+    });
+    it('should properly stream transcribe speech', async () => {
+      [24000, 44100].forEach(async (sampleRate) => {
+        const frames = makeTestSpeech(sampleRate, 10);
+        let stream: sttlib.SpeechStream;
+        if (supports.streaming) {
+          stream = stt.stream();
+        } else {
+          stream = new sttlib.StreamAdapter(stt, vad).stream();
+        }
+
+        const input = async () => {
+          for (const frame of frames) {
+            stream.pushFrame(frame);
+            await new Promise((resolve) => setTimeout(resolve, 5));
+            stream.endInput();
+          }
+        };
+
+        const output = async () => {
+          let text = '';
+          let recvStart = false;
+          let recvEnd = true;
+
+          for await (const event of stream) {
+            switch (event.type) {
+              case sttlib.SpeechEventType.START_OF_SPEECH:
+                expect(recvEnd).toBeTruthy();
+                expect(recvStart).toBeFalsy();
+                recvEnd = false;
+                recvStart = true;
+                break;
+              case sttlib.SpeechEventType.FINAL_TRANSCRIPT:
+                text += event.alternatives![0].text;
+                break;
+              case sttlib.SpeechEventType.END_OF_SPEECH:
+                recvStart = false;
+                recvEnd = true;
+            }
+          }
+
+          await validate(text, TRANSCRIPT, 0.2);
+        };
+
+        Promise.all([input, output]);
+      });
+    });
+  });
+};
+
+const makeTestSpeech = (targetSampleRate: number, chunkDuration?: number): AudioFrame[] => {
+  const sample = readFileSync(join(import.meta.dirname, './long.wav'));
+  const channels = sample.readUInt16LE(22);
+  const sampleRate = sample.readUInt32LE(24);
+  const dataSize = sample.readUInt32LE(40) / 2;
+  const buffer = new Int16Array(sample.buffer);
+
+  let written = 44; // start of WAVE data stream
+  const FRAME_DURATION = 1; // write 1s of audio at a time
+  const numSamples = sampleRate * FRAME_DURATION;
+  let frames: AudioFrame[] = [];
+  while (written < dataSize) {
+    const available = dataSize - written;
+    const frameSize = Math.min(numSamples, available);
+
+    frames.push(
+      new AudioFrame(
+        buffer.slice(written, written + frameSize),
+        sampleRate,
+        channels,
+        Math.trunc(frameSize / channels),
+      ),
+    );
+    written += frameSize;
+  }
+
+  if (sampleRate !== targetSampleRate) {
+    const resampler = new AudioResampler(sampleRate, targetSampleRate, channels);
+    const output = [];
+    for (const frame of frames) {
+      output.push(...resampler.push(frame));
+    }
+    output.push(...resampler.flush());
+    frames = output;
+  }
+
+  const merged = mergeFrames(frames);
+  if (!chunkDuration) {
+    return [merged];
+  }
+
+  const chunkSize = (targetSampleRate * chunkDuration) / 1000;
+  const bstream = new AudioByteStream(targetSampleRate, channels, chunkSize);
+  frames = bstream.write(merged.data);
+  frames.push(...bstream.flush());
+  return frames;
+};
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml