Skip to content

Commit

Permalink
test(stt): add openai and deepgram tests (#205)
Browse files Browse the repository at this point in the history
  • Loading branch information
nbsp authored Dec 10, 2024
1 parent 3cab81d commit 8dfb5e6
Show file tree
Hide file tree
Showing 12 changed files with 197 additions and 1 deletion.
3 changes: 3 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
lfs: true
- uses: pnpm/action-setup@v4
- name: Setup node
uses: actions/setup-node@v4
Expand All @@ -31,4 +33,5 @@ jobs:
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
ELEVEN_API_KEY: ${{ secrets.ELEVEN_API_KEY }}
DEEPGRAM_API_KEY: ${{ secrets.DEEPGRAM_API_KEY }}
run: pnpm test
6 changes: 6 additions & 0 deletions REUSE.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,9 @@ SPDX-License-Identifier = "Apache-2.0"
path = ["**/*.onnx"]
SPDX-FileCopyrightText = "2024 Silero Team"
SPDX-License-Identifier = "CC-BY-NC-SA-4.0"

# testing files
[[annotations]]
path = ["**/.gitattributes", "**.wav"]
SPDX-FileCopyrightText = "2024 LiveKit, Inc."
SPDX-License-Identifier = "Apache-2.0"
2 changes: 2 additions & 0 deletions plugins/deepgram/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
},
"devDependencies": {
"@livekit/agents": "workspace:^x",
"@livekit/agents-plugin-silero": "workspace:^x",
"@livekit/agents-plugins-test": "workspace:^x",
"@livekit/rtc-node": "^0.12.1",
"@microsoft/api-extractor": "^7.35.0",
"@types/ws": "^8.5.10",
Expand Down
13 changes: 13 additions & 0 deletions plugins/deepgram/src/stt.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
//
// SPDX-License-Identifier: Apache-2.0
import { initializeLogger } from '@livekit/agents';
import { VAD } from '@livekit/agents-plugin-silero';
import { stt } from '@livekit/agents-plugins-test';
import { describe } from 'vitest';
import { STT } from './stt.js';

describe('Deepgram', async () => {
initializeLogger({ pretty: false });
await stt(new STT(), await VAD.load(), { nonStreaming: false });
});
1 change: 1 addition & 0 deletions plugins/openai/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
},
"devDependencies": {
"@livekit/agents": "workspace:^x",
"@livekit/agents-plugin-silero": "workspace:^x",
"@livekit/agents-plugins-test": "workspace:^x",
"@livekit/rtc-node": "^0.12.1",
"@microsoft/api-extractor": "^7.35.0",
Expand Down
11 changes: 11 additions & 0 deletions plugins/openai/src/stt.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
//
// SPDX-License-Identifier: Apache-2.0
import { VAD } from '@livekit/agents-plugin-silero';
import { stt } from '@livekit/agents-plugins-test';
import { describe } from 'vitest';
import { STT } from './stt.js';

describe('OpenAI', async () => {
await stt(new STT(), await VAD.load(), { streaming: false });
});
1 change: 1 addition & 0 deletions plugins/test/.gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.wav filter=lfs diff=lfs merge=lfs -text
2 changes: 1 addition & 1 deletion plugins/test/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
"README.md"
],
"scripts": {
"build": "tsup --onSuccess \"tsc --declaration --emitDeclarationOnly\"",
"build": "tsup --onSuccess \"tsc --declaration --emitDeclarationOnly\" && cp src/long.wav dist/",
"lint": "eslint -f unix \"src/**/*.{ts,js}\""
},
"devDependencies": {
Expand Down
1 change: 1 addition & 0 deletions plugins/test/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
//
// SPDX-License-Identifier: Apache-2.0
export { tts } from './tts.js';
export { stt } from './stt.js';
3 changes: 3 additions & 0 deletions plugins/test/src/long.wav
Git LFS file not shown
146 changes: 146 additions & 0 deletions plugins/test/src/stt.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
//
// SPDX-License-Identifier: Apache-2.0
import {
AudioByteStream,
type VAD,
initializeLogger,
mergeFrames,
stt as sttlib,
} from '@livekit/agents';
import { AudioFrame, AudioResampler } from '@livekit/rtc-node';
import { distance } from 'fastest-levenshtein';
import { readFileSync } from 'node:fs';
import { join } from 'node:path';
import { describe, expect, it } from 'vitest';

const TRANSCRIPT =
'It could not have been ten seconds, and yet it seemed a long time that their hands were clasped together. ' +
'He had time to learn every detail of her hand. ' +
'He explored the long fingers, the shapely nails, the work-hardened palm with its row of callouses, the smooth flesh under the wrist. ' +
'Merely from feeling it he would have known it by sight. ' +
"In the same instant it occurred to him that he did not know what colour the girl's eyes were. " +
'They were probably brown, but people with dark hair sometimes had blue eyes. ' +
'To turn his head and look at her would have been inconceivable folly. ' +
'With hands locked together, invisible among the press of bodies, ' +
'they stared steadily in front of them, and instead of the eyes of the girl, the eyes of the aged prisoner gazed mournfully at Winston out of nests of hair.';

const validate = async (text: string, transcript: string, threshold: number) => {
text = text.toLowerCase().replace(/\s/g, ' ').trim();
transcript = transcript.toLowerCase().replace(/\s/g, ' ').trim();
expect(distance(text, transcript) / text.length).toBeLessThanOrEqual(threshold);
};

export const stt = async (
stt: sttlib.STT,
vad: VAD,
supports: Partial<{ streaming: boolean; nonStreaming: boolean }> = {},
) => {
initializeLogger({ pretty: false });
supports = { streaming: true, nonStreaming: true, ...supports };
describe('STT', async () => {
it.skipIf(!supports.nonStreaming)('should properly transcribe speech', async () => {
[24000, 44100].forEach(async (sampleRate) => {
const frames = makeTestSpeech(sampleRate);
const event = await stt.recognize(frames);
const text = event.alternatives![0].text;
await validate(text, TRANSCRIPT, 0.2);
expect(event.type).toStrictEqual(sttlib.SpeechEventType.FINAL_TRANSCRIPT);
});
});
it('should properly stream transcribe speech', async () => {
[24000, 44100].forEach(async (sampleRate) => {
const frames = makeTestSpeech(sampleRate, 10);
let stream: sttlib.SpeechStream;
if (supports.streaming) {
stream = stt.stream();
} else {
stream = new sttlib.StreamAdapter(stt, vad).stream();
}

const input = async () => {
for (const frame of frames) {
stream.pushFrame(frame);
await new Promise((resolve) => setTimeout(resolve, 5));
stream.endInput();
}
};

const output = async () => {
let text = '';
let recvStart = false;
let recvEnd = true;

for await (const event of stream) {
switch (event.type) {
case sttlib.SpeechEventType.START_OF_SPEECH:
expect(recvEnd).toBeTruthy();
expect(recvStart).toBeFalsy();
recvEnd = false;
recvStart = true;
break;
case sttlib.SpeechEventType.FINAL_TRANSCRIPT:
text += event.alternatives![0].text;
break;
case sttlib.SpeechEventType.END_OF_SPEECH:
recvStart = false;
recvEnd = true;
}
}

await validate(text, TRANSCRIPT, 0.2);
};

Promise.all([input, output]);
});
});
});
};

const makeTestSpeech = (targetSampleRate: number, chunkDuration?: number): AudioFrame[] => {
const sample = readFileSync(join(import.meta.dirname, './long.wav'));
const channels = sample.readUInt16LE(22);
const sampleRate = sample.readUInt32LE(24);
const dataSize = sample.readUInt32LE(40) / 2;
const buffer = new Int16Array(sample.buffer);

let written = 44; // start of WAVE data stream
const FRAME_DURATION = 1; // write 1s of audio at a time
const numSamples = sampleRate * FRAME_DURATION;
let frames: AudioFrame[] = [];
while (written < dataSize) {
const available = dataSize - written;
const frameSize = Math.min(numSamples, available);

frames.push(
new AudioFrame(
buffer.slice(written, written + frameSize),
sampleRate,
channels,
Math.trunc(frameSize / channels),
),
);
written += frameSize;
}

if (sampleRate !== targetSampleRate) {
const resampler = new AudioResampler(sampleRate, targetSampleRate, channels);
const output = [];
for (const frame of frames) {
output.push(...resampler.push(frame));
}
output.push(...resampler.flush());
frames = output;
}

const merged = mergeFrames(frames);
if (!chunkDuration) {
return [merged];
}

const chunkSize = (targetSampleRate * chunkDuration) / 1000;
const bstream = new AudioByteStream(targetSampleRate, channels, chunkSize);
frames = bstream.write(merged.data);
frames.push(...bstream.flush());
return frames;
};
9 changes: 9 additions & 0 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 8dfb5e6

Please sign in to comment.