Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test(stt): add openai and deepgram tests #205

Merged
merged 3 commits into from
Dec 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
lfs: true
- uses: pnpm/action-setup@v4
- name: Setup node
uses: actions/setup-node@v4
Expand All @@ -31,4 +33,5 @@ jobs:
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
ELEVEN_API_KEY: ${{ secrets.ELEVEN_API_KEY }}
DEEPGRAM_API_KEY: ${{ secrets.DEEPGRAM_API_KEY }}
run: pnpm test
6 changes: 6 additions & 0 deletions REUSE.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,9 @@ SPDX-License-Identifier = "Apache-2.0"
path = ["**/*.onnx"]
SPDX-FileCopyrightText = "2024 Silero Team"
SPDX-License-Identifier = "CC-BY-NC-SA-4.0"

# testing files
[[annotations]]
path = ["**/.gitattributes", "**.wav"]
SPDX-FileCopyrightText = "2024 LiveKit, Inc."
SPDX-License-Identifier = "Apache-2.0"
2 changes: 2 additions & 0 deletions plugins/deepgram/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
},
"devDependencies": {
"@livekit/agents": "workspace:^x",
"@livekit/agents-plugin-silero": "workspace:^x",
"@livekit/agents-plugins-test": "workspace:^x",
"@livekit/rtc-node": "^0.12.1",
"@microsoft/api-extractor": "^7.35.0",
"@types/ws": "^8.5.10",
Expand Down
13 changes: 13 additions & 0 deletions plugins/deepgram/src/stt.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
//
// SPDX-License-Identifier: Apache-2.0
import { initializeLogger } from '@livekit/agents';
import { VAD } from '@livekit/agents-plugin-silero';
import { stt } from '@livekit/agents-plugins-test';
import { describe } from 'vitest';
import { STT } from './stt.js';

describe('Deepgram', async () => {
initializeLogger({ pretty: false });
await stt(new STT(), await VAD.load(), { nonStreaming: false });
});
1 change: 1 addition & 0 deletions plugins/openai/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
},
"devDependencies": {
"@livekit/agents": "workspace:^x",
"@livekit/agents-plugin-silero": "workspace:^x",
"@livekit/agents-plugins-test": "workspace:^x",
"@livekit/rtc-node": "^0.12.1",
"@microsoft/api-extractor": "^7.35.0",
Expand Down
11 changes: 11 additions & 0 deletions plugins/openai/src/stt.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
//
// SPDX-License-Identifier: Apache-2.0
import { VAD } from '@livekit/agents-plugin-silero';
import { stt } from '@livekit/agents-plugins-test';
import { describe } from 'vitest';
import { STT } from './stt.js';

describe('OpenAI', async () => {
await stt(new STT(), await VAD.load(), { streaming: false });
});
1 change: 1 addition & 0 deletions plugins/test/.gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.wav filter=lfs diff=lfs merge=lfs -text
2 changes: 1 addition & 1 deletion plugins/test/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
"README.md"
],
"scripts": {
"build": "tsup --onSuccess \"tsc --declaration --emitDeclarationOnly\"",
"build": "tsup --onSuccess \"tsc --declaration --emitDeclarationOnly\" && cp src/long.wav dist/",
"lint": "eslint -f unix \"src/**/*.{ts,js}\""
},
"devDependencies": {
Expand Down
1 change: 1 addition & 0 deletions plugins/test/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
//
// SPDX-License-Identifier: Apache-2.0
export { tts } from './tts.js';
export { stt } from './stt.js';
3 changes: 3 additions & 0 deletions plugins/test/src/long.wav
Git LFS file not shown
146 changes: 146 additions & 0 deletions plugins/test/src/stt.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
//
// SPDX-License-Identifier: Apache-2.0
import {
AudioByteStream,
type VAD,
initializeLogger,
mergeFrames,
stt as sttlib,
} from '@livekit/agents';
import { AudioFrame, AudioResampler } from '@livekit/rtc-node';
import { distance } from 'fastest-levenshtein';
import { readFileSync } from 'node:fs';
import { join } from 'node:path';
import { describe, expect, it } from 'vitest';

const TRANSCRIPT =
'It could not have been ten seconds, and yet it seemed a long time that their hands were clasped together. ' +
'He had time to learn every detail of her hand. ' +
'He explored the long fingers, the shapely nails, the work-hardened palm with its row of callouses, the smooth flesh under the wrist. ' +
'Merely from feeling it he would have known it by sight. ' +
"In the same instant it occurred to him that he did not know what colour the girl's eyes were. " +
'They were probably brown, but people with dark hair sometimes had blue eyes. ' +
'To turn his head and look at her would have been inconceivable folly. ' +
'With hands locked together, invisible among the press of bodies, ' +
'they stared steadily in front of them, and instead of the eyes of the girl, the eyes of the aged prisoner gazed mournfully at Winston out of nests of hair.';

const validate = async (text: string, transcript: string, threshold: number) => {
text = text.toLowerCase().replace(/\s/g, ' ').trim();
transcript = transcript.toLowerCase().replace(/\s/g, ' ').trim();
expect(distance(text, transcript) / text.length).toBeLessThanOrEqual(threshold);
};

export const stt = async (
stt: sttlib.STT,
vad: VAD,
supports: Partial<{ streaming: boolean; nonStreaming: boolean }> = {},
) => {
initializeLogger({ pretty: false });
supports = { streaming: true, nonStreaming: true, ...supports };
describe('STT', async () => {
it.skipIf(!supports.nonStreaming)('should properly transcribe speech', async () => {
[24000, 44100].forEach(async (sampleRate) => {
const frames = makeTestSpeech(sampleRate);
const event = await stt.recognize(frames);
const text = event.alternatives![0].text;
await validate(text, TRANSCRIPT, 0.2);
expect(event.type).toStrictEqual(sttlib.SpeechEventType.FINAL_TRANSCRIPT);
});
});
it('should properly stream transcribe speech', async () => {
[24000, 44100].forEach(async (sampleRate) => {
const frames = makeTestSpeech(sampleRate, 10);
let stream: sttlib.SpeechStream;
if (supports.streaming) {
stream = stt.stream();
} else {
stream = new sttlib.StreamAdapter(stt, vad).stream();
}

const input = async () => {
for (const frame of frames) {
stream.pushFrame(frame);
await new Promise((resolve) => setTimeout(resolve, 5));
stream.endInput();
}
};

const output = async () => {
let text = '';
let recvStart = false;
let recvEnd = true;

for await (const event of stream) {
switch (event.type) {
case sttlib.SpeechEventType.START_OF_SPEECH:
expect(recvEnd).toBeTruthy();
expect(recvStart).toBeFalsy();
recvEnd = false;
recvStart = true;
break;
case sttlib.SpeechEventType.FINAL_TRANSCRIPT:
text += event.alternatives![0].text;
break;
case sttlib.SpeechEventType.END_OF_SPEECH:
recvStart = false;
recvEnd = true;
}
}

await validate(text, TRANSCRIPT, 0.2);
};

Promise.all([input, output]);
});
});
});
};

const makeTestSpeech = (targetSampleRate: number, chunkDuration?: number): AudioFrame[] => {
const sample = readFileSync(join(import.meta.dirname, './long.wav'));
const channels = sample.readUInt16LE(22);
const sampleRate = sample.readUInt32LE(24);
const dataSize = sample.readUInt32LE(40) / 2;
const buffer = new Int16Array(sample.buffer);

let written = 44; // start of WAVE data stream
const FRAME_DURATION = 1; // write 1s of audio at a time
const numSamples = sampleRate * FRAME_DURATION;
let frames: AudioFrame[] = [];
while (written < dataSize) {
const available = dataSize - written;
const frameSize = Math.min(numSamples, available);

frames.push(
new AudioFrame(
buffer.slice(written, written + frameSize),
sampleRate,
channels,
Math.trunc(frameSize / channels),
),
);
written += frameSize;
}

if (sampleRate !== targetSampleRate) {
const resampler = new AudioResampler(sampleRate, targetSampleRate, channels);
const output = [];
for (const frame of frames) {
output.push(...resampler.push(frame));
}
output.push(...resampler.flush());
frames = output;
}

const merged = mergeFrames(frames);
if (!chunkDuration) {
return [merged];
}

const chunkSize = (targetSampleRate * chunkDuration) / 1000;
const bstream = new AudioByteStream(targetSampleRate, channels, chunkSize);
frames = bstream.write(merged.data);
frames.push(...bstream.flush());
return frames;
};
9 changes: 9 additions & 0 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading