diff --git a/app/client/api.ts b/app/client/api.ts index 7a242ea99dd..fe08a8e4378 100644 --- a/app/client/api.ts +++ b/app/client/api.ts @@ -63,6 +63,16 @@ export interface SpeechOptions { onController?: (controller: AbortController) => void; } +export interface TranscriptionOptions { + model?: "whisper-1"; + file: Blob; + language?: string; + prompt?: string; + response_format?: "json" | "text" | "srt" | "verbose_json" | "vtt"; + temperature?: number; + onController?: (controller: AbortController) => void; +} + export interface ChatOptions { messages: RequestMessage[]; config: LLMConfig; @@ -98,6 +108,7 @@ export interface LLMModelProvider { export abstract class LLMApi { abstract chat(options: ChatOptions): Promise; abstract speech(options: SpeechOptions): Promise; + abstract transcription(options: TranscriptionOptions): Promise; abstract usage(): Promise; abstract models(): Promise; } diff --git a/app/client/platforms/alibaba.ts b/app/client/platforms/alibaba.ts index 86229a14705..57f8bc3fd39 100644 --- a/app/client/platforms/alibaba.ts +++ b/app/client/platforms/alibaba.ts @@ -13,6 +13,7 @@ import { LLMApi, LLMModel, SpeechOptions, + TranscriptionOptions, MultimodalContent, } from "../api"; import Locale from "../../locales"; @@ -89,6 +90,10 @@ export class QwenApi implements LLMApi { throw new Error("Method not implemented."); } + transcription(options: TranscriptionOptions): Promise { + throw new Error("Method not implemented."); + } + async chat(options: ChatOptions) { const messages = options.messages.map((v) => ({ role: v.role, diff --git a/app/client/platforms/anthropic.ts b/app/client/platforms/anthropic.ts index 1a83bd53aa1..24f76f31d46 100644 --- a/app/client/platforms/anthropic.ts +++ b/app/client/platforms/anthropic.ts @@ -1,5 +1,11 @@ import { Anthropic, ApiPath } from "@/app/constant"; -import { ChatOptions, getHeaders, LLMApi, SpeechOptions } from "../api"; +import { + ChatOptions, + getHeaders, + LLMApi, + SpeechOptions, + TranscriptionOptions, +} from "../api"; import { useAccessStore, useAppConfig, @@ -77,6 +83,10 @@ export class ClaudeApi implements LLMApi { throw new Error("Method not implemented."); } + transcription(options: TranscriptionOptions): Promise { + throw new Error("Method not implemented."); + } + extractMessage(res: any) { console.log("[Response] claude response: ", res); diff --git a/app/client/platforms/baidu.ts b/app/client/platforms/baidu.ts index 2511a696b9b..a809183c077 100644 --- a/app/client/platforms/baidu.ts +++ b/app/client/platforms/baidu.ts @@ -15,6 +15,7 @@ import { LLMModel, MultimodalContent, SpeechOptions, + TranscriptionOptions, } from "../api"; import Locale from "../../locales"; import { @@ -81,6 +82,10 @@ export class ErnieApi implements LLMApi { throw new Error("Method not implemented."); } + transcription(options: TranscriptionOptions): Promise { + throw new Error("Method not implemented."); + } + async chat(options: ChatOptions) { const messages = options.messages.map((v) => ({ // "error_code": 336006, "error_msg": "the role of message with even index in the messages must be user or function", diff --git a/app/client/platforms/bytedance.ts b/app/client/platforms/bytedance.ts index 000a9e278db..25b45e4853e 100644 --- a/app/client/platforms/bytedance.ts +++ b/app/client/platforms/bytedance.ts @@ -14,6 +14,7 @@ import { LLMModel, MultimodalContent, SpeechOptions, + TranscriptionOptions, } from "../api"; import Locale from "../../locales"; import { @@ -83,6 +84,10 @@ export class DoubaoApi implements LLMApi { throw new Error("Method not implemented."); } + transcription(options: TranscriptionOptions): Promise { + throw new Error("Method not implemented."); + } + async chat(options: ChatOptions) { const messages = options.messages.map((v) => ({ role: v.role, diff --git a/app/client/platforms/google.ts b/app/client/platforms/google.ts index 7265a500b97..40d49373e7f 100644 --- a/app/client/platforms/google.ts +++ b/app/client/platforms/google.ts @@ -6,6 +6,7 @@ import { LLMModel, LLMUsage, SpeechOptions, + TranscriptionOptions, } from "../api"; import { useAccessStore, @@ -68,6 +69,10 @@ export class GeminiProApi implements LLMApi { throw new Error("Method not implemented."); } + transcription(options: TranscriptionOptions): Promise { + throw new Error("Method not implemented."); + } + async chat(options: ChatOptions): Promise { const apiClient = this; let multimodal = false; diff --git a/app/client/platforms/iflytek.ts b/app/client/platforms/iflytek.ts index 55a39d0ccca..fcff72ca607 100644 --- a/app/client/platforms/iflytek.ts +++ b/app/client/platforms/iflytek.ts @@ -13,6 +13,7 @@ import { LLMApi, LLMModel, SpeechOptions, + TranscriptionOptions, } from "../api"; import Locale from "../../locales"; import { @@ -64,6 +65,10 @@ export class SparkApi implements LLMApi { throw new Error("Method not implemented."); } + transcription(options: TranscriptionOptions): Promise { + throw new Error("Method not implemented."); + } + async chat(options: ChatOptions) { const messages: ChatOptions["messages"] = []; for (const v of options.messages) { diff --git a/app/client/platforms/moonshot.ts b/app/client/platforms/moonshot.ts index e0ef3494fe7..b32e4213ae0 100644 --- a/app/client/platforms/moonshot.ts +++ b/app/client/platforms/moonshot.ts @@ -20,6 +20,7 @@ import { LLMApi, LLMModel, SpeechOptions, + TranscriptionOptions, } from "../api"; import { getClientConfig } from "@/app/config/client"; import { getMessageTextContent } from "@/app/utils"; @@ -63,6 +64,10 @@ export class MoonshotApi implements LLMApi { throw new Error("Method not implemented."); } + transcription(options: TranscriptionOptions): Promise { + throw new Error("Method not implemented."); + } + async chat(options: ChatOptions) { const messages: ChatOptions["messages"] = []; for (const v of options.messages) { diff --git a/app/client/platforms/openai.ts b/app/client/platforms/openai.ts index 76bac59e876..b9443905ecf 100644 --- a/app/client/platforms/openai.ts +++ b/app/client/platforms/openai.ts @@ -34,6 +34,7 @@ import { LLMUsage, MultimodalContent, SpeechOptions, + TranscriptionOptions, } from "../api"; import Locale from "../../locales"; import { getClientConfig } from "@/app/config/client"; @@ -180,6 +181,47 @@ export class ChatGPTApi implements LLMApi { } } + async transcription(options: TranscriptionOptions): Promise { + const formData = new FormData(); + formData.append("file", options.file, "audio.wav"); + formData.append("model", options.model ?? "whisper-1"); + if (options.language) formData.append("language", options.language); + if (options.prompt) formData.append("prompt", options.prompt); + if (options.response_format) + formData.append("response_format", options.response_format); + if (options.temperature) + formData.append("temperature", options.temperature.toString()); + + console.log("[Request] openai audio transcriptions payload: ", options); + + const controller = new AbortController(); + options.onController?.(controller); + + try { + const path = this.path(OpenaiPath.TranscriptionPath); + const headers = getHeaders(true); + const payload = { + method: "POST", + body: formData, + signal: controller.signal, + headers: headers, + }; + + // make a fetch request + const requestTimeoutId = setTimeout( + () => controller.abort(), + REQUEST_TIMEOUT_MS, + ); + const res = await fetch(path, payload); + clearTimeout(requestTimeoutId); + const json = await res.json(); + return json.text; + } catch (e) { + console.log("[Request] failed to make a audio transcriptions request", e); + throw e; + } + } + async chat(options: ChatOptions) { const modelConfig = { ...useAppConfig.getState().modelConfig, diff --git a/app/client/platforms/tencent.ts b/app/client/platforms/tencent.ts index 3610fac0a48..67ad64dcb48 100644 --- a/app/client/platforms/tencent.ts +++ b/app/client/platforms/tencent.ts @@ -9,6 +9,7 @@ import { LLMModel, MultimodalContent, SpeechOptions, + TranscriptionOptions, } from "../api"; import Locale from "../../locales"; import { @@ -93,6 +94,10 @@ export class HunyuanApi implements LLMApi { throw new Error("Method not implemented."); } + transcription(options: TranscriptionOptions): Promise { + throw new Error("Method not implemented."); + } + async chat(options: ChatOptions) { const visionModel = isVisionModel(options.config.model); const messages = options.messages.map((v, index) => ({ diff --git a/app/components/chat.module.scss b/app/components/chat.module.scss index 73542fc67f1..b5fd4dffd18 100644 --- a/app/components/chat.module.scss +++ b/app/components/chat.module.scss @@ -75,6 +75,14 @@ pointer-events: none; } + &.listening { + width: var(--full-width); + .text { + opacity: 1; + transform: translate(0); + } + } + &:hover { --delay: 0.5s; width: var(--full-width); diff --git a/app/components/chat.tsx b/app/components/chat.tsx index 3d5b6a4f2c4..73ebda4ebae 100644 --- a/app/components/chat.tsx +++ b/app/components/chat.tsx @@ -10,6 +10,8 @@ import React, { } from "react"; import SendWhiteIcon from "../icons/send-white.svg"; +import VoiceOpenIcon from "../icons/vioce-open.svg"; +import VoiceCloseIcon from "../icons/vioce-close.svg"; import BrainIcon from "../icons/brain.svg"; import RenameIcon from "../icons/rename.svg"; import ExportIcon from "../icons/share.svg"; @@ -72,6 +74,7 @@ import { isDalle3, showPlugins, safeLocalStorage, + isFirefox, } from "../utils"; import { uploadImage as uploadImageRemote } from "@/app/utils/chat"; @@ -98,7 +101,9 @@ import { import { useNavigate } from "react-router-dom"; import { CHAT_PAGE_SIZE, + DEFAULT_STT_ENGINE, DEFAULT_TTS_ENGINE, + FIREFOX_DEFAULT_STT_ENGINE, ModelProvider, Path, REQUEST_TIMEOUT_MS, @@ -117,6 +122,7 @@ import { MultimodalContent } from "../client/api"; import { ClientApi } from "../client/api"; import { createTTSPlayer } from "../utils/audio"; +import { OpenAITranscriptionApi, WebTranscriptionApi } from "../utils/speech"; import { MsEdgeTTS, OUTPUT_FORMAT } from "../utils/ms_edge_tts"; import { isEmpty } from "lodash-es"; @@ -367,6 +373,7 @@ export function ChatAction(props: { text: string; icon: JSX.Element; onClick: () => void; + isListening?: boolean; }) { const iconRef = useRef(null); const textRef = useRef(null); @@ -388,7 +395,9 @@ export function ChatAction(props: { return (
{ props.onClick(); setTimeout(updateWidth, 1); @@ -549,6 +558,61 @@ export function ChatActions(props: { } }, [chatStore, currentModel, models]); + const [isListening, setIsListening] = useState(false); + const [isTranscription, setIsTranscription] = useState(false); + const [speechApi, setSpeechApi] = useState(null); + + useEffect(() => { + if (isFirefox()) config.sttConfig.engine = FIREFOX_DEFAULT_STT_ENGINE; + const lang = config.sttConfig.lang; + setSpeechApi( + config.sttConfig.engine !== DEFAULT_STT_ENGINE + ? new WebTranscriptionApi( + (transcription) => onRecognitionEnd(transcription), + lang, + ) + : new OpenAITranscriptionApi((transcription) => + onRecognitionEnd(transcription), + ), + ); + }, []); + + function playSound(fileName: string) { + const audio = new Audio(fileName); + audio.play().catch((error) => { + console.error("error:", error); + }); + } + + const startListening = async () => { + playSound("/Recordingstart.mp3"); + showToast(Locale.Chat.StartSpeak); + if (speechApi) { + await speechApi.start(); + setIsListening(true); + document.getElementById("chat-input")?.focus(); + } + }; + const stopListening = async () => { + showToast(Locale.Chat.CloseSpeak); + if (speechApi) { + if (config.sttConfig.engine !== DEFAULT_STT_ENGINE) + setIsTranscription(true); + await speechApi.stop(); + setIsListening(false); + } + playSound("/Recordingdone.mp3"); + document.getElementById("chat-input")?.focus(); + }; + const onRecognitionEnd = (finalTranscript: string) => { + console.log(finalTranscript); + if (finalTranscript) { + props.setUserInput((prevInput) => prevInput + finalTranscript); + } + if (config.sttConfig.engine !== DEFAULT_STT_ENGINE) + setIsTranscription(false); + }; + return (
{couldStop && ( @@ -783,6 +847,17 @@ export function ChatActions(props: { icon={} /> )} + + {config.sttConfig.enable && ( + + isListening ? await stopListening() : await startListening() + } + text={isListening ? Locale.Chat.StopSpeak : Locale.Chat.StartSpeak} + icon={isListening ? : } + isListening={isListening} + /> + )}
); } @@ -1508,7 +1583,7 @@ function _Chat() { setAttachImages(images); } - // 快捷键 shortcut keys + // 快捷键 const [showShortcutKeyModal, setShowShortcutKeyModal] = useState(false); useEffect(() => { diff --git a/app/components/markdown.tsx b/app/components/markdown.tsx index 9841a196d27..b1083054fad 100644 --- a/app/components/markdown.tsx +++ b/app/components/markdown.tsx @@ -193,7 +193,9 @@ function CustomCode(props: { children: any; className?: string }) { const renderShowMoreButton = () => { if (showToggle && enableCodeFold && collapsed) { return ( -
+
); diff --git a/app/components/settings.tsx b/app/components/settings.tsx index e2464481341..d04931148df 100644 --- a/app/components/settings.tsx +++ b/app/components/settings.tsx @@ -83,6 +83,7 @@ import { nanoid } from "nanoid"; import { useMaskStore } from "../store/mask"; import { ProviderType } from "../utils/cloud"; import { TTSConfigList } from "./tts-config"; +import { STTConfigList } from "./stt-config"; function EditPromptModal(props: { id: string; onClose: () => void }) { const promptStore = usePromptStore(); @@ -1719,6 +1720,17 @@ export function Settings() { /> + + { + const sttConfig = { ...config.sttConfig }; + updater(sttConfig); + config.update((config) => (config.sttConfig = sttConfig)); + }} + /> + +
diff --git a/app/components/stt-config.tsx b/app/components/stt-config.tsx new file mode 100644 index 00000000000..01a63cd5f5c --- /dev/null +++ b/app/components/stt-config.tsx @@ -0,0 +1,75 @@ +import { STTConfig, STTConfigValidator } from "../store"; + +import Locale from "../locales"; +import { ListItem, Select } from "./ui-lib"; +import { DEFAULT_STT_ENGINES, DEFAULT_STT_LANGUAGES } from "../constant"; +import { isFirefox } from "../utils"; + +export function STTConfigList(props: { + sttConfig: STTConfig; + updateConfig: (updater: (config: STTConfig) => void) => void; +}) { + return ( + <> + + + props.updateConfig( + (config) => (config.enable = e.currentTarget.checked), + ) + } + > + + + + + {props.sttConfig.engine === "Web Speech API" && !isFirefox() && ( + + + + )} + + ); +} diff --git a/app/components/stt.module.scss b/app/components/stt.module.scss new file mode 100644 index 00000000000..ba9f382e40b --- /dev/null +++ b/app/components/stt.module.scss @@ -0,0 +1,119 @@ +@import "../styles/animation.scss"; +.plugin-page { + height: 100%; + display: flex; + flex-direction: column; + + .plugin-page-body { + padding: 20px; + overflow-y: auto; + + .plugin-filter { + width: 100%; + max-width: 100%; + margin-bottom: 20px; + animation: slide-in ease 0.3s; + height: 40px; + + display: flex; + + .search-bar { + flex-grow: 1; + max-width: 100%; + min-width: 0; + outline: none; + } + + .search-bar:focus { + border: 1px solid var(--primary); + } + + .plugin-filter-lang { + height: 100%; + margin-left: 10px; + } + + .plugin-create { + height: 100%; + margin-left: 10px; + box-sizing: border-box; + min-width: 80px; + } + } + + .plugin-item { + display: flex; + justify-content: space-between; + padding: 20px; + border: var(--border-in-light); + animation: slide-in ease 0.3s; + + &:not(:last-child) { + border-bottom: 0; + } + + &:first-child { + border-top-left-radius: 10px; + border-top-right-radius: 10px; + } + + &:last-child { + border-bottom-left-radius: 10px; + border-bottom-right-radius: 10px; + } + + .plugin-header { + display: flex; + align-items: center; + + .plugin-icon { + display: flex; + align-items: center; + justify-content: center; + margin-right: 10px; + } + + .plugin-title { + .plugin-name { + font-size: 14px; + font-weight: bold; + } + .plugin-info { + font-size: 12px; + } + .plugin-runtime-warning { + font-size: 12px; + color: #f86c6c; + } + } + } + + .plugin-actions { + display: flex; + flex-wrap: nowrap; + transition: all ease 0.3s; + justify-content: center; + align-items: center; + } + + @media screen and (max-width: 600px) { + display: flex; + flex-direction: column; + padding-bottom: 10px; + border-radius: 10px; + margin-bottom: 20px; + box-shadow: var(--card-shadow); + + &:not(:last-child) { + border-bottom: var(--border-in-light); + } + + .plugin-actions { + width: 100%; + justify-content: space-between; + padding-top: 10px; + } + } + } + } +} diff --git a/app/constant.ts b/app/constant.ts index a06b8f05062..44b53ac4d51 100644 --- a/app/constant.ts +++ b/app/constant.ts @@ -150,6 +150,7 @@ export const Anthropic = { export const OpenaiPath = { ChatPath: "v1/chat/completions", SpeechPath: "v1/audio/speech", + TranscriptionPath: "v1/audio/transcriptions", ImagePath: "v1/images/generations", UsagePath: "dashboard/billing/usage", SubsPath: "dashboard/billing/subscription", @@ -270,6 +271,24 @@ export const DEFAULT_TTS_VOICES = [ "shimmer", ]; +export const DEFAULT_STT_ENGINE = "OpenAI Whisper"; +export const DEFAULT_STT_ENGINES = ["OpenAI Whisper", "Web Speech API"]; +export const DEFAULT_STT_LANGUAGE = "zh-CN"; +export const DEFAULT_STT_LANGUAGES = [ + "zh-CN", // 中文(简体) + "en-US", // 英文 + "fr-FR", // 法文 + "de-DE", // 德文 + "es-ES", // 西班牙文 + "it-IT", // 意大利文 + "ja-JP", // 日文 + "ko-KR", // 韩文 + "ru-RU", // 俄文 + "pt-BR", // 葡萄牙文(巴西) + "ar-SA", // 阿拉伯文 +]; +export const FIREFOX_DEFAULT_STT_ENGINE = "OpenAI Whisper"; + const openaiModels = [ "gpt-3.5-turbo", "gpt-3.5-turbo-1106", diff --git a/app/icons/vioce-close.svg b/app/icons/vioce-close.svg new file mode 100644 index 00000000000..9d162e1690d --- /dev/null +++ b/app/icons/vioce-close.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/app/icons/vioce-open.svg b/app/icons/vioce-open.svg new file mode 100644 index 00000000000..0028a1d9b56 --- /dev/null +++ b/app/icons/vioce-open.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/app/locales/cn.ts b/app/locales/cn.ts index b7debe80514..ba783c33d52 100644 --- a/app/locales/cn.ts +++ b/app/locales/cn.ts @@ -92,8 +92,9 @@ const cn = { return inputHints + ",/ 触发补全,: 触发命令"; }, Send: "发送", - StartSpeak: "说话", - StopSpeak: "停止", + StartSpeak: "启用语音输入", + CloseSpeak: "关闭语音输入", + StopSpeak: "录音中....点击结束", Config: { Reset: "清除记忆", SaveAs: "存为面具", @@ -538,6 +539,16 @@ const cn = { SubTitle: "生成语音的速度", }, }, + STT: { + Enable: { + Title: "启用语音转文本", + SubTitle: "启用语音转文本", + }, + Engine: { + Title: "转换引擎", + SubTitle: "音频转换引擎", + }, + }, }, Store: { DefaultTopic: "新的聊天", diff --git a/app/locales/en.ts b/app/locales/en.ts index 5cc296f1efd..b532ac0e7e9 100644 --- a/app/locales/en.ts +++ b/app/locales/en.ts @@ -94,7 +94,8 @@ const en: LocaleType = { }, Send: "Send", StartSpeak: "Start Speak", - StopSpeak: "Stop Speak", + CloseSpeak: "Stop Speak", + StopSpeak: "Recording...", Config: { Reset: "Reset to Default", SaveAs: "Save as Mask", @@ -546,6 +547,16 @@ const en: LocaleType = { }, Engine: "TTS Engine", }, + STT: { + Enable: { + Title: "Enable STT", + SubTitle: "Enable Speech-to-Text", + }, + Engine: { + Title: "STT Engine", + SubTitle: "Text-to-Speech Engine", + }, + }, }, Store: { DefaultTopic: "New Conversation", diff --git a/app/store/config.ts b/app/store/config.ts index f14793c287a..cdf1d6e1bea 100644 --- a/app/store/config.ts +++ b/app/store/config.ts @@ -5,6 +5,9 @@ import { DEFAULT_INPUT_TEMPLATE, DEFAULT_MODELS, DEFAULT_SIDEBAR_WIDTH, + DEFAULT_STT_ENGINE, + DEFAULT_STT_ENGINES, + DEFAULT_STT_LANGUAGE, DEFAULT_TTS_ENGINE, DEFAULT_TTS_ENGINES, DEFAULT_TTS_MODEL, @@ -20,6 +23,7 @@ export type ModelType = (typeof DEFAULT_MODELS)[number]["name"]; export type TTSModelType = (typeof DEFAULT_TTS_MODELS)[number]; export type TTSVoiceType = (typeof DEFAULT_TTS_VOICES)[number]; export type TTSEngineType = (typeof DEFAULT_TTS_ENGINES)[number]; +export type STTEngineType = (typeof DEFAULT_STT_ENGINES)[number]; export enum SubmitKey { Enter = "Enter", @@ -83,19 +87,25 @@ export const DEFAULT_CONFIG = { }, ttsConfig: { - enable: false, + enable: true, autoplay: false, engine: DEFAULT_TTS_ENGINE, model: DEFAULT_TTS_MODEL, voice: DEFAULT_TTS_VOICE, speed: 1.0, }, + sttConfig: { + enable: true, + engine: DEFAULT_STT_ENGINE, + lang: DEFAULT_STT_LANGUAGE, + }, }; export type ChatConfig = typeof DEFAULT_CONFIG; export type ModelConfig = ChatConfig["modelConfig"]; export type TTSConfig = ChatConfig["ttsConfig"]; +export type STTConfig = ChatConfig["sttConfig"]; export function limitNumber( x: number, @@ -125,6 +135,12 @@ export const TTSConfigValidator = { }, }; +export const STTConfigValidator = { + engine(x: string) { + return x as STTEngineType; + }, +}; + export const ModalConfigValidator = { model(x: string) { return x as ModelType; diff --git a/app/utils/speech.ts b/app/utils/speech.ts new file mode 100644 index 00000000000..1e69f603186 --- /dev/null +++ b/app/utils/speech.ts @@ -0,0 +1,126 @@ +import { ChatGPTApi } from "../client/platforms/openai"; +import { getSTTLang } from "../locales"; +import { isFirefox } from "../utils"; + +export type TranscriptionCallback = (transcription: string) => void; + +export abstract class SpeechApi { + protected onTranscription: TranscriptionCallback = () => {}; + + abstract isListening(): boolean; + abstract start(): Promise; + abstract stop(): Promise; + + onTranscriptionReceived(callback: TranscriptionCallback) { + this.onTranscription = callback; + } +} + +export class OpenAITranscriptionApi extends SpeechApi { + private listeningStatus = false; + private mediaRecorder: MediaRecorder | null = null; + private stream: MediaStream | null = null; + private audioChunks: Blob[] = []; + + isListening = () => this.listeningStatus; + + constructor(transcriptionCallback?: TranscriptionCallback) { + super(); + if (transcriptionCallback) { + this.onTranscriptionReceived(transcriptionCallback); + } + } + + async start(): Promise { + // @ts-ignore + navigator.getUserMedia = + // @ts-ignore + navigator.getUserMedia || + // @ts-ignore + navigator.webkitGetUserMedia || + // @ts-ignore + navigator.mozGetUserMedia || + // @ts-ignore + navigator.msGetUserMedia; + if (navigator.mediaDevices) { + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + this.mediaRecorder = new MediaRecorder(stream); + this.mediaRecorder.ondataavailable = (e) => { + if (e.data && e.data.size > 0) { + this.audioChunks.push(e.data); + } + }; + + this.stream = stream; + } else { + console.warn("Media Decives will work only with SSL"); + return; + } + + this.audioChunks = []; + + // this.recorder.addEventListener("dataavailable", (event) => { + // this.audioChunks.push(event.data); + // }); + + this.mediaRecorder.start(1000); + this.listeningStatus = true; + } + + async stop(): Promise { + if (!this.mediaRecorder || !this.listeningStatus) { + return; + } + + return new Promise((resolve) => { + this.mediaRecorder!.addEventListener("stop", async () => { + const audioBlob = new Blob(this.audioChunks, { type: "audio/wav" }); + const llm = new ChatGPTApi(); + const transcription = await llm.transcription({ file: audioBlob }); + this.onTranscription(transcription); + this.listeningStatus = false; + resolve(); + }); + + this.mediaRecorder!.stop(); + }); + } +} + +export class WebTranscriptionApi extends SpeechApi { + private listeningStatus = false; + private recognitionInstance: any | null = null; + + isListening = () => this.listeningStatus; + + constructor(transcriptionCallback?: TranscriptionCallback, lang?: string) { + super(); + if (isFirefox()) return; + const SpeechRecognition = + (window as any).SpeechRecognition || + (window as any).webkitSpeechRecognition; + this.recognitionInstance = new SpeechRecognition(); + this.recognitionInstance.continuous = true; + this.recognitionInstance.interimResults = true; + this.recognitionInstance.lang = lang ?? getSTTLang(); + if (transcriptionCallback) { + this.onTranscriptionReceived(transcriptionCallback); + } + this.recognitionInstance.onresult = (event: any) => { + const result = event.results[event.results.length - 1]; + if (result.isFinal) { + this.onTranscription(result[0].transcript); + } + }; + } + + async start(): Promise { + this.listeningStatus = true; + await this.recognitionInstance.start(); + } + + async stop(): Promise { + this.listeningStatus = false; + await this.recognitionInstance.stop(); + } +} diff --git a/public/Recordingdone.mp3 b/public/Recordingdone.mp3 new file mode 100644 index 00000000000..71d80c709e4 Binary files /dev/null and b/public/Recordingdone.mp3 differ diff --git a/public/Recordingstart.mp3 b/public/Recordingstart.mp3 new file mode 100644 index 00000000000..dedc4d152a2 Binary files /dev/null and b/public/Recordingstart.mp3 differ