diff --git a/app/client/api.ts b/app/client/api.ts
index 7a242ea99dd..fe08a8e4378 100644
--- a/app/client/api.ts
+++ b/app/client/api.ts
@@ -63,6 +63,16 @@ export interface SpeechOptions {
   onController?: (controller: AbortController) => void;
 }
 
+export interface TranscriptionOptions {
+  model?: "whisper-1";
+  file: Blob;
+  language?: string;
+  prompt?: string;
+  response_format?: "json" | "text" | "srt" | "verbose_json" | "vtt";
+  temperature?: number;
+  onController?: (controller: AbortController) => void;
+}
+
 export interface ChatOptions {
   messages: RequestMessage[];
   config: LLMConfig;
@@ -98,6 +108,7 @@ export interface LLMModelProvider {
 export abstract class LLMApi {
   abstract chat(options: ChatOptions): Promise<void>;
   abstract speech(options: SpeechOptions): Promise<ArrayBuffer>;
+  abstract transcription(options: TranscriptionOptions): Promise<string>;
   abstract usage(): Promise<LLMUsage>;
   abstract models(): Promise<LLMModel[]>;
 }
diff --git a/app/client/platforms/alibaba.ts b/app/client/platforms/alibaba.ts
index 86229a14705..57f8bc3fd39 100644
--- a/app/client/platforms/alibaba.ts
+++ b/app/client/platforms/alibaba.ts
@@ -13,6 +13,7 @@ import {
   LLMApi,
   LLMModel,
   SpeechOptions,
+  TranscriptionOptions,
   MultimodalContent,
 } from "../api";
 import Locale from "../../locales";
@@ -89,6 +90,10 @@ export class QwenApi implements LLMApi {
     throw new Error("Method not implemented.");
   }
 
+  transcription(options: TranscriptionOptions): Promise<string> {
+    throw new Error("Method not implemented.");
+  }
+
   async chat(options: ChatOptions) {
     const messages = options.messages.map((v) => ({
       role: v.role,
diff --git a/app/client/platforms/anthropic.ts b/app/client/platforms/anthropic.ts
index 1a83bd53aa1..24f76f31d46 100644
--- a/app/client/platforms/anthropic.ts
+++ b/app/client/platforms/anthropic.ts
@@ -1,5 +1,11 @@
 import { Anthropic, ApiPath } from "@/app/constant";
-import { ChatOptions, getHeaders, LLMApi, SpeechOptions } from "../api";
+import {
+  ChatOptions,
+  getHeaders,
+  LLMApi,
+  SpeechOptions,
+  TranscriptionOptions,
+} from "../api";
 import {
   useAccessStore,
   useAppConfig,
@@ -77,6 +83,10 @@ export class ClaudeApi implements LLMApi {
     throw new Error("Method not implemented.");
   }
 
+  transcription(options: TranscriptionOptions): Promise<string> {
+    throw new Error("Method not implemented.");
+  }
+
   extractMessage(res: any) {
     console.log("[Response] claude response: ", res);
 
diff --git a/app/client/platforms/baidu.ts b/app/client/platforms/baidu.ts
index 2511a696b9b..a809183c077 100644
--- a/app/client/platforms/baidu.ts
+++ b/app/client/platforms/baidu.ts
@@ -15,6 +15,7 @@ import {
   LLMModel,
   MultimodalContent,
   SpeechOptions,
+  TranscriptionOptions,
 } from "../api";
 import Locale from "../../locales";
 import {
@@ -81,6 +82,10 @@ export class ErnieApi implements LLMApi {
     throw new Error("Method not implemented.");
   }
 
+  transcription(options: TranscriptionOptions): Promise<string> {
+    throw new Error("Method not implemented.");
+  }
+
   async chat(options: ChatOptions) {
     const messages = options.messages.map((v) => ({
       // "error_code": 336006, "error_msg": "the role of message with even index in the messages must be user or function",
diff --git a/app/client/platforms/bytedance.ts b/app/client/platforms/bytedance.ts
index 000a9e278db..25b45e4853e 100644
--- a/app/client/platforms/bytedance.ts
+++ b/app/client/platforms/bytedance.ts
@@ -14,6 +14,7 @@ import {
   LLMModel,
   MultimodalContent,
   SpeechOptions,
+  TranscriptionOptions,
 } from "../api";
 import Locale from "../../locales";
 import {
@@ -83,6 +84,10 @@ export class DoubaoApi implements LLMApi {
     throw new Error("Method not implemented.");
   }
 
+  transcription(options: TranscriptionOptions): Promise<string> {
+    throw new Error("Method not implemented.");
+  }
+
   async chat(options: ChatOptions) {
     const messages = options.messages.map((v) => ({
       role: v.role,
diff --git a/app/client/platforms/google.ts b/app/client/platforms/google.ts
index 7265a500b97..40d49373e7f 100644
--- a/app/client/platforms/google.ts
+++ b/app/client/platforms/google.ts
@@ -6,6 +6,7 @@ import {
   LLMModel,
   LLMUsage,
   SpeechOptions,
+  TranscriptionOptions,
 } from "../api";
 import {
   useAccessStore,
@@ -68,6 +69,10 @@ export class GeminiProApi implements LLMApi {
     throw new Error("Method not implemented.");
   }
 
+  transcription(options: TranscriptionOptions): Promise<string> {
+    throw new Error("Method not implemented.");
+  }
+
   async chat(options: ChatOptions): Promise<void> {
     const apiClient = this;
     let multimodal = false;
diff --git a/app/client/platforms/iflytek.ts b/app/client/platforms/iflytek.ts
index 55a39d0ccca..fcff72ca607 100644
--- a/app/client/platforms/iflytek.ts
+++ b/app/client/platforms/iflytek.ts
@@ -13,6 +13,7 @@ import {
   LLMApi,
   LLMModel,
   SpeechOptions,
+  TranscriptionOptions,
 } from "../api";
 import Locale from "../../locales";
 import {
@@ -64,6 +65,10 @@ export class SparkApi implements LLMApi {
     throw new Error("Method not implemented.");
   }
 
+  transcription(options: TranscriptionOptions): Promise<string> {
+    throw new Error("Method not implemented.");
+  }
+
   async chat(options: ChatOptions) {
     const messages: ChatOptions["messages"] = [];
     for (const v of options.messages) {
diff --git a/app/client/platforms/moonshot.ts b/app/client/platforms/moonshot.ts
index e0ef3494fe7..b32e4213ae0 100644
--- a/app/client/platforms/moonshot.ts
+++ b/app/client/platforms/moonshot.ts
@@ -20,6 +20,7 @@ import {
   LLMApi,
   LLMModel,
   SpeechOptions,
+  TranscriptionOptions,
 } from "../api";
 import { getClientConfig } from "@/app/config/client";
 import { getMessageTextContent } from "@/app/utils";
@@ -63,6 +64,10 @@ export class MoonshotApi implements LLMApi {
     throw new Error("Method not implemented.");
   }
 
+  transcription(options: TranscriptionOptions): Promise<string> {
+    throw new Error("Method not implemented.");
+  }
+
   async chat(options: ChatOptions) {
     const messages: ChatOptions["messages"] = [];
     for (const v of options.messages) {
diff --git a/app/client/platforms/openai.ts b/app/client/platforms/openai.ts
index 76bac59e876..b9443905ecf 100644
--- a/app/client/platforms/openai.ts
+++ b/app/client/platforms/openai.ts
@@ -34,6 +34,7 @@ import {
   LLMUsage,
   MultimodalContent,
   SpeechOptions,
+  TranscriptionOptions,
 } from "../api";
 import Locale from "../../locales";
 import { getClientConfig } from "@/app/config/client";
@@ -180,6 +181,47 @@ export class ChatGPTApi implements LLMApi {
     }
   }
 
+  async transcription(options: TranscriptionOptions): Promise<string> {
+    const formData = new FormData();
+    formData.append("file", options.file, "audio.wav");
+    formData.append("model", options.model ?? "whisper-1");
+    if (options.language) formData.append("language", options.language);
+    if (options.prompt) formData.append("prompt", options.prompt);
+    if (options.response_format)
+      formData.append("response_format", options.response_format);
+    if (options.temperature)
+      formData.append("temperature", options.temperature.toString());
+
+    console.log("[Request] openai audio transcriptions payload: ", options);
+
+    const controller = new AbortController();
+    options.onController?.(controller);
+
+    try {
+      const path = this.path(OpenaiPath.TranscriptionPath);
+      const headers = getHeaders(true);
+      const payload = {
+        method: "POST",
+        body: formData,
+        signal: controller.signal,
+        headers: headers,
+      };
+
+      // make a fetch request
+      const requestTimeoutId = setTimeout(
+        () => controller.abort(),
+        REQUEST_TIMEOUT_MS,
+      );
+      const res = await fetch(path, payload);
+      clearTimeout(requestTimeoutId);
+      const json = await res.json();
+      return json.text;
+    } catch (e) {
+      console.log("[Request] failed to make a audio transcriptions request", e);
+      throw e;
+    }
+  }
+
   async chat(options: ChatOptions) {
     const modelConfig = {
       ...useAppConfig.getState().modelConfig,
diff --git a/app/client/platforms/tencent.ts b/app/client/platforms/tencent.ts
index 3610fac0a48..67ad64dcb48 100644
--- a/app/client/platforms/tencent.ts
+++ b/app/client/platforms/tencent.ts
@@ -9,6 +9,7 @@ import {
   LLMModel,
   MultimodalContent,
   SpeechOptions,
+  TranscriptionOptions,
 } from "../api";
 import Locale from "../../locales";
 import {
@@ -93,6 +94,10 @@ export class HunyuanApi implements LLMApi {
     throw new Error("Method not implemented.");
   }
 
+  transcription(options: TranscriptionOptions): Promise<string> {
+    throw new Error("Method not implemented.");
+  }
+
   async chat(options: ChatOptions) {
     const visionModel = isVisionModel(options.config.model);
     const messages = options.messages.map((v, index) => ({
diff --git a/app/components/chat.module.scss b/app/components/chat.module.scss
index 73542fc67f1..b5fd4dffd18 100644
--- a/app/components/chat.module.scss
+++ b/app/components/chat.module.scss
@@ -75,6 +75,14 @@
       pointer-events: none;
     }
 
+    &.listening {
+      width: var(--full-width);
+      .text {
+        opacity: 1;
+        transform: translate(0);
+      }
+    }
+
     &:hover {
       --delay: 0.5s;
       width: var(--full-width);
diff --git a/app/components/chat.tsx b/app/components/chat.tsx
index 3d5b6a4f2c4..73ebda4ebae 100644
--- a/app/components/chat.tsx
+++ b/app/components/chat.tsx
@@ -10,6 +10,8 @@ import React, {
 } from "react";
 
 import SendWhiteIcon from "../icons/send-white.svg";
+import VoiceOpenIcon from "../icons/vioce-open.svg";
+import VoiceCloseIcon from "../icons/vioce-close.svg";
 import BrainIcon from "../icons/brain.svg";
 import RenameIcon from "../icons/rename.svg";
 import ExportIcon from "../icons/share.svg";
@@ -72,6 +74,7 @@ import {
   isDalle3,
   showPlugins,
   safeLocalStorage,
+  isFirefox,
 } from "../utils";
 
 import { uploadImage as uploadImageRemote } from "@/app/utils/chat";
@@ -98,7 +101,9 @@ import {
 import { useNavigate } from "react-router-dom";
 import {
   CHAT_PAGE_SIZE,
+  DEFAULT_STT_ENGINE,
   DEFAULT_TTS_ENGINE,
+  FIREFOX_DEFAULT_STT_ENGINE,
   ModelProvider,
   Path,
   REQUEST_TIMEOUT_MS,
@@ -117,6 +122,7 @@ import { MultimodalContent } from "../client/api";
 
 import { ClientApi } from "../client/api";
 import { createTTSPlayer } from "../utils/audio";
+import { OpenAITranscriptionApi, WebTranscriptionApi } from "../utils/speech";
 import { MsEdgeTTS, OUTPUT_FORMAT } from "../utils/ms_edge_tts";
 
 import { isEmpty } from "lodash-es";
@@ -367,6 +373,7 @@ export function ChatAction(props: {
   text: string;
   icon: JSX.Element;
   onClick: () => void;
+  isListening?: boolean;
 }) {
   const iconRef = useRef<HTMLDivElement>(null);
   const textRef = useRef<HTMLDivElement>(null);
@@ -388,7 +395,9 @@ export function ChatAction(props: {
 
   return (
     <div
-      className={`${styles["chat-input-action"]} clickable`}
+      className={`${styles["chat-input-action"]} clickable ${
+        props.isListening ? styles["listening"] : ""
+      }`}
       onClick={() => {
         props.onClick();
         setTimeout(updateWidth, 1);
@@ -549,6 +558,61 @@ export function ChatActions(props: {
     }
   }, [chatStore, currentModel, models]);
 
+  const [isListening, setIsListening] = useState(false);
+  const [isTranscription, setIsTranscription] = useState(false);
+  const [speechApi, setSpeechApi] = useState<any>(null);
+
+  useEffect(() => {
+    if (isFirefox()) config.sttConfig.engine = FIREFOX_DEFAULT_STT_ENGINE;
+    const lang = config.sttConfig.lang;
+    setSpeechApi(
+      config.sttConfig.engine !== DEFAULT_STT_ENGINE
+        ? new WebTranscriptionApi(
+            (transcription) => onRecognitionEnd(transcription),
+            lang,
+          )
+        : new OpenAITranscriptionApi((transcription) =>
+            onRecognitionEnd(transcription),
+          ),
+    );
+  }, []);
+
+  function playSound(fileName: string) {
+    const audio = new Audio(fileName);
+    audio.play().catch((error) => {
+      console.error("error:", error);
+    });
+  }
+
+  const startListening = async () => {
+    playSound("/Recordingstart.mp3");
+    showToast(Locale.Chat.StartSpeak);
+    if (speechApi) {
+      await speechApi.start();
+      setIsListening(true);
+      document.getElementById("chat-input")?.focus();
+    }
+  };
+  const stopListening = async () => {
+    showToast(Locale.Chat.CloseSpeak);
+    if (speechApi) {
+      if (config.sttConfig.engine !== DEFAULT_STT_ENGINE)
+        setIsTranscription(true);
+      await speechApi.stop();
+      setIsListening(false);
+    }
+    playSound("/Recordingdone.mp3");
+    document.getElementById("chat-input")?.focus();
+  };
+  const onRecognitionEnd = (finalTranscript: string) => {
+    console.log(finalTranscript);
+    if (finalTranscript) {
+      props.setUserInput((prevInput) => prevInput + finalTranscript);
+    }
+    if (config.sttConfig.engine !== DEFAULT_STT_ENGINE)
+      setIsTranscription(false);
+  };
+
   return (
     <div className={styles["chat-input-actions"]}>
       {couldStop && (
@@ -783,6 +847,17 @@ export function ChatActions(props: {
           icon={<ShortcutkeyIcon />}
         />
       )}
+
+      {config.sttConfig.enable && (
+        <ChatAction
+          onClick={async () =>
+            isListening ? await stopListening() : await startListening()
+          }
+          text={isListening ? Locale.Chat.StopSpeak : Locale.Chat.StartSpeak}
+          icon={isListening ? <VoiceOpenIcon /> : <VoiceCloseIcon />}
+          isListening={isListening}
+        />
+      )}
     </div>
   );
 }
@@ -1508,7 +1583,7 @@ function _Chat() {
     setAttachImages(images);
   }
 
-  // 快捷键 shortcut keys
+  // 快捷键
   const [showShortcutKeyModal, setShowShortcutKeyModal] = useState(false);
 
   useEffect(() => {
diff --git a/app/components/markdown.tsx b/app/components/markdown.tsx
index 9841a196d27..b1083054fad 100644
--- a/app/components/markdown.tsx
+++ b/app/components/markdown.tsx
@@ -193,7 +193,9 @@ function CustomCode(props: { children: any; className?: string }) {
   const renderShowMoreButton = () => {
     if (showToggle && enableCodeFold && collapsed) {
       return (
-        <div className={`show-hide-button ${collapsed ? "collapsed" : "expanded"}`}>
+        <div
+          className={`show-hide-button ${collapsed ? "collapsed" : "expanded"}`}
+        >
           <button onClick={toggleCollapsed}>{Locale.NewChat.More}</button>
         </div>
       );
diff --git a/app/components/settings.tsx b/app/components/settings.tsx
index e2464481341..d04931148df 100644
--- a/app/components/settings.tsx
+++ b/app/components/settings.tsx
@@ -83,6 +83,7 @@ import { nanoid } from "nanoid";
 import { useMaskStore } from "../store/mask";
 import { ProviderType } from "../utils/cloud";
 import { TTSConfigList } from "./tts-config";
+import { STTConfigList } from "./stt-config";
 
 function EditPromptModal(props: { id: string; onClose: () => void }) {
   const promptStore = usePromptStore();
@@ -1719,6 +1720,17 @@ export function Settings() {
           />
         </List>
 
+        <List>
+          <STTConfigList
+            sttConfig={config.sttConfig}
+            updateConfig={(updater) => {
+              const sttConfig = { ...config.sttConfig };
+              updater(sttConfig);
+              config.update((config) => (config.sttConfig = sttConfig));
+            }}
+          />
+        </List>
+
         <DangerItems />
       </div>
     </ErrorBoundary>
diff --git a/app/components/stt-config.tsx b/app/components/stt-config.tsx
new file mode 100644
index 00000000000..01a63cd5f5c
--- /dev/null
+++ b/app/components/stt-config.tsx
@@ -0,0 +1,75 @@
+import { STTConfig, STTConfigValidator } from "../store";
+
+import Locale from "../locales";
+import { ListItem, Select } from "./ui-lib";
+import { DEFAULT_STT_ENGINES, DEFAULT_STT_LANGUAGES } from "../constant";
+import { isFirefox } from "../utils";
+
+export function STTConfigList(props: {
+  sttConfig: STTConfig;
+  updateConfig: (updater: (config: STTConfig) => void) => void;
+}) {
+  return (
+    <>
+      <ListItem
+        title={Locale.Settings.STT.Enable.Title}
+        subTitle={Locale.Settings.STT.Enable.SubTitle}
+      >
+        <input
+          type="checkbox"
+          checked={props.sttConfig.enable}
+          onChange={(e) =>
+            props.updateConfig(
+              (config) => (config.enable = e.currentTarget.checked),
+            )
+          }
+        ></input>
+      </ListItem>
+      <ListItem title={Locale.Settings.STT.Engine.Title}>
+        <Select
+          value={props.sttConfig.engine}
+          onChange={(e) => {
+            props.updateConfig(
+              (config) =>
+                (config.engine = STTConfigValidator.engine(
+                  e.currentTarget.value,
+                )),
+            );
+          }}
+        >
+          {isFirefox()
+            ? DEFAULT_STT_ENGINES.filter((v) => v !== "Web Speech API").map(
+                (v, i) => (
+                  <option value={v} key={i}>
+                    {v}
+                  </option>
+                ),
+              )
+            : DEFAULT_STT_ENGINES.map((v, i) => (
+                <option value={v} key={i}>
+                  {v}
+                </option>
+              ))}
+        </Select>
+      </ListItem>
+      {props.sttConfig.engine === "Web Speech API" && !isFirefox() && (
+        <ListItem title="语言选择">
+          <Select
+            value={props.sttConfig.lang}
+            onChange={(e) => {
+              props.updateConfig(
+                (config) => (config.lang = e.currentTarget.value),
+              );
+            }}
+          >
+            {DEFAULT_STT_LANGUAGES.map((v, i) => (
+              <option value={v} key={i}>
+                {v}
+              </option>
+            ))}
+          </Select>
+        </ListItem>
+      )}
+    </>
+  );
+}
diff --git a/app/components/stt.module.scss b/app/components/stt.module.scss
new file mode 100644
index 00000000000..ba9f382e40b
--- /dev/null
+++ b/app/components/stt.module.scss
@@ -0,0 +1,119 @@
+@import "../styles/animation.scss";
+.plugin-page {
+  height: 100%;
+  display: flex;
+  flex-direction: column;
+
+  .plugin-page-body {
+    padding: 20px;
+    overflow-y: auto;
+
+    .plugin-filter {
+      width: 100%;
+      max-width: 100%;
+      margin-bottom: 20px;
+      animation: slide-in ease 0.3s;
+      height: 40px;
+
+      display: flex;
+
+      .search-bar {
+        flex-grow: 1;
+        max-width: 100%;
+        min-width: 0;
+        outline: none;
+      }
+
+      .search-bar:focus {
+        border: 1px solid var(--primary);
+      }
+
+      .plugin-filter-lang {
+        height: 100%;
+        margin-left: 10px;
+      }
+
+      .plugin-create {
+        height: 100%;
+        margin-left: 10px;
+        box-sizing: border-box;
+        min-width: 80px;
+      }
+    }
+
+    .plugin-item {
+      display: flex;
+      justify-content: space-between;
+      padding: 20px;
+      border: var(--border-in-light);
+      animation: slide-in ease 0.3s;
+
+      &:not(:last-child) {
+        border-bottom: 0;
+      }
+
+      &:first-child {
+        border-top-left-radius: 10px;
+        border-top-right-radius: 10px;
+      }
+
+      &:last-child {
+        border-bottom-left-radius: 10px;
+        border-bottom-right-radius: 10px;
+      }
+
+      .plugin-header {
+        display: flex;
+        align-items: center;
+
+        .plugin-icon {
+          display: flex;
+          align-items: center;
+          justify-content: center;
+          margin-right: 10px;
+        }
+
+        .plugin-title {
+          .plugin-name {
+            font-size: 14px;
+            font-weight: bold;
+          }
+          .plugin-info {
+            font-size: 12px;
+          }
+          .plugin-runtime-warning {
+            font-size: 12px;
+            color: #f86c6c;
+          }
+        }
+      }
+
+      .plugin-actions {
+        display: flex;
+        flex-wrap: nowrap;
+        transition: all ease 0.3s;
+        justify-content: center;
+        align-items: center;
+      }
+
+      @media screen and (max-width: 600px) {
+        display: flex;
+        flex-direction: column;
+        padding-bottom: 10px;
+        border-radius: 10px;
+        margin-bottom: 20px;
+        box-shadow: var(--card-shadow);
+
+        &:not(:last-child) {
+          border-bottom: var(--border-in-light);
+        }
+
+        .plugin-actions {
+          width: 100%;
+          justify-content: space-between;
+          padding-top: 10px;
+        }
+      }
+    }
+  }
+}
diff --git a/app/constant.ts b/app/constant.ts
index a06b8f05062..44b53ac4d51 100644
--- a/app/constant.ts
+++ b/app/constant.ts
@@ -150,6 +150,7 @@ export const Anthropic = {
 export const OpenaiPath = {
   ChatPath: "v1/chat/completions",
   SpeechPath: "v1/audio/speech",
+  TranscriptionPath: "v1/audio/transcriptions",
   ImagePath: "v1/images/generations",
   UsagePath: "dashboard/billing/usage",
   SubsPath: "dashboard/billing/subscription",
@@ -270,6 +271,24 @@ export const DEFAULT_TTS_VOICES = [
   "shimmer",
 ];
 
+export const DEFAULT_STT_ENGINE = "OpenAI Whisper";
+export const DEFAULT_STT_ENGINES = ["OpenAI Whisper", "Web Speech API"];
+export const DEFAULT_STT_LANGUAGE = "zh-CN";
+export const DEFAULT_STT_LANGUAGES = [
+  "zh-CN", // 中文（简体）
+  "en-US", // 英文
+  "fr-FR", // 法文
+  "de-DE", // 德文
+  "es-ES", // 西班牙文
+  "it-IT", // 意大利文
+  "ja-JP", // 日文
+  "ko-KR", // 韩文
+  "ru-RU", // 俄文
+  "pt-BR", // 葡萄牙文（巴西）
+  "ar-SA", // 阿拉伯文
+];
+export const FIREFOX_DEFAULT_STT_ENGINE = "OpenAI Whisper";
+
 const openaiModels = [
   "gpt-3.5-turbo",
   "gpt-3.5-turbo-1106",
diff --git a/app/icons/vioce-close.svg b/app/icons/vioce-close.svg
new file mode 100644
index 00000000000..9d162e1690d
--- /dev/null
+++ b/app/icons/vioce-close.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24"><g fill="none" stroke="currentColor" stroke-width="1.5"><path d="M7 8a5 5 0 0 1 10 0v3a5 5 0 0 1-10 0z"/><path stroke-linecap="round" d="M11 8h2m-3 3h4m6-1v1a8 8 0 1 1-16 0v-1m8 9v3"/></g></svg>
\ No newline at end of file
diff --git a/app/icons/vioce-open.svg b/app/icons/vioce-open.svg
new file mode 100644
index 00000000000..0028a1d9b56
--- /dev/null
+++ b/app/icons/vioce-open.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24"><g fill="currentColor" fill-rule="evenodd" clip-rule="evenodd"><path d="M4 9a.75.75 0 0 1 .75.75v1a7.25 7.25 0 1 0 14.5 0v-1a.75.75 0 0 1 1.5 0v1a8.75 8.75 0 0 1-8 8.718v2.282a.75.75 0 0 1-1.5 0v-2.282a8.75 8.75 0 0 1-8-8.718v-1A.75.75 0 0 1 4 9"/><path d="M12 2a5.75 5.75 0 0 0-5.75 5.75v3a5.75 5.75 0 0 0 11.5 0v-3A5.75 5.75 0 0 0 12 2m2 9.5a.75.75 0 0 0 0-1.5h-4a.75.75 0 0 0 0 1.5zm-.25-3.75a.75.75 0 0 1-.75.75h-2A.75.75 0 0 1 11 7h2a.75.75 0 0 1 .75.75"/></g></svg>
\ No newline at end of file
diff --git a/app/locales/cn.ts b/app/locales/cn.ts
index b7debe80514..ba783c33d52 100644
--- a/app/locales/cn.ts
+++ b/app/locales/cn.ts
@@ -92,8 +92,9 @@ const cn = {
       return inputHints + "，/ 触发补全，: 触发命令";
     },
     Send: "发送",
-    StartSpeak: "说话",
-    StopSpeak: "停止",
+    StartSpeak: "启用语音输入",
+    CloseSpeak: "关闭语音输入",
+    StopSpeak: "录音中....点击结束",
     Config: {
       Reset: "清除记忆",
       SaveAs: "存为面具",
@@ -538,6 +539,16 @@ const cn = {
         SubTitle: "生成语音的速度",
       },
     },
+    STT: {
+      Enable: {
+        Title: "启用语音转文本",
+        SubTitle: "启用语音转文本",
+      },
+      Engine: {
+        Title: "转换引擎",
+        SubTitle: "音频转换引擎",
+      },
+    },
   },
   Store: {
     DefaultTopic: "新的聊天",
diff --git a/app/locales/en.ts b/app/locales/en.ts
index 5cc296f1efd..b532ac0e7e9 100644
--- a/app/locales/en.ts
+++ b/app/locales/en.ts
@@ -94,7 +94,8 @@ const en: LocaleType = {
     },
     Send: "Send",
     StartSpeak: "Start Speak",
-    StopSpeak: "Stop Speak",
+    CloseSpeak: "Stop Speak",
+    StopSpeak: "Recording...",
     Config: {
       Reset: "Reset to Default",
       SaveAs: "Save as Mask",
@@ -546,6 +547,16 @@ const en: LocaleType = {
       },
       Engine: "TTS Engine",
     },
+    STT: {
+      Enable: {
+        Title: "Enable STT",
+        SubTitle: "Enable Speech-to-Text",
+      },
+      Engine: {
+        Title: "STT Engine",
+        SubTitle: "Text-to-Speech Engine",
+      },
+    },
   },
   Store: {
     DefaultTopic: "New Conversation",
diff --git a/app/store/config.ts b/app/store/config.ts
index f14793c287a..cdf1d6e1bea 100644
--- a/app/store/config.ts
+++ b/app/store/config.ts
@@ -5,6 +5,9 @@ import {
   DEFAULT_INPUT_TEMPLATE,
   DEFAULT_MODELS,
   DEFAULT_SIDEBAR_WIDTH,
+  DEFAULT_STT_ENGINE,
+  DEFAULT_STT_ENGINES,
+  DEFAULT_STT_LANGUAGE,
   DEFAULT_TTS_ENGINE,
   DEFAULT_TTS_ENGINES,
   DEFAULT_TTS_MODEL,
@@ -20,6 +23,7 @@ export type ModelType = (typeof DEFAULT_MODELS)[number]["name"];
 export type TTSModelType = (typeof DEFAULT_TTS_MODELS)[number];
 export type TTSVoiceType = (typeof DEFAULT_TTS_VOICES)[number];
 export type TTSEngineType = (typeof DEFAULT_TTS_ENGINES)[number];
+export type STTEngineType = (typeof DEFAULT_STT_ENGINES)[number];
 
 export enum SubmitKey {
   Enter = "Enter",
@@ -83,19 +87,25 @@ export const DEFAULT_CONFIG = {
   },
 
   ttsConfig: {
-    enable: false,
+    enable: true,
     autoplay: false,
     engine: DEFAULT_TTS_ENGINE,
     model: DEFAULT_TTS_MODEL,
     voice: DEFAULT_TTS_VOICE,
     speed: 1.0,
   },
+  sttConfig: {
+    enable: true,
+    engine: DEFAULT_STT_ENGINE,
+    lang: DEFAULT_STT_LANGUAGE,
+  },
 };
 
 export type ChatConfig = typeof DEFAULT_CONFIG;
 
 export type ModelConfig = ChatConfig["modelConfig"];
 export type TTSConfig = ChatConfig["ttsConfig"];
+export type STTConfig = ChatConfig["sttConfig"];
 
 export function limitNumber(
   x: number,
@@ -125,6 +135,12 @@ export const TTSConfigValidator = {
   },
 };
 
+export const STTConfigValidator = {
+  engine(x: string) {
+    return x as STTEngineType;
+  },
+};
+
 export const ModalConfigValidator = {
   model(x: string) {
     return x as ModelType;
diff --git a/app/utils/speech.ts b/app/utils/speech.ts
new file mode 100644
index 00000000000..1e69f603186
--- /dev/null
+++ b/app/utils/speech.ts
@@ -0,0 +1,126 @@
+import { ChatGPTApi } from "../client/platforms/openai";
+import { getSTTLang } from "../locales";
+import { isFirefox } from "../utils";
+
+export type TranscriptionCallback = (transcription: string) => void;
+
+export abstract class SpeechApi {
+  protected onTranscription: TranscriptionCallback = () => {};
+
+  abstract isListening(): boolean;
+  abstract start(): Promise<void>;
+  abstract stop(): Promise<void>;
+
+  onTranscriptionReceived(callback: TranscriptionCallback) {
+    this.onTranscription = callback;
+  }
+}
+
+export class OpenAITranscriptionApi extends SpeechApi {
+  private listeningStatus = false;
+  private mediaRecorder: MediaRecorder | null = null;
+  private stream: MediaStream | null = null;
+  private audioChunks: Blob[] = [];
+
+  isListening = () => this.listeningStatus;
+
+  constructor(transcriptionCallback?: TranscriptionCallback) {
+    super();
+    if (transcriptionCallback) {
+      this.onTranscriptionReceived(transcriptionCallback);
+    }
+  }
+
+  async start(): Promise<void> {
+    // @ts-ignore
+    navigator.getUserMedia =
+      // @ts-ignore
+      navigator.getUserMedia ||
+      // @ts-ignore
+      navigator.webkitGetUserMedia ||
+      // @ts-ignore
+      navigator.mozGetUserMedia ||
+      // @ts-ignore
+      navigator.msGetUserMedia;
+    if (navigator.mediaDevices) {
+      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+      this.mediaRecorder = new MediaRecorder(stream);
+      this.mediaRecorder.ondataavailable = (e) => {
+        if (e.data && e.data.size > 0) {
+          this.audioChunks.push(e.data);
+        }
+      };
+
+      this.stream = stream;
+    } else {
+      console.warn("Media Decives will work only with SSL");
+      return;
+    }
+
+    this.audioChunks = [];
+
+    // this.recorder.addEventListener("dataavailable", (event) => {
+    //     this.audioChunks.push(event.data);
+    // });
+
+    this.mediaRecorder.start(1000);
+    this.listeningStatus = true;
+  }
+
+  async stop(): Promise<void> {
+    if (!this.mediaRecorder || !this.listeningStatus) {
+      return;
+    }
+
+    return new Promise((resolve) => {
+      this.mediaRecorder!.addEventListener("stop", async () => {
+        const audioBlob = new Blob(this.audioChunks, { type: "audio/wav" });
+        const llm = new ChatGPTApi();
+        const transcription = await llm.transcription({ file: audioBlob });
+        this.onTranscription(transcription);
+        this.listeningStatus = false;
+        resolve();
+      });
+
+      this.mediaRecorder!.stop();
+    });
+  }
+}
+
+export class WebTranscriptionApi extends SpeechApi {
+  private listeningStatus = false;
+  private recognitionInstance: any | null = null;
+
+  isListening = () => this.listeningStatus;
+
+  constructor(transcriptionCallback?: TranscriptionCallback, lang?: string) {
+    super();
+    if (isFirefox()) return;
+    const SpeechRecognition =
+      (window as any).SpeechRecognition ||
+      (window as any).webkitSpeechRecognition;
+    this.recognitionInstance = new SpeechRecognition();
+    this.recognitionInstance.continuous = true;
+    this.recognitionInstance.interimResults = true;
+    this.recognitionInstance.lang = lang ?? getSTTLang();
+    if (transcriptionCallback) {
+      this.onTranscriptionReceived(transcriptionCallback);
+    }
+    this.recognitionInstance.onresult = (event: any) => {
+      const result = event.results[event.results.length - 1];
+      if (result.isFinal) {
+        this.onTranscription(result[0].transcript);
+      }
+    };
+  }
+
+  async start(): Promise<void> {
+    this.listeningStatus = true;
+    await this.recognitionInstance.start();
+  }
+
+  async stop(): Promise<void> {
+    this.listeningStatus = false;
+    await this.recognitionInstance.stop();
+  }
+}
diff --git a/public/Recordingdone.mp3 b/public/Recordingdone.mp3
new file mode 100644
index 00000000000..71d80c709e4
Binary files /dev/null and b/public/Recordingdone.mp3 differ
diff --git a/public/Recordingstart.mp3 b/public/Recordingstart.mp3
new file mode 100644
index 00000000000..dedc4d152a2
Binary files /dev/null and b/public/Recordingstart.mp3 differ