Add openai talk to LLM example (#48)

membraneframework · Oct 31, 2024 · bd988cb · bd988cb
1 parent dd221df
commit bd988cb
Show file tree

Hide file tree

Showing 3 changed files with 262 additions and 10 deletions.
diff --git a/boombox_examples_data/talk_to_llm.html b/boombox_examples_data/talk_to_llm.html
@@ -0,0 +1,118 @@
+<!DOCTYPE html>
+<html lang="en">
+
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <meta http-equiv="X-UA-Compatible" content="ie=edge">
+  <title>Boombox talk to LLM demo</title>
+</head>
+
+<body
+  style="background-color: black; color: white; font-family: Arial, Helvetica, sans-serif; min-height: 100vh; margin: 0px; padding: 5px 0px 5px 0px">
+  <main>
+    <h1>Boombox talk to LLM demo</h1>
+    <div id="status">Connecting</div>
+    <audio id="audioPlayer"></audio>
+  </main>
+  <script>
+    const pcConfig = { iceServers: [{ urls: "stun:stun.l.google.com:19302" }] };
+    const mediaConstraints = { video: false, audio: true };
+
+    const proto = window.location.protocol === "https:" ? "wss:" : "ws:";
+    const wsBrowserToElixir = new WebSocket(`${proto}//${window.location.hostname}:8829`);
+    const connBrowserToElixirStatus = document.getElementById("status");
+    wsBrowserToElixir.onopen = (_) => start_connection_browser_to_elixir(wsBrowserToElixir);
+    wsBrowserToElixir.onclose = (event) => {
+      connBrowserToElixirStatus.innerHTML = "Disconnected";
+      console.log("WebSocket connection was terminated:", event);
+    };
+
+    const start_connection_browser_to_elixir = async (ws) => {
+      const localStream = await navigator.mediaDevices.getUserMedia(mediaConstraints);
+      const pc = new RTCPeerConnection(pcConfig);
+
+      pc.onicecandidate = (event) => {
+        if (event.candidate === null) return;
+        console.log("Sent ICE candidate:", event.candidate);
+        ws.send(JSON.stringify({ type: "ice_candidate", data: event.candidate }));
+      };
+
+      pc.onconnectionstatechange = () => {
+        if (pc.connectionState == "connected") {
+          const button = document.createElement("button");
+          button.innerHTML = "Disconnect";
+          button.onclick = () => {
+            ws.close();
+            localStream.getTracks().forEach((track) => track.stop());
+          };
+          connBrowserToElixirStatus.innerHTML = "Connected ";
+          connBrowserToElixirStatus.appendChild(button);
+        }
+      };
+
+      for (const track of localStream.getTracks()) {
+        pc.addTrack(track, localStream);
+      }
+
+      ws.onmessage = async (event) => {
+        const { type, data } = JSON.parse(event.data);
+
+        switch (type) {
+          case "sdp_answer":
+            console.log("Received SDP answer:", data);
+            await pc.setRemoteDescription(data);
+            break;
+          case "ice_candidate":
+            console.log("Recieved ICE candidate:", data);
+            await pc.addIceCandidate(data);
+            break;
+        }
+      };
+
+      const offer = await pc.createOffer();
+      await pc.setLocalDescription(offer);
+      console.log("Sent SDP offer:", offer);
+      ws.send(JSON.stringify({ type: "sdp_offer", data: offer }));
+    };
+
+    const audioPlayer = document.getElementById("audioPlayer");
+    const wsElixirToBrowser = new WebSocket(`${proto}//${window.location.hostname}:8830`);
+    wsElixirToBrowser.onopen = () => start_connection_elixir_to_browser(wsElixirToBrowser);
+    wsElixirToBrowser.onclose = (event) => console.log("WebSocket connection was terminated:", event);
+
+    const start_connection_elixir_to_browser = async (ws) => {
+      audioPlayer.srcObject = new MediaStream();
+
+      const pc = new RTCPeerConnection(pcConfig);
+      pc.ontrack = (event) => audioPlayer.srcObject.addTrack(event.track);
+      pc.onicecandidate = (event) => {
+        if (event.candidate === null) return;
+
+        console.log("Sent ICE candidate:", event.candidate);
+        ws.send(JSON.stringify({ type: "ice_candidate", data: event.candidate }));
+      };
+
+      ws.onmessage = async (event) => {
+        const { type, data } = JSON.parse(event.data);
+
+        switch (type) {
+          case "sdp_offer":
+            console.log("Received SDP offer:", data);
+            await pc.setRemoteDescription(data);
+            const answer = await pc.createAnswer();
+            await pc.setLocalDescription(answer);
+            ws.send(JSON.stringify({ type: "sdp_answer", data: answer }));
+            console.log("Sent SDP answer:", answer);
+            break;
+          case "ice_candidate":
+            console.log("Recieved ICE candidate:", data);
+            await pc.addIceCandidate(data);
+        }
+      };
+    };
+
+  </script>
+</body>
+
+</html>
diff --git a/examples.livemd b/examples.livemd
@@ -9,7 +9,7 @@ System.put_env("PATH", "/opt/homebrew/bin:#{System.get_env("PATH")}")
 # In case of problems installing Nx/EXLA/Bumblebee,
 # you can remove them and the Nx backend config below.
 # Examples that don't mention them should still work.
-Mix.install([:boombox, :kino, :nx, :exla, :bumblebee])
+Mix.install([:boombox, :kino, :nx, :exla, :bumblebee, :websockex])
 
 Nx.global_default_backend(EXLA.Backend)
 ```
@@ -38,7 +38,7 @@ end
 assets_url =
   "https://raw.githubusercontent.com/membraneframework/boombox/master/boombox_examples_data"
 
-for asset <- ["hls", "webrtc_from_browser", "webrtc_to_browser"],
+for asset <- ["hls", "webrtc_from_browser", "webrtc_to_browser", "talk_to_llm"],
     path = "#{data_dir}/#{asset}.html",
     not File.exists?(path) do
   %{status: 200, body: data} = Req.get!("#{assets_url}/#{asset}.html")
@@ -167,6 +167,140 @@ end)
 
 <!-- livebook:{"branch_parent_index":0} -->
 
+## Talk to Chat GPT
+
+This example lets you perform a natural conversation with Chat GPT using voice. Boombox is used to deliver audio between the browser and server. It uses WebRTC, which is probably the best option for this case.
+
+The module below is a simple interface to the OpenAI realtime audio API. It accepts PCM audio (1 channel, 24kHz, s16le) and responds in the same format. Thanks to that, we don't need to do speech to text nor text to speech. This results in very low latency and simple logic.
+
+If you prefer open source solutions, there's [Ultravox](https://github.com/fixie-ai/ultravox), but while it accepts audio, it outputs text for now, so you'd need TTS. If there's anything else we should link here, please open a PR.
+
+```elixir
+defmodule OpenAIWebSocket do
+  use WebSockex
+  require Logger
+
+  def start_link(opts) do
+    # OpenAI API docs: https://platform.openai.com/docs/guides/realtime
+    WebSockex.start_link(
+      "wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-10-01",
+      __MODULE__,
+      %{response: <<>>},
+      extra_headers: [
+        {"Authorization", "Bearer " <> opts[:token]},
+        {"OpenAI-Beta", "realtime=v1"}
+      ]
+    )
+  end
+
+  def send_audio(ws, audio) do
+    audio = Base.encode64(audio)
+    frame = %{type: "input_audio_buffer.append", audio: audio} |> Jason.encode!()
+    WebSockex.send_frame(ws, {:text, frame})
+  end
+
+  def get_response_chunk(ws, chunk_byte_size) do
+    # There's no 'call' in WebSockex, so we just send and receive
+    send(ws, {:get_response_chunk, chunk_byte_size, self()})
+
+    receive do
+      {:response_chunk, chunk} -> chunk
+    end
+  end
+
+  @impl true
+  def handle_frame({:text, frame}, state) do
+    case Jason.decode!(frame) do
+      %{"type" => "response.audio.delta", "delta" => delta} ->
+        audio_payload = Base.decode64!(delta)
+        # Buffer the response audio
+        response = state.response <> audio_payload
+        {:ok, %{state | response: response}}
+
+      %{"type" => "input_audio_buffer.speech_started"} ->
+        # If the user speaks, they may interrupt the current response,
+        # so we drop it and wait for a new one.
+        {:ok, %{state | response: <<>>}}
+
+      %{"type" => "response.audio_transcript.done", "transcript" => transcript} ->
+        Logger.info("AI transcription: #{transcript}")
+        {:ok, state}
+
+      %{} = _event ->
+        {:ok, state}
+    end
+  end
+
+  @impl true
+  def handle_frame(_frame, state), do: {:ok, state}
+
+  @impl true
+  def handle_info({:get_response_chunk, size, pid}, state) do
+    case state.response do
+      <<chunk::binary-size(size), rest::binary>> ->
+        # If we have enough data, send it back
+        send(pid, {:response_chunk, chunk})
+        {:ok, %{state | response: rest}}
+
+      chunk ->
+        # Otherwise, send what we have, padded with silence
+        silence = <<0::size(size - byte_size(chunk))-unit(8)>>
+        send(pid, {:response_chunk, chunk <> silence})
+        {:ok, %{state | response: <<>>}}
+    end
+  end
+end
+```
+
+In the cell below, we receive stream from the browser via WebRTC, feed it to the API, receive response and send it back to the browser. You need to add the Open AI API token as a `OPEN_AI_TOKEN` secret in Livebook for this to work. To connect via WebRTC, visit http://localhost:1234/talk_to_llm.html after running this cell
+
+```elixir
+{:ok, ws} = OpenAIWebSocket.start_link(token: System.fetch_env!("LB_OPEN_AI_TOKEN"))
+
+# Ingress part
+Task.start_link(fn ->
+  Boombox.run(
+    # Connect to the browser via WebRTC, using WebSocket for session establishment
+    input: {:webrtc, "ws://localhost:8829"},
+    output: {
+      :stream,
+      # Audio format that the OpenAI API expects 
+      video: false, audio: :binary, audio_format: :s16le, audio_channels: 1, audio_rate: 24_000
+    }
+  )
+  |> Enum.each(fn packet -> OpenAIWebSocket.send_audio(ws, packet.payload) end)
+end)
+
+# Egress part
+
+# We send 20 millisecond chunks to Boombox
+chunk_duration_ms = 20
+# Samples per second * bytes per sample * chunk duration in seconds
+chunk_byte_size = trunc(24_000 * 2 * chunk_duration_ms / 1_000)
+
+Stream.interval(chunk_duration_ms)
+# This emits the current time in milliseconds (0, 20, 40, 60...) every 20ms
+|> Stream.map(&(&1 * chunk_duration_ms))
+|> Stream.map(fn time ->
+  response_chunk = OpenAIWebSocket.get_response_chunk(ws, chunk_byte_size)
+
+  %Boombox.Packet{
+    payload: response_chunk,
+    kind: :audio,
+    pts: Membrane.Time.milliseconds(time),
+    # Audio format that the OpenAI API outputs
+    format: %{audio_format: :s16le, audio_channels: 1, audio_rate: 24_000}
+  }
+end)
+|> Boombox.run(
+  input: {:stream, audio: :binary, video: false},
+  # Connect to the browser via WebRTC, using WebSocket for session establishment
+  output: {:webrtc, "ws://localhost:8830"}
+)
+```
+
+<!-- livebook:{"branch_parent_index":0} -->
+
 ## Read speech audio from MP4 chunk-by-chunk, generate transcription
 
 ```elixir