Skip to content

Commit

Permalink
Add openai talk to LLM example (#48)
Browse files Browse the repository at this point in the history
  • Loading branch information
mat-hek authored Oct 31, 2024
1 parent dd221df commit bd988cb
Show file tree
Hide file tree
Showing 3 changed files with 262 additions and 10 deletions.
118 changes: 118 additions & 0 deletions boombox_examples_data/talk_to_llm.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
<!DOCTYPE html>
<html lang="en">

<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta http-equiv="X-UA-Compatible" content="ie=edge">
<title>Boombox talk to LLM demo</title>
</head>

<body
style="background-color: black; color: white; font-family: Arial, Helvetica, sans-serif; min-height: 100vh; margin: 0px; padding: 5px 0px 5px 0px">
<main>
<h1>Boombox talk to LLM demo</h1>
<div id="status">Connecting</div>
<audio id="audioPlayer"></audio>
</main>
<script>
const pcConfig = { iceServers: [{ urls: "stun:stun.l.google.com:19302" }] };
const mediaConstraints = { video: false, audio: true };

const proto = window.location.protocol === "https:" ? "wss:" : "ws:";
const wsBrowserToElixir = new WebSocket(`${proto}//${window.location.hostname}:8829`);
const connBrowserToElixirStatus = document.getElementById("status");
wsBrowserToElixir.onopen = (_) => start_connection_browser_to_elixir(wsBrowserToElixir);
wsBrowserToElixir.onclose = (event) => {
connBrowserToElixirStatus.innerHTML = "Disconnected";
console.log("WebSocket connection was terminated:", event);
};

const start_connection_browser_to_elixir = async (ws) => {
const localStream = await navigator.mediaDevices.getUserMedia(mediaConstraints);
const pc = new RTCPeerConnection(pcConfig);

pc.onicecandidate = (event) => {
if (event.candidate === null) return;
console.log("Sent ICE candidate:", event.candidate);
ws.send(JSON.stringify({ type: "ice_candidate", data: event.candidate }));
};

pc.onconnectionstatechange = () => {
if (pc.connectionState == "connected") {
const button = document.createElement("button");
button.innerHTML = "Disconnect";
button.onclick = () => {
ws.close();
localStream.getTracks().forEach((track) => track.stop());
};
connBrowserToElixirStatus.innerHTML = "Connected ";
connBrowserToElixirStatus.appendChild(button);
}
};

for (const track of localStream.getTracks()) {
pc.addTrack(track, localStream);
}

ws.onmessage = async (event) => {
const { type, data } = JSON.parse(event.data);

switch (type) {
case "sdp_answer":
console.log("Received SDP answer:", data);
await pc.setRemoteDescription(data);
break;
case "ice_candidate":
console.log("Recieved ICE candidate:", data);
await pc.addIceCandidate(data);
break;
}
};

const offer = await pc.createOffer();
await pc.setLocalDescription(offer);
console.log("Sent SDP offer:", offer);
ws.send(JSON.stringify({ type: "sdp_offer", data: offer }));
};

const audioPlayer = document.getElementById("audioPlayer");
const wsElixirToBrowser = new WebSocket(`${proto}//${window.location.hostname}:8830`);
wsElixirToBrowser.onopen = () => start_connection_elixir_to_browser(wsElixirToBrowser);
wsElixirToBrowser.onclose = (event) => console.log("WebSocket connection was terminated:", event);

const start_connection_elixir_to_browser = async (ws) => {
audioPlayer.srcObject = new MediaStream();

const pc = new RTCPeerConnection(pcConfig);
pc.ontrack = (event) => audioPlayer.srcObject.addTrack(event.track);
pc.onicecandidate = (event) => {
if (event.candidate === null) return;

console.log("Sent ICE candidate:", event.candidate);
ws.send(JSON.stringify({ type: "ice_candidate", data: event.candidate }));
};

ws.onmessage = async (event) => {
const { type, data } = JSON.parse(event.data);

switch (type) {
case "sdp_offer":
console.log("Received SDP offer:", data);
await pc.setRemoteDescription(data);
const answer = await pc.createAnswer();
await pc.setLocalDescription(answer);
ws.send(JSON.stringify({ type: "sdp_answer", data: answer }));
console.log("Sent SDP answer:", answer);
break;
case "ice_candidate":
console.log("Recieved ICE candidate:", data);
await pc.addIceCandidate(data);
}
};
};

</script>
</body>

</html>
138 changes: 136 additions & 2 deletions examples.livemd
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ System.put_env("PATH", "/opt/homebrew/bin:#{System.get_env("PATH")}")
# In case of problems installing Nx/EXLA/Bumblebee,
# you can remove them and the Nx backend config below.
# Examples that don't mention them should still work.
Mix.install([:boombox, :kino, :nx, :exla, :bumblebee])
Mix.install([:boombox, :kino, :nx, :exla, :bumblebee, :websockex])

Nx.global_default_backend(EXLA.Backend)
```
Expand Down Expand Up @@ -38,7 +38,7 @@ end
assets_url =
"https://raw.githubusercontent.com/membraneframework/boombox/master/boombox_examples_data"

for asset <- ["hls", "webrtc_from_browser", "webrtc_to_browser"],
for asset <- ["hls", "webrtc_from_browser", "webrtc_to_browser", "talk_to_llm"],
path = "#{data_dir}/#{asset}.html",
not File.exists?(path) do
%{status: 200, body: data} = Req.get!("#{assets_url}/#{asset}.html")
Expand Down Expand Up @@ -167,6 +167,140 @@ end)

<!-- livebook:{"branch_parent_index":0} -->

## Talk to Chat GPT

This example lets you perform a natural conversation with Chat GPT using voice. Boombox is used to deliver audio between the browser and server. It uses WebRTC, which is probably the best option for this case.

The module below is a simple interface to the OpenAI realtime audio API. It accepts PCM audio (1 channel, 24kHz, s16le) and responds in the same format. Thanks to that, we don't need to do speech to text nor text to speech. This results in very low latency and simple logic.

If you prefer open source solutions, there's [Ultravox](https://github.com/fixie-ai/ultravox), but while it accepts audio, it outputs text for now, so you'd need TTS. If there's anything else we should link here, please open a PR.

```elixir
defmodule OpenAIWebSocket do
use WebSockex
require Logger

def start_link(opts) do
# OpenAI API docs: https://platform.openai.com/docs/guides/realtime
WebSockex.start_link(
"wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-10-01",
__MODULE__,
%{response: <<>>},
extra_headers: [
{"Authorization", "Bearer " <> opts[:token]},
{"OpenAI-Beta", "realtime=v1"}
]
)
end

def send_audio(ws, audio) do
audio = Base.encode64(audio)
frame = %{type: "input_audio_buffer.append", audio: audio} |> Jason.encode!()
WebSockex.send_frame(ws, {:text, frame})
end

def get_response_chunk(ws, chunk_byte_size) do
# There's no 'call' in WebSockex, so we just send and receive
send(ws, {:get_response_chunk, chunk_byte_size, self()})

receive do
{:response_chunk, chunk} -> chunk
end
end

@impl true
def handle_frame({:text, frame}, state) do
case Jason.decode!(frame) do
%{"type" => "response.audio.delta", "delta" => delta} ->
audio_payload = Base.decode64!(delta)
# Buffer the response audio
response = state.response <> audio_payload
{:ok, %{state | response: response}}

%{"type" => "input_audio_buffer.speech_started"} ->
# If the user speaks, they may interrupt the current response,
# so we drop it and wait for a new one.
{:ok, %{state | response: <<>>}}

%{"type" => "response.audio_transcript.done", "transcript" => transcript} ->
Logger.info("AI transcription: #{transcript}")
{:ok, state}

%{} = _event ->
{:ok, state}
end
end

@impl true
def handle_frame(_frame, state), do: {:ok, state}

@impl true
def handle_info({:get_response_chunk, size, pid}, state) do
case state.response do
<<chunk::binary-size(size), rest::binary>> ->
# If we have enough data, send it back
send(pid, {:response_chunk, chunk})
{:ok, %{state | response: rest}}

chunk ->
# Otherwise, send what we have, padded with silence
silence = <<0::size(size - byte_size(chunk))-unit(8)>>
send(pid, {:response_chunk, chunk <> silence})
{:ok, %{state | response: <<>>}}
end
end
end
```

In the cell below, we receive stream from the browser via WebRTC, feed it to the API, receive response and send it back to the browser. You need to add the Open AI API token as a `OPEN_AI_TOKEN` secret in Livebook for this to work. To connect via WebRTC, visit http://localhost:1234/talk_to_llm.html after running this cell

```elixir
{:ok, ws} = OpenAIWebSocket.start_link(token: System.fetch_env!("LB_OPEN_AI_TOKEN"))

# Ingress part
Task.start_link(fn ->
Boombox.run(
# Connect to the browser via WebRTC, using WebSocket for session establishment
input: {:webrtc, "ws://localhost:8829"},
output: {
:stream,
# Audio format that the OpenAI API expects
video: false, audio: :binary, audio_format: :s16le, audio_channels: 1, audio_rate: 24_000
}
)
|> Enum.each(fn packet -> OpenAIWebSocket.send_audio(ws, packet.payload) end)
end)

# Egress part

# We send 20 millisecond chunks to Boombox
chunk_duration_ms = 20
# Samples per second * bytes per sample * chunk duration in seconds
chunk_byte_size = trunc(24_000 * 2 * chunk_duration_ms / 1_000)

Stream.interval(chunk_duration_ms)
# This emits the current time in milliseconds (0, 20, 40, 60...) every 20ms
|> Stream.map(&(&1 * chunk_duration_ms))
|> Stream.map(fn time ->
response_chunk = OpenAIWebSocket.get_response_chunk(ws, chunk_byte_size)

%Boombox.Packet{
payload: response_chunk,
kind: :audio,
pts: Membrane.Time.milliseconds(time),
# Audio format that the OpenAI API outputs
format: %{audio_format: :s16le, audio_channels: 1, audio_rate: 24_000}
}
end)
|> Boombox.run(
input: {:stream, audio: :binary, video: false},
# Connect to the browser via WebRTC, using WebSocket for session establishment
output: {:webrtc, "ws://localhost:8830"}
)
```

<!-- livebook:{"branch_parent_index":0} -->

## Read speech audio from MP4 chunk-by-chunk, generate transcription

```elixir
Expand Down
Loading

0 comments on commit bd988cb

Please sign in to comment.