com.openai.unity 8.4.4

- refactored AudioEndpoint speech requests - added AudioEndpoint.GetSpeechAsync - deprecated AudioEndpoint.CreateSpeechAsync - deprecated AudioEndpoint.CreateSpeechStreamAsync - added SpeechClip response - added Realtime.ResponseAudioResponse.AudioSamples - updated all sample scenes with new OnAudioFilterRead examples for better playback - updated unit tests
RageAgainstThePixel · Nov 25, 2024 · 79a919a · 79a919a
1 parent 2e54704
commit 79a919a
Show file tree

Hide file tree

Showing 15 changed files with 755 additions and 669 deletions.
diff --git a/OpenAI/Packages/com.openai.unity/Documentation~/README.md b/OpenAI/Packages/com.openai.unity/Documentation~/README.md
@@ -61,12 +61,12 @@ The recommended installation method is though the unity package manager and [Ope
   - [List Models](#list-models)
   - [Retrieve Models](#retrieve-model)
   - [Delete Fine Tuned Model](#delete-fine-tuned-model)
-- [Realtime](#realtime) :new:
-  - [Create Realtime Session](#create-realtime-session) :new:
-  - [Client Events](#client-events) :new:
-    - [Sending Client Events](#sending-client-events) :new:
-  - [Server Events](#server-events) :new:
-    - [Receiving Server Events](#receiving-server-events) :new:
+- [Realtime](#realtime)
+  - [Create Realtime Session](#create-realtime-session)
+  - [Client Events](#client-events)
+    - [Sending Client Events](#sending-client-events)
+  - [Server Events](#server-events)
+    - [Receiving Server Events](#receiving-server-events)
 - [Assistants](#assistants)
   - [List Assistants](#list-assistants)
   - [Create Assistant](#create-assistant)
@@ -118,7 +118,7 @@ The recommended installation method is though the unity package manager and [Ope
   - [Streaming](#chat-streaming)
   - [Tools](#chat-tools)
   - [Vision](#chat-vision)
-  - [Audio](#chat-audio) :new:
+  - [Audio](#chat-audio)
   - [Structured Outputs](#chat-structured-outputs)
   - [Json Mode](#chat-json-mode)
 - [Audio](#audio)
@@ -1555,6 +1555,7 @@ Debug.Log($"{result.FirstChoice.Message.Role}: {result.FirstChoice} | Finish Rea
 #### [Chat Audio](https://platform.openai.com/docs/guides/audio)
 
 ```csharp
+var api = new OpenAIClient();
 var messages = new List<Message>
 {
     new Message(Role.System, "You are a helpful assistant."),
@@ -1662,9 +1663,9 @@ Generates audio from the input text.
 ```csharp
 var api = new OpenAIClient();
 var request = new SpeechRequest("Hello world!");
-var (path, clip) = await api.AudioEndpoint.CreateSpeechAsync(request);
-audioSource.PlayOneShot(clip);
-Debug.Log(path);
+var speechClip = await api.AudioEndpoint.CreateSpeechAsync(request);
+audioSource.PlayOneShot(speechClip);
+Debug.Log(speechClip);
 ```
 
 ##### [Stream Speech]
@@ -1673,11 +1674,17 @@ Generate streamed audio from the input text.
 
 ```csharp
 var api = new OpenAIClient();
-var request = new SpeechRequest("Hello world!");
-var (path, clip) = await api.AudioEndpoint.CreateSpeechStreamAsync(request, partialClip => audioSource.PlayOneShot(partialClip));
-Debug.Log(path);
+var request = new SpeechRequest("Hello world!", responseFormat: SpeechResponseFormat.PCM);
+var speechClip = await api.AudioEndpoint.CreateSpeechStreamAsync(request, partialClip =>
+{
+    audioSource.PlayOneShot(partialClip);
+});
+Debug.Log(speechClip);
 ```
 
+> [!NOTE]
+> Checkout any of the demo scenes for best practices on how to handle playback with `OnAudioFilterRead`.
+
 #### [Create Transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription)
 
 Transcribes audio into the input language.

diff --git a/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioEndpoint.cs b/OpenAI/Packages/com.openai.unity/Runtime/Audio/AudioEndpoint.cs
@@ -7,7 +7,6 @@
 using System.Threading;
 using System.Threading.Tasks;
 using UnityEngine;
-using UnityEngine.Networking;
 using Utilities.WebRequestRest;
 
 namespace OpenAI.Audio
@@ -27,25 +26,29 @@ internal AudioEndpoint(OpenAIClient client) : base(client) { }
 
         private static readonly object mutex = new();
 
-        /// <summary>
-        /// Generates audio from the input text.
-        /// </summary>
-        /// <param name="request"><see cref="SpeechRequest"/>.</param>
-        /// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
-        /// <returns><see cref="AudioClip"/> and the cached path.</returns>
-        [Function("Generates audio from the input text.")]
+        [Obsolete("use GetSpeechAsync")]
         public async Task<Tuple<string, AudioClip>> CreateSpeechAsync(SpeechRequest request, CancellationToken cancellationToken = default)
             => await CreateSpeechStreamAsync(request, null, cancellationToken);
 
+        [Obsolete("use GetSpeechAsync")]
+        public async Task<Tuple<string, AudioClip>> CreateSpeechStreamAsync(SpeechRequest request, Action<AudioClip> partialClipCallback, CancellationToken cancellationToken = default)
+        {
+            var result = await GetSpeechAsync(request, speechClip =>
+            {
+                partialClipCallback.Invoke(speechClip.AudioClip);
+            }, cancellationToken);
+            return Tuple.Create(result.CachePath, result.AudioClip);
+        }
+
         /// <summary>
-        /// Generates streaming audio from the input text.
+        /// Generates audio from the input text.
         /// </summary>
         /// <param name="request"><see cref="SpeechRequest"/>.</param>
-        /// <param name="partialClipCallback">Optional, partial <see cref="AudioClip"/> callback used to stream audio.</param>
+        /// <param name="partialClipCallback">Optional, partial <see cref="SpeechClip"/> callback used to stream audio.</param>
         /// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
-        /// <returns><see cref="AudioClip"/> and the cached path.</returns>
-        [Function("Generates streaming audio from the input text.")]
-        public async Task<Tuple<string, AudioClip>> CreateSpeechStreamAsync(SpeechRequest request, Action<AudioClip> partialClipCallback, CancellationToken cancellationToken = default)
+        /// <returns><see cref="SpeechClip"/></returns>
+        [Function("Generates audio from the input text.")]
+        public async Task<SpeechClip> GetSpeechAsync(SpeechRequest request, Action<SpeechClip> partialClipCallback = null, CancellationToken cancellationToken = default)
         {
             if (partialClipCallback != null && request.ResponseFormat != SpeechResponseFormat.PCM)
             {
@@ -70,52 +73,16 @@ public async Task<Tuple<string, AudioClip>> CreateSpeechStreamAsync(SpeechReques
 
             Rest.TryGetDownloadCacheItem(clipName, out var cachedPath);
 
-            if (request.ResponseFormat == SpeechResponseFormat.PCM)
-            {
-                var part = 0;
-                var response = await Rest.PostAsync(
-                    GetUrl("/speech"),
-                    payload,
-                    StreamCallback,
-                    eventChunkSize: 8192,
-                    new RestParameters(client.DefaultRequestHeaders),
-                    cancellationToken);
-                response.Validate(EnableDebug);
-                var samples = Utilities.Audio.PCMEncoder.Decode(response.Data);
-                await File.WriteAllBytesAsync(cachedPath, response.Data, cancellationToken).ConfigureAwait(true);
-                return new Tuple<string, AudioClip>(cachedPath, AudioClip.Create(clipName, samples.Length, 1, 24000, false));
-
-                void StreamCallback(Response partialResponse)
-                {
-                    var chunk = Utilities.Audio.PCMEncoder.Decode(partialResponse.Data);
-                    var partialClip = AudioClip.Create($"{clipName}_{++part}", chunk.Length, 1, 24000, false);
-
-                    if (!partialClip.SetData(chunk, 0))
-                    {
-                        Debug.LogError("Failed to set pcm data to partial clip.");
-                        return;
-                    }
-
-                    partialClipCallback?.Invoke(partialClip);
-                }
-            }
+            var part = 0;
+            var pcmResponse = await Rest.PostAsync(GetUrl("/speech"), payload, StreamCallback, 8192, new RestParameters(client.DefaultRequestHeaders), cancellationToken);
+            pcmResponse.Validate(EnableDebug);
+            await File.WriteAllBytesAsync(cachedPath, pcmResponse.Data, cancellationToken).ConfigureAwait(true);
+            return new SpeechClip(clipName, cachedPath, new ReadOnlyMemory<byte>(pcmResponse.Data));
 
-            var audioFormat = request.ResponseFormat switch
+            void StreamCallback(Response partialResponse)
             {
-                SpeechResponseFormat.MP3 => AudioType.MPEG,
-                SpeechResponseFormat.WAV => AudioType.WAV,
-                _ => throw new NotSupportedException(request.ResponseFormat.ToString())
-            };
-
-            var clip = await Rest.DownloadAudioClipAsync(
-                GetUrl("/speech"),
-                audioFormat,
-                UnityWebRequest.kHttpVerbPOST,
-                clipName,
-                payload,
-                parameters: new RestParameters(client.DefaultRequestHeaders, debug: EnableDebug),
-                cancellationToken: cancellationToken);
-            return new Tuple<string, AudioClip>(cachedPath, clip);
+                partialClipCallback?.Invoke(new SpeechClip($"{clipName}_{++part}", null, partialResponse.Data));
+            }
         }
 
         /// <summary>

diff --git a/OpenAI/Packages/com.openai.unity/Runtime/Audio/SpeechClip.cs b/OpenAI/Packages/com.openai.unity/Runtime/Audio/SpeechClip.cs
@@ -0,0 +1,56 @@
+// Licensed under the MIT License. See LICENSE in the project root for license information.
+
+using System;
+using UnityEngine;
+using UnityEngine.Scripting;
+using Utilities.Audio;
+
+namespace OpenAI.Audio
+{
+    [Preserve]
+    public sealed class SpeechClip
+    {
+        [Preserve]
+        internal SpeechClip(string name, string cachePath, ReadOnlyMemory<byte> audioData, int sampleRate = 24000)
+        {
+            Name = name;
+            CachePath = cachePath;
+            AudioData = audioData;
+            SampleRate = sampleRate;
+        }
+
+        [Preserve]
+        public string Name { get; }
+
+        [Preserve]
+        public string CachePath { get; }
+
+        [Preserve]
+        public ReadOnlyMemory<byte> AudioData { get; }
+
+        [Preserve]
+        public float[] AudioSamples
+            => PCMEncoder.Resample(PCMEncoder.Decode(AudioData.ToArray()), SampleRate, 44100);
+
+        [Preserve]
+        public int SampleRate { get; }
+
+        [Preserve]
+        public AudioClip AudioClip
+        {
+            get
+            {
+                var samples = AudioSamples;
+                var clip = AudioClip.Create(Name, samples.Length, 1, 44100, false);
+                clip.SetData(samples, 0);
+                return clip;
+            }
+        }
+
+        [Preserve]
+        public static implicit operator AudioClip(SpeechClip clip) => clip?.AudioClip;
+
+        [Preserve]
+        public static implicit operator string(SpeechClip clip) => clip?.CachePath;
+    }
+}
diff --git a/OpenAI/Packages/com.openai.unity/Runtime/Audio/SpeechClip.cs.meta b/OpenAI/Packages/com.openai.unity/Runtime/Audio/SpeechClip.cs.meta
diff --git a/OpenAI/Packages/com.openai.unity/Runtime/Realtime/ResponseAudioResponse.cs b/OpenAI/Packages/com.openai.unity/Runtime/Realtime/ResponseAudioResponse.cs
@@ -1,6 +1,7 @@
 // Licensed under the MIT License. See LICENSE in the project root for license information.
 
 using Newtonsoft.Json;
+using System;
 using UnityEngine;
 using UnityEngine.Scripting;
 using Utilities.Audio;
@@ -72,6 +73,11 @@ internal ResponseAudioResponse(
         [JsonProperty("delta")]
         public string Delta { get; }
 
+        [Preserve]
+        [JsonIgnore]
+        public float[] AudioSamples
+            => PCMEncoder.Resample(PCMEncoder.Decode(Convert.FromBase64String(Delta)), 24000, 44100);
+
         [Preserve]
         [JsonIgnore]
         public bool IsDelta => Type.EndsWith("delta");
@@ -83,8 +89,8 @@ internal ResponseAudioResponse(
         [Preserve]
         public static implicit operator AudioClip(ResponseAudioResponse response)
         {
-            var audioSamples = PCMEncoder.Decode(System.Convert.FromBase64String(response.Delta));
-            var audioClip = AudioClip.Create($"{response.ItemId}_{response.OutputIndex}_delta", audioSamples.Length, 1, 24000, false);
+            var audioSamples = response.AudioSamples;
+            var audioClip = AudioClip.Create($"{response.ItemId}_{response.OutputIndex}_delta", audioSamples.Length, 1, 44100, false);
             audioClip.SetData(audioSamples, 0);
             return audioClip;
         }