Refactor audio processing and update UI settings

This commit includes several changes to improve the audio processing and user interface. The default value of `autoLanguageDetection` has been changed to `false`. The logic for handling speaking state and silence or short pause has been refactored into separate methods. The `ProcessAudioStreamAsync` method has been updated to handle partial transcriptions and no longer displays a message when no audio is detected. The UI update delay in the `UpdateUI` method has been increased. The `TranscribeAudioAsync` method now uses a ternary operator for setting the language parameter. Lastly, the bindings for the `IsChecked` and `Text` properties in `MainWindow.xaml` have been updated.
BoiHanny · Mar 21, 2024 · cb8cf7b · cb8cf7b
1 parent f34a5d6
commit cb8cf7b
Show file tree

Hide file tree

Showing 2 changed files with 79 additions and 76 deletions.
diff --git a/vrcosc-magicchatbox/Classes/Modules/WhisperModule.cs b/vrcosc-magicchatbox/Classes/Modules/WhisperModule.cs
@@ -41,7 +41,7 @@ public partial class WhisperModuleSettings : ObservableObject
         private string selectedSpeechToTextLanguage;
 
         [ObservableProperty]
-        private bool autoLanguageDetection = true;
+        private bool autoLanguageDetection = false;
 
         [ObservableProperty]
         private int silenceAutoTurnOffDuration = 3000;
@@ -325,74 +325,75 @@ private void OnDataAvailable(object sender, WaveInEventArgs e)
 
             if (isLoudEnough)
             {
-                if (!isCurrentlySpeaking)
-                {
-                    speakingStartedTimestamp = DateTime.Now;
-                    isCurrentlySpeaking = true;
-                }
-
-                var speakingDuration = (DateTime.Now - speakingStartedTimestamp).TotalSeconds;
-                UpdateUI($"Speaking detected, recording... (Duration: {speakingDuration:0.0}s)", true);
-                audioStream.Write(e.Buffer, 0, e.BytesRecorded);
-                lastSoundTimestamp = DateTime.Now;
+                HandleSpeakingState(e);
             }
-            else if (isCurrentlySpeaking)
+            else
             {
-                var silenceDuration = DateTime.Now.Subtract(lastSoundTimestamp).TotalMilliseconds;
+                ProcessSilenceOrShortPause();
+            }
+        }
 
-                if (silenceDuration > 500 && silenceDuration <= Settings.SilenceAutoTurnOffDuration)
-                {
-                    if (!isProcessingShortPause)
-                    {
-                        isProcessingShortPause = true;
-                        // Offload to a background task since we can't await in this event handler
-                        Task.Run(() => ProcessShortPauseAsync()).ContinueWith(_ =>
-                        {
-                            // Use Dispatcher.Invoke to ensure that the following actions are performed on the UI thread.
-                            Application.Current.Dispatcher.Invoke(() =>
-                            {
-                                // Actions to take after processing the short pause, ensuring thread safety for UI operations
-                                isProcessingShortPause = false;
-                                // Any other UI updates or state changes that need to be made safely on the UI thread
-                            });
-                        });
-
-                    }
-                }
-                else if (silenceDuration > Settings.SilenceAutoTurnOffDuration)
+        private void HandleSpeakingState(WaveInEventArgs e)
+        {
+            if (!isCurrentlySpeaking)
+            {
+                speakingStartedTimestamp = DateTime.Now;
+                isCurrentlySpeaking = true;
+                if (audioStream.Length > 0)
                 {
-                    isCurrentlySpeaking = false;
-                    UpdateUI($"Silence detected for more than {Settings.SilenceAutoTurnOffDuration / 1000.0} seconds, stopping recording...", true);
-                    StopRecording();
+                    // If there's residual audio from a previous pause, start fresh without losing the continuity
+                    ProcessAudioStreamAsync(audioStream, partial: true);
+                    audioStream = new MemoryStream();
                 }
             }
+
+            audioStream.Write(e.Buffer, 0, e.BytesRecorded);
+            lastSoundTimestamp = DateTime.Now;
+            var speakingDuration = (DateTime.Now - speakingStartedTimestamp).TotalSeconds;
+            UpdateUI($"Speaking... Duration: {speakingDuration:0.0}s", true);
         }
 
-        private async Task ProcessShortPauseAsync()
+        private void ProcessSilenceOrShortPause()
         {
-            await ProcessAudioStreamAsync(audioStream);
-            // Ensure the continuation logic here is thread-safe, especially if updating the UI
-            App.Current.Dispatcher.Invoke(() =>
+            var silenceDuration = DateTime.Now.Subtract(lastSoundTimestamp).TotalMilliseconds;
+
+            if (!isCurrentlySpeaking || silenceDuration < 500) return; // Ignore very short silences or if not speaking
+
+            if (silenceDuration <= Settings.SilenceAutoTurnOffDuration)
+            {
+                if (!isProcessingShortPause)
+                {
+                    // Handle short pause: Transcribe partial speech without stopping the recording session
+                    isProcessingShortPause = true;
+                    ProcessAudioStreamAsync(audioStream, partial: true);
+                    audioStream = new MemoryStream(); // Prepare for more speech, ensuring continuity
+                    Task.Delay(500).ContinueWith(_ => isProcessingShortPause = false);
+                }
+            }
+            else if (silenceDuration > Settings.SilenceAutoTurnOffDuration && isCurrentlySpeaking)
             {
-                isProcessingShortPause = false;
-                audioStream = new MemoryStream(); // Reset for new data
-                lastSoundTimestamp = DateTime.Now; // Reset timestamp
-                                                   // Optionally update the UI or reset flags
-            });
+                // Long silence detected, auto-disable the STT session by stopping recording
+                isCurrentlySpeaking = false;
+                StopRecording(); // Auto-disable the STT session due to prolonged silence
+                audioStream = new MemoryStream(); // Reset for a new session
+                UpdateUI($"Silence detected for more than {Settings.SilenceAutoTurnOffDuration / 1000.0} seconds, auto-disabling STT session...", false);
+            }
         }
 
 
 
 
 
+
         private async void UpdateUI(string message, bool isVisible)
         {
             ViewModel.Instance.IntelliChatModule.Settings.IntelliChatUILabelTxt = message;
             ViewModel.Instance.IntelliChatModule.Settings.IntelliChatUILabel = isVisible;
 
             if (!isVisible)
             {
-                await Task.Delay(1200); 
+                ViewModel.Instance.IntelliChatModule.Settings.IntelliChatUILabel = true;
+                await Task.Delay(2500); 
                 App.Current.Dispatcher.Invoke(() =>
                 {
                     ViewModel.Instance.IntelliChatModule.Settings.IntelliChatUILabel = false;
@@ -408,26 +409,28 @@ private float CalculateMaxAmplitude(byte[] buffer, int bytesRecorded)
             return samples.Max(sample => Math.Abs(sample / 32768f));
         }
 
-        private async Task ProcessAudioStreamAsync(MemoryStream stream)
+        private async Task ProcessAudioStreamAsync(MemoryStream stream, bool partial = false)
         {
-            if (stream.Length == 0)
-            {
-                UpdateUI("No audio detected.", false);
-                return;
-            }
+            if (stream.Length == 0) return;
 
             stream.Position = 0;
-            UpdateUI("Transcribing with OpenAI...", true);
+            UpdateUI(partial ? "Transcribing part of your speech..." : "Transcribing with OpenAI...", true);
             string transcription = await TranscribeAudioAsync(stream);
-            if (transcription != null) {
+            if (!string.IsNullOrEmpty(transcription))
+            {
                 TranscriptionReceived?.Invoke(transcription);
                 UpdateUI("Transcription complete.", false);
             }
             else
-                {
+            {
                 UpdateUI("Error transcribing audio.", false);
             }
-
+
+            if (!partial)
+            {
+                // Reset the stream only if processing the final segment
+                audioStream = new MemoryStream();
+            }
         }
 
 
@@ -442,7 +445,7 @@ private async Task<string> TranscribeAudioAsync(Stream audioStream)
                     await audioStream.CopyToAsync(writer);
                 }
 
-                var response = await OpenAIModule.Instance.OpenAIClient.AudioEndpoint.CreateTranscriptionAsync(new AudioTranscriptionRequest(tempFilePath, language: Settings.AutoLanguageDetection?null:Settings.SelectedSpeechToTextLanguage));
+                var response = await OpenAIModule.Instance.OpenAIClient.AudioEndpoint.CreateTranscriptionAsync(new AudioTranscriptionRequest(tempFilePath, language: Settings.AutoLanguageDetection ? null:Settings.SelectedSpeechToTextLanguage));
 
                 return response;
             }

diff --git a/vrcosc-magicchatbox/MainWindow.xaml b/vrcosc-magicchatbox/MainWindow.xaml
@@ -2195,7 +2195,7 @@
                                             FontFamily="Comfortaa Light"
                                             FontSize="18"
                                             Foreground="#FFB9B5C1"
-                                            IsChecked="{Binding IntelliChatModule.Settings.AutolanguageSelection, Mode=TwoWay}"
+                                            IsChecked="{Binding WhisperModule.Settings.AutoLanguageDetection, Mode=TwoWay}"
                                             Style="{DynamicResource SettingsCheckbox}">
                                             Automatically detect language
                                         </CheckBox>
@@ -2227,25 +2227,25 @@
 
                                         <StackPanel Orientation="Horizontal">
                                             <TextBlock
-                                        Width="auto"
-                                        Padding="3,10,0,0"
-                                        FontFamily="Albert Sans Thin"
-                                        FontSize="15"
-                                        Foreground="#FF9983AD"
-                                        Text="Disable after" />
+                                                Width="auto"
+                                                Padding="3,10,0,0"
+                                                FontFamily="Albert Sans Thin"
+                                                FontSize="15"
+                                                Foreground="#FF9983AD"
+                                                Text="Disable after" />
                                             <TextBox
-                                            x:Name="SilenceAutoTurnOffDurationINT"
-                                            Width="auto"
-                                            Height="26"
-                                            Margin="4,3,3,0"
-                                            Padding="3,0,3,0"
-                                            HorizontalAlignment="Left"
-                                            VerticalContentAlignment="Center"
-                                            Background="#FF7B7195"
-                                            BorderThickness="0"
-                                            FontSize="15"
-                                            Foreground="#FF240E54"
-                                            Text="{Binding WhisperModule.Settings.SilenceAutoTurnOffDuration, Mode=TwoWay, UpdateSourceTrigger=PropertyChanged}" />
+                                                x:Name="SilenceAutoTurnOffDurationINT"
+                                                Width="auto"
+                                                Height="26"
+                                                Margin="4,3,3,0"
+                                                Padding="3,0,3,0"
+                                                HorizontalAlignment="Left"
+                                                VerticalContentAlignment="Center"
+                                                Background="#FF7B7195"
+                                                BorderThickness="0"
+                                                FontSize="15"
+                                                Foreground="#FF240E54"
+                                                Text="{Binding WhisperModule.Settings.SilenceAutoTurnOffDuration, Mode=TwoWay, UpdateSourceTrigger=PropertyChanged}" />
                                             <TextBlock
                                                 Width="auto"
                                                 Padding="0,10,0,0"