Skip to content

Commit

Permalink
Refactor audio processing and update UI settings
Browse files Browse the repository at this point in the history
This commit includes several changes to improve the audio processing and user interface. The default value of `autoLanguageDetection` has been changed to `false`. The logic for handling speaking state and silence or short pause has been refactored into separate methods. The `ProcessAudioStreamAsync` method has been updated to handle partial transcriptions and no longer displays a message when no audio is detected. The UI update delay in the `UpdateUI` method has been increased. The `TranscribeAudioAsync` method now uses a ternary operator for setting the language parameter. Lastly, the bindings for the `IsChecked` and `Text` properties in `MainWindow.xaml` have been updated.
  • Loading branch information
BoiHanny committed Mar 21, 2024
1 parent f34a5d6 commit cb8cf7b
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 76 deletions.
117 changes: 60 additions & 57 deletions vrcosc-magicchatbox/Classes/Modules/WhisperModule.cs
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ public partial class WhisperModuleSettings : ObservableObject
private string selectedSpeechToTextLanguage;

[ObservableProperty]
private bool autoLanguageDetection = true;
private bool autoLanguageDetection = false;

[ObservableProperty]
private int silenceAutoTurnOffDuration = 3000;
Expand Down Expand Up @@ -325,74 +325,75 @@ private void OnDataAvailable(object sender, WaveInEventArgs e)

if (isLoudEnough)
{
if (!isCurrentlySpeaking)
{
speakingStartedTimestamp = DateTime.Now;
isCurrentlySpeaking = true;
}

var speakingDuration = (DateTime.Now - speakingStartedTimestamp).TotalSeconds;
UpdateUI($"Speaking detected, recording... (Duration: {speakingDuration:0.0}s)", true);
audioStream.Write(e.Buffer, 0, e.BytesRecorded);
lastSoundTimestamp = DateTime.Now;
HandleSpeakingState(e);
}
else if (isCurrentlySpeaking)
else
{
var silenceDuration = DateTime.Now.Subtract(lastSoundTimestamp).TotalMilliseconds;
ProcessSilenceOrShortPause();
}
}

if (silenceDuration > 500 && silenceDuration <= Settings.SilenceAutoTurnOffDuration)
{
if (!isProcessingShortPause)
{
isProcessingShortPause = true;
// Offload to a background task since we can't await in this event handler
Task.Run(() => ProcessShortPauseAsync()).ContinueWith(_ =>
{
// Use Dispatcher.Invoke to ensure that the following actions are performed on the UI thread.
Application.Current.Dispatcher.Invoke(() =>
{
// Actions to take after processing the short pause, ensuring thread safety for UI operations
isProcessingShortPause = false;
// Any other UI updates or state changes that need to be made safely on the UI thread
});
});

}
}
else if (silenceDuration > Settings.SilenceAutoTurnOffDuration)
private void HandleSpeakingState(WaveInEventArgs e)
{
if (!isCurrentlySpeaking)
{
speakingStartedTimestamp = DateTime.Now;
isCurrentlySpeaking = true;
if (audioStream.Length > 0)
{
isCurrentlySpeaking = false;
UpdateUI($"Silence detected for more than {Settings.SilenceAutoTurnOffDuration / 1000.0} seconds, stopping recording...", true);
StopRecording();
// If there's residual audio from a previous pause, start fresh without losing the continuity
ProcessAudioStreamAsync(audioStream, partial: true);
audioStream = new MemoryStream();
}
}

audioStream.Write(e.Buffer, 0, e.BytesRecorded);
lastSoundTimestamp = DateTime.Now;
var speakingDuration = (DateTime.Now - speakingStartedTimestamp).TotalSeconds;
UpdateUI($"Speaking... Duration: {speakingDuration:0.0}s", true);
}

private async Task ProcessShortPauseAsync()
private void ProcessSilenceOrShortPause()
{
await ProcessAudioStreamAsync(audioStream);
// Ensure the continuation logic here is thread-safe, especially if updating the UI
App.Current.Dispatcher.Invoke(() =>
var silenceDuration = DateTime.Now.Subtract(lastSoundTimestamp).TotalMilliseconds;

if (!isCurrentlySpeaking || silenceDuration < 500) return; // Ignore very short silences or if not speaking

if (silenceDuration <= Settings.SilenceAutoTurnOffDuration)
{
if (!isProcessingShortPause)
{
// Handle short pause: Transcribe partial speech without stopping the recording session
isProcessingShortPause = true;
ProcessAudioStreamAsync(audioStream, partial: true);
audioStream = new MemoryStream(); // Prepare for more speech, ensuring continuity
Task.Delay(500).ContinueWith(_ => isProcessingShortPause = false);
}
}
else if (silenceDuration > Settings.SilenceAutoTurnOffDuration && isCurrentlySpeaking)
{
isProcessingShortPause = false;
audioStream = new MemoryStream(); // Reset for new data
lastSoundTimestamp = DateTime.Now; // Reset timestamp
// Optionally update the UI or reset flags
});
// Long silence detected, auto-disable the STT session by stopping recording
isCurrentlySpeaking = false;
StopRecording(); // Auto-disable the STT session due to prolonged silence
audioStream = new MemoryStream(); // Reset for a new session
UpdateUI($"Silence detected for more than {Settings.SilenceAutoTurnOffDuration / 1000.0} seconds, auto-disabling STT session...", false);
}
}






private async void UpdateUI(string message, bool isVisible)
{
ViewModel.Instance.IntelliChatModule.Settings.IntelliChatUILabelTxt = message;
ViewModel.Instance.IntelliChatModule.Settings.IntelliChatUILabel = isVisible;

if (!isVisible)
{
await Task.Delay(1200);
ViewModel.Instance.IntelliChatModule.Settings.IntelliChatUILabel = true;
await Task.Delay(2500);
App.Current.Dispatcher.Invoke(() =>
{
ViewModel.Instance.IntelliChatModule.Settings.IntelliChatUILabel = false;
Expand All @@ -408,26 +409,28 @@ private float CalculateMaxAmplitude(byte[] buffer, int bytesRecorded)
return samples.Max(sample => Math.Abs(sample / 32768f));
}

private async Task ProcessAudioStreamAsync(MemoryStream stream)
private async Task ProcessAudioStreamAsync(MemoryStream stream, bool partial = false)
{
if (stream.Length == 0)
{
UpdateUI("No audio detected.", false);
return;
}
if (stream.Length == 0) return;

stream.Position = 0;
UpdateUI("Transcribing with OpenAI...", true);
UpdateUI(partial ? "Transcribing part of your speech..." : "Transcribing with OpenAI...", true);
string transcription = await TranscribeAudioAsync(stream);
if (transcription != null) {
if (!string.IsNullOrEmpty(transcription))
{
TranscriptionReceived?.Invoke(transcription);
UpdateUI("Transcription complete.", false);
}
else
{
{
UpdateUI("Error transcribing audio.", false);
}


if (!partial)
{
// Reset the stream only if processing the final segment
audioStream = new MemoryStream();
}
}


Expand All @@ -442,7 +445,7 @@ private async Task<string> TranscribeAudioAsync(Stream audioStream)
await audioStream.CopyToAsync(writer);
}

var response = await OpenAIModule.Instance.OpenAIClient.AudioEndpoint.CreateTranscriptionAsync(new AudioTranscriptionRequest(tempFilePath, language: Settings.AutoLanguageDetection?null:Settings.SelectedSpeechToTextLanguage));
var response = await OpenAIModule.Instance.OpenAIClient.AudioEndpoint.CreateTranscriptionAsync(new AudioTranscriptionRequest(tempFilePath, language: Settings.AutoLanguageDetection ? null:Settings.SelectedSpeechToTextLanguage));

return response;
}
Expand Down
38 changes: 19 additions & 19 deletions vrcosc-magicchatbox/MainWindow.xaml
Original file line number Diff line number Diff line change
Expand Up @@ -2195,7 +2195,7 @@
FontFamily="Comfortaa Light"
FontSize="18"
Foreground="#FFB9B5C1"
IsChecked="{Binding IntelliChatModule.Settings.AutolanguageSelection, Mode=TwoWay}"
IsChecked="{Binding WhisperModule.Settings.AutoLanguageDetection, Mode=TwoWay}"
Style="{DynamicResource SettingsCheckbox}">
Automatically detect language
</CheckBox>
Expand Down Expand Up @@ -2227,25 +2227,25 @@

<StackPanel Orientation="Horizontal">
<TextBlock
Width="auto"
Padding="3,10,0,0"
FontFamily="Albert Sans Thin"
FontSize="15"
Foreground="#FF9983AD"
Text="Disable after" />
Width="auto"
Padding="3,10,0,0"
FontFamily="Albert Sans Thin"
FontSize="15"
Foreground="#FF9983AD"
Text="Disable after" />
<TextBox
x:Name="SilenceAutoTurnOffDurationINT"
Width="auto"
Height="26"
Margin="4,3,3,0"
Padding="3,0,3,0"
HorizontalAlignment="Left"
VerticalContentAlignment="Center"
Background="#FF7B7195"
BorderThickness="0"
FontSize="15"
Foreground="#FF240E54"
Text="{Binding WhisperModule.Settings.SilenceAutoTurnOffDuration, Mode=TwoWay, UpdateSourceTrigger=PropertyChanged}" />
x:Name="SilenceAutoTurnOffDurationINT"
Width="auto"
Height="26"
Margin="4,3,3,0"
Padding="3,0,3,0"
HorizontalAlignment="Left"
VerticalContentAlignment="Center"
Background="#FF7B7195"
BorderThickness="0"
FontSize="15"
Foreground="#FF240E54"
Text="{Binding WhisperModule.Settings.SilenceAutoTurnOffDuration, Mode=TwoWay, UpdateSourceTrigger=PropertyChanged}" />
<TextBlock
Width="auto"
Padding="0,10,0,0"
Expand Down

0 comments on commit cb8cf7b

Please sign in to comment.