Skip to content

Commit

Permalink
refactor: Update version to 0.3.5 and clear current caption in transc… (
Browse files Browse the repository at this point in the history
#164)

* refactor: Update version to 0.3.5 and clear current caption in transcription filter callbacks

* feat: Refactor whisper-processing.cpp for improved VAD segmentation and token buffer thread

* feat: Update prebuilt Whispercpp version to 0.0.6

* refactor: Remove trailing whitespace in translation-language-utils.h

* refactor: Add case-insensitive flag to regex in set_text_callback

The code change adds the `std::regex_constants::icase` flag to the regex used in the `set_text_callback` function in `transcription-filter-callbacks.cpp`. This allows for case-insensitive matching when replacing filter words in the `str_copy` string.

Refactor the code to improve VAD segmentation and token buffer thread in whisper-processing.cpp

The code change refactors the `whisper-processing.cpp` file to improve the VAD (Voice Activity Detection) segmentation and token buffer thread. This aims to enhance the performance and accuracy of the transcription filtering process.

refactor: Add prepopulated filter options and corresponding map entries in FilterReplaceDialog

The code change adds prepopulated filter options, such as "English Swear Words," "English Hallucinations," and "Korean Hallucinations," to the `FilterReplaceDialog` UI. It also adds the corresponding map entries to the `filter_words_replace` map, allowing users to easily add predefined filter patterns and replacement values.

refactor: Update version to 0.3.5 and clear current caption in transcription filter callbacks

The code change updates the version to 0.3.5 and clears the current caption in the transcription filter callbacks. This ensures that the correct version is displayed and any previous captions are removed.

refactor: Remove trailing whitespace in translation-language-utils.h

The code change removes trailing whitespace in the `translation-language-utils.h` file, improving code readability and consistency.
  • Loading branch information
royshil authored Sep 13, 2024
1 parent abe678b commit 0245023
Show file tree
Hide file tree
Showing 12 changed files with 138 additions and 47 deletions.
2 changes: 1 addition & 1 deletion buildspec.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
},
"name": "obs-localvocal",
"displayName": "OBS Localvocal",
"version": "0.3.4",
"version": "0.3.5",
"author": "Roy Shilkrot",
"website": "https://github.com/occ-ai/obs-localvocal",
"email": "[email protected]",
Expand Down
12 changes: 6 additions & 6 deletions cmake/BuildWhispercpp.cmake
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
include(ExternalProject)
include(FetchContent)

set(PREBUILT_WHISPERCPP_VERSION "0.0.5")
set(PREBUILT_WHISPERCPP_VERSION "0.0.6")
set(PREBUILT_WHISPERCPP_URL_BASE
"https://github.com/occ-ai/occ-ai-dep-whispercpp/releases/download/${PREBUILT_WHISPERCPP_VERSION}")

if(APPLE)
# check the "MACOS_ARCH" env var to figure out if this is x86 or arm64
if($ENV{MACOS_ARCH} STREQUAL "x86_64")
set(WHISPER_CPP_HASH "da61500b9a37f8630b9e4ed49bc3fe7858729d7a28a2e80bf6cfa4cb97523546")
set(WHISPER_CPP_HASH "454abee900a96a0a10a91f631ff797bdbdf2df0d2a819479a409634c9be1e12c")
elseif($ENV{MACOS_ARCH} STREQUAL "arm64")
set(WHISPER_CPP_HASH "ef1e2628ba09414c0848d58c471440f38b8393cb5d428edf82b9e78aeeecdd15")
set(WHISPER_CPP_HASH "f726388cc494f6fca864c860af6c1bc2932c3dc823ef92197b1e29f088425668")
else()
message(
FATAL_ERROR
Expand Down Expand Up @@ -54,13 +54,13 @@ elseif(WIN32)
set(WHISPER_CPP_URL
"${PREBUILT_WHISPERCPP_URL_BASE}/whispercpp-windows-${ARCH_PREFIX}-${PREBUILT_WHISPERCPP_VERSION}.zip")
if(${ACCELERATION} STREQUAL "cpu")
set(WHISPER_CPP_HASH "2b1cfa0dd764132c4cde60e112a8e6328d28d158d91a8845080baa3e9d2dcdcd")
set(WHISPER_CPP_HASH "126c5d859e902b4cd0f2cd09304a68750f1dbc6a7aa62e280cfd56c51a6a1c95")
add_compile_definitions("LOCALVOCAL_WITH_CPU")
elseif(${ACCELERATION} STREQUAL "cuda")
set(WHISPER_CPP_HASH "011e813742fddf0911c4a36d2080d7a388cf78738081297088e7d50023e4f9bc")
set(WHISPER_CPP_HASH "5b9592c311a7f1612894ca0b36f6bd4effb6a46acd03d33924df56c52f566779")
add_compile_definitions("LOCALVOCAL_WITH_CUDA")
elseif(${ACCELERATION} STREQUAL "hipblas")
set(WHISPER_CPP_HASH "f2980d6cd3df9cac464378d26d2c19d827bcac995c8d0398a39230a9be936013")
set(WHISPER_CPP_HASH "c306ecce16cd10f377fdefbf7bb252abac8e6638a2637f82b1f1f32dd2cb4e39")
add_compile_definitions("LOCALVOCAL_WITH_HIPBLAS")
else()
message(
Expand Down
11 changes: 9 additions & 2 deletions src/transcription-filter-callbacks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,8 @@ void set_text_callback(struct transcription_filter_data *gf,
for (const auto &filter_words : gf->filter_words_replace) {
// if filter exists within str_copy, replace it with the replacement
str_copy = std::regex_replace(str_copy,
std::regex(std::get<0>(filter_words)),
std::regex(std::get<0>(filter_words),
std::regex_constants::icase),
std::get<1>(filter_words));
}
// if the text was modified, log the original and modified text
Expand Down Expand Up @@ -322,7 +323,7 @@ void recording_state_callback(enum obs_frontend_event event, void *data)
}
}

void reset_caption_state(transcription_filter_data *gf_)
void clear_current_caption(transcription_filter_data *gf_)
{
if (gf_->captions_monitor.isEnabled()) {
gf_->captions_monitor.clear();
Expand All @@ -336,6 +337,12 @@ void reset_caption_state(transcription_filter_data *gf_)
gf_->translation_ctx.last_input_tokens.clear();
gf_->translation_ctx.last_translation_tokens.clear();
gf_->last_transcription_sentence.clear();
gf_->cleared_last_sub = true;
}

void reset_caption_state(transcription_filter_data *gf_)
{
clear_current_caption(gf_);
// flush the buffer
{
std::lock_guard<std::mutex> lock(gf_->whisper_buf_mutex);
Expand Down
2 changes: 2 additions & 0 deletions src/transcription-filter-callbacks.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ void audio_chunk_callback(struct transcription_filter_data *gf, const float *pcm
void set_text_callback(struct transcription_filter_data *gf,
const DetectionResultWithText &resultIn);

void clear_current_caption(transcription_filter_data *gf_);

void recording_state_callback(enum obs_frontend_event event, void *data);

void media_play_callback(void *data_, calldata_t *cd);
Expand Down
1 change: 1 addition & 0 deletions src/transcription-filter-data.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ struct transcription_filter_audio_info {

// Callback sent when the transcription has a new result
void set_text_callback(struct transcription_filter_data *gf, const DetectionResultWithText &str);
void clear_current_caption(transcription_filter_data *gf_);

// Callback sent when the VAD finds an audio chunk. Sample rate = WHISPER_SAMPLE_RATE, channels = 1
// The audio chunk is in 32-bit float format
Expand Down
2 changes: 1 addition & 1 deletion src/transcription-filter-properties.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -622,7 +622,7 @@ void transcription_filter_defaults(obs_data_t *s)
obs_data_set_default_double(s, "thold_ptsum", 0.01);
obs_data_set_default_int(s, "max_len", 0);
obs_data_set_default_bool(s, "split_on_word", true);
obs_data_set_default_int(s, "max_tokens", 0);
obs_data_set_default_int(s, "max_tokens", 50);
obs_data_set_default_bool(s, "suppress_blank", false);
obs_data_set_default_bool(s, "suppress_non_speech_tokens", true);
obs_data_set_default_double(s, "temperature", 0.1);
Expand Down
1 change: 1 addition & 0 deletions src/transcription-filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,7 @@ void transcription_filter_update(void *data, obs_data_t *s)
gf->whisper_params.temperature = (float)obs_data_get_double(s, "temperature");
gf->whisper_params.max_initial_ts = (float)obs_data_get_double(s, "max_initial_ts");
gf->whisper_params.length_penalty = (float)obs_data_get_double(s, "length_penalty");
gf->whisper_params.no_timestamps = true;

if (gf->vad) {
const float vad_threshold = (float)obs_data_get_double(s, "vad_threshold");
Expand Down
2 changes: 1 addition & 1 deletion src/translation/translation-language-utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@

std::string remove_start_punctuation(const std::string &text);

#endif // TRANSLATION_LANGUAGE_UTILS_H
#endif // TRANSLATION_LANGUAGE_UTILS_H
28 changes: 28 additions & 0 deletions src/ui/filter-replace-dialog.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ FilterReplaceDialog::FilterReplaceDialog(QWidget *parent, transcription_filter_d
// connect edit triggers
connect(ui->tableWidget, &QTableWidget::itemChanged, this,
&FilterReplaceDialog::editFilter);
// connect toolButton_addPrepopulatedFilter
connect(ui->toolButton_addPrepopulatedFilter, &QToolButton::clicked, this,
&FilterReplaceDialog::addPrepopulatedFilter);
}

FilterReplaceDialog::~FilterReplaceDialog()
Expand Down Expand Up @@ -73,3 +76,28 @@ void FilterReplaceDialog::editFilter(QTableWidgetItem *item)
// use the row number to update the filter_words_replace map
ctx->filter_words_replace[item->row()] = std::make_tuple(key, value);
}

void FilterReplaceDialog::addPrepopulatedFilter()
{
// add a prepopulated filter_words_replace map entry
// check the value of the comboBox_selectPrepopulatedFilter
// and add the corresponding filter_words_replace map entry
std::string replace_value = "";
std::string replace_pattern;
const std::string selected =
ui->comboBox_selectPrepopulatedFilter->currentText().toStdString();
if (selected == "English Swear Words") {
replace_pattern = "(fuck|shit|bitch|cunt|cock|dick|pussy)";
replace_value = "****";
} else if (selected == "English Hallucinations") {
replace_pattern = "(Thank you|Thanks for watching|Please subscribe)";
} else if (selected == "Korean Hallucinations") {
replace_pattern = "MBC.*";
}
ctx->filter_words_replace.push_back(std::make_tuple(replace_pattern, replace_value));
ui->tableWidget->insertRow(ui->tableWidget->rowCount());
ui->tableWidget->setItem(ui->tableWidget->rowCount() - 1, 0,
new QTableWidgetItem(QString::fromStdString(replace_pattern)));
ui->tableWidget->setItem(ui->tableWidget->rowCount() - 1, 1,
new QTableWidgetItem(QString::fromStdString(replace_value)));
}
1 change: 1 addition & 0 deletions src/ui/filter-replace-dialog.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ private slots:
void addFilter();
void removeFilter();
void editFilter(QTableWidgetItem *item);
void addPrepopulatedFilter();
};

#endif // FILTERREPLACEDIALOG_H
100 changes: 72 additions & 28 deletions src/ui/filter-replace-dialog.ui
Original file line number Diff line number Diff line change
Expand Up @@ -14,33 +14,7 @@
<string>Filter and Replace</string>
</property>
<layout class="QGridLayout" name="gridLayout">
<item row="0" column="0">
<widget class="QTableWidget" name="tableWidget">
<property name="rowCount">
<number>0</number>
</property>
<attribute name="horizontalHeaderDefaultSectionSize">
<number>180</number>
</attribute>
<attribute name="horizontalHeaderStretchLastSection">
<bool>true</bool>
</attribute>
<attribute name="verticalHeaderVisible">
<bool>false</bool>
</attribute>
<column>
<property name="text">
<string>Word / Phrase (Regex)</string>
</property>
</column>
<column>
<property name="text">
<string>Replace Value</string>
</property>
</column>
</widget>
</item>
<item row="2" column="0">
<item row="3" column="0">
<widget class="QWidget" name="widget" native="true">
<layout class="QHBoxLayout" name="horizontalLayout">
<property name="spacing">
Expand Down Expand Up @@ -85,13 +59,83 @@
</layout>
</widget>
</item>
<item row="1" column="0">
<item row="0" column="0">
<widget class="QTableWidget" name="tableWidget">
<property name="rowCount">
<number>0</number>
</property>
<attribute name="horizontalHeaderDefaultSectionSize">
<number>180</number>
</attribute>
<attribute name="horizontalHeaderStretchLastSection">
<bool>true</bool>
</attribute>
<attribute name="verticalHeaderVisible">
<bool>false</bool>
</attribute>
<column>
<property name="text">
<string>Word / Phrase (Regex)</string>
</property>
</column>
<column>
<property name="text">
<string>Replace Value</string>
</property>
</column>
</widget>
</item>
<item row="2" column="0">
<widget class="QLabel" name="label">
<property name="text">
<string>Regex enabled. Use empty Replace Value to filter.</string>
</property>
</widget>
</item>
<item row="1" column="0">
<widget class="QWidget" name="widget_2" native="true">
<layout class="QHBoxLayout" name="horizontalLayout_2">
<property name="leftMargin">
<number>0</number>
</property>
<property name="topMargin">
<number>0</number>
</property>
<property name="rightMargin">
<number>0</number>
</property>
<property name="bottomMargin">
<number>0</number>
</property>
<item>
<widget class="QComboBox" name="comboBox_selectPrepopulatedFilter">
<item>
<property name="text">
<string>English Swear Words</string>
</property>
</item>
<item>
<property name="text">
<string>English Hallucinations</string>
</property>
</item>
<item>
<property name="text">
<string>Korean Hallucinations</string>
</property>
</item>
</widget>
</item>
<item>
<widget class="QToolButton" name="toolButton_addPrepopulatedFilter">
<property name="text">
<string>Add</string>
</property>
</widget>
</item>
</layout>
</widget>
</item>
</layout>
</widget>
<resources/>
Expand Down
23 changes: 15 additions & 8 deletions src/whisper-utils/whisper-processing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,11 +161,18 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter

if (pcm32f_num_samples < WHISPER_SAMPLE_RATE) {
obs_log(gf->log_level,
"Speech segment is less than 1 second, padding with zeros to 1 second");
"Speech segment is less than 1 second, padding with white noise to 1 second");
const size_t new_size = (size_t)(1.01f * (float)(WHISPER_SAMPLE_RATE));
// create a new buffer and copy the data to it in the middle
pcm32f_data = (float *)bzalloc(new_size * sizeof(float));
memset(pcm32f_data, 0, new_size * sizeof(float));

// add low volume white noise
const float noise_level = 0.01f;
for (size_t i = 0; i < new_size; ++i) {
pcm32f_data[i] =
noise_level * ((float)rand() / (float)RAND_MAX * 2.0f - 1.0f);
}

memcpy(pcm32f_data + (new_size - pcm32f_num_samples) / 2, pcm32f_data_,
pcm32f_num_samples * sizeof(float));
pcm32f_size = new_size;
Expand Down Expand Up @@ -234,10 +241,11 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
// get token
whisper_token_data token =
whisper_full_get_token_data(gf->whisper_context, n_segment, j);
const char *token_str = whisper_token_to_str(gf->whisper_context, token.id);
const std::string token_str =
whisper_token_to_str(gf->whisper_context, token.id);
bool keep = true;
// if the token starts with '[' and ends with ']', don't keep it
if (token_str[0] == '[' && token_str[strlen(token_str) - 1] == ']') {
if (token_str[0] == '[' && token_str[token_str.size() - 1] == ']') {
keep = false;
}
// if this is a special token, don't keep it
Expand Down Expand Up @@ -271,8 +279,8 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
text += token_str;
tokens.push_back(token);
}
obs_log(gf->log_level, "S %d, T %d: %d\t%s\tp: %.3f [keep: %d]", n_segment,
j, token.id, token_str, token.p, keep);
obs_log(gf->log_level, "S %d, T %2d: %5d\t%s\tp: %.3f [keep: %d]",
n_segment, j, token.id, token_str.c_str(), token.p, keep);
}
}
sentence_p /= (float)tokens.size();
Expand Down Expand Up @@ -379,8 +387,7 @@ void whisper_loop(void *data)
obs_log(gf->log_level,
"Clearing current subtitle. now: %lu ms, last: %lu ms", now,
gf->last_sub_render_time);
set_text_callback(gf, {DETECTION_RESULT_UNKNOWN, "", 0, 0, {}});
gf->cleared_last_sub = true;
clear_current_caption(gf);
}
}

Expand Down

0 comments on commit 0245023

Please sign in to comment.