From 024502333a52b59f3f0795108574cd020a50ede9 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Thu, 12 Sep 2024 20:06:26 -0400 Subject: [PATCH] =?UTF-8?q?refactor:=20Update=20version=20to=200.3.5=20and?= =?UTF-8?q?=20clear=20current=20caption=20in=20transc=E2=80=A6=20(#164)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * refactor: Update version to 0.3.5 and clear current caption in transcription filter callbacks * feat: Refactor whisper-processing.cpp for improved VAD segmentation and token buffer thread * feat: Update prebuilt Whispercpp version to 0.0.6 * refactor: Remove trailing whitespace in translation-language-utils.h * refactor: Add case-insensitive flag to regex in set_text_callback The code change adds the `std::regex_constants::icase` flag to the regex used in the `set_text_callback` function in `transcription-filter-callbacks.cpp`. This allows for case-insensitive matching when replacing filter words in the `str_copy` string. Refactor the code to improve VAD segmentation and token buffer thread in whisper-processing.cpp The code change refactors the `whisper-processing.cpp` file to improve the VAD (Voice Activity Detection) segmentation and token buffer thread. This aims to enhance the performance and accuracy of the transcription filtering process. refactor: Add prepopulated filter options and corresponding map entries in FilterReplaceDialog The code change adds prepopulated filter options, such as "English Swear Words," "English Hallucinations," and "Korean Hallucinations," to the `FilterReplaceDialog` UI. It also adds the corresponding map entries to the `filter_words_replace` map, allowing users to easily add predefined filter patterns and replacement values. refactor: Update version to 0.3.5 and clear current caption in transcription filter callbacks The code change updates the version to 0.3.5 and clears the current caption in the transcription filter callbacks. This ensures that the correct version is displayed and any previous captions are removed. refactor: Remove trailing whitespace in translation-language-utils.h The code change removes trailing whitespace in the `translation-language-utils.h` file, improving code readability and consistency. --- buildspec.json | 2 +- cmake/BuildWhispercpp.cmake | 12 +-- src/transcription-filter-callbacks.cpp | 11 +- src/transcription-filter-callbacks.h | 2 + src/transcription-filter-data.h | 1 + src/transcription-filter-properties.cpp | 2 +- src/transcription-filter.cpp | 1 + src/translation/translation-language-utils.h | 2 +- src/ui/filter-replace-dialog.cpp | 28 ++++++ src/ui/filter-replace-dialog.h | 1 + src/ui/filter-replace-dialog.ui | 100 +++++++++++++------ src/whisper-utils/whisper-processing.cpp | 23 +++-- 12 files changed, 138 insertions(+), 47 deletions(-) diff --git a/buildspec.json b/buildspec.json index 22a0b17..fd6c6f7 100644 --- a/buildspec.json +++ b/buildspec.json @@ -38,7 +38,7 @@ }, "name": "obs-localvocal", "displayName": "OBS Localvocal", - "version": "0.3.4", + "version": "0.3.5", "author": "Roy Shilkrot", "website": "https://github.com/occ-ai/obs-localvocal", "email": "roy.shil@gmail.com", diff --git a/cmake/BuildWhispercpp.cmake b/cmake/BuildWhispercpp.cmake index 25d69d1..66e0f0b 100644 --- a/cmake/BuildWhispercpp.cmake +++ b/cmake/BuildWhispercpp.cmake @@ -1,16 +1,16 @@ include(ExternalProject) include(FetchContent) -set(PREBUILT_WHISPERCPP_VERSION "0.0.5") +set(PREBUILT_WHISPERCPP_VERSION "0.0.6") set(PREBUILT_WHISPERCPP_URL_BASE "https://github.com/occ-ai/occ-ai-dep-whispercpp/releases/download/${PREBUILT_WHISPERCPP_VERSION}") if(APPLE) # check the "MACOS_ARCH" env var to figure out if this is x86 or arm64 if($ENV{MACOS_ARCH} STREQUAL "x86_64") - set(WHISPER_CPP_HASH "da61500b9a37f8630b9e4ed49bc3fe7858729d7a28a2e80bf6cfa4cb97523546") + set(WHISPER_CPP_HASH "454abee900a96a0a10a91f631ff797bdbdf2df0d2a819479a409634c9be1e12c") elseif($ENV{MACOS_ARCH} STREQUAL "arm64") - set(WHISPER_CPP_HASH "ef1e2628ba09414c0848d58c471440f38b8393cb5d428edf82b9e78aeeecdd15") + set(WHISPER_CPP_HASH "f726388cc494f6fca864c860af6c1bc2932c3dc823ef92197b1e29f088425668") else() message( FATAL_ERROR @@ -54,13 +54,13 @@ elseif(WIN32) set(WHISPER_CPP_URL "${PREBUILT_WHISPERCPP_URL_BASE}/whispercpp-windows-${ARCH_PREFIX}-${PREBUILT_WHISPERCPP_VERSION}.zip") if(${ACCELERATION} STREQUAL "cpu") - set(WHISPER_CPP_HASH "2b1cfa0dd764132c4cde60e112a8e6328d28d158d91a8845080baa3e9d2dcdcd") + set(WHISPER_CPP_HASH "126c5d859e902b4cd0f2cd09304a68750f1dbc6a7aa62e280cfd56c51a6a1c95") add_compile_definitions("LOCALVOCAL_WITH_CPU") elseif(${ACCELERATION} STREQUAL "cuda") - set(WHISPER_CPP_HASH "011e813742fddf0911c4a36d2080d7a388cf78738081297088e7d50023e4f9bc") + set(WHISPER_CPP_HASH "5b9592c311a7f1612894ca0b36f6bd4effb6a46acd03d33924df56c52f566779") add_compile_definitions("LOCALVOCAL_WITH_CUDA") elseif(${ACCELERATION} STREQUAL "hipblas") - set(WHISPER_CPP_HASH "f2980d6cd3df9cac464378d26d2c19d827bcac995c8d0398a39230a9be936013") + set(WHISPER_CPP_HASH "c306ecce16cd10f377fdefbf7bb252abac8e6638a2637f82b1f1f32dd2cb4e39") add_compile_definitions("LOCALVOCAL_WITH_HIPBLAS") else() message( diff --git a/src/transcription-filter-callbacks.cpp b/src/transcription-filter-callbacks.cpp index 7b8208f..ff204b4 100644 --- a/src/transcription-filter-callbacks.cpp +++ b/src/transcription-filter-callbacks.cpp @@ -218,7 +218,8 @@ void set_text_callback(struct transcription_filter_data *gf, for (const auto &filter_words : gf->filter_words_replace) { // if filter exists within str_copy, replace it with the replacement str_copy = std::regex_replace(str_copy, - std::regex(std::get<0>(filter_words)), + std::regex(std::get<0>(filter_words), + std::regex_constants::icase), std::get<1>(filter_words)); } // if the text was modified, log the original and modified text @@ -322,7 +323,7 @@ void recording_state_callback(enum obs_frontend_event event, void *data) } } -void reset_caption_state(transcription_filter_data *gf_) +void clear_current_caption(transcription_filter_data *gf_) { if (gf_->captions_monitor.isEnabled()) { gf_->captions_monitor.clear(); @@ -336,6 +337,12 @@ void reset_caption_state(transcription_filter_data *gf_) gf_->translation_ctx.last_input_tokens.clear(); gf_->translation_ctx.last_translation_tokens.clear(); gf_->last_transcription_sentence.clear(); + gf_->cleared_last_sub = true; +} + +void reset_caption_state(transcription_filter_data *gf_) +{ + clear_current_caption(gf_); // flush the buffer { std::lock_guard lock(gf_->whisper_buf_mutex); diff --git a/src/transcription-filter-callbacks.h b/src/transcription-filter-callbacks.h index e8bdf3b..1f11ad8 100644 --- a/src/transcription-filter-callbacks.h +++ b/src/transcription-filter-callbacks.h @@ -17,6 +17,8 @@ void audio_chunk_callback(struct transcription_filter_data *gf, const float *pcm void set_text_callback(struct transcription_filter_data *gf, const DetectionResultWithText &resultIn); +void clear_current_caption(transcription_filter_data *gf_); + void recording_state_callback(enum obs_frontend_event event, void *data); void media_play_callback(void *data_, calldata_t *cd); diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h index e1af694..e8990be 100644 --- a/src/transcription-filter-data.h +++ b/src/transcription-filter-data.h @@ -152,6 +152,7 @@ struct transcription_filter_audio_info { // Callback sent when the transcription has a new result void set_text_callback(struct transcription_filter_data *gf, const DetectionResultWithText &str); +void clear_current_caption(transcription_filter_data *gf_); // Callback sent when the VAD finds an audio chunk. Sample rate = WHISPER_SAMPLE_RATE, channels = 1 // The audio chunk is in 32-bit float format diff --git a/src/transcription-filter-properties.cpp b/src/transcription-filter-properties.cpp index 815b9e3..a2c9da1 100644 --- a/src/transcription-filter-properties.cpp +++ b/src/transcription-filter-properties.cpp @@ -622,7 +622,7 @@ void transcription_filter_defaults(obs_data_t *s) obs_data_set_default_double(s, "thold_ptsum", 0.01); obs_data_set_default_int(s, "max_len", 0); obs_data_set_default_bool(s, "split_on_word", true); - obs_data_set_default_int(s, "max_tokens", 0); + obs_data_set_default_int(s, "max_tokens", 50); obs_data_set_default_bool(s, "suppress_blank", false); obs_data_set_default_bool(s, "suppress_non_speech_tokens", true); obs_data_set_default_double(s, "temperature", 0.1); diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp index 90e67eb..65ae072 100644 --- a/src/transcription-filter.cpp +++ b/src/transcription-filter.cpp @@ -396,6 +396,7 @@ void transcription_filter_update(void *data, obs_data_t *s) gf->whisper_params.temperature = (float)obs_data_get_double(s, "temperature"); gf->whisper_params.max_initial_ts = (float)obs_data_get_double(s, "max_initial_ts"); gf->whisper_params.length_penalty = (float)obs_data_get_double(s, "length_penalty"); + gf->whisper_params.no_timestamps = true; if (gf->vad) { const float vad_threshold = (float)obs_data_get_double(s, "vad_threshold"); diff --git a/src/translation/translation-language-utils.h b/src/translation/translation-language-utils.h index 44b450a..d2f4c47 100644 --- a/src/translation/translation-language-utils.h +++ b/src/translation/translation-language-utils.h @@ -5,4 +5,4 @@ std::string remove_start_punctuation(const std::string &text); -#endif // TRANSLATION_LANGUAGE_UTILS_H \ No newline at end of file +#endif // TRANSLATION_LANGUAGE_UTILS_H diff --git a/src/ui/filter-replace-dialog.cpp b/src/ui/filter-replace-dialog.cpp index b491a31..d3d0f84 100644 --- a/src/ui/filter-replace-dialog.cpp +++ b/src/ui/filter-replace-dialog.cpp @@ -27,6 +27,9 @@ FilterReplaceDialog::FilterReplaceDialog(QWidget *parent, transcription_filter_d // connect edit triggers connect(ui->tableWidget, &QTableWidget::itemChanged, this, &FilterReplaceDialog::editFilter); + // connect toolButton_addPrepopulatedFilter + connect(ui->toolButton_addPrepopulatedFilter, &QToolButton::clicked, this, + &FilterReplaceDialog::addPrepopulatedFilter); } FilterReplaceDialog::~FilterReplaceDialog() @@ -73,3 +76,28 @@ void FilterReplaceDialog::editFilter(QTableWidgetItem *item) // use the row number to update the filter_words_replace map ctx->filter_words_replace[item->row()] = std::make_tuple(key, value); } + +void FilterReplaceDialog::addPrepopulatedFilter() +{ + // add a prepopulated filter_words_replace map entry + // check the value of the comboBox_selectPrepopulatedFilter + // and add the corresponding filter_words_replace map entry + std::string replace_value = ""; + std::string replace_pattern; + const std::string selected = + ui->comboBox_selectPrepopulatedFilter->currentText().toStdString(); + if (selected == "English Swear Words") { + replace_pattern = "(fuck|shit|bitch|cunt|cock|dick|pussy)"; + replace_value = "****"; + } else if (selected == "English Hallucinations") { + replace_pattern = "(Thank you|Thanks for watching|Please subscribe)"; + } else if (selected == "Korean Hallucinations") { + replace_pattern = "MBC.*"; + } + ctx->filter_words_replace.push_back(std::make_tuple(replace_pattern, replace_value)); + ui->tableWidget->insertRow(ui->tableWidget->rowCount()); + ui->tableWidget->setItem(ui->tableWidget->rowCount() - 1, 0, + new QTableWidgetItem(QString::fromStdString(replace_pattern))); + ui->tableWidget->setItem(ui->tableWidget->rowCount() - 1, 1, + new QTableWidgetItem(QString::fromStdString(replace_value))); +} diff --git a/src/ui/filter-replace-dialog.h b/src/ui/filter-replace-dialog.h index d392a80..5e85b1e 100644 --- a/src/ui/filter-replace-dialog.h +++ b/src/ui/filter-replace-dialog.h @@ -25,6 +25,7 @@ private slots: void addFilter(); void removeFilter(); void editFilter(QTableWidgetItem *item); + void addPrepopulatedFilter(); }; #endif // FILTERREPLACEDIALOG_H diff --git a/src/ui/filter-replace-dialog.ui b/src/ui/filter-replace-dialog.ui index d67830c..ccc99f3 100644 --- a/src/ui/filter-replace-dialog.ui +++ b/src/ui/filter-replace-dialog.ui @@ -14,33 +14,7 @@ Filter and Replace - - - - 0 - - - 180 - - - true - - - false - - - - Word / Phrase (Regex) - - - - - Replace Value - - - - - + @@ -85,13 +59,83 @@ - + + + + 0 + + + 180 + + + true + + + false + + + + Word / Phrase (Regex) + + + + + Replace Value + + + + + Regex enabled. Use empty Replace Value to filter. + + + + + 0 + + + 0 + + + 0 + + + 0 + + + + + + English Swear Words + + + + + English Hallucinations + + + + + Korean Hallucinations + + + + + + + + Add + + + + + + diff --git a/src/whisper-utils/whisper-processing.cpp b/src/whisper-utils/whisper-processing.cpp index 6da91d9..3518edf 100644 --- a/src/whisper-utils/whisper-processing.cpp +++ b/src/whisper-utils/whisper-processing.cpp @@ -161,11 +161,18 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter if (pcm32f_num_samples < WHISPER_SAMPLE_RATE) { obs_log(gf->log_level, - "Speech segment is less than 1 second, padding with zeros to 1 second"); + "Speech segment is less than 1 second, padding with white noise to 1 second"); const size_t new_size = (size_t)(1.01f * (float)(WHISPER_SAMPLE_RATE)); // create a new buffer and copy the data to it in the middle pcm32f_data = (float *)bzalloc(new_size * sizeof(float)); - memset(pcm32f_data, 0, new_size * sizeof(float)); + + // add low volume white noise + const float noise_level = 0.01f; + for (size_t i = 0; i < new_size; ++i) { + pcm32f_data[i] = + noise_level * ((float)rand() / (float)RAND_MAX * 2.0f - 1.0f); + } + memcpy(pcm32f_data + (new_size - pcm32f_num_samples) / 2, pcm32f_data_, pcm32f_num_samples * sizeof(float)); pcm32f_size = new_size; @@ -234,10 +241,11 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter // get token whisper_token_data token = whisper_full_get_token_data(gf->whisper_context, n_segment, j); - const char *token_str = whisper_token_to_str(gf->whisper_context, token.id); + const std::string token_str = + whisper_token_to_str(gf->whisper_context, token.id); bool keep = true; // if the token starts with '[' and ends with ']', don't keep it - if (token_str[0] == '[' && token_str[strlen(token_str) - 1] == ']') { + if (token_str[0] == '[' && token_str[token_str.size() - 1] == ']') { keep = false; } // if this is a special token, don't keep it @@ -271,8 +279,8 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter text += token_str; tokens.push_back(token); } - obs_log(gf->log_level, "S %d, T %d: %d\t%s\tp: %.3f [keep: %d]", n_segment, - j, token.id, token_str, token.p, keep); + obs_log(gf->log_level, "S %d, T %2d: %5d\t%s\tp: %.3f [keep: %d]", + n_segment, j, token.id, token_str.c_str(), token.p, keep); } } sentence_p /= (float)tokens.size(); @@ -379,8 +387,7 @@ void whisper_loop(void *data) obs_log(gf->log_level, "Clearing current subtitle. now: %lu ms, last: %lu ms", now, gf->last_sub_render_time); - set_text_callback(gf, {DETECTION_RESULT_UNKNOWN, "", 0, 0, {}}); - gf->cleared_last_sub = true; + clear_current_caption(gf); } }