diff --git a/buildspec.json b/buildspec.json index 22a0b17..fd6c6f7 100644 --- a/buildspec.json +++ b/buildspec.json @@ -38,7 +38,7 @@ }, "name": "obs-localvocal", "displayName": "OBS Localvocal", - "version": "0.3.4", + "version": "0.3.5", "author": "Roy Shilkrot", "website": "https://github.com/occ-ai/obs-localvocal", "email": "roy.shil@gmail.com", diff --git a/cmake/BuildWhispercpp.cmake b/cmake/BuildWhispercpp.cmake index 25d69d1..66e0f0b 100644 --- a/cmake/BuildWhispercpp.cmake +++ b/cmake/BuildWhispercpp.cmake @@ -1,16 +1,16 @@ include(ExternalProject) include(FetchContent) -set(PREBUILT_WHISPERCPP_VERSION "0.0.5") +set(PREBUILT_WHISPERCPP_VERSION "0.0.6") set(PREBUILT_WHISPERCPP_URL_BASE "https://github.com/occ-ai/occ-ai-dep-whispercpp/releases/download/${PREBUILT_WHISPERCPP_VERSION}") if(APPLE) # check the "MACOS_ARCH" env var to figure out if this is x86 or arm64 if($ENV{MACOS_ARCH} STREQUAL "x86_64") - set(WHISPER_CPP_HASH "da61500b9a37f8630b9e4ed49bc3fe7858729d7a28a2e80bf6cfa4cb97523546") + set(WHISPER_CPP_HASH "454abee900a96a0a10a91f631ff797bdbdf2df0d2a819479a409634c9be1e12c") elseif($ENV{MACOS_ARCH} STREQUAL "arm64") - set(WHISPER_CPP_HASH "ef1e2628ba09414c0848d58c471440f38b8393cb5d428edf82b9e78aeeecdd15") + set(WHISPER_CPP_HASH "f726388cc494f6fca864c860af6c1bc2932c3dc823ef92197b1e29f088425668") else() message( FATAL_ERROR @@ -54,13 +54,13 @@ elseif(WIN32) set(WHISPER_CPP_URL "${PREBUILT_WHISPERCPP_URL_BASE}/whispercpp-windows-${ARCH_PREFIX}-${PREBUILT_WHISPERCPP_VERSION}.zip") if(${ACCELERATION} STREQUAL "cpu") - set(WHISPER_CPP_HASH "2b1cfa0dd764132c4cde60e112a8e6328d28d158d91a8845080baa3e9d2dcdcd") + set(WHISPER_CPP_HASH "126c5d859e902b4cd0f2cd09304a68750f1dbc6a7aa62e280cfd56c51a6a1c95") add_compile_definitions("LOCALVOCAL_WITH_CPU") elseif(${ACCELERATION} STREQUAL "cuda") - set(WHISPER_CPP_HASH "011e813742fddf0911c4a36d2080d7a388cf78738081297088e7d50023e4f9bc") + set(WHISPER_CPP_HASH "5b9592c311a7f1612894ca0b36f6bd4effb6a46acd03d33924df56c52f566779") add_compile_definitions("LOCALVOCAL_WITH_CUDA") elseif(${ACCELERATION} STREQUAL "hipblas") - set(WHISPER_CPP_HASH "f2980d6cd3df9cac464378d26d2c19d827bcac995c8d0398a39230a9be936013") + set(WHISPER_CPP_HASH "c306ecce16cd10f377fdefbf7bb252abac8e6638a2637f82b1f1f32dd2cb4e39") add_compile_definitions("LOCALVOCAL_WITH_HIPBLAS") else() message( diff --git a/src/transcription-filter-callbacks.cpp b/src/transcription-filter-callbacks.cpp index 7b8208f..ff204b4 100644 --- a/src/transcription-filter-callbacks.cpp +++ b/src/transcription-filter-callbacks.cpp @@ -218,7 +218,8 @@ void set_text_callback(struct transcription_filter_data *gf, for (const auto &filter_words : gf->filter_words_replace) { // if filter exists within str_copy, replace it with the replacement str_copy = std::regex_replace(str_copy, - std::regex(std::get<0>(filter_words)), + std::regex(std::get<0>(filter_words), + std::regex_constants::icase), std::get<1>(filter_words)); } // if the text was modified, log the original and modified text @@ -322,7 +323,7 @@ void recording_state_callback(enum obs_frontend_event event, void *data) } } -void reset_caption_state(transcription_filter_data *gf_) +void clear_current_caption(transcription_filter_data *gf_) { if (gf_->captions_monitor.isEnabled()) { gf_->captions_monitor.clear(); @@ -336,6 +337,12 @@ void reset_caption_state(transcription_filter_data *gf_) gf_->translation_ctx.last_input_tokens.clear(); gf_->translation_ctx.last_translation_tokens.clear(); gf_->last_transcription_sentence.clear(); + gf_->cleared_last_sub = true; +} + +void reset_caption_state(transcription_filter_data *gf_) +{ + clear_current_caption(gf_); // flush the buffer { std::lock_guard lock(gf_->whisper_buf_mutex); diff --git a/src/transcription-filter-callbacks.h b/src/transcription-filter-callbacks.h index e8bdf3b..1f11ad8 100644 --- a/src/transcription-filter-callbacks.h +++ b/src/transcription-filter-callbacks.h @@ -17,6 +17,8 @@ void audio_chunk_callback(struct transcription_filter_data *gf, const float *pcm void set_text_callback(struct transcription_filter_data *gf, const DetectionResultWithText &resultIn); +void clear_current_caption(transcription_filter_data *gf_); + void recording_state_callback(enum obs_frontend_event event, void *data); void media_play_callback(void *data_, calldata_t *cd); diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h index e1af694..e8990be 100644 --- a/src/transcription-filter-data.h +++ b/src/transcription-filter-data.h @@ -152,6 +152,7 @@ struct transcription_filter_audio_info { // Callback sent when the transcription has a new result void set_text_callback(struct transcription_filter_data *gf, const DetectionResultWithText &str); +void clear_current_caption(transcription_filter_data *gf_); // Callback sent when the VAD finds an audio chunk. Sample rate = WHISPER_SAMPLE_RATE, channels = 1 // The audio chunk is in 32-bit float format diff --git a/src/transcription-filter-properties.cpp b/src/transcription-filter-properties.cpp index 815b9e3..a2c9da1 100644 --- a/src/transcription-filter-properties.cpp +++ b/src/transcription-filter-properties.cpp @@ -622,7 +622,7 @@ void transcription_filter_defaults(obs_data_t *s) obs_data_set_default_double(s, "thold_ptsum", 0.01); obs_data_set_default_int(s, "max_len", 0); obs_data_set_default_bool(s, "split_on_word", true); - obs_data_set_default_int(s, "max_tokens", 0); + obs_data_set_default_int(s, "max_tokens", 50); obs_data_set_default_bool(s, "suppress_blank", false); obs_data_set_default_bool(s, "suppress_non_speech_tokens", true); obs_data_set_default_double(s, "temperature", 0.1); diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp index 90e67eb..65ae072 100644 --- a/src/transcription-filter.cpp +++ b/src/transcription-filter.cpp @@ -396,6 +396,7 @@ void transcription_filter_update(void *data, obs_data_t *s) gf->whisper_params.temperature = (float)obs_data_get_double(s, "temperature"); gf->whisper_params.max_initial_ts = (float)obs_data_get_double(s, "max_initial_ts"); gf->whisper_params.length_penalty = (float)obs_data_get_double(s, "length_penalty"); + gf->whisper_params.no_timestamps = true; if (gf->vad) { const float vad_threshold = (float)obs_data_get_double(s, "vad_threshold"); diff --git a/src/translation/translation-language-utils.h b/src/translation/translation-language-utils.h index 44b450a..d2f4c47 100644 --- a/src/translation/translation-language-utils.h +++ b/src/translation/translation-language-utils.h @@ -5,4 +5,4 @@ std::string remove_start_punctuation(const std::string &text); -#endif // TRANSLATION_LANGUAGE_UTILS_H \ No newline at end of file +#endif // TRANSLATION_LANGUAGE_UTILS_H diff --git a/src/ui/filter-replace-dialog.cpp b/src/ui/filter-replace-dialog.cpp index b491a31..d3d0f84 100644 --- a/src/ui/filter-replace-dialog.cpp +++ b/src/ui/filter-replace-dialog.cpp @@ -27,6 +27,9 @@ FilterReplaceDialog::FilterReplaceDialog(QWidget *parent, transcription_filter_d // connect edit triggers connect(ui->tableWidget, &QTableWidget::itemChanged, this, &FilterReplaceDialog::editFilter); + // connect toolButton_addPrepopulatedFilter + connect(ui->toolButton_addPrepopulatedFilter, &QToolButton::clicked, this, + &FilterReplaceDialog::addPrepopulatedFilter); } FilterReplaceDialog::~FilterReplaceDialog() @@ -73,3 +76,28 @@ void FilterReplaceDialog::editFilter(QTableWidgetItem *item) // use the row number to update the filter_words_replace map ctx->filter_words_replace[item->row()] = std::make_tuple(key, value); } + +void FilterReplaceDialog::addPrepopulatedFilter() +{ + // add a prepopulated filter_words_replace map entry + // check the value of the comboBox_selectPrepopulatedFilter + // and add the corresponding filter_words_replace map entry + std::string replace_value = ""; + std::string replace_pattern; + const std::string selected = + ui->comboBox_selectPrepopulatedFilter->currentText().toStdString(); + if (selected == "English Swear Words") { + replace_pattern = "(fuck|shit|bitch|cunt|cock|dick|pussy)"; + replace_value = "****"; + } else if (selected == "English Hallucinations") { + replace_pattern = "(Thank you|Thanks for watching|Please subscribe)"; + } else if (selected == "Korean Hallucinations") { + replace_pattern = "MBC.*"; + } + ctx->filter_words_replace.push_back(std::make_tuple(replace_pattern, replace_value)); + ui->tableWidget->insertRow(ui->tableWidget->rowCount()); + ui->tableWidget->setItem(ui->tableWidget->rowCount() - 1, 0, + new QTableWidgetItem(QString::fromStdString(replace_pattern))); + ui->tableWidget->setItem(ui->tableWidget->rowCount() - 1, 1, + new QTableWidgetItem(QString::fromStdString(replace_value))); +} diff --git a/src/ui/filter-replace-dialog.h b/src/ui/filter-replace-dialog.h index d392a80..5e85b1e 100644 --- a/src/ui/filter-replace-dialog.h +++ b/src/ui/filter-replace-dialog.h @@ -25,6 +25,7 @@ private slots: void addFilter(); void removeFilter(); void editFilter(QTableWidgetItem *item); + void addPrepopulatedFilter(); }; #endif // FILTERREPLACEDIALOG_H diff --git a/src/ui/filter-replace-dialog.ui b/src/ui/filter-replace-dialog.ui index d67830c..ccc99f3 100644 --- a/src/ui/filter-replace-dialog.ui +++ b/src/ui/filter-replace-dialog.ui @@ -14,33 +14,7 @@ Filter and Replace - - - - 0 - - - 180 - - - true - - - false - - - - Word / Phrase (Regex) - - - - - Replace Value - - - - - + @@ -85,13 +59,83 @@ - + + + + 0 + + + 180 + + + true + + + false + + + + Word / Phrase (Regex) + + + + + Replace Value + + + + + Regex enabled. Use empty Replace Value to filter. + + + + + 0 + + + 0 + + + 0 + + + 0 + + + + + + English Swear Words + + + + + English Hallucinations + + + + + Korean Hallucinations + + + + + + + + Add + + + + + + diff --git a/src/whisper-utils/whisper-processing.cpp b/src/whisper-utils/whisper-processing.cpp index 6da91d9..3518edf 100644 --- a/src/whisper-utils/whisper-processing.cpp +++ b/src/whisper-utils/whisper-processing.cpp @@ -161,11 +161,18 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter if (pcm32f_num_samples < WHISPER_SAMPLE_RATE) { obs_log(gf->log_level, - "Speech segment is less than 1 second, padding with zeros to 1 second"); + "Speech segment is less than 1 second, padding with white noise to 1 second"); const size_t new_size = (size_t)(1.01f * (float)(WHISPER_SAMPLE_RATE)); // create a new buffer and copy the data to it in the middle pcm32f_data = (float *)bzalloc(new_size * sizeof(float)); - memset(pcm32f_data, 0, new_size * sizeof(float)); + + // add low volume white noise + const float noise_level = 0.01f; + for (size_t i = 0; i < new_size; ++i) { + pcm32f_data[i] = + noise_level * ((float)rand() / (float)RAND_MAX * 2.0f - 1.0f); + } + memcpy(pcm32f_data + (new_size - pcm32f_num_samples) / 2, pcm32f_data_, pcm32f_num_samples * sizeof(float)); pcm32f_size = new_size; @@ -234,10 +241,11 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter // get token whisper_token_data token = whisper_full_get_token_data(gf->whisper_context, n_segment, j); - const char *token_str = whisper_token_to_str(gf->whisper_context, token.id); + const std::string token_str = + whisper_token_to_str(gf->whisper_context, token.id); bool keep = true; // if the token starts with '[' and ends with ']', don't keep it - if (token_str[0] == '[' && token_str[strlen(token_str) - 1] == ']') { + if (token_str[0] == '[' && token_str[token_str.size() - 1] == ']') { keep = false; } // if this is a special token, don't keep it @@ -271,8 +279,8 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter text += token_str; tokens.push_back(token); } - obs_log(gf->log_level, "S %d, T %d: %d\t%s\tp: %.3f [keep: %d]", n_segment, - j, token.id, token_str, token.p, keep); + obs_log(gf->log_level, "S %d, T %2d: %5d\t%s\tp: %.3f [keep: %d]", + n_segment, j, token.id, token_str.c_str(), token.p, keep); } } sentence_p /= (float)tokens.size(); @@ -379,8 +387,7 @@ void whisper_loop(void *data) obs_log(gf->log_level, "Clearing current subtitle. now: %lu ms, last: %lu ms", now, gf->last_sub_render_time); - set_text_callback(gf, {DETECTION_RESULT_UNKNOWN, "", 0, 0, {}}); - gf->cleared_last_sub = true; + clear_current_caption(gf); } }