refactor: Update version to 0.3.5 and clear current caption in transc… (#164)

* refactor: Update version to 0.3.5 and clear current caption in transcription filter callbacks

* feat: Refactor whisper-processing.cpp for improved VAD segmentation and token buffer thread

* feat: Update prebuilt Whispercpp version to 0.0.6

* refactor: Remove trailing whitespace in translation-language-utils.h

* refactor: Add case-insensitive flag to regex in set_text_callback

The code change adds the `std::regex_constants::icase` flag to the regex used in the `set_text_callback` function in `transcription-filter-callbacks.cpp`. This allows for case-insensitive matching when replacing filter words in the `str_copy` string.

Refactor the code to improve VAD segmentation and token buffer thread in whisper-processing.cpp

The code change refactors the `whisper-processing.cpp` file to improve the VAD (Voice Activity Detection) segmentation and token buffer thread. This aims to enhance the performance and accuracy of the transcription filtering process.

refactor: Add prepopulated filter options and corresponding map entries in FilterReplaceDialog

The code change adds prepopulated filter options, such as "English Swear Words," "English Hallucinations," and "Korean Hallucinations," to the `FilterReplaceDialog` UI. It also adds the corresponding map entries to the `filter_words_replace` map, allowing users to easily add predefined filter patterns and replacement values.

refactor: Update version to 0.3.5 and clear current caption in transcription filter callbacks

The code change updates the version to 0.3.5 and clears the current caption in the transcription filter callbacks. This ensures that the correct version is displayed and any previous captions are removed.

refactor: Remove trailing whitespace in translation-language-utils.h

The code change removes trailing whitespace in the `translation-language-utils.h` file, improving code readability and consistency.
This commit is contained in:
Roy Shilkrot 2024-09-12 20:06:26 -04:00 committed by GitHub
parent abe678bbb1
commit 024502333a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 138 additions and 47 deletions

View File

@ -38,7 +38,7 @@
},
"name": "obs-localvocal",
"displayName": "OBS Localvocal",
"version": "0.3.4",
"version": "0.3.5",
"author": "Roy Shilkrot",
"website": "https://github.com/occ-ai/obs-localvocal",
"email": "roy.shil@gmail.com",

View File

@ -1,16 +1,16 @@
include(ExternalProject)
include(FetchContent)
set(PREBUILT_WHISPERCPP_VERSION "0.0.5")
set(PREBUILT_WHISPERCPP_VERSION "0.0.6")
set(PREBUILT_WHISPERCPP_URL_BASE
"https://github.com/occ-ai/occ-ai-dep-whispercpp/releases/download/${PREBUILT_WHISPERCPP_VERSION}")
if(APPLE)
# check the "MACOS_ARCH" env var to figure out if this is x86 or arm64
if($ENV{MACOS_ARCH} STREQUAL "x86_64")
set(WHISPER_CPP_HASH "da61500b9a37f8630b9e4ed49bc3fe7858729d7a28a2e80bf6cfa4cb97523546")
set(WHISPER_CPP_HASH "454abee900a96a0a10a91f631ff797bdbdf2df0d2a819479a409634c9be1e12c")
elseif($ENV{MACOS_ARCH} STREQUAL "arm64")
set(WHISPER_CPP_HASH "ef1e2628ba09414c0848d58c471440f38b8393cb5d428edf82b9e78aeeecdd15")
set(WHISPER_CPP_HASH "f726388cc494f6fca864c860af6c1bc2932c3dc823ef92197b1e29f088425668")
else()
message(
FATAL_ERROR
@ -54,13 +54,13 @@ elseif(WIN32)
set(WHISPER_CPP_URL
"${PREBUILT_WHISPERCPP_URL_BASE}/whispercpp-windows-${ARCH_PREFIX}-${PREBUILT_WHISPERCPP_VERSION}.zip")
if(${ACCELERATION} STREQUAL "cpu")
set(WHISPER_CPP_HASH "2b1cfa0dd764132c4cde60e112a8e6328d28d158d91a8845080baa3e9d2dcdcd")
set(WHISPER_CPP_HASH "126c5d859e902b4cd0f2cd09304a68750f1dbc6a7aa62e280cfd56c51a6a1c95")
add_compile_definitions("LOCALVOCAL_WITH_CPU")
elseif(${ACCELERATION} STREQUAL "cuda")
set(WHISPER_CPP_HASH "011e813742fddf0911c4a36d2080d7a388cf78738081297088e7d50023e4f9bc")
set(WHISPER_CPP_HASH "5b9592c311a7f1612894ca0b36f6bd4effb6a46acd03d33924df56c52f566779")
add_compile_definitions("LOCALVOCAL_WITH_CUDA")
elseif(${ACCELERATION} STREQUAL "hipblas")
set(WHISPER_CPP_HASH "f2980d6cd3df9cac464378d26d2c19d827bcac995c8d0398a39230a9be936013")
set(WHISPER_CPP_HASH "c306ecce16cd10f377fdefbf7bb252abac8e6638a2637f82b1f1f32dd2cb4e39")
add_compile_definitions("LOCALVOCAL_WITH_HIPBLAS")
else()
message(

View File

@ -218,7 +218,8 @@ void set_text_callback(struct transcription_filter_data *gf,
for (const auto &filter_words : gf->filter_words_replace) {
// if filter exists within str_copy, replace it with the replacement
str_copy = std::regex_replace(str_copy,
std::regex(std::get<0>(filter_words)),
std::regex(std::get<0>(filter_words),
std::regex_constants::icase),
std::get<1>(filter_words));
}
// if the text was modified, log the original and modified text
@ -322,7 +323,7 @@ void recording_state_callback(enum obs_frontend_event event, void *data)
}
}
void reset_caption_state(transcription_filter_data *gf_)
void clear_current_caption(transcription_filter_data *gf_)
{
if (gf_->captions_monitor.isEnabled()) {
gf_->captions_monitor.clear();
@ -336,6 +337,12 @@ void reset_caption_state(transcription_filter_data *gf_)
gf_->translation_ctx.last_input_tokens.clear();
gf_->translation_ctx.last_translation_tokens.clear();
gf_->last_transcription_sentence.clear();
gf_->cleared_last_sub = true;
}
void reset_caption_state(transcription_filter_data *gf_)
{
clear_current_caption(gf_);
// flush the buffer
{
std::lock_guard<std::mutex> lock(gf_->whisper_buf_mutex);

View File

@ -17,6 +17,8 @@ void audio_chunk_callback(struct transcription_filter_data *gf, const float *pcm
void set_text_callback(struct transcription_filter_data *gf,
const DetectionResultWithText &resultIn);
void clear_current_caption(transcription_filter_data *gf_);
void recording_state_callback(enum obs_frontend_event event, void *data);
void media_play_callback(void *data_, calldata_t *cd);

View File

@ -152,6 +152,7 @@ struct transcription_filter_audio_info {
// Callback sent when the transcription has a new result
void set_text_callback(struct transcription_filter_data *gf, const DetectionResultWithText &str);
void clear_current_caption(transcription_filter_data *gf_);
// Callback sent when the VAD finds an audio chunk. Sample rate = WHISPER_SAMPLE_RATE, channels = 1
// The audio chunk is in 32-bit float format

View File

@ -622,7 +622,7 @@ void transcription_filter_defaults(obs_data_t *s)
obs_data_set_default_double(s, "thold_ptsum", 0.01);
obs_data_set_default_int(s, "max_len", 0);
obs_data_set_default_bool(s, "split_on_word", true);
obs_data_set_default_int(s, "max_tokens", 0);
obs_data_set_default_int(s, "max_tokens", 50);
obs_data_set_default_bool(s, "suppress_blank", false);
obs_data_set_default_bool(s, "suppress_non_speech_tokens", true);
obs_data_set_default_double(s, "temperature", 0.1);

View File

@ -396,6 +396,7 @@ void transcription_filter_update(void *data, obs_data_t *s)
gf->whisper_params.temperature = (float)obs_data_get_double(s, "temperature");
gf->whisper_params.max_initial_ts = (float)obs_data_get_double(s, "max_initial_ts");
gf->whisper_params.length_penalty = (float)obs_data_get_double(s, "length_penalty");
gf->whisper_params.no_timestamps = true;
if (gf->vad) {
const float vad_threshold = (float)obs_data_get_double(s, "vad_threshold");

View File

@ -5,4 +5,4 @@
std::string remove_start_punctuation(const std::string &text);
#endif // TRANSLATION_LANGUAGE_UTILS_H
#endif // TRANSLATION_LANGUAGE_UTILS_H

View File

@ -27,6 +27,9 @@ FilterReplaceDialog::FilterReplaceDialog(QWidget *parent, transcription_filter_d
// connect edit triggers
connect(ui->tableWidget, &QTableWidget::itemChanged, this,
&FilterReplaceDialog::editFilter);
// connect toolButton_addPrepopulatedFilter
connect(ui->toolButton_addPrepopulatedFilter, &QToolButton::clicked, this,
&FilterReplaceDialog::addPrepopulatedFilter);
}
FilterReplaceDialog::~FilterReplaceDialog()
@ -73,3 +76,28 @@ void FilterReplaceDialog::editFilter(QTableWidgetItem *item)
// use the row number to update the filter_words_replace map
ctx->filter_words_replace[item->row()] = std::make_tuple(key, value);
}
void FilterReplaceDialog::addPrepopulatedFilter()
{
// add a prepopulated filter_words_replace map entry
// check the value of the comboBox_selectPrepopulatedFilter
// and add the corresponding filter_words_replace map entry
std::string replace_value = "";
std::string replace_pattern;
const std::string selected =
ui->comboBox_selectPrepopulatedFilter->currentText().toStdString();
if (selected == "English Swear Words") {
replace_pattern = "(fuck|shit|bitch|cunt|cock|dick|pussy)";
replace_value = "****";
} else if (selected == "English Hallucinations") {
replace_pattern = "(Thank you|Thanks for watching|Please subscribe)";
} else if (selected == "Korean Hallucinations") {
replace_pattern = "MBC.*";
}
ctx->filter_words_replace.push_back(std::make_tuple(replace_pattern, replace_value));
ui->tableWidget->insertRow(ui->tableWidget->rowCount());
ui->tableWidget->setItem(ui->tableWidget->rowCount() - 1, 0,
new QTableWidgetItem(QString::fromStdString(replace_pattern)));
ui->tableWidget->setItem(ui->tableWidget->rowCount() - 1, 1,
new QTableWidgetItem(QString::fromStdString(replace_value)));
}

View File

@ -25,6 +25,7 @@ private slots:
void addFilter();
void removeFilter();
void editFilter(QTableWidgetItem *item);
void addPrepopulatedFilter();
};
#endif // FILTERREPLACEDIALOG_H

View File

@ -14,33 +14,7 @@
<string>Filter and Replace</string>
</property>
<layout class="QGridLayout" name="gridLayout">
<item row="0" column="0">
<widget class="QTableWidget" name="tableWidget">
<property name="rowCount">
<number>0</number>
</property>
<attribute name="horizontalHeaderDefaultSectionSize">
<number>180</number>
</attribute>
<attribute name="horizontalHeaderStretchLastSection">
<bool>true</bool>
</attribute>
<attribute name="verticalHeaderVisible">
<bool>false</bool>
</attribute>
<column>
<property name="text">
<string>Word / Phrase (Regex)</string>
</property>
</column>
<column>
<property name="text">
<string>Replace Value</string>
</property>
</column>
</widget>
</item>
<item row="2" column="0">
<item row="3" column="0">
<widget class="QWidget" name="widget" native="true">
<layout class="QHBoxLayout" name="horizontalLayout">
<property name="spacing">
@ -85,13 +59,83 @@
</layout>
</widget>
</item>
<item row="1" column="0">
<item row="0" column="0">
<widget class="QTableWidget" name="tableWidget">
<property name="rowCount">
<number>0</number>
</property>
<attribute name="horizontalHeaderDefaultSectionSize">
<number>180</number>
</attribute>
<attribute name="horizontalHeaderStretchLastSection">
<bool>true</bool>
</attribute>
<attribute name="verticalHeaderVisible">
<bool>false</bool>
</attribute>
<column>
<property name="text">
<string>Word / Phrase (Regex)</string>
</property>
</column>
<column>
<property name="text">
<string>Replace Value</string>
</property>
</column>
</widget>
</item>
<item row="2" column="0">
<widget class="QLabel" name="label">
<property name="text">
<string>Regex enabled. Use empty Replace Value to filter.</string>
</property>
</widget>
</item>
<item row="1" column="0">
<widget class="QWidget" name="widget_2" native="true">
<layout class="QHBoxLayout" name="horizontalLayout_2">
<property name="leftMargin">
<number>0</number>
</property>
<property name="topMargin">
<number>0</number>
</property>
<property name="rightMargin">
<number>0</number>
</property>
<property name="bottomMargin">
<number>0</number>
</property>
<item>
<widget class="QComboBox" name="comboBox_selectPrepopulatedFilter">
<item>
<property name="text">
<string>English Swear Words</string>
</property>
</item>
<item>
<property name="text">
<string>English Hallucinations</string>
</property>
</item>
<item>
<property name="text">
<string>Korean Hallucinations</string>
</property>
</item>
</widget>
</item>
<item>
<widget class="QToolButton" name="toolButton_addPrepopulatedFilter">
<property name="text">
<string>Add</string>
</property>
</widget>
</item>
</layout>
</widget>
</item>
</layout>
</widget>
<resources/>

View File

@ -161,11 +161,18 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
if (pcm32f_num_samples < WHISPER_SAMPLE_RATE) {
obs_log(gf->log_level,
"Speech segment is less than 1 second, padding with zeros to 1 second");
"Speech segment is less than 1 second, padding with white noise to 1 second");
const size_t new_size = (size_t)(1.01f * (float)(WHISPER_SAMPLE_RATE));
// create a new buffer and copy the data to it in the middle
pcm32f_data = (float *)bzalloc(new_size * sizeof(float));
memset(pcm32f_data, 0, new_size * sizeof(float));
// add low volume white noise
const float noise_level = 0.01f;
for (size_t i = 0; i < new_size; ++i) {
pcm32f_data[i] =
noise_level * ((float)rand() / (float)RAND_MAX * 2.0f - 1.0f);
}
memcpy(pcm32f_data + (new_size - pcm32f_num_samples) / 2, pcm32f_data_,
pcm32f_num_samples * sizeof(float));
pcm32f_size = new_size;
@ -234,10 +241,11 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
// get token
whisper_token_data token =
whisper_full_get_token_data(gf->whisper_context, n_segment, j);
const char *token_str = whisper_token_to_str(gf->whisper_context, token.id);
const std::string token_str =
whisper_token_to_str(gf->whisper_context, token.id);
bool keep = true;
// if the token starts with '[' and ends with ']', don't keep it
if (token_str[0] == '[' && token_str[strlen(token_str) - 1] == ']') {
if (token_str[0] == '[' && token_str[token_str.size() - 1] == ']') {
keep = false;
}
// if this is a special token, don't keep it
@ -271,8 +279,8 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
text += token_str;
tokens.push_back(token);
}
obs_log(gf->log_level, "S %d, T %d: %d\t%s\tp: %.3f [keep: %d]", n_segment,
j, token.id, token_str, token.p, keep);
obs_log(gf->log_level, "S %d, T %2d: %5d\t%s\tp: %.3f [keep: %d]",
n_segment, j, token.id, token_str.c_str(), token.p, keep);
}
}
sentence_p /= (float)tokens.size();
@ -379,8 +387,7 @@ void whisper_loop(void *data)
obs_log(gf->log_level,
"Clearing current subtitle. now: %lu ms, last: %lu ms", now,
gf->last_sub_render_time);
set_text_callback(gf, {DETECTION_RESULT_UNKNOWN, "", 0, 0, {}});
gf->cleared_last_sub = true;
clear_current_caption(gf);
}
}