From 024502333a52b59f3f0795108574cd020a50ede9 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Thu, 12 Sep 2024 20:06:26 -0400
Subject: [PATCH] =?UTF-8?q?refactor:=20Update=20version=20to=200.3.5=20and?=
 =?UTF-8?q?=20clear=20current=20caption=20in=20transc=E2=80=A6=20(#164)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* refactor: Update version to 0.3.5 and clear current caption in transcription filter callbacks

* feat: Refactor whisper-processing.cpp for improved VAD segmentation and token buffer thread

* feat: Update prebuilt Whispercpp version to 0.0.6

* refactor: Remove trailing whitespace in translation-language-utils.h

* refactor: Add case-insensitive flag to regex in set_text_callback

The code change adds the `std::regex_constants::icase` flag to the regex used in the `set_text_callback` function in `transcription-filter-callbacks.cpp`. This allows for case-insensitive matching when replacing filter words in the `str_copy` string.

Refactor the code to improve VAD segmentation and token buffer thread in whisper-processing.cpp

The code change refactors the `whisper-processing.cpp` file to improve the VAD (Voice Activity Detection) segmentation and token buffer thread. This aims to enhance the performance and accuracy of the transcription filtering process.

refactor: Add prepopulated filter options and corresponding map entries in FilterReplaceDialog

The code change adds prepopulated filter options, such as "English Swear Words," "English Hallucinations," and "Korean Hallucinations," to the `FilterReplaceDialog` UI. It also adds the corresponding map entries to the `filter_words_replace` map, allowing users to easily add predefined filter patterns and replacement values.

refactor: Update version to 0.3.5 and clear current caption in transcription filter callbacks

The code change updates the version to 0.3.5 and clears the current caption in the transcription filter callbacks. This ensures that the correct version is displayed and any previous captions are removed.

refactor: Remove trailing whitespace in translation-language-utils.h

The code change removes trailing whitespace in the `translation-language-utils.h` file, improving code readability and consistency.
---
 buildspec.json                               |   2 +-
 cmake/BuildWhispercpp.cmake                  |  12 +--
 src/transcription-filter-callbacks.cpp       |  11 +-
 src/transcription-filter-callbacks.h         |   2 +
 src/transcription-filter-data.h              |   1 +
 src/transcription-filter-properties.cpp      |   2 +-
 src/transcription-filter.cpp                 |   1 +
 src/translation/translation-language-utils.h |   2 +-
 src/ui/filter-replace-dialog.cpp             |  28 ++++++
 src/ui/filter-replace-dialog.h               |   1 +
 src/ui/filter-replace-dialog.ui              | 100 +++++++++++++------
 src/whisper-utils/whisper-processing.cpp     |  23 +++--
 12 files changed, 138 insertions(+), 47 deletions(-)

diff --git a/buildspec.json b/buildspec.json
index 22a0b17..fd6c6f7 100644
--- a/buildspec.json
+++ b/buildspec.json
@@ -38,7 +38,7 @@
     },
     "name": "obs-localvocal",
     "displayName": "OBS Localvocal",
-    "version": "0.3.4",
+    "version": "0.3.5",
     "author": "Roy Shilkrot",
     "website": "https://github.com/occ-ai/obs-localvocal",
     "email": "roy.shil@gmail.com",
diff --git a/cmake/BuildWhispercpp.cmake b/cmake/BuildWhispercpp.cmake
index 25d69d1..66e0f0b 100644
--- a/cmake/BuildWhispercpp.cmake
+++ b/cmake/BuildWhispercpp.cmake
@@ -1,16 +1,16 @@
 include(ExternalProject)
 include(FetchContent)
 
-set(PREBUILT_WHISPERCPP_VERSION "0.0.5")
+set(PREBUILT_WHISPERCPP_VERSION "0.0.6")
 set(PREBUILT_WHISPERCPP_URL_BASE
     "https://github.com/occ-ai/occ-ai-dep-whispercpp/releases/download/${PREBUILT_WHISPERCPP_VERSION}")
 
 if(APPLE)
   # check the "MACOS_ARCH" env var to figure out if this is x86 or arm64
   if($ENV{MACOS_ARCH} STREQUAL "x86_64")
-    set(WHISPER_CPP_HASH "da61500b9a37f8630b9e4ed49bc3fe7858729d7a28a2e80bf6cfa4cb97523546")
+    set(WHISPER_CPP_HASH "454abee900a96a0a10a91f631ff797bdbdf2df0d2a819479a409634c9be1e12c")
   elseif($ENV{MACOS_ARCH} STREQUAL "arm64")
-    set(WHISPER_CPP_HASH "ef1e2628ba09414c0848d58c471440f38b8393cb5d428edf82b9e78aeeecdd15")
+    set(WHISPER_CPP_HASH "f726388cc494f6fca864c860af6c1bc2932c3dc823ef92197b1e29f088425668")
   else()
     message(
       FATAL_ERROR
@@ -54,13 +54,13 @@ elseif(WIN32)
   set(WHISPER_CPP_URL
       "${PREBUILT_WHISPERCPP_URL_BASE}/whispercpp-windows-${ARCH_PREFIX}-${PREBUILT_WHISPERCPP_VERSION}.zip")
   if(${ACCELERATION} STREQUAL "cpu")
-    set(WHISPER_CPP_HASH "2b1cfa0dd764132c4cde60e112a8e6328d28d158d91a8845080baa3e9d2dcdcd")
+    set(WHISPER_CPP_HASH "126c5d859e902b4cd0f2cd09304a68750f1dbc6a7aa62e280cfd56c51a6a1c95")
     add_compile_definitions("LOCALVOCAL_WITH_CPU")
   elseif(${ACCELERATION} STREQUAL "cuda")
-    set(WHISPER_CPP_HASH "011e813742fddf0911c4a36d2080d7a388cf78738081297088e7d50023e4f9bc")
+    set(WHISPER_CPP_HASH "5b9592c311a7f1612894ca0b36f6bd4effb6a46acd03d33924df56c52f566779")
     add_compile_definitions("LOCALVOCAL_WITH_CUDA")
   elseif(${ACCELERATION} STREQUAL "hipblas")
-    set(WHISPER_CPP_HASH "f2980d6cd3df9cac464378d26d2c19d827bcac995c8d0398a39230a9be936013")
+    set(WHISPER_CPP_HASH "c306ecce16cd10f377fdefbf7bb252abac8e6638a2637f82b1f1f32dd2cb4e39")
     add_compile_definitions("LOCALVOCAL_WITH_HIPBLAS")
   else()
     message(
diff --git a/src/transcription-filter-callbacks.cpp b/src/transcription-filter-callbacks.cpp
index 7b8208f..ff204b4 100644
--- a/src/transcription-filter-callbacks.cpp
+++ b/src/transcription-filter-callbacks.cpp
@@ -218,7 +218,8 @@ void set_text_callback(struct transcription_filter_data *gf,
 		for (const auto &filter_words : gf->filter_words_replace) {
 			// if filter exists within str_copy, replace it with the replacement
 			str_copy = std::regex_replace(str_copy,
-						      std::regex(std::get<0>(filter_words)),
+						      std::regex(std::get<0>(filter_words),
+								 std::regex_constants::icase),
 						      std::get<1>(filter_words));
 		}
 		// if the text was modified, log the original and modified text
@@ -322,7 +323,7 @@ void recording_state_callback(enum obs_frontend_event event, void *data)
 	}
 }
 
-void reset_caption_state(transcription_filter_data *gf_)
+void clear_current_caption(transcription_filter_data *gf_)
 {
 	if (gf_->captions_monitor.isEnabled()) {
 		gf_->captions_monitor.clear();
@@ -336,6 +337,12 @@ void reset_caption_state(transcription_filter_data *gf_)
 	gf_->translation_ctx.last_input_tokens.clear();
 	gf_->translation_ctx.last_translation_tokens.clear();
 	gf_->last_transcription_sentence.clear();
+	gf_->cleared_last_sub = true;
+}
+
+void reset_caption_state(transcription_filter_data *gf_)
+{
+	clear_current_caption(gf_);
 	// flush the buffer
 	{
 		std::lock_guard<std::mutex> lock(gf_->whisper_buf_mutex);
diff --git a/src/transcription-filter-callbacks.h b/src/transcription-filter-callbacks.h
index e8bdf3b..1f11ad8 100644
--- a/src/transcription-filter-callbacks.h
+++ b/src/transcription-filter-callbacks.h
@@ -17,6 +17,8 @@ void audio_chunk_callback(struct transcription_filter_data *gf, const float *pcm
 void set_text_callback(struct transcription_filter_data *gf,
 		       const DetectionResultWithText &resultIn);
 
+void clear_current_caption(transcription_filter_data *gf_);
+
 void recording_state_callback(enum obs_frontend_event event, void *data);
 
 void media_play_callback(void *data_, calldata_t *cd);
diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h
index e1af694..e8990be 100644
--- a/src/transcription-filter-data.h
+++ b/src/transcription-filter-data.h
@@ -152,6 +152,7 @@ struct transcription_filter_audio_info {
 
 // Callback sent when the transcription has a new result
 void set_text_callback(struct transcription_filter_data *gf, const DetectionResultWithText &str);
+void clear_current_caption(transcription_filter_data *gf_);
 
 // Callback sent when the VAD finds an audio chunk. Sample rate = WHISPER_SAMPLE_RATE, channels = 1
 // The audio chunk is in 32-bit float format
diff --git a/src/transcription-filter-properties.cpp b/src/transcription-filter-properties.cpp
index 815b9e3..a2c9da1 100644
--- a/src/transcription-filter-properties.cpp
+++ b/src/transcription-filter-properties.cpp
@@ -622,7 +622,7 @@ void transcription_filter_defaults(obs_data_t *s)
 	obs_data_set_default_double(s, "thold_ptsum", 0.01);
 	obs_data_set_default_int(s, "max_len", 0);
 	obs_data_set_default_bool(s, "split_on_word", true);
-	obs_data_set_default_int(s, "max_tokens", 0);
+	obs_data_set_default_int(s, "max_tokens", 50);
 	obs_data_set_default_bool(s, "suppress_blank", false);
 	obs_data_set_default_bool(s, "suppress_non_speech_tokens", true);
 	obs_data_set_default_double(s, "temperature", 0.1);
diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp
index 90e67eb..65ae072 100644
--- a/src/transcription-filter.cpp
+++ b/src/transcription-filter.cpp
@@ -396,6 +396,7 @@ void transcription_filter_update(void *data, obs_data_t *s)
 		gf->whisper_params.temperature = (float)obs_data_get_double(s, "temperature");
 		gf->whisper_params.max_initial_ts = (float)obs_data_get_double(s, "max_initial_ts");
 		gf->whisper_params.length_penalty = (float)obs_data_get_double(s, "length_penalty");
+		gf->whisper_params.no_timestamps = true;
 
 		if (gf->vad) {
 			const float vad_threshold = (float)obs_data_get_double(s, "vad_threshold");
diff --git a/src/translation/translation-language-utils.h b/src/translation/translation-language-utils.h
index 44b450a..d2f4c47 100644
--- a/src/translation/translation-language-utils.h
+++ b/src/translation/translation-language-utils.h
@@ -5,4 +5,4 @@
 
 std::string remove_start_punctuation(const std::string &text);
 
-#endif // TRANSLATION_LANGUAGE_UTILS_H
\ No newline at end of file
+#endif // TRANSLATION_LANGUAGE_UTILS_H
diff --git a/src/ui/filter-replace-dialog.cpp b/src/ui/filter-replace-dialog.cpp
index b491a31..d3d0f84 100644
--- a/src/ui/filter-replace-dialog.cpp
+++ b/src/ui/filter-replace-dialog.cpp
@@ -27,6 +27,9 @@ FilterReplaceDialog::FilterReplaceDialog(QWidget *parent, transcription_filter_d
 	// connect edit triggers
 	connect(ui->tableWidget, &QTableWidget::itemChanged, this,
 		&FilterReplaceDialog::editFilter);
+	// connect toolButton_addPrepopulatedFilter
+	connect(ui->toolButton_addPrepopulatedFilter, &QToolButton::clicked, this,
+		&FilterReplaceDialog::addPrepopulatedFilter);
 }
 
 FilterReplaceDialog::~FilterReplaceDialog()
@@ -73,3 +76,28 @@ void FilterReplaceDialog::editFilter(QTableWidgetItem *item)
 	// use the row number to update the filter_words_replace map
 	ctx->filter_words_replace[item->row()] = std::make_tuple(key, value);
 }
+
+void FilterReplaceDialog::addPrepopulatedFilter()
+{
+	// add a prepopulated filter_words_replace map entry
+	// check the value of the comboBox_selectPrepopulatedFilter
+	// and add the corresponding filter_words_replace map entry
+	std::string replace_value = "";
+	std::string replace_pattern;
+	const std::string selected =
+		ui->comboBox_selectPrepopulatedFilter->currentText().toStdString();
+	if (selected == "English Swear Words") {
+		replace_pattern = "(fuck|shit|bitch|cunt|cock|dick|pussy)";
+		replace_value = "****";
+	} else if (selected == "English Hallucinations") {
+		replace_pattern = "(Thank you|Thanks for watching|Please subscribe)";
+	} else if (selected == "Korean Hallucinations") {
+		replace_pattern = "MBC.*";
+	}
+	ctx->filter_words_replace.push_back(std::make_tuple(replace_pattern, replace_value));
+	ui->tableWidget->insertRow(ui->tableWidget->rowCount());
+	ui->tableWidget->setItem(ui->tableWidget->rowCount() - 1, 0,
+				 new QTableWidgetItem(QString::fromStdString(replace_pattern)));
+	ui->tableWidget->setItem(ui->tableWidget->rowCount() - 1, 1,
+				 new QTableWidgetItem(QString::fromStdString(replace_value)));
+}
diff --git a/src/ui/filter-replace-dialog.h b/src/ui/filter-replace-dialog.h
index d392a80..5e85b1e 100644
--- a/src/ui/filter-replace-dialog.h
+++ b/src/ui/filter-replace-dialog.h
@@ -25,6 +25,7 @@ private slots:
 	void addFilter();
 	void removeFilter();
 	void editFilter(QTableWidgetItem *item);
+	void addPrepopulatedFilter();
 };
 
 #endif // FILTERREPLACEDIALOG_H
diff --git a/src/ui/filter-replace-dialog.ui b/src/ui/filter-replace-dialog.ui
index d67830c..ccc99f3 100644
--- a/src/ui/filter-replace-dialog.ui
+++ b/src/ui/filter-replace-dialog.ui
@@ -14,33 +14,7 @@
    <string>Filter and Replace</string>
   </property>
   <layout class="QGridLayout" name="gridLayout">
-   <item row="0" column="0">
-    <widget class="QTableWidget" name="tableWidget">
-     <property name="rowCount">
-      <number>0</number>
-     </property>
-     <attribute name="horizontalHeaderDefaultSectionSize">
-      <number>180</number>
-     </attribute>
-     <attribute name="horizontalHeaderStretchLastSection">
-      <bool>true</bool>
-     </attribute>
-     <attribute name="verticalHeaderVisible">
-      <bool>false</bool>
-     </attribute>
-     <column>
-      <property name="text">
-       <string>Word / Phrase (Regex)</string>
-      </property>
-     </column>
-     <column>
-      <property name="text">
-       <string>Replace Value</string>
-      </property>
-     </column>
-    </widget>
-   </item>
-   <item row="2" column="0">
+   <item row="3" column="0">
     <widget class="QWidget" name="widget" native="true">
      <layout class="QHBoxLayout" name="horizontalLayout">
       <property name="spacing">
@@ -85,13 +59,83 @@
      </layout>
     </widget>
    </item>
-   <item row="1" column="0">
+   <item row="0" column="0">
+    <widget class="QTableWidget" name="tableWidget">
+     <property name="rowCount">
+      <number>0</number>
+     </property>
+     <attribute name="horizontalHeaderDefaultSectionSize">
+      <number>180</number>
+     </attribute>
+     <attribute name="horizontalHeaderStretchLastSection">
+      <bool>true</bool>
+     </attribute>
+     <attribute name="verticalHeaderVisible">
+      <bool>false</bool>
+     </attribute>
+     <column>
+      <property name="text">
+       <string>Word / Phrase (Regex)</string>
+      </property>
+     </column>
+     <column>
+      <property name="text">
+       <string>Replace Value</string>
+      </property>
+     </column>
+    </widget>
+   </item>
+   <item row="2" column="0">
     <widget class="QLabel" name="label">
      <property name="text">
       <string>Regex enabled. Use empty Replace Value to filter.</string>
      </property>
     </widget>
    </item>
+   <item row="1" column="0">
+    <widget class="QWidget" name="widget_2" native="true">
+     <layout class="QHBoxLayout" name="horizontalLayout_2">
+      <property name="leftMargin">
+       <number>0</number>
+      </property>
+      <property name="topMargin">
+       <number>0</number>
+      </property>
+      <property name="rightMargin">
+       <number>0</number>
+      </property>
+      <property name="bottomMargin">
+       <number>0</number>
+      </property>
+      <item>
+       <widget class="QComboBox" name="comboBox_selectPrepopulatedFilter">
+        <item>
+         <property name="text">
+          <string>English Swear Words</string>
+         </property>
+        </item>
+        <item>
+         <property name="text">
+          <string>English Hallucinations</string>
+         </property>
+        </item>
+        <item>
+         <property name="text">
+          <string>Korean Hallucinations</string>
+         </property>
+        </item>
+       </widget>
+      </item>
+      <item>
+       <widget class="QToolButton" name="toolButton_addPrepopulatedFilter">
+        <property name="text">
+         <string>Add</string>
+        </property>
+       </widget>
+      </item>
+     </layout>
+    </widget>
+   </item>
   </layout>
  </widget>
  <resources/>
diff --git a/src/whisper-utils/whisper-processing.cpp b/src/whisper-utils/whisper-processing.cpp
index 6da91d9..3518edf 100644
--- a/src/whisper-utils/whisper-processing.cpp
+++ b/src/whisper-utils/whisper-processing.cpp
@@ -161,11 +161,18 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
 
 	if (pcm32f_num_samples < WHISPER_SAMPLE_RATE) {
 		obs_log(gf->log_level,
-			"Speech segment is less than 1 second, padding with zeros to 1 second");
+			"Speech segment is less than 1 second, padding with white noise to 1 second");
 		const size_t new_size = (size_t)(1.01f * (float)(WHISPER_SAMPLE_RATE));
 		// create a new buffer and copy the data to it in the middle
 		pcm32f_data = (float *)bzalloc(new_size * sizeof(float));
-		memset(pcm32f_data, 0, new_size * sizeof(float));
+
+		// add low volume white noise
+		const float noise_level = 0.01f;
+		for (size_t i = 0; i < new_size; ++i) {
+			pcm32f_data[i] =
+				noise_level * ((float)rand() / (float)RAND_MAX * 2.0f - 1.0f);
+		}
+
 		memcpy(pcm32f_data + (new_size - pcm32f_num_samples) / 2, pcm32f_data_,
 		       pcm32f_num_samples * sizeof(float));
 		pcm32f_size = new_size;
@@ -234,10 +241,11 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
 			// get token
 			whisper_token_data token =
 				whisper_full_get_token_data(gf->whisper_context, n_segment, j);
-			const char *token_str = whisper_token_to_str(gf->whisper_context, token.id);
+			const std::string token_str =
+				whisper_token_to_str(gf->whisper_context, token.id);
 			bool keep = true;
 			// if the token starts with '[' and ends with ']', don't keep it
-			if (token_str[0] == '[' && token_str[strlen(token_str) - 1] == ']') {
+			if (token_str[0] == '[' && token_str[token_str.size() - 1] == ']') {
 				keep = false;
 			}
 			// if this is a special token, don't keep it
@@ -271,8 +279,8 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
 				text += token_str;
 				tokens.push_back(token);
 			}
-			obs_log(gf->log_level, "S %d, T %d: %d\t%s\tp: %.3f [keep: %d]", n_segment,
-				j, token.id, token_str, token.p, keep);
+			obs_log(gf->log_level, "S %d, T %2d: %5d\t%s\tp: %.3f [keep: %d]",
+				n_segment, j, token.id, token_str.c_str(), token.p, keep);
 		}
 	}
 	sentence_p /= (float)tokens.size();
@@ -379,8 +387,7 @@ void whisper_loop(void *data)
 				obs_log(gf->log_level,
 					"Clearing current subtitle. now: %lu ms, last: %lu ms", now,
 					gf->last_sub_render_time);
-				set_text_callback(gf, {DETECTION_RESULT_UNKNOWN, "", 0, 0, {}});
-				gf->cleared_last_sub = true;
+				clear_current_caption(gf);
 			}
 		}