refactor: Update version to 0.3.5 and clear current caption in transc… (#164)

* refactor: Update version to 0.3.5 and clear current caption in transcription filter callbacks * feat: Refactor whisper-processing.cpp for improved VAD segmentation and token buffer thread * feat: Update prebuilt Whispercpp version to 0.0.6 * refactor: Remove trailing whitespace in translation-language-utils.h * refactor: Add case-insensitive flag to regex in set_text_callback The code change adds the `std::regex_constants::icase` flag to the regex used in the `set_text_callback` function in `transcription-filter-callbacks.cpp`. This allows for case-insensitive matching when replacing filter words in the `str_copy` string. Refactor the code to improve VAD segmentation and token buffer thread in whisper-processing.cpp The code change refactors the `whisper-processing.cpp` file to improve the VAD (Voice Activity Detection) segmentation and token buffer thread. This aims to enhance the performance and accuracy of the transcription filtering process. refactor: Add prepopulated filter options and corresponding map entries in FilterReplaceDialog The code change adds prepopulated filter options, such as "English Swear Words," "English Hallucinations," and "Korean Hallucinations," to the `FilterReplaceDialog` UI. It also adds the corresponding map entries to the `filter_words_replace` map, allowing users to easily add predefined filter patterns and replacement values. refactor: Update version to 0.3.5 and clear current caption in transcription filter callbacks The code change updates the version to 0.3.5 and clears the current caption in the transcription filter callbacks. This ensures that the correct version is displayed and any previous captions are removed. refactor: Remove trailing whitespace in translation-language-utils.h The code change removes trailing whitespace in the `translation-language-utils.h` file, improving code readability and consistency.
2024-11-07 02:40:12 +00:00 · 2024-09-12 20:06:26 -04:00 · 2024-09-12 20:06:26 -04:00 · 024502333a
commit 024502333a
parent abe678bbb1
12 changed files with 138 additions and 47 deletions
--- a/buildspec.json
+++ b/buildspec.json
@ -38,7 +38,7 @@
    },
    "name": "obs-localvocal",
    "displayName": "OBS Localvocal",
-    "version": "0.3.4",
+    "version": "0.3.5",
    "author": "Roy Shilkrot",
    "website": "https://github.com/occ-ai/obs-localvocal",
    "email": "roy.shil@gmail.com",
--- a/cmake/BuildWhispercpp.cmake
+++ b/cmake/BuildWhispercpp.cmake
@ -1,16 +1,16 @@
 include(ExternalProject)
 include(FetchContent)

-set(PREBUILT_WHISPERCPP_VERSION "0.0.5")
+set(PREBUILT_WHISPERCPP_VERSION "0.0.6")
 set(PREBUILT_WHISPERCPP_URL_BASE
    "https://github.com/occ-ai/occ-ai-dep-whispercpp/releases/download/${PREBUILT_WHISPERCPP_VERSION}")

 if(APPLE)
  # check the "MACOS_ARCH" env var to figure out if this is x86 or arm64
  if($ENV{MACOS_ARCH} STREQUAL "x86_64")
-    set(WHISPER_CPP_HASH "da61500b9a37f8630b9e4ed49bc3fe7858729d7a28a2e80bf6cfa4cb97523546")
+    set(WHISPER_CPP_HASH "454abee900a96a0a10a91f631ff797bdbdf2df0d2a819479a409634c9be1e12c")
  elseif($ENV{MACOS_ARCH} STREQUAL "arm64")
-    set(WHISPER_CPP_HASH "ef1e2628ba09414c0848d58c471440f38b8393cb5d428edf82b9e78aeeecdd15")
+    set(WHISPER_CPP_HASH "f726388cc494f6fca864c860af6c1bc2932c3dc823ef92197b1e29f088425668")
  else()
    message(
      FATAL_ERROR
@ -54,13 +54,13 @@ elseif(WIN32)
  set(WHISPER_CPP_URL
      "${PREBUILT_WHISPERCPP_URL_BASE}/whispercpp-windows-${ARCH_PREFIX}-${PREBUILT_WHISPERCPP_VERSION}.zip")
  if(${ACCELERATION} STREQUAL "cpu")
-    set(WHISPER_CPP_HASH "2b1cfa0dd764132c4cde60e112a8e6328d28d158d91a8845080baa3e9d2dcdcd")
+    set(WHISPER_CPP_HASH "126c5d859e902b4cd0f2cd09304a68750f1dbc6a7aa62e280cfd56c51a6a1c95")
    add_compile_definitions("LOCALVOCAL_WITH_CPU")
  elseif(${ACCELERATION} STREQUAL "cuda")
-    set(WHISPER_CPP_HASH "011e813742fddf0911c4a36d2080d7a388cf78738081297088e7d50023e4f9bc")
+    set(WHISPER_CPP_HASH "5b9592c311a7f1612894ca0b36f6bd4effb6a46acd03d33924df56c52f566779")
    add_compile_definitions("LOCALVOCAL_WITH_CUDA")
  elseif(${ACCELERATION} STREQUAL "hipblas")
-    set(WHISPER_CPP_HASH "f2980d6cd3df9cac464378d26d2c19d827bcac995c8d0398a39230a9be936013")
+    set(WHISPER_CPP_HASH "c306ecce16cd10f377fdefbf7bb252abac8e6638a2637f82b1f1f32dd2cb4e39")
    add_compile_definitions("LOCALVOCAL_WITH_HIPBLAS")
  else()
    message(
--- a/src/transcription-filter-callbacks.cpp
+++ b/src/transcription-filter-callbacks.cpp
@ -218,7 +218,8 @@ void set_text_callback(struct transcription_filter_data *gf,
 		for (const auto &filter_words : gf->filter_words_replace) {
 			// if filter exists within str_copy, replace it with the replacement
 			str_copy = std::regex_replace(str_copy,
-						      std::regex(std::get<0>(filter_words)),
+						      std::regex(std::get<0>(filter_words),
+								 std::regex_constants::icase),
 						      std::get<1>(filter_words));
 		}
 		// if the text was modified, log the original and modified text
@ -322,7 +323,7 @@ void recording_state_callback(enum obs_frontend_event event, void *data)
 	}
 }

-void reset_caption_state(transcription_filter_data *gf_)
+void clear_current_caption(transcription_filter_data *gf_)
 {
 	if (gf_->captions_monitor.isEnabled()) {
 		gf_->captions_monitor.clear();
@ -336,6 +337,12 @@ void reset_caption_state(transcription_filter_data *gf_)
 	gf_->translation_ctx.last_input_tokens.clear();
 	gf_->translation_ctx.last_translation_tokens.clear();
 	gf_->last_transcription_sentence.clear();
+	gf_->cleared_last_sub = true;
+}
+
+void reset_caption_state(transcription_filter_data *gf_)
+{
+	clear_current_caption(gf_);
 	// flush the buffer
 	{
 		std::lock_guard<std::mutex> lock(gf_->whisper_buf_mutex);
--- a/src/transcription-filter-callbacks.h
+++ b/src/transcription-filter-callbacks.h
@ -17,6 +17,8 @@ void audio_chunk_callback(struct transcription_filter_data *gf, const float *pcm
 void set_text_callback(struct transcription_filter_data *gf,
 		       const DetectionResultWithText &resultIn);

+void clear_current_caption(transcription_filter_data *gf_);
+
 void recording_state_callback(enum obs_frontend_event event, void *data);

 void media_play_callback(void *data_, calldata_t *cd);
--- a/src/transcription-filter-data.h
+++ b/src/transcription-filter-data.h
@ -152,6 +152,7 @@ struct transcription_filter_audio_info {

 // Callback sent when the transcription has a new result
 void set_text_callback(struct transcription_filter_data *gf, const DetectionResultWithText &str);
+void clear_current_caption(transcription_filter_data *gf_);

 // Callback sent when the VAD finds an audio chunk. Sample rate = WHISPER_SAMPLE_RATE, channels = 1
 // The audio chunk is in 32-bit float format
--- a/src/transcription-filter-properties.cpp
+++ b/src/transcription-filter-properties.cpp
@ -622,7 +622,7 @@ void transcription_filter_defaults(obs_data_t *s)
 	obs_data_set_default_double(s, "thold_ptsum", 0.01);
 	obs_data_set_default_int(s, "max_len", 0);
 	obs_data_set_default_bool(s, "split_on_word", true);
-	obs_data_set_default_int(s, "max_tokens", 0);
+	obs_data_set_default_int(s, "max_tokens", 50);
 	obs_data_set_default_bool(s, "suppress_blank", false);
 	obs_data_set_default_bool(s, "suppress_non_speech_tokens", true);
 	obs_data_set_default_double(s, "temperature", 0.1);
--- a/src/transcription-filter.cpp
+++ b/src/transcription-filter.cpp
@ -396,6 +396,7 @@ void transcription_filter_update(void *data, obs_data_t *s)
 		gf->whisper_params.temperature = (float)obs_data_get_double(s, "temperature");
 		gf->whisper_params.max_initial_ts = (float)obs_data_get_double(s, "max_initial_ts");
 		gf->whisper_params.length_penalty = (float)obs_data_get_double(s, "length_penalty");
+		gf->whisper_params.no_timestamps = true;

 		if (gf->vad) {
 			const float vad_threshold = (float)obs_data_get_double(s, "vad_threshold");
--- a/src/translation/translation-language-utils.h
+++ b/src/translation/translation-language-utils.h
@ -5,4 +5,4 @@

 std::string remove_start_punctuation(const std::string &text);

-#endif // TRANSLATION_LANGUAGE_UTILS_H
+#endif // TRANSLATION_LANGUAGE_UTILS_H
--- a/src/ui/filter-replace-dialog.cpp
+++ b/src/ui/filter-replace-dialog.cpp
@ -27,6 +27,9 @@ FilterReplaceDialog::FilterReplaceDialog(QWidget *parent, transcription_filter_d
 	// connect edit triggers
 	connect(ui->tableWidget, &QTableWidget::itemChanged, this,
 		&FilterReplaceDialog::editFilter);
+	// connect toolButton_addPrepopulatedFilter
+	connect(ui->toolButton_addPrepopulatedFilter, &QToolButton::clicked, this,
+		&FilterReplaceDialog::addPrepopulatedFilter);
 }

 FilterReplaceDialog::~FilterReplaceDialog()
@ -73,3 +76,28 @@ void FilterReplaceDialog::editFilter(QTableWidgetItem *item)
 	// use the row number to update the filter_words_replace map
 	ctx->filter_words_replace[item->row()] = std::make_tuple(key, value);
 }
+
+void FilterReplaceDialog::addPrepopulatedFilter()
+{
+	// add a prepopulated filter_words_replace map entry
+	// check the value of the comboBox_selectPrepopulatedFilter
+	// and add the corresponding filter_words_replace map entry
+	std::string replace_value = "";
+	std::string replace_pattern;
+	const std::string selected =
+		ui->comboBox_selectPrepopulatedFilter->currentText().toStdString();
+	if (selected == "English Swear Words") {
+		replace_pattern = "(fuck|shit|bitch|cunt|cock|dick|pussy)";
+		replace_value = "****";
+	} else if (selected == "English Hallucinations") {
+		replace_pattern = "(Thank you|Thanks for watching|Please subscribe)";
+	} else if (selected == "Korean Hallucinations") {
+		replace_pattern = "MBC.*";
+	}
+	ctx->filter_words_replace.push_back(std::make_tuple(replace_pattern, replace_value));
+	ui->tableWidget->insertRow(ui->tableWidget->rowCount());
+	ui->tableWidget->setItem(ui->tableWidget->rowCount() - 1, 0,
+				 new QTableWidgetItem(QString::fromStdString(replace_pattern)));
+	ui->tableWidget->setItem(ui->tableWidget->rowCount() - 1, 1,
+				 new QTableWidgetItem(QString::fromStdString(replace_value)));
+}
--- a/src/ui/filter-replace-dialog.h
+++ b/src/ui/filter-replace-dialog.h
@ -25,6 +25,7 @@ private slots:
 	void addFilter();
 	void removeFilter();
 	void editFilter(QTableWidgetItem *item);
+	void addPrepopulatedFilter();
 };

 #endif // FILTERREPLACEDIALOG_H
--- a/src/ui/filter-replace-dialog.ui
+++ b/src/ui/filter-replace-dialog.ui
@ -14,33 +14,7 @@
   <string>Filter and Replace</string>
  </property>
  <layout class="QGridLayout" name="gridLayout">
-   <item row="0" column="0">
-    <widget class="QTableWidget" name="tableWidget">
-     <property name="rowCount">
-      <number>0</number>
-     </property>
-     <attribute name="horizontalHeaderDefaultSectionSize">
-      <number>180</number>
-     </attribute>
-     <attribute name="horizontalHeaderStretchLastSection">
-      <bool>true</bool>
-     </attribute>
-     <attribute name="verticalHeaderVisible">
-      <bool>false</bool>
-     </attribute>
-     <column>
-      <property name="text">
-       <string>Word / Phrase (Regex)</string>
-      </property>
-     </column>
-     <column>
-      <property name="text">
-       <string>Replace Value</string>
-      </property>
-     </column>
-    </widget>
-   </item>
-   <item row="2" column="0">
+   <item row="3" column="0">
    <widget class="QWidget" name="widget" native="true">
     <layout class="QHBoxLayout" name="horizontalLayout">
      <property name="spacing">
@ -85,13 +59,83 @@
     </layout>
    </widget>
   </item>
-   <item row="1" column="0">
+   <item row="0" column="0">
+    <widget class="QTableWidget" name="tableWidget">
+     <property name="rowCount">
+      <number>0</number>
+     </property>
+     <attribute name="horizontalHeaderDefaultSectionSize">
+      <number>180</number>
+     </attribute>
+     <attribute name="horizontalHeaderStretchLastSection">
+      <bool>true</bool>
+     </attribute>
+     <attribute name="verticalHeaderVisible">
+      <bool>false</bool>
+     </attribute>
+     <column>
+      <property name="text">
+       <string>Word / Phrase (Regex)</string>
+      </property>
+     </column>
+     <column>
+      <property name="text">
+       <string>Replace Value</string>
+      </property>
+     </column>
+    </widget>
+   </item>
+   <item row="2" column="0">
    <widget class="QLabel" name="label">
     <property name="text">
      <string>Regex enabled. Use empty Replace Value to filter.</string>
     </property>
    </widget>
   </item>
+   <item row="1" column="0">
+    <widget class="QWidget" name="widget_2" native="true">
+     <layout class="QHBoxLayout" name="horizontalLayout_2">
+      <property name="leftMargin">
+       <number>0</number>
+      </property>
+      <property name="topMargin">
+       <number>0</number>
+      </property>
+      <property name="rightMargin">
+       <number>0</number>
+      </property>
+      <property name="bottomMargin">
+       <number>0</number>
+      </property>
+      <item>
+       <widget class="QComboBox" name="comboBox_selectPrepopulatedFilter">
+        <item>
+         <property name="text">
+          <string>English Swear Words</string>
+         </property>
+        </item>
+        <item>
+         <property name="text">
+          <string>English Hallucinations</string>
+         </property>
+        </item>
+        <item>
+         <property name="text">
+          <string>Korean Hallucinations</string>
+         </property>
+        </item>
+       </widget>
+      </item>
+      <item>
+       <widget class="QToolButton" name="toolButton_addPrepopulatedFilter">
+        <property name="text">
+         <string>Add</string>
+        </property>
+       </widget>
+      </item>
+     </layout>
+    </widget>
+   </item>
  </layout>
 </widget>
 <resources/>
--- a/src/whisper-utils/whisper-processing.cpp
+++ b/src/whisper-utils/whisper-processing.cpp
@ -161,11 +161,18 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter

 	if (pcm32f_num_samples < WHISPER_SAMPLE_RATE) {
 		obs_log(gf->log_level,
-			"Speech segment is less than 1 second, padding with zeros to 1 second");
+			"Speech segment is less than 1 second, padding with white noise to 1 second");
 		const size_t new_size = (size_t)(1.01f * (float)(WHISPER_SAMPLE_RATE));
 		// create a new buffer and copy the data to it in the middle
 		pcm32f_data = (float *)bzalloc(new_size * sizeof(float));
-		memset(pcm32f_data, 0, new_size * sizeof(float));
+
+		// add low volume white noise
+		const float noise_level = 0.01f;
+		for (size_t i = 0; i < new_size; ++i) {
+			pcm32f_data[i] =
+				noise_level * ((float)rand() / (float)RAND_MAX * 2.0f - 1.0f);
+		}
+
 		memcpy(pcm32f_data + (new_size - pcm32f_num_samples) / 2, pcm32f_data_,
 		       pcm32f_num_samples * sizeof(float));
 		pcm32f_size = new_size;
@ -234,10 +241,11 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
 			// get token
 			whisper_token_data token =
 				whisper_full_get_token_data(gf->whisper_context, n_segment, j);
-			const char *token_str = whisper_token_to_str(gf->whisper_context, token.id);
+			const std::string token_str =
+				whisper_token_to_str(gf->whisper_context, token.id);
 			bool keep = true;
 			// if the token starts with '[' and ends with ']', don't keep it
-			if (token_str[0] == '[' && token_str[strlen(token_str) - 1] == ']') {
+			if (token_str[0] == '[' && token_str[token_str.size() - 1] == ']') {
 				keep = false;
 			}
 			// if this is a special token, don't keep it
@ -271,8 +279,8 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
 				text += token_str;
 				tokens.push_back(token);
 			}
-			obs_log(gf->log_level, "S %d, T %d: %d\t%s\tp: %.3f [keep: %d]", n_segment,
-				j, token.id, token_str, token.p, keep);
+			obs_log(gf->log_level, "S %d, T %2d: %5d\t%s\tp: %.3f [keep: %d]",
+				n_segment, j, token.id, token_str.c_str(), token.p, keep);
 		}
 	}
 	sentence_p /= (float)tokens.size();
@ -379,8 +387,7 @@ void whisper_loop(void *data)
 				obs_log(gf->log_level,
 					"Clearing current subtitle. now: %lu ms, last: %lu ms", now,
 					gf->last_sub_render_time);
-				set_text_callback(gf, {DETECTION_RESULT_UNKNOWN, "", 0, 0, {}});
-				gf->cleared_last_sub = true;
+				clear_current_caption(gf);
 			}
 		}