Merge pull request #31 from obs-ai/roy.fix_russian_win32_and_model_load

fix model load, fix Russian utf8 on Windows
2024-11-08 03:08:07 +00:00 · 2023-10-04 00:01:57 -04:00 · 2023-10-04 00:01:57 -04:00 · 3c4c1c3ddd
commit 3c4c1c3ddd
parent 2e78d47aa6 f90270b5fe
2 changed files with 91 additions and 31 deletions
--- a/src/transcription-filter-data.h
+++ b/src/transcription-filter-data.h
@ -43,6 +43,7 @@ struct transcription_filter_data {
 	/* whisper */
 	char *whisper_model_path = nullptr;
 	std::string whisper_model_file_currently_loaded = "";
 	struct whisper_context *whisper_context = nullptr;
 	whisper_full_params whisper_params;
@ -50,7 +51,7 @@ struct transcription_filter_data {
 	bool do_silence;
 	bool vad_enabled;
-	int log_level;
+	int log_level = LOG_DEBUG;
 	bool log_words;
 	bool caption_to_stream;
 	bool active = false;
--- a/src/transcription-filter.cpp
+++ b/src/transcription-filter.cpp
@ -10,6 +10,12 @@
 #include <algorithm>
 #include <fstream>
 #include <sstream>
 #include <iomanip>
 #include <bitset>
 #ifdef _WIN32
 #include <Windows.h>
 #endif
 inline enum speaker_layout convert_speaker_layout(uint8_t channels)
 {
@ -175,6 +181,21 @@ void acquire_weak_text_source_ref(struct transcription_filter_data *gf)
 void set_text_callback(struct transcription_filter_data *gf, const std::string &str)
 {
 #ifdef _WIN32
 	// Russian UTF8 charset on Windows output has a bug, instead of 0xd? it outputs
 	// 0xf?, so we need to replace it. This doesn't affect any other charset, which
 	// outputs the correct UTF8 output. (Except maybe for Greek?)
 	std::string str_copy = str;
 	for (size_t i = 0; i < str_copy.size(); ++i) {
 		// if the char MSBs starts with 0xf replace the MSBs with 0xd
 		if ((str_copy.c_str()[i] & 0xf0) == 0xf0) {
 			str_copy[i] = (str_copy.c_str()[i] & 0x0f) | 0xd0;
 		}
 	}
 #else
 	std::string str_copy = str;
 #endif
 	if (gf->caption_to_stream) {
 		obs_output_t *streaming_output = obs_frontend_get_streaming_output();
 		if (streaming_output) {
@ -210,7 +231,7 @@ void set_text_callback(struct transcription_filter_data *gf, const std::string &
 			return;
 		}
 		auto text_settings = obs_source_get_settings(target);
-		obs_data_set_string(text_settings, "text", str.c_str());
+		obs_data_set_string(text_settings, "text", str_copy.c_str());
 		obs_source_update(target, text_settings);
 		obs_source_release(target);
 	}
@ -218,6 +239,7 @@ void set_text_callback(struct transcription_filter_data *gf, const std::string &
 void shutdown_whisper_thread(struct transcription_filter_data *gf)
 {
 	obs_log(gf->log_level, "shutdown_whisper_thread");
 	if (gf->whisper_context != nullptr) {
 		// acquire the mutex before freeing the context
 		if (!gf->whisper_ctx_mutex || !gf->wshiper_thread_cv) {
@ -232,6 +254,28 @@ void shutdown_whisper_thread(struct transcription_filter_data *gf)
 	if (gf->whisper_thread.joinable()) {
 		gf->whisper_thread.join();
 	}
 	if (gf->whisper_model_path != nullptr) {
 		bfree(gf->whisper_model_path);
 		gf->whisper_model_path = nullptr;
 	}
 }
 void start_whisper_thread_with_path(struct transcription_filter_data *gf, const std::string &path)
 {
 	obs_log(gf->log_level, "start_whisper_thread_with_path: %s", path.c_str());
 	if (!gf->whisper_ctx_mutex) {
 		obs_log(LOG_ERROR, "cannot init whisper: whisper_ctx_mutex is null");
 		return;
 	}
 	std::lock_guard<std::mutex> lock(*gf->whisper_ctx_mutex);
 	if (gf->whisper_context != nullptr) {
 		obs_log(LOG_ERROR, "cannot init whisper: whisper_context is not null");
 		return;
 	}
 	gf->whisper_context = init_whisper_context(path);
 	gf->whisper_model_file_currently_loaded = path;
 	std::thread new_whisper_thread(whisper_loop, gf);
 	gf->whisper_thread.swap(new_whisper_thread);
 }
 void transcription_filter_update(void *data, obs_data_t *s)
@ -239,8 +283,9 @@ void transcription_filter_update(void *data, obs_data_t *s)
 	struct transcription_filter_data *gf =
 		static_cast<struct transcription_filter_data *>(data);
 	obs_log(gf->log_level, "transcription_filter_update");
 	gf->log_level = (int)obs_data_get_int(s, "log_level");
 	obs_log(gf->log_level, "transcription_filter_update");
 	gf->vad_enabled = obs_data_get_bool(s, "vad_enabled");
 	gf->log_words = obs_data_get_bool(s, "log_words");
 	gf->caption_to_stream = obs_data_get_bool(s, "caption_to_stream");
@ -310,20 +355,21 @@ void transcription_filter_update(void *data, obs_data_t *s)
 	obs_log(gf->log_level, "transcription_filter: update whisper model");
 	// update the whisper model path
 	std::string new_model_path = obs_data_get_string(s, "whisper_model_path");
 	const bool is_external_model = new_model_path.find("!!!external!!!") != std::string::npos;
 	if (gf->whisper_model_path == nullptr ||
-	    strcmp(new_model_path.c_str(), gf->whisper_model_path) != 0) {
+	    strcmp(new_model_path.c_str(), gf->whisper_model_path) != 0 || is_external_model) {
 		// model path changed, reload the model
-		obs_log(LOG_INFO, "model path changed, reloading model");
+		obs_log(gf->log_level, "model path changed from %s to %s", gf->whisper_model_path,
-		shutdown_whisper_thread(gf);
+			new_model_path.c_str());
 		if (gf->whisper_model_path != nullptr) {
 			bfree(gf->whisper_model_path);
 		}
 		gf->whisper_model_path = bstrdup(new_model_path.c_str());
 		// check if the new model is external file
-		if (new_model_path.find("!!!external!!!") == std::string::npos) {
+		if (!is_external_model) {
 			// new model is not external file
 			shutdown_whisper_thread(gf);
 			gf->whisper_model_path = bstrdup(new_model_path.c_str());
 			// check if the model exists, if not, download it
 			std::string model_file_found = find_model_file(gf->whisper_model_path);
 			if (model_file_found == "") {
@ -334,29 +380,39 @@ void transcription_filter_update(void *data, obs_data_t *s)
 						if (download_status == 0) {
 							obs_log(LOG_INFO,
 								"Model download complete");
-							gf->whisper_context =
+							start_whisper_thread_with_path(gf, path);
 								init_whisper_context(path);
 							std::thread new_whisper_thread(whisper_loop,
 										       gf);
 							gf->whisper_thread.swap(new_whisper_thread);
 						} else {
 							obs_log(LOG_ERROR, "Model download failed");
 						}
 					});
 			} else {
 				// Model exists, just load it
-				gf->whisper_context = init_whisper_context(model_file_found);
+				start_whisper_thread_with_path(gf, model_file_found);
 				std::thread new_whisper_thread(whisper_loop, gf);
 				gf->whisper_thread.swap(new_whisper_thread);
 			}
 		} else {
-			// new model is local file, get file location from file property
+			// new model is external file, get file location from file property
 			std::string external_model_file_path =
 				obs_data_get_string(s, "whisper_model_path_external");
-			gf->whisper_context = init_whisper_context(external_model_file_path);
+			if (external_model_file_path.empty()) {
-			std::thread new_whisper_thread(whisper_loop, gf);
+				obs_log(LOG_WARNING, "External model file path is empty");
-			gf->whisper_thread.swap(new_whisper_thread);
+			} else {
 				// check if the external model file is not currently loaded
 				if (gf->whisper_model_file_currently_loaded ==
 				    external_model_file_path) {
 					obs_log(LOG_INFO, "External model file is already loaded");
 					return;
 				} else {
 					shutdown_whisper_thread(gf);
 					gf->whisper_model_path = bstrdup(new_model_path.c_str());
 					start_whisper_thread_with_path(gf,
 								       external_model_file_path);
 				}
 			}
 		}
 	} else {
 		// model path did not change
 		obs_log(LOG_INFO, "model path did not change: %s == %s", gf->whisper_model_path,
 			new_model_path.c_str());
 	}
 	if (!gf->whisper_ctx_mutex) {
@ -409,6 +465,7 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
 	gf->step_size_msec = step_by_step_processing
 				     ? (int)obs_data_get_int(settings, "step_size_msec")
 				     : BUFFER_SIZE_MSEC;
 	gf->log_level = (int)obs_data_get_int(settings, "log_level");
 	for (size_t i = 0; i < MAX_AUDIO_CHANNELS; i++) {
 		circlebuf_init(&gf->input_buffers[i]);
@ -423,7 +480,6 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
 	}
 	gf->context = filter;
 	gf->whisper_model_path = nullptr; // The update function will set the model path
 	gf->overlap_ms = OVERLAP_SIZE_MSEC;
 	gf->overlap_frames = (size_t)((float)gf->sample_rate / (1000.0f / (float)gf->overlap_ms));
@ -450,6 +506,9 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
 	gf->text_source = nullptr;
 	gf->text_source_name = bstrdup(obs_data_get_string(settings, "subtitle_sources"));
 	gf->output_file_path = std::string("");
 	gf->whisper_model_path = nullptr; // The update function will set the model path
 	gf->whisper_context = nullptr;
 	gf->whisper_model_file_currently_loaded = "";
 	obs_log(gf->log_level, "transcription_filter: run update");
 	// get the settings updated on the filter data struct
@ -479,6 +538,8 @@ void transcription_filter_deactivate(void *data)
 void transcription_filter_defaults(obs_data_t *s)
 {
 	obs_log(LOG_INFO, "transcription_filter_defaults");
 	obs_data_set_default_bool(s, "vad_enabled", true);
 	obs_data_set_default_int(s, "log_level", LOG_DEBUG);
 	obs_data_set_default_bool(s, "log_words", true);
@ -505,7 +566,7 @@ void transcription_filter_defaults(obs_data_t *s)
 	obs_data_set_default_double(s, "thold_pt", 0.01);
 	obs_data_set_default_double(s, "thold_ptsum", 0.01);
 	obs_data_set_default_int(s, "max_len", 0);
-	obs_data_set_default_bool(s, "split_on_word", false);
+	obs_data_set_default_bool(s, "split_on_word", true);
 	obs_data_set_default_int(s, "max_tokens", 32);
 	obs_data_set_default_bool(s, "speed_up", false);
 	obs_data_set_default_bool(s, "suppress_blank", false);
@ -517,6 +578,8 @@ void transcription_filter_defaults(obs_data_t *s)
 obs_properties_t *transcription_filter_properties(void *data)
 {
 	obs_log(LOG_INFO, "transcription_filter_properties");
 	struct transcription_filter_data *gf =
 		static_cast<struct transcription_filter_data *>(data);
@ -603,16 +666,12 @@ obs_properties_t *transcription_filter_properties(void *data)
 		whisper_model_path_external,
 		[](void *data_, obs_properties_t *props, obs_property_t *property,
 		   obs_data_t *settings) {
 			obs_log(LOG_INFO, "whisper_model_path_external modified");
 			UNUSED_PARAMETER(property);
 			UNUSED_PARAMETER(props);
 			struct transcription_filter_data *gf_ =
 				static_cast<struct transcription_filter_data *>(data_);
-			shutdown_whisper_thread(gf_);
+			transcription_filter_update(gf_, settings);
 			std::string external_model_file_path =
 				obs_data_get_string(settings, "whisper_model_path_external");
 			gf_->whisper_context = init_whisper_context(external_model_file_path);
 			std::thread new_whisper_thread(whisper_loop, gf_);
 			gf_->whisper_thread.swap(new_whisper_thread);
 			return true;
 		},
 		gf);