Merge pull request #31 from obs-ai/roy.fix_russian_win32_and_model_load

fix model load, fix Russian utf8 on Windows
This commit is contained in:
Roy Shilkrot 2023-10-04 00:01:57 -04:00 committed by GitHub
commit 3c4c1c3ddd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 91 additions and 31 deletions

View File

@ -43,6 +43,7 @@ struct transcription_filter_data {
/* whisper */ /* whisper */
char *whisper_model_path = nullptr; char *whisper_model_path = nullptr;
std::string whisper_model_file_currently_loaded = "";
struct whisper_context *whisper_context = nullptr; struct whisper_context *whisper_context = nullptr;
whisper_full_params whisper_params; whisper_full_params whisper_params;
@ -50,7 +51,7 @@ struct transcription_filter_data {
bool do_silence; bool do_silence;
bool vad_enabled; bool vad_enabled;
int log_level; int log_level = LOG_DEBUG;
bool log_words; bool log_words;
bool caption_to_stream; bool caption_to_stream;
bool active = false; bool active = false;

View File

@ -10,6 +10,12 @@
#include <algorithm> #include <algorithm>
#include <fstream> #include <fstream>
#include <sstream>
#include <iomanip>
#include <bitset>
#ifdef _WIN32
#include <Windows.h>
#endif
inline enum speaker_layout convert_speaker_layout(uint8_t channels) inline enum speaker_layout convert_speaker_layout(uint8_t channels)
{ {
@ -175,6 +181,21 @@ void acquire_weak_text_source_ref(struct transcription_filter_data *gf)
void set_text_callback(struct transcription_filter_data *gf, const std::string &str) void set_text_callback(struct transcription_filter_data *gf, const std::string &str)
{ {
#ifdef _WIN32
// Russian UTF8 charset on Windows output has a bug, instead of 0xd? it outputs
// 0xf?, so we need to replace it. This doesn't affect any other charset, which
// outputs the correct UTF8 output. (Except maybe for Greek?)
std::string str_copy = str;
for (size_t i = 0; i < str_copy.size(); ++i) {
// if the char MSBs starts with 0xf replace the MSBs with 0xd
if ((str_copy.c_str()[i] & 0xf0) == 0xf0) {
str_copy[i] = (str_copy.c_str()[i] & 0x0f) | 0xd0;
}
}
#else
std::string str_copy = str;
#endif
if (gf->caption_to_stream) { if (gf->caption_to_stream) {
obs_output_t *streaming_output = obs_frontend_get_streaming_output(); obs_output_t *streaming_output = obs_frontend_get_streaming_output();
if (streaming_output) { if (streaming_output) {
@ -210,7 +231,7 @@ void set_text_callback(struct transcription_filter_data *gf, const std::string &
return; return;
} }
auto text_settings = obs_source_get_settings(target); auto text_settings = obs_source_get_settings(target);
obs_data_set_string(text_settings, "text", str.c_str()); obs_data_set_string(text_settings, "text", str_copy.c_str());
obs_source_update(target, text_settings); obs_source_update(target, text_settings);
obs_source_release(target); obs_source_release(target);
} }
@ -218,6 +239,7 @@ void set_text_callback(struct transcription_filter_data *gf, const std::string &
void shutdown_whisper_thread(struct transcription_filter_data *gf) void shutdown_whisper_thread(struct transcription_filter_data *gf)
{ {
obs_log(gf->log_level, "shutdown_whisper_thread");
if (gf->whisper_context != nullptr) { if (gf->whisper_context != nullptr) {
// acquire the mutex before freeing the context // acquire the mutex before freeing the context
if (!gf->whisper_ctx_mutex || !gf->wshiper_thread_cv) { if (!gf->whisper_ctx_mutex || !gf->wshiper_thread_cv) {
@ -232,6 +254,28 @@ void shutdown_whisper_thread(struct transcription_filter_data *gf)
if (gf->whisper_thread.joinable()) { if (gf->whisper_thread.joinable()) {
gf->whisper_thread.join(); gf->whisper_thread.join();
} }
if (gf->whisper_model_path != nullptr) {
bfree(gf->whisper_model_path);
gf->whisper_model_path = nullptr;
}
}
void start_whisper_thread_with_path(struct transcription_filter_data *gf, const std::string &path)
{
obs_log(gf->log_level, "start_whisper_thread_with_path: %s", path.c_str());
if (!gf->whisper_ctx_mutex) {
obs_log(LOG_ERROR, "cannot init whisper: whisper_ctx_mutex is null");
return;
}
std::lock_guard<std::mutex> lock(*gf->whisper_ctx_mutex);
if (gf->whisper_context != nullptr) {
obs_log(LOG_ERROR, "cannot init whisper: whisper_context is not null");
return;
}
gf->whisper_context = init_whisper_context(path);
gf->whisper_model_file_currently_loaded = path;
std::thread new_whisper_thread(whisper_loop, gf);
gf->whisper_thread.swap(new_whisper_thread);
} }
void transcription_filter_update(void *data, obs_data_t *s) void transcription_filter_update(void *data, obs_data_t *s)
@ -239,8 +283,9 @@ void transcription_filter_update(void *data, obs_data_t *s)
struct transcription_filter_data *gf = struct transcription_filter_data *gf =
static_cast<struct transcription_filter_data *>(data); static_cast<struct transcription_filter_data *>(data);
obs_log(gf->log_level, "transcription_filter_update");
gf->log_level = (int)obs_data_get_int(s, "log_level"); gf->log_level = (int)obs_data_get_int(s, "log_level");
obs_log(gf->log_level, "transcription_filter_update");
gf->vad_enabled = obs_data_get_bool(s, "vad_enabled"); gf->vad_enabled = obs_data_get_bool(s, "vad_enabled");
gf->log_words = obs_data_get_bool(s, "log_words"); gf->log_words = obs_data_get_bool(s, "log_words");
gf->caption_to_stream = obs_data_get_bool(s, "caption_to_stream"); gf->caption_to_stream = obs_data_get_bool(s, "caption_to_stream");
@ -310,20 +355,21 @@ void transcription_filter_update(void *data, obs_data_t *s)
obs_log(gf->log_level, "transcription_filter: update whisper model"); obs_log(gf->log_level, "transcription_filter: update whisper model");
// update the whisper model path // update the whisper model path
std::string new_model_path = obs_data_get_string(s, "whisper_model_path"); std::string new_model_path = obs_data_get_string(s, "whisper_model_path");
const bool is_external_model = new_model_path.find("!!!external!!!") != std::string::npos;
if (gf->whisper_model_path == nullptr || if (gf->whisper_model_path == nullptr ||
strcmp(new_model_path.c_str(), gf->whisper_model_path) != 0) { strcmp(new_model_path.c_str(), gf->whisper_model_path) != 0 || is_external_model) {
// model path changed, reload the model // model path changed, reload the model
obs_log(LOG_INFO, "model path changed, reloading model"); obs_log(gf->log_level, "model path changed from %s to %s", gf->whisper_model_path,
shutdown_whisper_thread(gf); new_model_path.c_str());
if (gf->whisper_model_path != nullptr) {
bfree(gf->whisper_model_path);
}
gf->whisper_model_path = bstrdup(new_model_path.c_str());
// check if the new model is external file // check if the new model is external file
if (new_model_path.find("!!!external!!!") == std::string::npos) { if (!is_external_model) {
// new model is not external file // new model is not external file
shutdown_whisper_thread(gf);
gf->whisper_model_path = bstrdup(new_model_path.c_str());
// check if the model exists, if not, download it // check if the model exists, if not, download it
std::string model_file_found = find_model_file(gf->whisper_model_path); std::string model_file_found = find_model_file(gf->whisper_model_path);
if (model_file_found == "") { if (model_file_found == "") {
@ -334,29 +380,39 @@ void transcription_filter_update(void *data, obs_data_t *s)
if (download_status == 0) { if (download_status == 0) {
obs_log(LOG_INFO, obs_log(LOG_INFO,
"Model download complete"); "Model download complete");
gf->whisper_context = start_whisper_thread_with_path(gf, path);
init_whisper_context(path);
std::thread new_whisper_thread(whisper_loop,
gf);
gf->whisper_thread.swap(new_whisper_thread);
} else { } else {
obs_log(LOG_ERROR, "Model download failed"); obs_log(LOG_ERROR, "Model download failed");
} }
}); });
} else { } else {
// Model exists, just load it // Model exists, just load it
gf->whisper_context = init_whisper_context(model_file_found); start_whisper_thread_with_path(gf, model_file_found);
std::thread new_whisper_thread(whisper_loop, gf);
gf->whisper_thread.swap(new_whisper_thread);
} }
} else { } else {
// new model is local file, get file location from file property // new model is external file, get file location from file property
std::string external_model_file_path = std::string external_model_file_path =
obs_data_get_string(s, "whisper_model_path_external"); obs_data_get_string(s, "whisper_model_path_external");
gf->whisper_context = init_whisper_context(external_model_file_path); if (external_model_file_path.empty()) {
std::thread new_whisper_thread(whisper_loop, gf); obs_log(LOG_WARNING, "External model file path is empty");
gf->whisper_thread.swap(new_whisper_thread); } else {
// check if the external model file is not currently loaded
if (gf->whisper_model_file_currently_loaded ==
external_model_file_path) {
obs_log(LOG_INFO, "External model file is already loaded");
return;
} else {
shutdown_whisper_thread(gf);
gf->whisper_model_path = bstrdup(new_model_path.c_str());
start_whisper_thread_with_path(gf,
external_model_file_path);
}
}
} }
} else {
// model path did not change
obs_log(LOG_INFO, "model path did not change: %s == %s", gf->whisper_model_path,
new_model_path.c_str());
} }
if (!gf->whisper_ctx_mutex) { if (!gf->whisper_ctx_mutex) {
@ -409,6 +465,7 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
gf->step_size_msec = step_by_step_processing gf->step_size_msec = step_by_step_processing
? (int)obs_data_get_int(settings, "step_size_msec") ? (int)obs_data_get_int(settings, "step_size_msec")
: BUFFER_SIZE_MSEC; : BUFFER_SIZE_MSEC;
gf->log_level = (int)obs_data_get_int(settings, "log_level");
for (size_t i = 0; i < MAX_AUDIO_CHANNELS; i++) { for (size_t i = 0; i < MAX_AUDIO_CHANNELS; i++) {
circlebuf_init(&gf->input_buffers[i]); circlebuf_init(&gf->input_buffers[i]);
@ -423,7 +480,6 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
} }
gf->context = filter; gf->context = filter;
gf->whisper_model_path = nullptr; // The update function will set the model path
gf->overlap_ms = OVERLAP_SIZE_MSEC; gf->overlap_ms = OVERLAP_SIZE_MSEC;
gf->overlap_frames = (size_t)((float)gf->sample_rate / (1000.0f / (float)gf->overlap_ms)); gf->overlap_frames = (size_t)((float)gf->sample_rate / (1000.0f / (float)gf->overlap_ms));
@ -450,6 +506,9 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
gf->text_source = nullptr; gf->text_source = nullptr;
gf->text_source_name = bstrdup(obs_data_get_string(settings, "subtitle_sources")); gf->text_source_name = bstrdup(obs_data_get_string(settings, "subtitle_sources"));
gf->output_file_path = std::string(""); gf->output_file_path = std::string("");
gf->whisper_model_path = nullptr; // The update function will set the model path
gf->whisper_context = nullptr;
gf->whisper_model_file_currently_loaded = "";
obs_log(gf->log_level, "transcription_filter: run update"); obs_log(gf->log_level, "transcription_filter: run update");
// get the settings updated on the filter data struct // get the settings updated on the filter data struct
@ -479,6 +538,8 @@ void transcription_filter_deactivate(void *data)
void transcription_filter_defaults(obs_data_t *s) void transcription_filter_defaults(obs_data_t *s)
{ {
obs_log(LOG_INFO, "transcription_filter_defaults");
obs_data_set_default_bool(s, "vad_enabled", true); obs_data_set_default_bool(s, "vad_enabled", true);
obs_data_set_default_int(s, "log_level", LOG_DEBUG); obs_data_set_default_int(s, "log_level", LOG_DEBUG);
obs_data_set_default_bool(s, "log_words", true); obs_data_set_default_bool(s, "log_words", true);
@ -505,7 +566,7 @@ void transcription_filter_defaults(obs_data_t *s)
obs_data_set_default_double(s, "thold_pt", 0.01); obs_data_set_default_double(s, "thold_pt", 0.01);
obs_data_set_default_double(s, "thold_ptsum", 0.01); obs_data_set_default_double(s, "thold_ptsum", 0.01);
obs_data_set_default_int(s, "max_len", 0); obs_data_set_default_int(s, "max_len", 0);
obs_data_set_default_bool(s, "split_on_word", false); obs_data_set_default_bool(s, "split_on_word", true);
obs_data_set_default_int(s, "max_tokens", 32); obs_data_set_default_int(s, "max_tokens", 32);
obs_data_set_default_bool(s, "speed_up", false); obs_data_set_default_bool(s, "speed_up", false);
obs_data_set_default_bool(s, "suppress_blank", false); obs_data_set_default_bool(s, "suppress_blank", false);
@ -517,6 +578,8 @@ void transcription_filter_defaults(obs_data_t *s)
obs_properties_t *transcription_filter_properties(void *data) obs_properties_t *transcription_filter_properties(void *data)
{ {
obs_log(LOG_INFO, "transcription_filter_properties");
struct transcription_filter_data *gf = struct transcription_filter_data *gf =
static_cast<struct transcription_filter_data *>(data); static_cast<struct transcription_filter_data *>(data);
@ -603,16 +666,12 @@ obs_properties_t *transcription_filter_properties(void *data)
whisper_model_path_external, whisper_model_path_external,
[](void *data_, obs_properties_t *props, obs_property_t *property, [](void *data_, obs_properties_t *props, obs_property_t *property,
obs_data_t *settings) { obs_data_t *settings) {
obs_log(LOG_INFO, "whisper_model_path_external modified");
UNUSED_PARAMETER(property); UNUSED_PARAMETER(property);
UNUSED_PARAMETER(props); UNUSED_PARAMETER(props);
struct transcription_filter_data *gf_ = struct transcription_filter_data *gf_ =
static_cast<struct transcription_filter_data *>(data_); static_cast<struct transcription_filter_data *>(data_);
shutdown_whisper_thread(gf_); transcription_filter_update(gf_, settings);
std::string external_model_file_path =
obs_data_get_string(settings, "whisper_model_path_external");
gf_->whisper_context = init_whisper_context(external_model_file_path);
std::thread new_whisper_thread(whisper_loop, gf_);
gf_->whisper_thread.swap(new_whisper_thread);
return true; return true;
}, },
gf); gf);