diff --git a/data/locale/en-US.ini b/data/locale/en-US.ini index 2e1af16..a7226f1 100644 --- a/data/locale/en-US.ini +++ b/data/locale/en-US.ini @@ -1 +1,39 @@ +LocalVocalPlugin="LocalVocal Plugin" transcription_filterAudioFilter="LocalVocal Transcription" +vad_enabled="VAD Enabled" +log_level="Log Level" +log_words="Log Output Words" +caption_to_stream="Stream Captions" +step_by_step_processing="Step-by-step processing (⚠️ processing will increase)" +step_size_msec="Step size (ms)" +subtitle_sources="Subtitles Output" +none_no_output="None / No output" +text_file_output="Text File output" +output_filename="Output filename" +whisper_model="Whisper Model" +external_model_file="External model file" +whisper_parameters="Whisper Parameters" +language="Language" +whisper_sampling_method="Whisper Sampling Method" +n_threads="Number of threads" +n_max_text_ctx="Max text context" +translate="Translate" +no_context="No context" +single_segment="Single segment" +print_special="Print special" +print_progress="Print progress" +print_realtime="Print realtime" +print_timestamps="Print timestamps" +token_timestamps="Token timestamps" +thold_pt="Token prob. threshold" +thold_ptsum="Token sum prob. threshold" +max_len="Max length in chars" +split_on_word="Split on word" +max_tokens="Max tokens" +speed_up="Speed up" +initial_prompt="Initial prompt" +suppress_blank="Suppress blank" +suppress_non_speech_tokens="Suppress non-speech tokens" +temperature="Temperature" +max_initial_ts="Max initial timestamps" +length_penalty="Length penalty" \ No newline at end of file diff --git a/data/locale/pt_BR.ini b/data/locale/pt_BR.ini new file mode 100644 index 0000000..7960409 --- /dev/null +++ b/data/locale/pt_BR.ini @@ -0,0 +1,40 @@ +# Portuguese (Brazil) translation +LocalVocalPlugin="Plugin LocalVocal" +transcription_filterAudioFilter="LocalVocal Transcrição" +vad_enabled="VAD Habilitado" +log_level="Nível de log" +log_words="Log Palavras" +caption_to_stream="Legendas de fluxo" +step_by_step_processing="Processamento passo a passo (⚠️ o processamento aumentará)" +step_size_msec="Tamanho do passo (ms)" +subtitle_sources="Legendas de saída" +none_no_output="Nenhum / Sem saída" +text_file_output="Saída do arquivo de texto" +output_filename="Nome do arquivo de saída" +whisper_model="Modelo Whisper" +external_model_file="Arquivo de modelo externo" +whisper_parameters="Parâmetros Whisper" +language="Língua" +whisper_sampling_method="Método de amostragem Whisper" +n_threads="Número de threads" +n_max_text_ctx="Máximo de contexto de texto" +translate="Traduzir" +no_context="Sem contexto" +single_segment="Segmento único" +print_special="Imprimir especial" +print_progress="Imprimir progresso" +print_realtime="Imprimir em tempo real" +print_timestamps="Imprimir carimbos de data" +token_timestamps="Carimbos de data" +thold_pt="Limiar token probabilidade" +thold_ptsum="Limiar token soma probabilidade" +max_len="Comprimento máximo em caracteres" +split_on_word="Dividir na palavra" +max_tokens="Máximo de tokens" +speed_up="Acelerar" +initial_prompt="Prompt inicial" +suppress_blank="Suprimir em branco" +suppress_non_speech_tokens="Suprimir tokens não fala" +temperature="Temperatura" +max_initial_ts="Tempo inicial máximo" +length_penalty="Pena de comprimento" diff --git a/data/locale/ru_RU.ini b/data/locale/ru_RU.ini new file mode 100644 index 0000000..860611c --- /dev/null +++ b/data/locale/ru_RU.ini @@ -0,0 +1,39 @@ +LocalVocalPlugin="Плагин LocalVocal" +transcription_filterAudioFilter="LocalVocal Транскрипция" +vad_enabled="VAD Включен" +log_level="Уровень логирования" +log_words="Логировать слова" +caption_to_stream="Потоковые субтитры" +step_by_step_processing="Пошаговая обработка (⚠️ обработка будет увеличена)" +step_size_msec="Размер шага (мс)" +subtitle_sources="Выходные субтитры" +none_no_output="Нет / Нет выхода" +text_file_output="Выходной текстовый файл" +output_filename="Имя выходного файла" +whisper_model="Whisper Модель" +external_model_file="Внешний файл модели" +whisper_parameters="Параметры Whisper" +language="Язык" +whisper_sampling_method="Метод выборки Whisper" +n_threads="Количество потоков" +n_max_text_ctx="Максимальный текстовый контекст" +translate="Перевести" +no_context="Нет контекста" +single_segment="Одиночный сегмент" +print_special="Печать специальных" +print_progress="Печать прогресса" +print_realtime="Печать в реальном времени" +print_timestamps="Печать временных меток" +token_timestamps="Временные метки токенов" +thold_pt="Порог вероятности токена" +thold_ptsum="Порог суммы вероятностей токена" +max_len="Максимальная длина в символах" +split_on_word="Разделить по слову" +max_tokens="Максимальное количество токенов" +speed_up="Ускорить" +initial_prompt="Начальный приглашение" +suppress_blank="Подавить пустое" +suppress_non_speech_tokens="Подавить токены, не относящиеся к речи" +temperature="Температура" +max_initial_ts="Максимальное начальное время" +length_penalty="Штраф за длину" diff --git a/src/plugin-main.c b/src/plugin-main.c index ec8f74f..49cdca9 100644 --- a/src/plugin-main.c +++ b/src/plugin-main.c @@ -1,6 +1,6 @@ /* -Plugin Name -Copyright (C) +obs-localvocal +Copyright (C) 2023 Roy Shilkrot roy.shil@gmail.com This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -32,7 +32,7 @@ extern struct obs_source_info transcription_filter_info; bool obs_module_load(void) { obs_register_source(&transcription_filter_info); - blog(LOG_INFO, "plugin loaded successfully (version %s)", PLUGIN_VERSION); + obs_log(LOG_INFO, "plugin loaded successfully (version %s)", PLUGIN_VERSION); return true; } diff --git a/src/plugin-support.c.in b/src/plugin-support.c.in index f78a593..df69a54 100644 --- a/src/plugin-support.c.in +++ b/src/plugin-support.c.in @@ -1,6 +1,6 @@ /* -Plugin Name -Copyright (C) +obs-localvocal +Copyright (C) 2023 Roy Shilkrot roy.shil@gmail.com This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/src/plugin-support.h b/src/plugin-support.h index 6959fcf..088c628 100644 --- a/src/plugin-support.h +++ b/src/plugin-support.h @@ -1,6 +1,6 @@ /* -Plugin Name -Copyright (C) +obs-localvocal +Copyright (C) 2023 Roy Shilkrot roy.shil@gmail.com This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h index f6e90de..3933047 100644 --- a/src/transcription-filter-data.h +++ b/src/transcription-filter-data.h @@ -43,7 +43,6 @@ struct transcription_filter_data { /* whisper */ char *whisper_model_path = nullptr; - std::string whisper_model_file_currently_loaded = ""; struct whisper_context *whisper_context = nullptr; whisper_full_params whisper_params; @@ -63,7 +62,8 @@ struct transcription_filter_data { // Callback to set the text in the output text source (subtitles) std::function setTextCallback; // Output file path to write the subtitles - std::string output_file_path; + std::string output_file_path = ""; + std::string whisper_model_file_currently_loaded = ""; // Use std for thread and mutex std::thread whisper_thread; diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp index d0cb600..33cee78 100644 --- a/src/transcription-filter.cpp +++ b/src/transcription-filter.cpp @@ -159,7 +159,7 @@ void transcription_filter_destroy(void *data) void acquire_weak_text_source_ref(struct transcription_filter_data *gf) { if (!gf->text_source_name) { - obs_log(LOG_ERROR, "text_source_name is null"); + obs_log(gf->log_level, "text_source_name is null"); return; } @@ -222,12 +222,12 @@ void set_text_callback(struct transcription_filter_data *gf, const std::string & std::lock_guard lock(*gf->text_source_mutex); if (!gf->text_source) { - obs_log(LOG_ERROR, "text_source is null"); + obs_log(gf->log_level, "text_source is null"); return; } auto target = obs_weak_source_get_source(gf->text_source); if (!target) { - obs_log(LOG_ERROR, "text_source target is null"); + obs_log(gf->log_level, "text_source target is null"); return; } auto text_settings = obs_source_get_settings(target); @@ -454,6 +454,8 @@ void transcription_filter_update(void *data, obs_data_t *s) void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter) { + obs_log(LOG_INFO, "transcription filter create"); + struct transcription_filter_data *gf = new transcription_filter_data; // Get the number of channels for the input source @@ -503,12 +505,19 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter) gf->whisper_ctx_mutex = new std::mutex(); gf->wshiper_thread_cv = new std::condition_variable(); gf->text_source_mutex = new std::mutex(); + obs_log(gf->log_level, "transcription_filter: clear text source data"); gf->text_source = nullptr; - gf->text_source_name = bstrdup(obs_data_get_string(settings, "subtitle_sources")); + const char* subtitle_sources = obs_data_get_string(settings, "subtitle_sources"); + if (subtitle_sources != nullptr) { + gf->text_source_name = bstrdup(subtitle_sources); + } else { + gf->text_source_name = nullptr; + } + obs_log(gf->log_level, "transcription_filter: clear paths and whisper context"); + gf->whisper_model_file_currently_loaded = ""; gf->output_file_path = std::string(""); gf->whisper_model_path = nullptr; // The update function will set the model path gf->whisper_context = nullptr; - gf->whisper_model_file_currently_loaded = ""; obs_log(gf->log_level, "transcription_filter: run update"); // get the settings updated on the filter data struct @@ -585,18 +594,18 @@ obs_properties_t *transcription_filter_properties(void *data) obs_properties_t *ppts = obs_properties_create(); - obs_properties_add_bool(ppts, "vad_enabled", "VAD Enabled"); - obs_property_t *list = obs_properties_add_list(ppts, "log_level", "Log level", + obs_properties_add_bool(ppts, "vad_enabled", MT_("vad_enabled")); + obs_property_t *list = obs_properties_add_list(ppts, "log_level", MT_("log_level"), OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_INT); obs_property_list_add_int(list, "DEBUG", LOG_DEBUG); obs_property_list_add_int(list, "INFO", LOG_INFO); obs_property_list_add_int(list, "WARNING", LOG_WARNING); - obs_properties_add_bool(ppts, "log_words", "Log output words"); - obs_properties_add_bool(ppts, "caption_to_stream", "Stream captions"); + obs_properties_add_bool(ppts, "log_words", MT_("log_words")); + obs_properties_add_bool(ppts, "caption_to_stream", MT_("caption_to_stream")); obs_property_t *step_by_step_processing = obs_properties_add_bool( ppts, "step_by_step_processing", - "Step-by-step processing (⚠️ processing will increase)"); - obs_properties_add_int_slider(ppts, "step_size_msec", "Step size (ms)", 1000, + MT_("step_by_step_processing")); + obs_properties_add_int_slider(ppts, "step_size_msec", MT_("step_size_msec"), 1000, BUFFER_SIZE_MSEC, 50); obs_property_set_modified_callback(step_by_step_processing, [](obs_properties_t *props, @@ -610,15 +619,15 @@ obs_properties_t *transcription_filter_properties(void *data) }); obs_property_t *subs_output = - obs_properties_add_list(ppts, "subtitle_sources", "Subtitles Output", + obs_properties_add_list(ppts, "subtitle_sources", MT_("subtitle_sources"), OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING); // Add "none" option - obs_property_list_add_string(subs_output, "None / No output", "none"); - obs_property_list_add_string(subs_output, "Text File output", "text_file"); + obs_property_list_add_string(subs_output, MT_("none_no_output"), "none"); + obs_property_list_add_string(subs_output, MT_("text_file_output"), "text_file"); // Add text sources obs_enum_sources(add_sources_to_list, subs_output); - obs_properties_add_path(ppts, "subtitle_output_filename", "Output filename", + obs_properties_add_path(ppts, "subtitle_output_filename", MT_("output_filename"), OBS_PATH_FILE_SAVE, "Text (*.txt)", NULL); obs_property_set_modified_callback(subs_output, [](obs_properties_t *props, @@ -640,7 +649,7 @@ obs_properties_t *transcription_filter_properties(void *data) // Add a list of available whisper models to download obs_property_t *whisper_models_list = - obs_properties_add_list(ppts, "whisper_model_path", "Whisper Model", + obs_properties_add_list(ppts, "whisper_model_path", MT_("whisper_model"), OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING); obs_property_list_add_string(whisper_models_list, "Tiny (Eng) 75Mb", @@ -657,7 +666,7 @@ obs_properties_t *transcription_filter_properties(void *data) // Add a file selection input to select an external model file obs_property_t *whisper_model_path_external = - obs_properties_add_path(ppts, "whisper_model_path_external", "External model file", + obs_properties_add_path(ppts, "whisper_model_path_external", MT_("external_model_file"), OBS_PATH_FILE, "Model (*.bin)", NULL); // Hide the external model file selection input obs_property_set_visible(obs_properties_get(ppts, "whisper_model_path_external"), false); @@ -695,12 +704,12 @@ obs_properties_t *transcription_filter_properties(void *data) }); obs_properties_t *whisper_params_group = obs_properties_create(); - obs_properties_add_group(ppts, "whisper_params_group", "Whisper Parameters", + obs_properties_add_group(ppts, "whisper_params_group", MT_("whisper_parameters"), OBS_GROUP_NORMAL, whisper_params_group); // Add language selector obs_property_t *whisper_language_select_list = - obs_properties_add_list(whisper_params_group, "whisper_language_select", "Language", + obs_properties_add_list(whisper_params_group, "whisper_language_select", MT_("language"), OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING); // sort the languages by flipping the map std::map whisper_available_lang_flip; @@ -718,65 +727,65 @@ obs_properties_t *transcription_filter_properties(void *data) } obs_property_t *whisper_sampling_method_list = obs_properties_add_list( - whisper_params_group, "whisper_sampling_method", "whisper_sampling_method", + whisper_params_group, "whisper_sampling_method", MT_("whisper_sampling_method"), OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_INT); obs_property_list_add_int(whisper_sampling_method_list, "Beam search", WHISPER_SAMPLING_BEAM_SEARCH); obs_property_list_add_int(whisper_sampling_method_list, "Greedy", WHISPER_SAMPLING_GREEDY); // int n_threads; - obs_properties_add_int_slider(whisper_params_group, "n_threads", "n_threads", 1, 8, 1); + obs_properties_add_int_slider(whisper_params_group, "n_threads", MT_("n_threads"), 1, 8, 1); // int n_max_text_ctx; // max tokens to use from past text as prompt for the decoder - obs_properties_add_int_slider(whisper_params_group, "n_max_text_ctx", "n_max_text_ctx", 0, + obs_properties_add_int_slider(whisper_params_group, "n_max_text_ctx", MT_("n_max_text_ctx"), 0, 16384, 100); // int offset_ms; // start offset in ms // int duration_ms; // audio duration to process in ms // bool translate; - obs_properties_add_bool(whisper_params_group, "translate", "translate"); + obs_properties_add_bool(whisper_params_group, "translate", MT_("translate")); // bool no_context; // do not use past transcription (if any) as initial prompt for the decoder - obs_properties_add_bool(whisper_params_group, "no_context", "no_context"); + obs_properties_add_bool(whisper_params_group, "no_context", MT_("no_context")); // bool single_segment; // force single segment output (useful for streaming) - obs_properties_add_bool(whisper_params_group, "single_segment", "single_segment"); + obs_properties_add_bool(whisper_params_group, "single_segment", MT_("single_segment")); // bool print_special; // print special tokens (e.g. , , , etc.) - obs_properties_add_bool(whisper_params_group, "print_special", "print_special"); + obs_properties_add_bool(whisper_params_group, "print_special", MT_("print_special")); // bool print_progress; // print progress information - obs_properties_add_bool(whisper_params_group, "print_progress", "print_progress"); + obs_properties_add_bool(whisper_params_group, "print_progress", MT_("print_progress")); // bool print_realtime; // print results from within whisper.cpp (avoid it, use callback instead) - obs_properties_add_bool(whisper_params_group, "print_realtime", "print_realtime"); + obs_properties_add_bool(whisper_params_group, "print_realtime", MT_("print_realtime")); // bool print_timestamps; // print timestamps for each text segment when printing realtime - obs_properties_add_bool(whisper_params_group, "print_timestamps", "print_timestamps"); + obs_properties_add_bool(whisper_params_group, "print_timestamps", MT_("print_timestamps")); // bool token_timestamps; // enable token-level timestamps - obs_properties_add_bool(whisper_params_group, "token_timestamps", "token_timestamps"); + obs_properties_add_bool(whisper_params_group, "token_timestamps", MT_("token_timestamps")); // float thold_pt; // timestamp token probability threshold (~0.01) - obs_properties_add_float_slider(whisper_params_group, "thold_pt", "thold_pt", 0.0f, 1.0f, + obs_properties_add_float_slider(whisper_params_group, "thold_pt", MT_("thold_pt"), 0.0f, 1.0f, 0.05f); // float thold_ptsum; // timestamp token sum probability threshold (~0.01) - obs_properties_add_float_slider(whisper_params_group, "thold_ptsum", "thold_ptsum", 0.0f, + obs_properties_add_float_slider(whisper_params_group, "thold_ptsum", MT_("thold_ptsum"), 0.0f, 1.0f, 0.05f); // int max_len; // max segment length in characters - obs_properties_add_int_slider(whisper_params_group, "max_len", "max_len", 0, 100, 1); + obs_properties_add_int_slider(whisper_params_group, "max_len", MT_("max_len"), 0, 100, 1); // bool split_on_word; // split on word rather than on token (when used with max_len) - obs_properties_add_bool(whisper_params_group, "split_on_word", "split_on_word"); + obs_properties_add_bool(whisper_params_group, "split_on_word", MT_("split_on_word")); // int max_tokens; // max tokens per segment (0 = no limit) - obs_properties_add_int_slider(whisper_params_group, "max_tokens", "max_tokens", 0, 100, 1); + obs_properties_add_int_slider(whisper_params_group, "max_tokens", MT_("max_tokens"), 0, 100, 1); // bool speed_up; // speed-up the audio by 2x using Phase Vocoder - obs_properties_add_bool(whisper_params_group, "speed_up", "speed_up"); + obs_properties_add_bool(whisper_params_group, "speed_up", MT_("speed_up")); // const char * initial_prompt; - obs_properties_add_text(whisper_params_group, "initial_prompt", "initial_prompt", + obs_properties_add_text(whisper_params_group, "initial_prompt", MT_("initial_prompt"), OBS_TEXT_DEFAULT); // bool suppress_blank - obs_properties_add_bool(whisper_params_group, "suppress_blank", "suppress_blank"); + obs_properties_add_bool(whisper_params_group, "suppress_blank", MT_("suppress_blank")); // bool suppress_non_speech_tokens obs_properties_add_bool(whisper_params_group, "suppress_non_speech_tokens", - "suppress_non_speech_tokens"); + MT_("suppress_non_speech_tokens")); // float temperature - obs_properties_add_float_slider(whisper_params_group, "temperature", "temperature", 0.0f, + obs_properties_add_float_slider(whisper_params_group, "temperature", MT_("temperature"), 0.0f, 1.0f, 0.05f); // float max_initial_ts - obs_properties_add_float_slider(whisper_params_group, "max_initial_ts", "max_initial_ts", + obs_properties_add_float_slider(whisper_params_group, "max_initial_ts", MT_("max_initial_ts"), 0.0f, 1.0f, 0.05f); // float length_penalty - obs_properties_add_float_slider(whisper_params_group, "length_penalty", "length_penalty", + obs_properties_add_float_slider(whisper_params_group, "length_penalty", MT_("length_penalty"), -1.0f, 1.0f, 0.1f); UNUSED_PARAMETER(data); diff --git a/src/whisper-processing.cpp b/src/whisper-processing.cpp index 9e012ea..0befd3e 100644 --- a/src/whisper-processing.cpp +++ b/src/whisper-processing.cpp @@ -59,7 +59,7 @@ bool vad_simple(float *pcmf32, size_t pcm32f_size, uint32_t sample_rate, float v energy_all /= (float)n_samples; if (verbose) { - blog(LOG_INFO, "%s: energy_all: %f, vad_thold: %f, freq_thold: %f", __func__, + obs_log(LOG_INFO, "%s: energy_all: %f, vad_thold: %f, freq_thold: %f", __func__, energy_all, vad_thold, freq_thold); }