From 465193a12b781ec789ddde5509282387e6f4126c Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Sun, 29 Oct 2023 00:26:36 -0400 Subject: [PATCH 1/2] adding min sub duration --- data/locale/en-US.ini | 1 + data/locale/pt_BR.ini | 1 + data/locale/ru_RU.ini | 1 + src/transcription-filter-data.h | 4 ++++ src/transcription-filter.cpp | 26 ++++++++++++++++++++++++-- 5 files changed, 31 insertions(+), 2 deletions(-) diff --git a/data/locale/en-US.ini b/data/locale/en-US.ini index b5e95dd..cb74454 100644 --- a/data/locale/en-US.ini +++ b/data/locale/en-US.ini @@ -41,3 +41,4 @@ save_srt="Save in SRT format (no file truncation)" only_while_recording="Write output only while recording" process_while_muted="Process speech while source is muted" rename_file_to_match_recording="Rename file to match recording" +min_sub_duration="Minimal subtitle duration (msec)" diff --git a/data/locale/pt_BR.ini b/data/locale/pt_BR.ini index acfdf8b..21acd9c 100644 --- a/data/locale/pt_BR.ini +++ b/data/locale/pt_BR.ini @@ -42,3 +42,4 @@ save_srt="Salvar no formato SRT" only_while_recording="Escreva durante a gravação" process_while_muted="Processar enquanto está silenciada" rename_file_to_match_recording="Renomear arquivo para corresponder à gravação" +min_sub_duration="Duração mínima da legenda (msec)" diff --git a/data/locale/ru_RU.ini b/data/locale/ru_RU.ini index 57740da..e4b5bac 100644 --- a/data/locale/ru_RU.ini +++ b/data/locale/ru_RU.ini @@ -41,3 +41,4 @@ save_srt="Сохранить в формате SRT" only_while_recording="Записывать вывод только во время записи" process_while_muted="Обрабатывать речь, пока источник отключен" rename_file_to_match_recording="Переименовать файл, чтобы соответствовать записи" +min_sub_duration="Минимальная длительность субтитров (мс)" diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h index 5c9561b..b6ca1ad 100644 --- a/src/transcription-filter-data.h +++ b/src/transcription-filter-data.h @@ -49,6 +49,10 @@ struct transcription_filter_data { uint64_t start_timestamp_ms; // Sentence counter for srt size_t sentence_number; + // Minimal subtitle duration in ms + size_t min_sub_duration; + // Last time a subtitle was rendered + uint64_t last_sub_render_time; /* PCM buffers */ float *copy_buffers[MAX_PREPROC_CHANNELS]; diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp index 6ec573b..d8c8ef2 100644 --- a/src/transcription-filter.cpp +++ b/src/transcription-filter.cpp @@ -193,8 +193,23 @@ void acquire_weak_text_source_ref(struct transcription_filter_data *gf) } } -void set_text_callback(struct transcription_filter_data *gf, const DetectionResultWithText &result) +void set_text_callback(struct transcription_filter_data *gf, + const DetectionResultWithText &resultIn) { + DetectionResultWithText result = resultIn; + uint64_t now = now_ms(); + if (result.text.empty() || result.result != DETECTION_RESULT_SPEECH) { + // check if we should clear the current sub depending on the minimum subtitle duration + if ((now - gf->last_sub_render_time) > gf->min_sub_duration) { + // clear the current sub, run an empty sub + result.text = ""; + } else { + // nothing to do, the incoming sub is empty + return; + } + } + gf->last_sub_render_time = now; + #ifdef _WIN32 // Some UTF8 charsets on Windows output have a bug, instead of 0xd? it outputs // 0xf?, and 0xc? becomes 0xe?, so we need to replace it. @@ -356,6 +371,8 @@ void transcription_filter_update(void *data, obs_data_t *s) gf->start_timestamp_ms = now_ms(); gf->sentence_number = 1; gf->process_while_muted = obs_data_get_bool(s, "process_while_muted"); + gf->min_sub_duration = (int)obs_data_get_int(s, "min_sub_duration"); + gf->last_sub_render_time = 0; obs_log(gf->log_level, "transcription_filter: update text source"); // update the text source @@ -475,7 +492,7 @@ void transcription_filter_update(void *data, obs_data_t *s) } } else { // model path did not change - obs_log(LOG_INFO, "model path did not change: %s == %s", gf->whisper_model_path, + obs_log(LOG_DEBUG, "model path did not change: %s == %s", gf->whisper_model_path, new_model_path.c_str()); } @@ -531,6 +548,8 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter) gf->step_size_msec = step_by_step_processing ? (int)obs_data_get_int(settings, "step_size_msec") : BUFFER_SIZE_MSEC; + gf->min_sub_duration = (int)obs_data_get_int(settings, "min_sub_duration"); + gf->last_sub_render_time = 0; gf->log_level = (int)obs_data_get_int(settings, "log_level"); gf->save_srt = obs_data_get_bool(settings, "subtitle_save_srt"); gf->save_only_while_recording = obs_data_get_bool(settings, "only_while_recording"); @@ -673,6 +692,7 @@ void transcription_filter_defaults(obs_data_t *s) obs_data_set_default_bool(s, "only_while_recording", false); obs_data_set_default_bool(s, "rename_file_to_match_recording", true); obs_data_set_default_int(s, "step_size_msec", 1000); + obs_data_set_default_int(s, "min_sub_duration", 3000); // Whisper parameters obs_data_set_default_int(s, "whisper_sampling_method", WHISPER_SAMPLING_BEAM_SEARCH); @@ -721,6 +741,8 @@ obs_properties_t *transcription_filter_properties(void *data) ppts, "step_by_step_processing", MT_("step_by_step_processing")); obs_properties_add_int_slider(ppts, "step_size_msec", MT_("step_size_msec"), 1000, BUFFER_SIZE_MSEC, 50); + obs_properties_add_int_slider(ppts, "min_sub_duration", MT_("min_sub_duration"), 1000, 5000, + 50); obs_property_set_modified_callback(step_by_step_processing, [](obs_properties_t *props, obs_property_t *property, From 8d924d0cb1cca8d3371e52ed13bd2c8e371fdbb2 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Tue, 31 Oct 2023 09:45:46 -0400 Subject: [PATCH 2/2] Update README.md --- README.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 455f3d0..ee1cf5d 100644 --- a/README.md +++ b/README.md @@ -2,10 +2,10 @@
-[![GitHub](https://img.shields.io/github/license/obs-ai/obs-localvocal)](https://github.com/obs-ai/obs-localvocal/blob/main/LICENSE) -[![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/obs-ai/obs-localvocal/push.yaml)](https://github.com/obs-ai/obs-localvocal/actions/workflows/push.yaml) -[![Total downloads](https://img.shields.io/github/downloads/obs-ai/obs-localvocal/total)](https://github.com/obs-ai/obs-localvocal/releases) -[![GitHub release (latest by date)](https://img.shields.io/github/v/release/obs-ai/obs-localvocal)](https://github.com/obs-ai/obs-localvocal/releases) +[![GitHub](https://img.shields.io/github/license/occ-ai/obs-localvocal)](https://github.com/occ-ai/obs-localvocal/blob/main/LICENSE) +[![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/occ-ai/obs-localvocal/push.yaml)](https://github.com/occ-ai/obs-localvocal/actions/workflows/push.yaml) +[![Total downloads](https://img.shields.io/github/downloads/occ-ai/obs-localvocal/total)](https://github.com/occ-ai/obs-localvocal/releases) +[![GitHub release (latest by date)](https://img.shields.io/github/v/release/occ-ai/obs-localvocal)](https://github.com/occ-ai/obs-localvocal/releases)
@@ -44,13 +44,13 @@ Internally the plugin is running a neural network ([OpenAI Whisper](https://gith It's using the [Whisper.cpp](https://github.com/ggerganov/whisper.cpp) project from [ggerganov](https://github.com/ggerganov) to run the Whisper network in a very efficient way on CPUs and GPUs. Check out our other plugins: -- [Background Removal](https://github.com/royshil/obs-backgroundremoval) removes background from webcam without a green screen. -- 🚧 Experimental 🚧 [CleanStream](https://github.com/obs-ai/obs-cleanstream) for real-time filler word (uh,um) and profanity removal from live audio stream -- [URL/API Source](https://github.com/obs-ai/obs-urlsource) that allows fetching live data from an API and displaying it in OBS. -- [Polyglot](https://github.com/obs-ai/obs-polyglot) translation AI plugin for real-time, local translation to hunderds of languages +- [Background Removal](https://github.com/occ-ai/obs-backgroundremoval) removes background from webcam without a green screen. +- 🚧 Experimental 🚧 [CleanStream](https://github.com/occ-ai/obs-cleanstream) for real-time filler word (uh,um) and profanity removal from live audio stream +- [URL/API Source](https://github.com/occ-ai/obs-urlsource) that allows fetching live data from an API and displaying it in OBS. +- [Polyglot](https://github.com/occ-ai/obs-polyglot) translation AI plugin for real-time, local translation to hunderds of languages ## Download -Check out the [latest releases](https://github.com/obs-ai/obs-localvocal/releases) for downloads and install instructions. +Check out the [latest releases](https://github.com/occ-ai/obs-localvocal/releases) for downloads and install instructions. ## Building