diff --git a/src/transcription-filter-callbacks.cpp b/src/transcription-filter-callbacks.cpp index 22ac69f..7f3ff83 100644 --- a/src/transcription-filter-callbacks.cpp +++ b/src/transcription-filter-callbacks.cpp @@ -48,6 +48,101 @@ void audio_chunk_callback(struct transcription_filter_data *gf, const float *pcm // stub } +std::string send_sentence_to_translation(const std::string &sentence, + struct transcription_filter_data *gf) +{ + const std::string last_text = gf->last_text; + gf->last_text = sentence; + if (gf->translate && !sentence.empty() && sentence != last_text) { + obs_log(gf->log_level, "Translating text. %s -> %s", gf->source_lang.c_str(), + gf->target_lang.c_str()); + std::string translated_text; + if (translate(gf->translation_ctx, sentence, gf->source_lang, gf->target_lang, + translated_text) == OBS_POLYGLOT_TRANSLATION_SUCCESS) { + if (gf->log_words) { + obs_log(LOG_INFO, "Translation: '%s' -> '%s'", sentence.c_str(), + translated_text.c_str()); + } + if (gf->translation_output == "none") { + // overwrite the original text with the translated text + return translated_text; + } else { + // send the translation to the selected source + send_caption_to_source(gf->translation_output, translated_text, gf); + } + } else { + obs_log(gf->log_level, "Failed to translate text"); + } + } + return sentence; +} + +void send_sentence_to_file(struct transcription_filter_data *gf, + const DetectionResultWithText &result, const std::string &str_copy) +{ + // Check if we should save the sentence + if (gf->save_only_while_recording && !obs_frontend_recording_active()) { + // We are not recording, do not save the sentence to file + return; + } + // should the file be truncated? + std::ios_base::openmode openmode = std::ios::out; + if (gf->truncate_output_file) { + openmode |= std::ios::trunc; + } else { + openmode |= std::ios::app; + } + if (!gf->save_srt) { + // Write raw sentence to file + std::ofstream output_file(gf->output_file_path, openmode); + output_file << str_copy << std::endl; + output_file.close(); + } else { + obs_log(gf->log_level, "Saving sentence to file %s, sentence #%d", + gf->output_file_path.c_str(), gf->sentence_number); + // Append sentence to file in .srt format + std::ofstream output_file(gf->output_file_path, openmode); + output_file << gf->sentence_number << std::endl; + // use the start and end timestamps to calculate the start and end time in srt format + auto format_ts_for_srt = [&output_file](uint64_t ts) { + uint64_t time_s = ts / 1000; + uint64_t time_m = time_s / 60; + uint64_t time_h = time_m / 60; + uint64_t time_ms_rem = ts % 1000; + uint64_t time_s_rem = time_s % 60; + uint64_t time_m_rem = time_m % 60; + uint64_t time_h_rem = time_h % 60; + output_file << std::setfill('0') << std::setw(2) << time_h_rem << ":" + << std::setfill('0') << std::setw(2) << time_m_rem << ":" + << std::setfill('0') << std::setw(2) << time_s_rem << "," + << std::setfill('0') << std::setw(3) << time_ms_rem; + }; + format_ts_for_srt(result.start_timestamp_ms); + output_file << " --> "; + format_ts_for_srt(result.end_timestamp_ms); + output_file << std::endl; + + output_file << str_copy << std::endl; + output_file << std::endl; + output_file.close(); + gf->sentence_number++; + } +} + +void send_caption_to_stream(DetectionResultWithText result, const std::string &str_copy, + struct transcription_filter_data *gf) +{ + obs_output_t *streaming_output = obs_frontend_get_streaming_output(); + if (streaming_output) { + // calculate the duration in seconds + const uint64_t duration = result.end_timestamp_ms - result.start_timestamp_ms; + obs_log(gf->log_level, "Sending caption to streaming output: %s", str_copy.c_str()); + obs_output_output_caption_text2(streaming_output, str_copy.c_str(), + (double)duration / 1000.0); + obs_output_release(streaming_output); + } +} + void set_text_callback(struct transcription_filter_data *gf, const DetectionResultWithText &resultIn) { @@ -98,103 +193,25 @@ void set_text_callback(struct transcription_filter_data *gf, } } - if (gf->translate && !str_copy.empty() && str_copy != gf->last_text && - result.result == DETECTION_RESULT_SPEECH) { - obs_log(gf->log_level, "Translating text. %s -> %s", gf->source_lang.c_str(), - gf->target_lang.c_str()); - std::string translated_text; - if (translate(gf->translation_ctx, str_copy, gf->source_lang, gf->target_lang, - translated_text) == OBS_POLYGLOT_TRANSLATION_SUCCESS) { - if (gf->log_words) { - obs_log(LOG_INFO, "Translation: '%s' -> '%s'", str_copy.c_str(), - translated_text.c_str()); - } - if (gf->translation_output == "none") { - // overwrite the original text with the translated text - str_copy = translated_text; - } else { - // send the translation to the selected source - send_caption_to_source(gf->translation_output, translated_text, gf); - } - } else { - obs_log(gf->log_level, "Failed to translate text"); - } - } - - gf->last_text = str_copy; - if (gf->buffered_output) { gf->captions_monitor.addSentence(str_copy); + } else { + // non-buffered output + if (gf->translate) { + // send the sentence to translation (if enabled) + str_copy = send_sentence_to_translation(str_copy, gf); + } else { + // send the sentence to the selected source + send_caption_to_source(gf->text_source_name, str_copy, gf); + } } if (gf->caption_to_stream) { - obs_output_t *streaming_output = obs_frontend_get_streaming_output(); - if (streaming_output) { - // calculate the duration in seconds - const uint64_t duration = - result.end_timestamp_ms - result.start_timestamp_ms; - obs_log(gf->log_level, "Sending caption to streaming output: %s", - str_copy.c_str()); - obs_output_output_caption_text2(streaming_output, str_copy.c_str(), - (double)duration / 1000.0); - obs_output_release(streaming_output); - } + send_caption_to_stream(result, str_copy, gf); } if (gf->output_file_path != "" && gf->text_source_name.empty()) { - // Check if we should save the sentence - if (gf->save_only_while_recording && !obs_frontend_recording_active()) { - // We are not recording, do not save the sentence to file - return; - } - // should the file be truncated? - std::ios_base::openmode openmode = std::ios::out; - if (gf->truncate_output_file) { - openmode |= std::ios::trunc; - } else { - openmode |= std::ios::app; - } - if (!gf->save_srt) { - // Write raw sentence to file - std::ofstream output_file(gf->output_file_path, openmode); - output_file << str_copy << std::endl; - output_file.close(); - } else { - obs_log(gf->log_level, "Saving sentence to file %s, sentence #%d", - gf->output_file_path.c_str(), gf->sentence_number); - // Append sentence to file in .srt format - std::ofstream output_file(gf->output_file_path, openmode); - output_file << gf->sentence_number << std::endl; - // use the start and end timestamps to calculate the start and end time in srt format - auto format_ts_for_srt = [&output_file](uint64_t ts) { - uint64_t time_s = ts / 1000; - uint64_t time_m = time_s / 60; - uint64_t time_h = time_m / 60; - uint64_t time_ms_rem = ts % 1000; - uint64_t time_s_rem = time_s % 60; - uint64_t time_m_rem = time_m % 60; - uint64_t time_h_rem = time_h % 60; - output_file << std::setfill('0') << std::setw(2) << time_h_rem - << ":" << std::setfill('0') << std::setw(2) - << time_m_rem << ":" << std::setfill('0') - << std::setw(2) << time_s_rem << "," - << std::setfill('0') << std::setw(3) << time_ms_rem; - }; - format_ts_for_srt(result.start_timestamp_ms); - output_file << " --> "; - format_ts_for_srt(result.end_timestamp_ms); - output_file << std::endl; - - output_file << str_copy << std::endl; - output_file << std::endl; - output_file.close(); - gf->sentence_number++; - } - } else { - if (!gf->buffered_output) { - // Send the caption to the text source - send_caption_to_source(gf->text_source_name, str_copy, gf); - } + send_sentence_to_file(gf, result, str_copy); } }; diff --git a/src/transcription-filter-callbacks.h b/src/transcription-filter-callbacks.h index 138cf1e..e8bdf3b 100644 --- a/src/transcription-filter-callbacks.h +++ b/src/transcription-filter-callbacks.h @@ -8,6 +8,8 @@ void send_caption_to_source(const std::string &target_source_name, const std::string &str_copy, struct transcription_filter_data *gf); +std::string send_sentence_to_translation(const std::string &sentence, + struct transcription_filter_data *gf); void audio_chunk_callback(struct transcription_filter_data *gf, const float *pcm32f_data, size_t frames, int vad_state, const DetectionResultWithText &result); diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp index e62e5b9..ff47135 100644 --- a/src/transcription-filter.cpp +++ b/src/transcription-filter.cpp @@ -204,6 +204,12 @@ void transcription_filter_update(void *data, obs_data_t *s) gf); } }, + [gf](const std::string &sentence) { + obs_log(LOG_INFO, "sentence: %s", sentence.c_str()); + if (gf->buffered_output && gf->translate) { + send_sentence_to_translation(sentence, gf); + } + }, new_buffer_num_lines, new_buffer_num_chars_per_line, std::chrono::seconds(3), new_buffer_output_type); } else { diff --git a/src/whisper-utils/token-buffer-thread.cpp b/src/whisper-utils/token-buffer-thread.cpp index b2a4093..dfdd221 100644 --- a/src/whisper-utils/token-buffer-thread.cpp +++ b/src/whisper-utils/token-buffer-thread.cpp @@ -34,20 +34,24 @@ TokenBufferThread::~TokenBufferThread() stopThread(); } -void TokenBufferThread::initialize(struct transcription_filter_data *gf_, - std::function callback_, - size_t numSentences_, size_t numPerSentence_, - std::chrono::seconds maxTime_, - TokenBufferSegmentation segmentation_) +void TokenBufferThread::initialize( + struct transcription_filter_data *gf_, + std::function captionPresentationCallback_, + std::function sentenceOutputCallback_, size_t numSentences_, + size_t numPerSentence_, std::chrono::seconds maxTime_, + TokenBufferSegmentation segmentation_) { this->gf = gf_; - this->callback = callback_; + this->captionPresentationCallback = captionPresentationCallback_; + this->sentenceOutputCallback = sentenceOutputCallback_; this->numSentences = numSentences_; this->numPerSentence = numPerSentence_; this->segmentation = segmentation_; this->maxTime = maxTime_; this->stop = false; this->workerThread = std::thread(&TokenBufferThread::monitor, this); + this->lastContributionTime = std::chrono::steady_clock::now(); + this->lastCaptionTime = std::chrono::steady_clock::now(); } void TokenBufferThread::stopThread() @@ -80,26 +84,29 @@ void TokenBufferThread::addSentence(const std::string &sentence) std::wstring sentence_ws(count, 0); MultiByteToWideChar(CP_UTF8, 0, sentence.c_str(), (int)sentence.length(), &sentence_ws[0], count); - // split to characters - std::vector characters; - for (const auto &c : sentence_ws) { - characters.push_back(std::wstring(1, c)); - } #else - // split to characters - std::vector characters; - for (const auto &c : sentence) { - characters.push_back(std::string(1, c)); - } + std::string sentence_ws = sentence; #endif + // split to characters + std::vector characters; + for (const auto &c : sentence_ws) { + characters.push_back(TokenBufferString(1, c)); + } std::lock_guard lock(inputQueueMutex); - // add the reconstructed sentence to the wordQueue + // add the characters to the inputQueue for (const auto &character : characters) { inputQueue.push_back(character); } inputQueue.push_back(SPACE); + + // add to the contribution queue as well + for (const auto &character : characters) { + contributionQueue.push_back(character); + } + contributionQueue.push_back(SPACE); + this->lastContributionTime = std::chrono::steady_clock::now(); } void TokenBufferThread::clear() @@ -114,14 +121,14 @@ void TokenBufferThread::clear() } this->lastCaption = ""; this->lastCaptionTime = std::chrono::steady_clock::now(); - this->callback(""); + this->captionPresentationCallback(""); } void TokenBufferThread::monitor() { obs_log(LOG_INFO, "TokenBufferThread::monitor"); - this->callback(""); + this->captionPresentationCallback(""); while (true) { std::string caption_out; @@ -152,11 +159,13 @@ void TokenBufferThread::monitor() if (!inputQueue.empty()) { // if there are token on the input queue + // then add to the presentation queue based on the segmentation if (this->segmentation == SEGMENTATION_SENTENCE) { // add all the tokens from the input queue to the presentation queue for (const auto &token : inputQueue) { presentationQueue.push_back(token); } + inputQueue.clear(); } else if (this->segmentation == SEGMENTATION_TOKEN) { // add one token to the presentation queue presentationQueue.push_back(inputQueue.front()); @@ -259,29 +268,65 @@ void TokenBufferThread::monitor() break; } + const auto now = std::chrono::steady_clock::now(); + + // check if enough time passed since last contribution (debounce) + const auto durationSinceLastContribution = + std::chrono::duration_cast( + now - this->lastContributionTime); + if (durationSinceLastContribution > std::chrono::milliseconds(500)) { + if (!lastContributionIsSent) { + // take the contribution queue and send it to the output + TokenBufferString contribution; + for (const auto &token : contributionQueue) { + contribution += token; + } + contributionQueue.clear(); +#ifdef _WIN32 + // convert caption to multibyte for obs + int count = WideCharToMultiByte(CP_UTF8, 0, contribution.c_str(), + (int)contribution.length(), NULL, 0, + NULL, NULL); + std::string contribution_out = std::string(count, 0); + WideCharToMultiByte(CP_UTF8, 0, contribution.c_str(), + (int)contribution.length(), + &contribution_out[0], count, NULL, NULL); +#else + std::string contribution_out(contribution.begin(), + contribution.end()); +#endif + + obs_log(LOG_INFO, "TokenBufferThread::monitor: output '%s'", + contribution_out.c_str()); + this->sentenceOutputCallback(contribution_out); + lastContributionIsSent = true; + } + } else { + lastContributionIsSent = false; + } + if (caption_out.empty()) { // if no caption was built, sleep for a while this->lastCaption = ""; - this->lastCaptionTime = std::chrono::steady_clock::now(); + this->lastCaptionTime = now; std::this_thread::sleep_for(std::chrono::milliseconds(100)); continue; } if (caption_out == lastCaption) { // if it has been max_time since the last caption - clear the presentation queue + const auto duration = std::chrono::duration_cast( + now - this->lastCaptionTime); if (this->maxTime.count() > 0) { - auto now = std::chrono::steady_clock::now(); - auto duration = std::chrono::duration_cast( - now - this->lastCaptionTime); if (duration > this->maxTime) { this->clear(); } } } else { // emit the caption - this->callback(caption_out); + this->captionPresentationCallback(caption_out); this->lastCaption = caption_out; - this->lastCaptionTime = std::chrono::steady_clock::now(); + this->lastCaptionTime = now; } // check the input queue size (iqs), if it's big - sleep less diff --git a/src/whisper-utils/token-buffer-thread.h b/src/whisper-utils/token-buffer-thread.h index a27b244..13be208 100644 --- a/src/whisper-utils/token-buffer-thread.h +++ b/src/whisper-utils/token-buffer-thread.h @@ -25,6 +25,8 @@ struct transcription_filter_data; enum TokenBufferSegmentation { SEGMENTATION_WORD = 0, SEGMENTATION_TOKEN, SEGMENTATION_SENTENCE }; enum TokenBufferSpeed { SPEED_SLOW = 0, SPEED_NORMAL, SPEED_FAST }; +typedef std::chrono::time_point TokenBufferTimePoint; + class TokenBufferThread { public: // default constructor @@ -32,8 +34,10 @@ public: ~TokenBufferThread(); void initialize(struct transcription_filter_data *gf, - std::function callback_, size_t numSentences_, - size_t numTokensPerSentence_, std::chrono::seconds maxTime_, + std::function captionPresentationCallback_, + std::function sentenceOutputCallback_, + size_t numSentences_, size_t numTokensPerSentence_, + std::chrono::seconds maxTime_, TokenBufferSegmentation segmentation_ = SEGMENTATION_TOKEN); void addSentence(const std::string &sentence); @@ -57,10 +61,12 @@ private: struct transcription_filter_data *gf; std::deque inputQueue; std::deque presentationQueue; + std::deque contributionQueue; std::thread workerThread; std::mutex inputQueueMutex; std::mutex presentationQueueMutex; - std::function callback; + std::function captionPresentationCallback; + std::function sentenceOutputCallback; std::condition_variable cv; std::chrono::seconds maxTime; std::atomic stop; @@ -69,7 +75,10 @@ private: size_t numPerSentence; TokenBufferSegmentation segmentation; // timestamp of the last caption - std::chrono::time_point lastCaptionTime; + TokenBufferTimePoint lastCaptionTime; + // timestamp of the last contribution + TokenBufferTimePoint lastContributionTime; + bool lastContributionIsSent = false; std::string lastCaption; };