refactor: Update TokenBufferThread to use TokenBufferString for sentence output (#122)

2024-11-07 18:57:14 +00:00 · 2024-07-01 22:00:01 -04:00 · 2024-07-01 22:00:01 -04:00 · a2244c2157
commit a2244c2157
parent 958266fb4e
5 changed files with 197 additions and 118 deletions
--- a/src/transcription-filter-callbacks.cpp
+++ b/src/transcription-filter-callbacks.cpp
@ -48,6 +48,101 @@ void audio_chunk_callback(struct transcription_filter_data *gf, const float *pcm
 	// stub
 }

+std::string send_sentence_to_translation(const std::string &sentence,
+					 struct transcription_filter_data *gf)
+{
+	const std::string last_text = gf->last_text;
+	gf->last_text = sentence;
+	if (gf->translate && !sentence.empty() && sentence != last_text) {
+		obs_log(gf->log_level, "Translating text. %s -> %s", gf->source_lang.c_str(),
+			gf->target_lang.c_str());
+		std::string translated_text;
+		if (translate(gf->translation_ctx, sentence, gf->source_lang, gf->target_lang,
+			      translated_text) == OBS_POLYGLOT_TRANSLATION_SUCCESS) {
+			if (gf->log_words) {
+				obs_log(LOG_INFO, "Translation: '%s' -> '%s'", sentence.c_str(),
+					translated_text.c_str());
+			}
+			if (gf->translation_output == "none") {
+				// overwrite the original text with the translated text
+				return translated_text;
+			} else {
+				// send the translation to the selected source
+				send_caption_to_source(gf->translation_output, translated_text, gf);
+			}
+		} else {
+			obs_log(gf->log_level, "Failed to translate text");
+		}
+	}
+	return sentence;
+}
+
+void send_sentence_to_file(struct transcription_filter_data *gf,
+			   const DetectionResultWithText &result, const std::string &str_copy)
+{
+	// Check if we should save the sentence
+	if (gf->save_only_while_recording && !obs_frontend_recording_active()) {
+		// We are not recording, do not save the sentence to file
+		return;
+	}
+	// should the file be truncated?
+	std::ios_base::openmode openmode = std::ios::out;
+	if (gf->truncate_output_file) {
+		openmode |= std::ios::trunc;
+	} else {
+		openmode |= std::ios::app;
+	}
+	if (!gf->save_srt) {
+		// Write raw sentence to file
+		std::ofstream output_file(gf->output_file_path, openmode);
+		output_file << str_copy << std::endl;
+		output_file.close();
+	} else {
+		obs_log(gf->log_level, "Saving sentence to file %s, sentence #%d",
+			gf->output_file_path.c_str(), gf->sentence_number);
+		// Append sentence to file in .srt format
+		std::ofstream output_file(gf->output_file_path, openmode);
+		output_file << gf->sentence_number << std::endl;
+		// use the start and end timestamps to calculate the start and end time in srt format
+		auto format_ts_for_srt = [&output_file](uint64_t ts) {
+			uint64_t time_s = ts / 1000;
+			uint64_t time_m = time_s / 60;
+			uint64_t time_h = time_m / 60;
+			uint64_t time_ms_rem = ts % 1000;
+			uint64_t time_s_rem = time_s % 60;
+			uint64_t time_m_rem = time_m % 60;
+			uint64_t time_h_rem = time_h % 60;
+			output_file << std::setfill('0') << std::setw(2) << time_h_rem << ":"
+				    << std::setfill('0') << std::setw(2) << time_m_rem << ":"
+				    << std::setfill('0') << std::setw(2) << time_s_rem << ","
+				    << std::setfill('0') << std::setw(3) << time_ms_rem;
+		};
+		format_ts_for_srt(result.start_timestamp_ms);
+		output_file << " --> ";
+		format_ts_for_srt(result.end_timestamp_ms);
+		output_file << std::endl;
+
+		output_file << str_copy << std::endl;
+		output_file << std::endl;
+		output_file.close();
+		gf->sentence_number++;
+	}
+}
+
+void send_caption_to_stream(DetectionResultWithText result, const std::string &str_copy,
+			    struct transcription_filter_data *gf)
+{
+	obs_output_t *streaming_output = obs_frontend_get_streaming_output();
+	if (streaming_output) {
+		// calculate the duration in seconds
+		const uint64_t duration = result.end_timestamp_ms - result.start_timestamp_ms;
+		obs_log(gf->log_level, "Sending caption to streaming output: %s", str_copy.c_str());
+		obs_output_output_caption_text2(streaming_output, str_copy.c_str(),
+						(double)duration / 1000.0);
+		obs_output_release(streaming_output);
+	}
+}
+
 void set_text_callback(struct transcription_filter_data *gf,
 		       const DetectionResultWithText &resultIn)
 {
@ -98,103 +193,25 @@ void set_text_callback(struct transcription_filter_data *gf,
 		}
 	}

-	if (gf->translate && !str_copy.empty() && str_copy != gf->last_text &&
-	    result.result == DETECTION_RESULT_SPEECH) {
-		obs_log(gf->log_level, "Translating text. %s -> %s", gf->source_lang.c_str(),
-			gf->target_lang.c_str());
-		std::string translated_text;
-		if (translate(gf->translation_ctx, str_copy, gf->source_lang, gf->target_lang,
-			      translated_text) == OBS_POLYGLOT_TRANSLATION_SUCCESS) {
-			if (gf->log_words) {
-				obs_log(LOG_INFO, "Translation: '%s' -> '%s'", str_copy.c_str(),
-					translated_text.c_str());
-			}
-			if (gf->translation_output == "none") {
-				// overwrite the original text with the translated text
-				str_copy = translated_text;
-			} else {
-				// send the translation to the selected source
-				send_caption_to_source(gf->translation_output, translated_text, gf);
-			}
-		} else {
-			obs_log(gf->log_level, "Failed to translate text");
-		}
-	}
-
-	gf->last_text = str_copy;
-
 	if (gf->buffered_output) {
 		gf->captions_monitor.addSentence(str_copy);
+	} else {
+		// non-buffered output
+		if (gf->translate) {
+			// send the sentence to translation (if enabled)
+			str_copy = send_sentence_to_translation(str_copy, gf);
+		} else {
+			// send the sentence to the selected source
+			send_caption_to_source(gf->text_source_name, str_copy, gf);
+		}
 	}

 	if (gf->caption_to_stream) {
-		obs_output_t *streaming_output = obs_frontend_get_streaming_output();
-		if (streaming_output) {
-			// calculate the duration in seconds
-			const uint64_t duration =
-				result.end_timestamp_ms - result.start_timestamp_ms;
-			obs_log(gf->log_level, "Sending caption to streaming output: %s",
-				str_copy.c_str());
-			obs_output_output_caption_text2(streaming_output, str_copy.c_str(),
-							(double)duration / 1000.0);
-			obs_output_release(streaming_output);
-		}
+		send_caption_to_stream(result, str_copy, gf);
 	}

 	if (gf->output_file_path != "" && gf->text_source_name.empty()) {
-		// Check if we should save the sentence
-		if (gf->save_only_while_recording && !obs_frontend_recording_active()) {
-			// We are not recording, do not save the sentence to file
-			return;
-		}
-		// should the file be truncated?
-		std::ios_base::openmode openmode = std::ios::out;
-		if (gf->truncate_output_file) {
-			openmode |= std::ios::trunc;
-		} else {
-			openmode |= std::ios::app;
-		}
-		if (!gf->save_srt) {
-			// Write raw sentence to file
-			std::ofstream output_file(gf->output_file_path, openmode);
-			output_file << str_copy << std::endl;
-			output_file.close();
-		} else {
-			obs_log(gf->log_level, "Saving sentence to file %s, sentence #%d",
-				gf->output_file_path.c_str(), gf->sentence_number);
-			// Append sentence to file in .srt format
-			std::ofstream output_file(gf->output_file_path, openmode);
-			output_file << gf->sentence_number << std::endl;
-			// use the start and end timestamps to calculate the start and end time in srt format
-			auto format_ts_for_srt = [&output_file](uint64_t ts) {
-				uint64_t time_s = ts / 1000;
-				uint64_t time_m = time_s / 60;
-				uint64_t time_h = time_m / 60;
-				uint64_t time_ms_rem = ts % 1000;
-				uint64_t time_s_rem = time_s % 60;
-				uint64_t time_m_rem = time_m % 60;
-				uint64_t time_h_rem = time_h % 60;
-				output_file << std::setfill('0') << std::setw(2) << time_h_rem
-					    << ":" << std::setfill('0') << std::setw(2)
-					    << time_m_rem << ":" << std::setfill('0')
-					    << std::setw(2) << time_s_rem << ","
-					    << std::setfill('0') << std::setw(3) << time_ms_rem;
-			};
-			format_ts_for_srt(result.start_timestamp_ms);
-			output_file << " --> ";
-			format_ts_for_srt(result.end_timestamp_ms);
-			output_file << std::endl;
-
-			output_file << str_copy << std::endl;
-			output_file << std::endl;
-			output_file.close();
-			gf->sentence_number++;
-		}
-	} else {
-		if (!gf->buffered_output) {
-			// Send the caption to the text source
-			send_caption_to_source(gf->text_source_name, str_copy, gf);
-		}
+		send_sentence_to_file(gf, result, str_copy);
 	}
 };

--- a/src/transcription-filter-callbacks.h
+++ b/src/transcription-filter-callbacks.h
@ -8,6 +8,8 @@

 void send_caption_to_source(const std::string &target_source_name, const std::string &str_copy,
 			    struct transcription_filter_data *gf);
+std::string send_sentence_to_translation(const std::string &sentence,
+					 struct transcription_filter_data *gf);

 void audio_chunk_callback(struct transcription_filter_data *gf, const float *pcm32f_data,
 			  size_t frames, int vad_state, const DetectionResultWithText &result);
--- a/src/transcription-filter.cpp
+++ b/src/transcription-filter.cpp
@ -204,6 +204,12 @@ void transcription_filter_update(void *data, obs_data_t *s)
 								       gf);
 					}
 				},
+				[gf](const std::string &sentence) {
+					obs_log(LOG_INFO, "sentence: %s", sentence.c_str());
+					if (gf->buffered_output && gf->translate) {
+						send_sentence_to_translation(sentence, gf);
+					}
+				},
 				new_buffer_num_lines, new_buffer_num_chars_per_line,
 				std::chrono::seconds(3), new_buffer_output_type);
 		} else {
--- a/src/whisper-utils/token-buffer-thread.cpp
+++ b/src/whisper-utils/token-buffer-thread.cpp
@ -34,20 +34,24 @@ TokenBufferThread::~TokenBufferThread()
 	stopThread();
 }

-void TokenBufferThread::initialize(struct transcription_filter_data *gf_,
-				   std::function<void(const std::string &)> callback_,
-				   size_t numSentences_, size_t numPerSentence_,
-				   std::chrono::seconds maxTime_,
-				   TokenBufferSegmentation segmentation_)
+void TokenBufferThread::initialize(
+	struct transcription_filter_data *gf_,
+	std::function<void(const std::string &)> captionPresentationCallback_,
+	std::function<void(const std::string &)> sentenceOutputCallback_, size_t numSentences_,
+	size_t numPerSentence_, std::chrono::seconds maxTime_,
+	TokenBufferSegmentation segmentation_)
 {
 	this->gf = gf_;
-	this->callback = callback_;
+	this->captionPresentationCallback = captionPresentationCallback_;
+	this->sentenceOutputCallback = sentenceOutputCallback_;
 	this->numSentences = numSentences_;
 	this->numPerSentence = numPerSentence_;
 	this->segmentation = segmentation_;
 	this->maxTime = maxTime_;
 	this->stop = false;
 	this->workerThread = std::thread(&TokenBufferThread::monitor, this);
+	this->lastContributionTime = std::chrono::steady_clock::now();
+	this->lastCaptionTime = std::chrono::steady_clock::now();
 }

 void TokenBufferThread::stopThread()
@ -80,26 +84,29 @@ void TokenBufferThread::addSentence(const std::string &sentence)
 	std::wstring sentence_ws(count, 0);
 	MultiByteToWideChar(CP_UTF8, 0, sentence.c_str(), (int)sentence.length(), &sentence_ws[0],
 			    count);
-	// split to characters
-	std::vector<std::wstring> characters;
-	for (const auto &c : sentence_ws) {
-		characters.push_back(std::wstring(1, c));
-	}
 #else
-	// split to characters
-	std::vector<std::string> characters;
-	for (const auto &c : sentence) {
-		characters.push_back(std::string(1, c));
-	}
+	std::string sentence_ws = sentence;
 #endif
+	// split to characters
+	std::vector<TokenBufferString> characters;
+	for (const auto &c : sentence_ws) {
+		characters.push_back(TokenBufferString(1, c));
+	}

 	std::lock_guard<std::mutex> lock(inputQueueMutex);

-	// add the reconstructed sentence to the wordQueue
+	// add the characters to the inputQueue
 	for (const auto &character : characters) {
 		inputQueue.push_back(character);
 	}
 	inputQueue.push_back(SPACE);
+
+	// add to the contribution queue as well
+	for (const auto &character : characters) {
+		contributionQueue.push_back(character);
+	}
+	contributionQueue.push_back(SPACE);
+	this->lastContributionTime = std::chrono::steady_clock::now();
 }

 void TokenBufferThread::clear()
@ -114,14 +121,14 @@ void TokenBufferThread::clear()
 	}
 	this->lastCaption = "";
 	this->lastCaptionTime = std::chrono::steady_clock::now();
-	this->callback("");
+	this->captionPresentationCallback("");
 }

 void TokenBufferThread::monitor()
 {
 	obs_log(LOG_INFO, "TokenBufferThread::monitor");

-	this->callback("");
+	this->captionPresentationCallback("");

 	while (true) {
 		std::string caption_out;
@ -152,11 +159,13 @@ void TokenBufferThread::monitor()

 				if (!inputQueue.empty()) {
 					// if there are token on the input queue
+					// then add to the presentation queue based on the segmentation
 					if (this->segmentation == SEGMENTATION_SENTENCE) {
 						// add all the tokens from the input queue to the presentation queue
 						for (const auto &token : inputQueue) {
 							presentationQueue.push_back(token);
 						}
+						inputQueue.clear();
 					} else if (this->segmentation == SEGMENTATION_TOKEN) {
 						// add one token to the presentation queue
 						presentationQueue.push_back(inputQueue.front());
@ -259,29 +268,65 @@ void TokenBufferThread::monitor()
 			break;
 		}

+		const auto now = std::chrono::steady_clock::now();
+
+		// check if enough time passed since last contribution (debounce)
+		const auto durationSinceLastContribution =
+			std::chrono::duration_cast<std::chrono::seconds>(
+				now - this->lastContributionTime);
+		if (durationSinceLastContribution > std::chrono::milliseconds(500)) {
+			if (!lastContributionIsSent) {
+				// take the contribution queue and send it to the output
+				TokenBufferString contribution;
+				for (const auto &token : contributionQueue) {
+					contribution += token;
+				}
+				contributionQueue.clear();
+#ifdef _WIN32
+				// convert caption to multibyte for obs
+				int count = WideCharToMultiByte(CP_UTF8, 0, contribution.c_str(),
+								(int)contribution.length(), NULL, 0,
+								NULL, NULL);
+				std::string contribution_out = std::string(count, 0);
+				WideCharToMultiByte(CP_UTF8, 0, contribution.c_str(),
+						    (int)contribution.length(),
+						    &contribution_out[0], count, NULL, NULL);
+#else
+				std::string contribution_out(contribution.begin(),
+							     contribution.end());
+#endif
+
+				obs_log(LOG_INFO, "TokenBufferThread::monitor: output '%s'",
+					contribution_out.c_str());
+				this->sentenceOutputCallback(contribution_out);
+				lastContributionIsSent = true;
+			}
+		} else {
+			lastContributionIsSent = false;
+		}
+
 		if (caption_out.empty()) {
 			// if no caption was built, sleep for a while
 			this->lastCaption = "";
-			this->lastCaptionTime = std::chrono::steady_clock::now();
+			this->lastCaptionTime = now;
 			std::this_thread::sleep_for(std::chrono::milliseconds(100));
 			continue;
 		}

 		if (caption_out == lastCaption) {
 			// if it has been max_time since the last caption - clear the presentation queue
+			const auto duration = std::chrono::duration_cast<std::chrono::seconds>(
+				now - this->lastCaptionTime);
 			if (this->maxTime.count() > 0) {
-				auto now = std::chrono::steady_clock::now();
-				auto duration = std::chrono::duration_cast<std::chrono::seconds>(
-					now - this->lastCaptionTime);
 				if (duration > this->maxTime) {
 					this->clear();
 				}
 			}
 		} else {
 			// emit the caption
-			this->callback(caption_out);
+			this->captionPresentationCallback(caption_out);
 			this->lastCaption = caption_out;
-			this->lastCaptionTime = std::chrono::steady_clock::now();
+			this->lastCaptionTime = now;
 		}

 		// check the input queue size (iqs), if it's big - sleep less
--- a/src/whisper-utils/token-buffer-thread.h
+++ b/src/whisper-utils/token-buffer-thread.h
@ -25,6 +25,8 @@ struct transcription_filter_data;
 enum TokenBufferSegmentation { SEGMENTATION_WORD = 0, SEGMENTATION_TOKEN, SEGMENTATION_SENTENCE };
 enum TokenBufferSpeed { SPEED_SLOW = 0, SPEED_NORMAL, SPEED_FAST };

+typedef std::chrono::time_point<std::chrono::steady_clock> TokenBufferTimePoint;
+
 class TokenBufferThread {
 public:
 	// default constructor
@ -32,8 +34,10 @@ public:

 	~TokenBufferThread();
 	void initialize(struct transcription_filter_data *gf,
-			std::function<void(const std::string &)> callback_, size_t numSentences_,
-			size_t numTokensPerSentence_, std::chrono::seconds maxTime_,
+			std::function<void(const std::string &)> captionPresentationCallback_,
+			std::function<void(const std::string &)> sentenceOutputCallback_,
+			size_t numSentences_, size_t numTokensPerSentence_,
+			std::chrono::seconds maxTime_,
 			TokenBufferSegmentation segmentation_ = SEGMENTATION_TOKEN);

 	void addSentence(const std::string &sentence);
@ -57,10 +61,12 @@ private:
 	struct transcription_filter_data *gf;
 	std::deque<TokenBufferString> inputQueue;
 	std::deque<TokenBufferString> presentationQueue;
+	std::deque<TokenBufferString> contributionQueue;
 	std::thread workerThread;
 	std::mutex inputQueueMutex;
 	std::mutex presentationQueueMutex;
-	std::function<void(std::string)> callback;
+	std::function<void(std::string)> captionPresentationCallback;
+	std::function<void(std::string)> sentenceOutputCallback;
 	std::condition_variable cv;
 	std::chrono::seconds maxTime;
 	std::atomic<bool> stop;
@ -69,7 +75,10 @@ private:
 	size_t numPerSentence;
 	TokenBufferSegmentation segmentation;
 	// timestamp of the last caption
-	std::chrono::time_point<std::chrono::steady_clock> lastCaptionTime;
+	TokenBufferTimePoint lastCaptionTime;
+	// timestamp of the last contribution
+	TokenBufferTimePoint lastContributionTime;
+	bool lastContributionIsSent = false;
 	std::string lastCaption;
 };