refactor: Update TokenBufferThread to use TokenBufferString for sentence output (#122)

This commit is contained in:
Roy Shilkrot 2024-07-01 22:00:01 -04:00 committed by GitHub
parent 958266fb4e
commit a2244c2157
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 197 additions and 118 deletions

View File

@ -48,6 +48,101 @@ void audio_chunk_callback(struct transcription_filter_data *gf, const float *pcm
// stub
}
std::string send_sentence_to_translation(const std::string &sentence,
struct transcription_filter_data *gf)
{
const std::string last_text = gf->last_text;
gf->last_text = sentence;
if (gf->translate && !sentence.empty() && sentence != last_text) {
obs_log(gf->log_level, "Translating text. %s -> %s", gf->source_lang.c_str(),
gf->target_lang.c_str());
std::string translated_text;
if (translate(gf->translation_ctx, sentence, gf->source_lang, gf->target_lang,
translated_text) == OBS_POLYGLOT_TRANSLATION_SUCCESS) {
if (gf->log_words) {
obs_log(LOG_INFO, "Translation: '%s' -> '%s'", sentence.c_str(),
translated_text.c_str());
}
if (gf->translation_output == "none") {
// overwrite the original text with the translated text
return translated_text;
} else {
// send the translation to the selected source
send_caption_to_source(gf->translation_output, translated_text, gf);
}
} else {
obs_log(gf->log_level, "Failed to translate text");
}
}
return sentence;
}
void send_sentence_to_file(struct transcription_filter_data *gf,
const DetectionResultWithText &result, const std::string &str_copy)
{
// Check if we should save the sentence
if (gf->save_only_while_recording && !obs_frontend_recording_active()) {
// We are not recording, do not save the sentence to file
return;
}
// should the file be truncated?
std::ios_base::openmode openmode = std::ios::out;
if (gf->truncate_output_file) {
openmode |= std::ios::trunc;
} else {
openmode |= std::ios::app;
}
if (!gf->save_srt) {
// Write raw sentence to file
std::ofstream output_file(gf->output_file_path, openmode);
output_file << str_copy << std::endl;
output_file.close();
} else {
obs_log(gf->log_level, "Saving sentence to file %s, sentence #%d",
gf->output_file_path.c_str(), gf->sentence_number);
// Append sentence to file in .srt format
std::ofstream output_file(gf->output_file_path, openmode);
output_file << gf->sentence_number << std::endl;
// use the start and end timestamps to calculate the start and end time in srt format
auto format_ts_for_srt = [&output_file](uint64_t ts) {
uint64_t time_s = ts / 1000;
uint64_t time_m = time_s / 60;
uint64_t time_h = time_m / 60;
uint64_t time_ms_rem = ts % 1000;
uint64_t time_s_rem = time_s % 60;
uint64_t time_m_rem = time_m % 60;
uint64_t time_h_rem = time_h % 60;
output_file << std::setfill('0') << std::setw(2) << time_h_rem << ":"
<< std::setfill('0') << std::setw(2) << time_m_rem << ":"
<< std::setfill('0') << std::setw(2) << time_s_rem << ","
<< std::setfill('0') << std::setw(3) << time_ms_rem;
};
format_ts_for_srt(result.start_timestamp_ms);
output_file << " --> ";
format_ts_for_srt(result.end_timestamp_ms);
output_file << std::endl;
output_file << str_copy << std::endl;
output_file << std::endl;
output_file.close();
gf->sentence_number++;
}
}
void send_caption_to_stream(DetectionResultWithText result, const std::string &str_copy,
struct transcription_filter_data *gf)
{
obs_output_t *streaming_output = obs_frontend_get_streaming_output();
if (streaming_output) {
// calculate the duration in seconds
const uint64_t duration = result.end_timestamp_ms - result.start_timestamp_ms;
obs_log(gf->log_level, "Sending caption to streaming output: %s", str_copy.c_str());
obs_output_output_caption_text2(streaming_output, str_copy.c_str(),
(double)duration / 1000.0);
obs_output_release(streaming_output);
}
}
void set_text_callback(struct transcription_filter_data *gf,
const DetectionResultWithText &resultIn)
{
@ -98,103 +193,25 @@ void set_text_callback(struct transcription_filter_data *gf,
}
}
if (gf->translate && !str_copy.empty() && str_copy != gf->last_text &&
result.result == DETECTION_RESULT_SPEECH) {
obs_log(gf->log_level, "Translating text. %s -> %s", gf->source_lang.c_str(),
gf->target_lang.c_str());
std::string translated_text;
if (translate(gf->translation_ctx, str_copy, gf->source_lang, gf->target_lang,
translated_text) == OBS_POLYGLOT_TRANSLATION_SUCCESS) {
if (gf->log_words) {
obs_log(LOG_INFO, "Translation: '%s' -> '%s'", str_copy.c_str(),
translated_text.c_str());
}
if (gf->translation_output == "none") {
// overwrite the original text with the translated text
str_copy = translated_text;
} else {
// send the translation to the selected source
send_caption_to_source(gf->translation_output, translated_text, gf);
}
} else {
obs_log(gf->log_level, "Failed to translate text");
}
}
gf->last_text = str_copy;
if (gf->buffered_output) {
gf->captions_monitor.addSentence(str_copy);
} else {
// non-buffered output
if (gf->translate) {
// send the sentence to translation (if enabled)
str_copy = send_sentence_to_translation(str_copy, gf);
} else {
// send the sentence to the selected source
send_caption_to_source(gf->text_source_name, str_copy, gf);
}
}
if (gf->caption_to_stream) {
obs_output_t *streaming_output = obs_frontend_get_streaming_output();
if (streaming_output) {
// calculate the duration in seconds
const uint64_t duration =
result.end_timestamp_ms - result.start_timestamp_ms;
obs_log(gf->log_level, "Sending caption to streaming output: %s",
str_copy.c_str());
obs_output_output_caption_text2(streaming_output, str_copy.c_str(),
(double)duration / 1000.0);
obs_output_release(streaming_output);
}
send_caption_to_stream(result, str_copy, gf);
}
if (gf->output_file_path != "" && gf->text_source_name.empty()) {
// Check if we should save the sentence
if (gf->save_only_while_recording && !obs_frontend_recording_active()) {
// We are not recording, do not save the sentence to file
return;
}
// should the file be truncated?
std::ios_base::openmode openmode = std::ios::out;
if (gf->truncate_output_file) {
openmode |= std::ios::trunc;
} else {
openmode |= std::ios::app;
}
if (!gf->save_srt) {
// Write raw sentence to file
std::ofstream output_file(gf->output_file_path, openmode);
output_file << str_copy << std::endl;
output_file.close();
} else {
obs_log(gf->log_level, "Saving sentence to file %s, sentence #%d",
gf->output_file_path.c_str(), gf->sentence_number);
// Append sentence to file in .srt format
std::ofstream output_file(gf->output_file_path, openmode);
output_file << gf->sentence_number << std::endl;
// use the start and end timestamps to calculate the start and end time in srt format
auto format_ts_for_srt = [&output_file](uint64_t ts) {
uint64_t time_s = ts / 1000;
uint64_t time_m = time_s / 60;
uint64_t time_h = time_m / 60;
uint64_t time_ms_rem = ts % 1000;
uint64_t time_s_rem = time_s % 60;
uint64_t time_m_rem = time_m % 60;
uint64_t time_h_rem = time_h % 60;
output_file << std::setfill('0') << std::setw(2) << time_h_rem
<< ":" << std::setfill('0') << std::setw(2)
<< time_m_rem << ":" << std::setfill('0')
<< std::setw(2) << time_s_rem << ","
<< std::setfill('0') << std::setw(3) << time_ms_rem;
};
format_ts_for_srt(result.start_timestamp_ms);
output_file << " --> ";
format_ts_for_srt(result.end_timestamp_ms);
output_file << std::endl;
output_file << str_copy << std::endl;
output_file << std::endl;
output_file.close();
gf->sentence_number++;
}
} else {
if (!gf->buffered_output) {
// Send the caption to the text source
send_caption_to_source(gf->text_source_name, str_copy, gf);
}
send_sentence_to_file(gf, result, str_copy);
}
};

View File

@ -8,6 +8,8 @@
void send_caption_to_source(const std::string &target_source_name, const std::string &str_copy,
struct transcription_filter_data *gf);
std::string send_sentence_to_translation(const std::string &sentence,
struct transcription_filter_data *gf);
void audio_chunk_callback(struct transcription_filter_data *gf, const float *pcm32f_data,
size_t frames, int vad_state, const DetectionResultWithText &result);

View File

@ -204,6 +204,12 @@ void transcription_filter_update(void *data, obs_data_t *s)
gf);
}
},
[gf](const std::string &sentence) {
obs_log(LOG_INFO, "sentence: %s", sentence.c_str());
if (gf->buffered_output && gf->translate) {
send_sentence_to_translation(sentence, gf);
}
},
new_buffer_num_lines, new_buffer_num_chars_per_line,
std::chrono::seconds(3), new_buffer_output_type);
} else {

View File

@ -34,20 +34,24 @@ TokenBufferThread::~TokenBufferThread()
stopThread();
}
void TokenBufferThread::initialize(struct transcription_filter_data *gf_,
std::function<void(const std::string &)> callback_,
size_t numSentences_, size_t numPerSentence_,
std::chrono::seconds maxTime_,
TokenBufferSegmentation segmentation_)
void TokenBufferThread::initialize(
struct transcription_filter_data *gf_,
std::function<void(const std::string &)> captionPresentationCallback_,
std::function<void(const std::string &)> sentenceOutputCallback_, size_t numSentences_,
size_t numPerSentence_, std::chrono::seconds maxTime_,
TokenBufferSegmentation segmentation_)
{
this->gf = gf_;
this->callback = callback_;
this->captionPresentationCallback = captionPresentationCallback_;
this->sentenceOutputCallback = sentenceOutputCallback_;
this->numSentences = numSentences_;
this->numPerSentence = numPerSentence_;
this->segmentation = segmentation_;
this->maxTime = maxTime_;
this->stop = false;
this->workerThread = std::thread(&TokenBufferThread::monitor, this);
this->lastContributionTime = std::chrono::steady_clock::now();
this->lastCaptionTime = std::chrono::steady_clock::now();
}
void TokenBufferThread::stopThread()
@ -80,26 +84,29 @@ void TokenBufferThread::addSentence(const std::string &sentence)
std::wstring sentence_ws(count, 0);
MultiByteToWideChar(CP_UTF8, 0, sentence.c_str(), (int)sentence.length(), &sentence_ws[0],
count);
// split to characters
std::vector<std::wstring> characters;
for (const auto &c : sentence_ws) {
characters.push_back(std::wstring(1, c));
}
#else
// split to characters
std::vector<std::string> characters;
for (const auto &c : sentence) {
characters.push_back(std::string(1, c));
}
std::string sentence_ws = sentence;
#endif
// split to characters
std::vector<TokenBufferString> characters;
for (const auto &c : sentence_ws) {
characters.push_back(TokenBufferString(1, c));
}
std::lock_guard<std::mutex> lock(inputQueueMutex);
// add the reconstructed sentence to the wordQueue
// add the characters to the inputQueue
for (const auto &character : characters) {
inputQueue.push_back(character);
}
inputQueue.push_back(SPACE);
// add to the contribution queue as well
for (const auto &character : characters) {
contributionQueue.push_back(character);
}
contributionQueue.push_back(SPACE);
this->lastContributionTime = std::chrono::steady_clock::now();
}
void TokenBufferThread::clear()
@ -114,14 +121,14 @@ void TokenBufferThread::clear()
}
this->lastCaption = "";
this->lastCaptionTime = std::chrono::steady_clock::now();
this->callback("");
this->captionPresentationCallback("");
}
void TokenBufferThread::monitor()
{
obs_log(LOG_INFO, "TokenBufferThread::monitor");
this->callback("");
this->captionPresentationCallback("");
while (true) {
std::string caption_out;
@ -152,11 +159,13 @@ void TokenBufferThread::monitor()
if (!inputQueue.empty()) {
// if there are token on the input queue
// then add to the presentation queue based on the segmentation
if (this->segmentation == SEGMENTATION_SENTENCE) {
// add all the tokens from the input queue to the presentation queue
for (const auto &token : inputQueue) {
presentationQueue.push_back(token);
}
inputQueue.clear();
} else if (this->segmentation == SEGMENTATION_TOKEN) {
// add one token to the presentation queue
presentationQueue.push_back(inputQueue.front());
@ -259,29 +268,65 @@ void TokenBufferThread::monitor()
break;
}
const auto now = std::chrono::steady_clock::now();
// check if enough time passed since last contribution (debounce)
const auto durationSinceLastContribution =
std::chrono::duration_cast<std::chrono::seconds>(
now - this->lastContributionTime);
if (durationSinceLastContribution > std::chrono::milliseconds(500)) {
if (!lastContributionIsSent) {
// take the contribution queue and send it to the output
TokenBufferString contribution;
for (const auto &token : contributionQueue) {
contribution += token;
}
contributionQueue.clear();
#ifdef _WIN32
// convert caption to multibyte for obs
int count = WideCharToMultiByte(CP_UTF8, 0, contribution.c_str(),
(int)contribution.length(), NULL, 0,
NULL, NULL);
std::string contribution_out = std::string(count, 0);
WideCharToMultiByte(CP_UTF8, 0, contribution.c_str(),
(int)contribution.length(),
&contribution_out[0], count, NULL, NULL);
#else
std::string contribution_out(contribution.begin(),
contribution.end());
#endif
obs_log(LOG_INFO, "TokenBufferThread::monitor: output '%s'",
contribution_out.c_str());
this->sentenceOutputCallback(contribution_out);
lastContributionIsSent = true;
}
} else {
lastContributionIsSent = false;
}
if (caption_out.empty()) {
// if no caption was built, sleep for a while
this->lastCaption = "";
this->lastCaptionTime = std::chrono::steady_clock::now();
this->lastCaptionTime = now;
std::this_thread::sleep_for(std::chrono::milliseconds(100));
continue;
}
if (caption_out == lastCaption) {
// if it has been max_time since the last caption - clear the presentation queue
const auto duration = std::chrono::duration_cast<std::chrono::seconds>(
now - this->lastCaptionTime);
if (this->maxTime.count() > 0) {
auto now = std::chrono::steady_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::seconds>(
now - this->lastCaptionTime);
if (duration > this->maxTime) {
this->clear();
}
}
} else {
// emit the caption
this->callback(caption_out);
this->captionPresentationCallback(caption_out);
this->lastCaption = caption_out;
this->lastCaptionTime = std::chrono::steady_clock::now();
this->lastCaptionTime = now;
}
// check the input queue size (iqs), if it's big - sleep less

View File

@ -25,6 +25,8 @@ struct transcription_filter_data;
enum TokenBufferSegmentation { SEGMENTATION_WORD = 0, SEGMENTATION_TOKEN, SEGMENTATION_SENTENCE };
enum TokenBufferSpeed { SPEED_SLOW = 0, SPEED_NORMAL, SPEED_FAST };
typedef std::chrono::time_point<std::chrono::steady_clock> TokenBufferTimePoint;
class TokenBufferThread {
public:
// default constructor
@ -32,8 +34,10 @@ public:
~TokenBufferThread();
void initialize(struct transcription_filter_data *gf,
std::function<void(const std::string &)> callback_, size_t numSentences_,
size_t numTokensPerSentence_, std::chrono::seconds maxTime_,
std::function<void(const std::string &)> captionPresentationCallback_,
std::function<void(const std::string &)> sentenceOutputCallback_,
size_t numSentences_, size_t numTokensPerSentence_,
std::chrono::seconds maxTime_,
TokenBufferSegmentation segmentation_ = SEGMENTATION_TOKEN);
void addSentence(const std::string &sentence);
@ -57,10 +61,12 @@ private:
struct transcription_filter_data *gf;
std::deque<TokenBufferString> inputQueue;
std::deque<TokenBufferString> presentationQueue;
std::deque<TokenBufferString> contributionQueue;
std::thread workerThread;
std::mutex inputQueueMutex;
std::mutex presentationQueueMutex;
std::function<void(std::string)> callback;
std::function<void(std::string)> captionPresentationCallback;
std::function<void(std::string)> sentenceOutputCallback;
std::condition_variable cv;
std::chrono::seconds maxTime;
std::atomic<bool> stop;
@ -69,7 +75,10 @@ private:
size_t numPerSentence;
TokenBufferSegmentation segmentation;
// timestamp of the last caption
std::chrono::time_point<std::chrono::steady_clock> lastCaptionTime;
TokenBufferTimePoint lastCaptionTime;
// timestamp of the last contribution
TokenBufferTimePoint lastContributionTime;
bool lastContributionIsSent = false;
std::string lastCaption;
};