mirror of
https://github.com/occ-ai/obs-localvocal
synced 2024-11-07 18:57:14 +00:00
refactor: Update TokenBufferThread to use TokenBufferString for sentence output (#122)
This commit is contained in:
parent
958266fb4e
commit
a2244c2157
@ -48,6 +48,101 @@ void audio_chunk_callback(struct transcription_filter_data *gf, const float *pcm
|
||||
// stub
|
||||
}
|
||||
|
||||
std::string send_sentence_to_translation(const std::string &sentence,
|
||||
struct transcription_filter_data *gf)
|
||||
{
|
||||
const std::string last_text = gf->last_text;
|
||||
gf->last_text = sentence;
|
||||
if (gf->translate && !sentence.empty() && sentence != last_text) {
|
||||
obs_log(gf->log_level, "Translating text. %s -> %s", gf->source_lang.c_str(),
|
||||
gf->target_lang.c_str());
|
||||
std::string translated_text;
|
||||
if (translate(gf->translation_ctx, sentence, gf->source_lang, gf->target_lang,
|
||||
translated_text) == OBS_POLYGLOT_TRANSLATION_SUCCESS) {
|
||||
if (gf->log_words) {
|
||||
obs_log(LOG_INFO, "Translation: '%s' -> '%s'", sentence.c_str(),
|
||||
translated_text.c_str());
|
||||
}
|
||||
if (gf->translation_output == "none") {
|
||||
// overwrite the original text with the translated text
|
||||
return translated_text;
|
||||
} else {
|
||||
// send the translation to the selected source
|
||||
send_caption_to_source(gf->translation_output, translated_text, gf);
|
||||
}
|
||||
} else {
|
||||
obs_log(gf->log_level, "Failed to translate text");
|
||||
}
|
||||
}
|
||||
return sentence;
|
||||
}
|
||||
|
||||
void send_sentence_to_file(struct transcription_filter_data *gf,
|
||||
const DetectionResultWithText &result, const std::string &str_copy)
|
||||
{
|
||||
// Check if we should save the sentence
|
||||
if (gf->save_only_while_recording && !obs_frontend_recording_active()) {
|
||||
// We are not recording, do not save the sentence to file
|
||||
return;
|
||||
}
|
||||
// should the file be truncated?
|
||||
std::ios_base::openmode openmode = std::ios::out;
|
||||
if (gf->truncate_output_file) {
|
||||
openmode |= std::ios::trunc;
|
||||
} else {
|
||||
openmode |= std::ios::app;
|
||||
}
|
||||
if (!gf->save_srt) {
|
||||
// Write raw sentence to file
|
||||
std::ofstream output_file(gf->output_file_path, openmode);
|
||||
output_file << str_copy << std::endl;
|
||||
output_file.close();
|
||||
} else {
|
||||
obs_log(gf->log_level, "Saving sentence to file %s, sentence #%d",
|
||||
gf->output_file_path.c_str(), gf->sentence_number);
|
||||
// Append sentence to file in .srt format
|
||||
std::ofstream output_file(gf->output_file_path, openmode);
|
||||
output_file << gf->sentence_number << std::endl;
|
||||
// use the start and end timestamps to calculate the start and end time in srt format
|
||||
auto format_ts_for_srt = [&output_file](uint64_t ts) {
|
||||
uint64_t time_s = ts / 1000;
|
||||
uint64_t time_m = time_s / 60;
|
||||
uint64_t time_h = time_m / 60;
|
||||
uint64_t time_ms_rem = ts % 1000;
|
||||
uint64_t time_s_rem = time_s % 60;
|
||||
uint64_t time_m_rem = time_m % 60;
|
||||
uint64_t time_h_rem = time_h % 60;
|
||||
output_file << std::setfill('0') << std::setw(2) << time_h_rem << ":"
|
||||
<< std::setfill('0') << std::setw(2) << time_m_rem << ":"
|
||||
<< std::setfill('0') << std::setw(2) << time_s_rem << ","
|
||||
<< std::setfill('0') << std::setw(3) << time_ms_rem;
|
||||
};
|
||||
format_ts_for_srt(result.start_timestamp_ms);
|
||||
output_file << " --> ";
|
||||
format_ts_for_srt(result.end_timestamp_ms);
|
||||
output_file << std::endl;
|
||||
|
||||
output_file << str_copy << std::endl;
|
||||
output_file << std::endl;
|
||||
output_file.close();
|
||||
gf->sentence_number++;
|
||||
}
|
||||
}
|
||||
|
||||
void send_caption_to_stream(DetectionResultWithText result, const std::string &str_copy,
|
||||
struct transcription_filter_data *gf)
|
||||
{
|
||||
obs_output_t *streaming_output = obs_frontend_get_streaming_output();
|
||||
if (streaming_output) {
|
||||
// calculate the duration in seconds
|
||||
const uint64_t duration = result.end_timestamp_ms - result.start_timestamp_ms;
|
||||
obs_log(gf->log_level, "Sending caption to streaming output: %s", str_copy.c_str());
|
||||
obs_output_output_caption_text2(streaming_output, str_copy.c_str(),
|
||||
(double)duration / 1000.0);
|
||||
obs_output_release(streaming_output);
|
||||
}
|
||||
}
|
||||
|
||||
void set_text_callback(struct transcription_filter_data *gf,
|
||||
const DetectionResultWithText &resultIn)
|
||||
{
|
||||
@ -98,103 +193,25 @@ void set_text_callback(struct transcription_filter_data *gf,
|
||||
}
|
||||
}
|
||||
|
||||
if (gf->translate && !str_copy.empty() && str_copy != gf->last_text &&
|
||||
result.result == DETECTION_RESULT_SPEECH) {
|
||||
obs_log(gf->log_level, "Translating text. %s -> %s", gf->source_lang.c_str(),
|
||||
gf->target_lang.c_str());
|
||||
std::string translated_text;
|
||||
if (translate(gf->translation_ctx, str_copy, gf->source_lang, gf->target_lang,
|
||||
translated_text) == OBS_POLYGLOT_TRANSLATION_SUCCESS) {
|
||||
if (gf->log_words) {
|
||||
obs_log(LOG_INFO, "Translation: '%s' -> '%s'", str_copy.c_str(),
|
||||
translated_text.c_str());
|
||||
}
|
||||
if (gf->translation_output == "none") {
|
||||
// overwrite the original text with the translated text
|
||||
str_copy = translated_text;
|
||||
} else {
|
||||
// send the translation to the selected source
|
||||
send_caption_to_source(gf->translation_output, translated_text, gf);
|
||||
}
|
||||
} else {
|
||||
obs_log(gf->log_level, "Failed to translate text");
|
||||
}
|
||||
}
|
||||
|
||||
gf->last_text = str_copy;
|
||||
|
||||
if (gf->buffered_output) {
|
||||
gf->captions_monitor.addSentence(str_copy);
|
||||
} else {
|
||||
// non-buffered output
|
||||
if (gf->translate) {
|
||||
// send the sentence to translation (if enabled)
|
||||
str_copy = send_sentence_to_translation(str_copy, gf);
|
||||
} else {
|
||||
// send the sentence to the selected source
|
||||
send_caption_to_source(gf->text_source_name, str_copy, gf);
|
||||
}
|
||||
}
|
||||
|
||||
if (gf->caption_to_stream) {
|
||||
obs_output_t *streaming_output = obs_frontend_get_streaming_output();
|
||||
if (streaming_output) {
|
||||
// calculate the duration in seconds
|
||||
const uint64_t duration =
|
||||
result.end_timestamp_ms - result.start_timestamp_ms;
|
||||
obs_log(gf->log_level, "Sending caption to streaming output: %s",
|
||||
str_copy.c_str());
|
||||
obs_output_output_caption_text2(streaming_output, str_copy.c_str(),
|
||||
(double)duration / 1000.0);
|
||||
obs_output_release(streaming_output);
|
||||
}
|
||||
send_caption_to_stream(result, str_copy, gf);
|
||||
}
|
||||
|
||||
if (gf->output_file_path != "" && gf->text_source_name.empty()) {
|
||||
// Check if we should save the sentence
|
||||
if (gf->save_only_while_recording && !obs_frontend_recording_active()) {
|
||||
// We are not recording, do not save the sentence to file
|
||||
return;
|
||||
}
|
||||
// should the file be truncated?
|
||||
std::ios_base::openmode openmode = std::ios::out;
|
||||
if (gf->truncate_output_file) {
|
||||
openmode |= std::ios::trunc;
|
||||
} else {
|
||||
openmode |= std::ios::app;
|
||||
}
|
||||
if (!gf->save_srt) {
|
||||
// Write raw sentence to file
|
||||
std::ofstream output_file(gf->output_file_path, openmode);
|
||||
output_file << str_copy << std::endl;
|
||||
output_file.close();
|
||||
} else {
|
||||
obs_log(gf->log_level, "Saving sentence to file %s, sentence #%d",
|
||||
gf->output_file_path.c_str(), gf->sentence_number);
|
||||
// Append sentence to file in .srt format
|
||||
std::ofstream output_file(gf->output_file_path, openmode);
|
||||
output_file << gf->sentence_number << std::endl;
|
||||
// use the start and end timestamps to calculate the start and end time in srt format
|
||||
auto format_ts_for_srt = [&output_file](uint64_t ts) {
|
||||
uint64_t time_s = ts / 1000;
|
||||
uint64_t time_m = time_s / 60;
|
||||
uint64_t time_h = time_m / 60;
|
||||
uint64_t time_ms_rem = ts % 1000;
|
||||
uint64_t time_s_rem = time_s % 60;
|
||||
uint64_t time_m_rem = time_m % 60;
|
||||
uint64_t time_h_rem = time_h % 60;
|
||||
output_file << std::setfill('0') << std::setw(2) << time_h_rem
|
||||
<< ":" << std::setfill('0') << std::setw(2)
|
||||
<< time_m_rem << ":" << std::setfill('0')
|
||||
<< std::setw(2) << time_s_rem << ","
|
||||
<< std::setfill('0') << std::setw(3) << time_ms_rem;
|
||||
};
|
||||
format_ts_for_srt(result.start_timestamp_ms);
|
||||
output_file << " --> ";
|
||||
format_ts_for_srt(result.end_timestamp_ms);
|
||||
output_file << std::endl;
|
||||
|
||||
output_file << str_copy << std::endl;
|
||||
output_file << std::endl;
|
||||
output_file.close();
|
||||
gf->sentence_number++;
|
||||
}
|
||||
} else {
|
||||
if (!gf->buffered_output) {
|
||||
// Send the caption to the text source
|
||||
send_caption_to_source(gf->text_source_name, str_copy, gf);
|
||||
}
|
||||
send_sentence_to_file(gf, result, str_copy);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -8,6 +8,8 @@
|
||||
|
||||
void send_caption_to_source(const std::string &target_source_name, const std::string &str_copy,
|
||||
struct transcription_filter_data *gf);
|
||||
std::string send_sentence_to_translation(const std::string &sentence,
|
||||
struct transcription_filter_data *gf);
|
||||
|
||||
void audio_chunk_callback(struct transcription_filter_data *gf, const float *pcm32f_data,
|
||||
size_t frames, int vad_state, const DetectionResultWithText &result);
|
||||
|
@ -204,6 +204,12 @@ void transcription_filter_update(void *data, obs_data_t *s)
|
||||
gf);
|
||||
}
|
||||
},
|
||||
[gf](const std::string &sentence) {
|
||||
obs_log(LOG_INFO, "sentence: %s", sentence.c_str());
|
||||
if (gf->buffered_output && gf->translate) {
|
||||
send_sentence_to_translation(sentence, gf);
|
||||
}
|
||||
},
|
||||
new_buffer_num_lines, new_buffer_num_chars_per_line,
|
||||
std::chrono::seconds(3), new_buffer_output_type);
|
||||
} else {
|
||||
|
@ -34,20 +34,24 @@ TokenBufferThread::~TokenBufferThread()
|
||||
stopThread();
|
||||
}
|
||||
|
||||
void TokenBufferThread::initialize(struct transcription_filter_data *gf_,
|
||||
std::function<void(const std::string &)> callback_,
|
||||
size_t numSentences_, size_t numPerSentence_,
|
||||
std::chrono::seconds maxTime_,
|
||||
TokenBufferSegmentation segmentation_)
|
||||
void TokenBufferThread::initialize(
|
||||
struct transcription_filter_data *gf_,
|
||||
std::function<void(const std::string &)> captionPresentationCallback_,
|
||||
std::function<void(const std::string &)> sentenceOutputCallback_, size_t numSentences_,
|
||||
size_t numPerSentence_, std::chrono::seconds maxTime_,
|
||||
TokenBufferSegmentation segmentation_)
|
||||
{
|
||||
this->gf = gf_;
|
||||
this->callback = callback_;
|
||||
this->captionPresentationCallback = captionPresentationCallback_;
|
||||
this->sentenceOutputCallback = sentenceOutputCallback_;
|
||||
this->numSentences = numSentences_;
|
||||
this->numPerSentence = numPerSentence_;
|
||||
this->segmentation = segmentation_;
|
||||
this->maxTime = maxTime_;
|
||||
this->stop = false;
|
||||
this->workerThread = std::thread(&TokenBufferThread::monitor, this);
|
||||
this->lastContributionTime = std::chrono::steady_clock::now();
|
||||
this->lastCaptionTime = std::chrono::steady_clock::now();
|
||||
}
|
||||
|
||||
void TokenBufferThread::stopThread()
|
||||
@ -80,26 +84,29 @@ void TokenBufferThread::addSentence(const std::string &sentence)
|
||||
std::wstring sentence_ws(count, 0);
|
||||
MultiByteToWideChar(CP_UTF8, 0, sentence.c_str(), (int)sentence.length(), &sentence_ws[0],
|
||||
count);
|
||||
// split to characters
|
||||
std::vector<std::wstring> characters;
|
||||
for (const auto &c : sentence_ws) {
|
||||
characters.push_back(std::wstring(1, c));
|
||||
}
|
||||
#else
|
||||
// split to characters
|
||||
std::vector<std::string> characters;
|
||||
for (const auto &c : sentence) {
|
||||
characters.push_back(std::string(1, c));
|
||||
}
|
||||
std::string sentence_ws = sentence;
|
||||
#endif
|
||||
// split to characters
|
||||
std::vector<TokenBufferString> characters;
|
||||
for (const auto &c : sentence_ws) {
|
||||
characters.push_back(TokenBufferString(1, c));
|
||||
}
|
||||
|
||||
std::lock_guard<std::mutex> lock(inputQueueMutex);
|
||||
|
||||
// add the reconstructed sentence to the wordQueue
|
||||
// add the characters to the inputQueue
|
||||
for (const auto &character : characters) {
|
||||
inputQueue.push_back(character);
|
||||
}
|
||||
inputQueue.push_back(SPACE);
|
||||
|
||||
// add to the contribution queue as well
|
||||
for (const auto &character : characters) {
|
||||
contributionQueue.push_back(character);
|
||||
}
|
||||
contributionQueue.push_back(SPACE);
|
||||
this->lastContributionTime = std::chrono::steady_clock::now();
|
||||
}
|
||||
|
||||
void TokenBufferThread::clear()
|
||||
@ -114,14 +121,14 @@ void TokenBufferThread::clear()
|
||||
}
|
||||
this->lastCaption = "";
|
||||
this->lastCaptionTime = std::chrono::steady_clock::now();
|
||||
this->callback("");
|
||||
this->captionPresentationCallback("");
|
||||
}
|
||||
|
||||
void TokenBufferThread::monitor()
|
||||
{
|
||||
obs_log(LOG_INFO, "TokenBufferThread::monitor");
|
||||
|
||||
this->callback("");
|
||||
this->captionPresentationCallback("");
|
||||
|
||||
while (true) {
|
||||
std::string caption_out;
|
||||
@ -152,11 +159,13 @@ void TokenBufferThread::monitor()
|
||||
|
||||
if (!inputQueue.empty()) {
|
||||
// if there are token on the input queue
|
||||
// then add to the presentation queue based on the segmentation
|
||||
if (this->segmentation == SEGMENTATION_SENTENCE) {
|
||||
// add all the tokens from the input queue to the presentation queue
|
||||
for (const auto &token : inputQueue) {
|
||||
presentationQueue.push_back(token);
|
||||
}
|
||||
inputQueue.clear();
|
||||
} else if (this->segmentation == SEGMENTATION_TOKEN) {
|
||||
// add one token to the presentation queue
|
||||
presentationQueue.push_back(inputQueue.front());
|
||||
@ -259,29 +268,65 @@ void TokenBufferThread::monitor()
|
||||
break;
|
||||
}
|
||||
|
||||
const auto now = std::chrono::steady_clock::now();
|
||||
|
||||
// check if enough time passed since last contribution (debounce)
|
||||
const auto durationSinceLastContribution =
|
||||
std::chrono::duration_cast<std::chrono::seconds>(
|
||||
now - this->lastContributionTime);
|
||||
if (durationSinceLastContribution > std::chrono::milliseconds(500)) {
|
||||
if (!lastContributionIsSent) {
|
||||
// take the contribution queue and send it to the output
|
||||
TokenBufferString contribution;
|
||||
for (const auto &token : contributionQueue) {
|
||||
contribution += token;
|
||||
}
|
||||
contributionQueue.clear();
|
||||
#ifdef _WIN32
|
||||
// convert caption to multibyte for obs
|
||||
int count = WideCharToMultiByte(CP_UTF8, 0, contribution.c_str(),
|
||||
(int)contribution.length(), NULL, 0,
|
||||
NULL, NULL);
|
||||
std::string contribution_out = std::string(count, 0);
|
||||
WideCharToMultiByte(CP_UTF8, 0, contribution.c_str(),
|
||||
(int)contribution.length(),
|
||||
&contribution_out[0], count, NULL, NULL);
|
||||
#else
|
||||
std::string contribution_out(contribution.begin(),
|
||||
contribution.end());
|
||||
#endif
|
||||
|
||||
obs_log(LOG_INFO, "TokenBufferThread::monitor: output '%s'",
|
||||
contribution_out.c_str());
|
||||
this->sentenceOutputCallback(contribution_out);
|
||||
lastContributionIsSent = true;
|
||||
}
|
||||
} else {
|
||||
lastContributionIsSent = false;
|
||||
}
|
||||
|
||||
if (caption_out.empty()) {
|
||||
// if no caption was built, sleep for a while
|
||||
this->lastCaption = "";
|
||||
this->lastCaptionTime = std::chrono::steady_clock::now();
|
||||
this->lastCaptionTime = now;
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
||||
continue;
|
||||
}
|
||||
|
||||
if (caption_out == lastCaption) {
|
||||
// if it has been max_time since the last caption - clear the presentation queue
|
||||
const auto duration = std::chrono::duration_cast<std::chrono::seconds>(
|
||||
now - this->lastCaptionTime);
|
||||
if (this->maxTime.count() > 0) {
|
||||
auto now = std::chrono::steady_clock::now();
|
||||
auto duration = std::chrono::duration_cast<std::chrono::seconds>(
|
||||
now - this->lastCaptionTime);
|
||||
if (duration > this->maxTime) {
|
||||
this->clear();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// emit the caption
|
||||
this->callback(caption_out);
|
||||
this->captionPresentationCallback(caption_out);
|
||||
this->lastCaption = caption_out;
|
||||
this->lastCaptionTime = std::chrono::steady_clock::now();
|
||||
this->lastCaptionTime = now;
|
||||
}
|
||||
|
||||
// check the input queue size (iqs), if it's big - sleep less
|
||||
|
@ -25,6 +25,8 @@ struct transcription_filter_data;
|
||||
enum TokenBufferSegmentation { SEGMENTATION_WORD = 0, SEGMENTATION_TOKEN, SEGMENTATION_SENTENCE };
|
||||
enum TokenBufferSpeed { SPEED_SLOW = 0, SPEED_NORMAL, SPEED_FAST };
|
||||
|
||||
typedef std::chrono::time_point<std::chrono::steady_clock> TokenBufferTimePoint;
|
||||
|
||||
class TokenBufferThread {
|
||||
public:
|
||||
// default constructor
|
||||
@ -32,8 +34,10 @@ public:
|
||||
|
||||
~TokenBufferThread();
|
||||
void initialize(struct transcription_filter_data *gf,
|
||||
std::function<void(const std::string &)> callback_, size_t numSentences_,
|
||||
size_t numTokensPerSentence_, std::chrono::seconds maxTime_,
|
||||
std::function<void(const std::string &)> captionPresentationCallback_,
|
||||
std::function<void(const std::string &)> sentenceOutputCallback_,
|
||||
size_t numSentences_, size_t numTokensPerSentence_,
|
||||
std::chrono::seconds maxTime_,
|
||||
TokenBufferSegmentation segmentation_ = SEGMENTATION_TOKEN);
|
||||
|
||||
void addSentence(const std::string &sentence);
|
||||
@ -57,10 +61,12 @@ private:
|
||||
struct transcription_filter_data *gf;
|
||||
std::deque<TokenBufferString> inputQueue;
|
||||
std::deque<TokenBufferString> presentationQueue;
|
||||
std::deque<TokenBufferString> contributionQueue;
|
||||
std::thread workerThread;
|
||||
std::mutex inputQueueMutex;
|
||||
std::mutex presentationQueueMutex;
|
||||
std::function<void(std::string)> callback;
|
||||
std::function<void(std::string)> captionPresentationCallback;
|
||||
std::function<void(std::string)> sentenceOutputCallback;
|
||||
std::condition_variable cv;
|
||||
std::chrono::seconds maxTime;
|
||||
std::atomic<bool> stop;
|
||||
@ -69,7 +75,10 @@ private:
|
||||
size_t numPerSentence;
|
||||
TokenBufferSegmentation segmentation;
|
||||
// timestamp of the last caption
|
||||
std::chrono::time_point<std::chrono::steady_clock> lastCaptionTime;
|
||||
TokenBufferTimePoint lastCaptionTime;
|
||||
// timestamp of the last contribution
|
||||
TokenBufferTimePoint lastContributionTime;
|
||||
bool lastContributionIsSent = false;
|
||||
std::string lastCaption;
|
||||
};
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user