mirror of
https://github.com/occ-ai/obs-localvocal
synced 2024-11-07 18:57:14 +00:00
Merge pull request #18 from royshil/roy.step_by_step_realtime_processing
Add step-by-step processing
This commit is contained in:
commit
adefe4c92c
@ -30,6 +30,8 @@ struct transcription_filter_data {
|
|||||||
size_t overlap_ms;
|
size_t overlap_ms;
|
||||||
// How many frames were processed in the last whisper frame (this is dynamic)
|
// How many frames were processed in the last whisper frame (this is dynamic)
|
||||||
size_t last_num_frames;
|
size_t last_num_frames;
|
||||||
|
// Milliseconds per processing step (e.g. rest of the whisper buffer may be filled with silence)
|
||||||
|
size_t step_size_msec;
|
||||||
|
|
||||||
/* PCM buffers */
|
/* PCM buffers */
|
||||||
float *copy_buffers[MAX_PREPROC_CHANNELS];
|
float *copy_buffers[MAX_PREPROC_CHANNELS];
|
||||||
|
@ -77,9 +77,6 @@ struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_
|
|||||||
|
|
||||||
{
|
{
|
||||||
std::lock_guard<std::mutex> lock(*gf->whisper_buf_mutex); // scoped lock
|
std::lock_guard<std::mutex> lock(*gf->whisper_buf_mutex); // scoped lock
|
||||||
obs_log(gf->log_level,
|
|
||||||
"pushing %lu frames to input buffer. current size: %lu (bytes)",
|
|
||||||
(size_t)(audio->frames), gf->input_buffers[0].size);
|
|
||||||
// push back current audio data to input circlebuf
|
// push back current audio data to input circlebuf
|
||||||
for (size_t c = 0; c < gf->channels; c++) {
|
for (size_t c = 0; c < gf->channels; c++) {
|
||||||
circlebuf_push_back(&gf->input_buffers[c], audio->data[c],
|
circlebuf_push_back(&gf->input_buffers[c], audio->data[c],
|
||||||
@ -229,6 +226,9 @@ void transcription_filter_update(void *data, obs_data_t *s)
|
|||||||
gf->vad_enabled = obs_data_get_bool(s, "vad_enabled");
|
gf->vad_enabled = obs_data_get_bool(s, "vad_enabled");
|
||||||
gf->log_words = obs_data_get_bool(s, "log_words");
|
gf->log_words = obs_data_get_bool(s, "log_words");
|
||||||
gf->caption_to_stream = obs_data_get_bool(s, "caption_to_stream");
|
gf->caption_to_stream = obs_data_get_bool(s, "caption_to_stream");
|
||||||
|
bool step_by_step_processing = obs_data_get_bool(s, "step_by_step_processing");
|
||||||
|
gf->step_size_msec = step_by_step_processing ? (int)obs_data_get_int(s, "step_size_msec")
|
||||||
|
: BUFFER_SIZE_MSEC;
|
||||||
|
|
||||||
obs_log(gf->log_level, "transcription_filter: update text source");
|
obs_log(gf->log_level, "transcription_filter: update text source");
|
||||||
// update the text source
|
// update the text source
|
||||||
@ -383,6 +383,10 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
|
|||||||
gf->sample_rate = audio_output_get_sample_rate(obs_get_audio());
|
gf->sample_rate = audio_output_get_sample_rate(obs_get_audio());
|
||||||
gf->frames = (size_t)((float)gf->sample_rate / (1000.0f / (float)BUFFER_SIZE_MSEC));
|
gf->frames = (size_t)((float)gf->sample_rate / (1000.0f / (float)BUFFER_SIZE_MSEC));
|
||||||
gf->last_num_frames = 0;
|
gf->last_num_frames = 0;
|
||||||
|
bool step_by_step_processing = obs_data_get_bool(settings, "step_by_step_processing");
|
||||||
|
gf->step_size_msec = step_by_step_processing
|
||||||
|
? (int)obs_data_get_int(settings, "step_size_msec")
|
||||||
|
: BUFFER_SIZE_MSEC;
|
||||||
|
|
||||||
for (size_t i = 0; i < MAX_AUDIO_CHANNELS; i++) {
|
for (size_t i = 0; i < MAX_AUDIO_CHANNELS; i++) {
|
||||||
circlebuf_init(&gf->input_buffers[i]);
|
circlebuf_init(&gf->input_buffers[i]);
|
||||||
@ -460,6 +464,8 @@ void transcription_filter_defaults(obs_data_t *s)
|
|||||||
obs_data_set_default_string(s, "whisper_model_path", "models/ggml-tiny.en.bin");
|
obs_data_set_default_string(s, "whisper_model_path", "models/ggml-tiny.en.bin");
|
||||||
obs_data_set_default_string(s, "whisper_language_select", "en");
|
obs_data_set_default_string(s, "whisper_language_select", "en");
|
||||||
obs_data_set_default_string(s, "subtitle_sources", "none");
|
obs_data_set_default_string(s, "subtitle_sources", "none");
|
||||||
|
obs_data_set_default_bool(s, "step_by_step_processing", false);
|
||||||
|
obs_data_set_default_int(s, "step_size_msec", 1000);
|
||||||
|
|
||||||
// Whisper parameters
|
// Whisper parameters
|
||||||
obs_data_set_default_int(s, "whisper_sampling_method", WHISPER_SAMPLING_BEAM_SEARCH);
|
obs_data_set_default_int(s, "whisper_sampling_method", WHISPER_SAMPLING_BEAM_SEARCH);
|
||||||
@ -499,6 +505,21 @@ obs_properties_t *transcription_filter_properties(void *data)
|
|||||||
obs_property_list_add_int(list, "WARNING", LOG_WARNING);
|
obs_property_list_add_int(list, "WARNING", LOG_WARNING);
|
||||||
obs_properties_add_bool(ppts, "log_words", "Log output words");
|
obs_properties_add_bool(ppts, "log_words", "Log output words");
|
||||||
obs_properties_add_bool(ppts, "caption_to_stream", "Stream captions");
|
obs_properties_add_bool(ppts, "caption_to_stream", "Stream captions");
|
||||||
|
obs_property_t *step_by_step_processing = obs_properties_add_bool(
|
||||||
|
ppts, "step_by_step_processing",
|
||||||
|
"Step-by-step processing (⚠️ processing will increase)");
|
||||||
|
obs_properties_add_int_slider(ppts, "step_size_msec", "Step size (ms)", 1000,
|
||||||
|
BUFFER_SIZE_MSEC, 50);
|
||||||
|
|
||||||
|
obs_property_set_modified_callback(step_by_step_processing, [](obs_properties_t *props,
|
||||||
|
obs_property_t *property,
|
||||||
|
obs_data_t *settings) {
|
||||||
|
UNUSED_PARAMETER(property);
|
||||||
|
// Show/Hide the step size input
|
||||||
|
obs_property_set_visible(obs_properties_get(props, "step_size_msec"),
|
||||||
|
obs_data_get_bool(settings, "step_by_step_processing"));
|
||||||
|
return true;
|
||||||
|
});
|
||||||
|
|
||||||
obs_property_t *subs_output =
|
obs_property_t *subs_output =
|
||||||
obs_properties_add_list(ppts, "subtitle_sources", "Subtitles Output",
|
obs_properties_add_list(ppts, "subtitle_sources", "Subtitles Output",
|
||||||
|
@ -148,7 +148,7 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
|
|||||||
to_timestamp(t1).c_str(), sentence_p, text_lower.c_str());
|
to_timestamp(t1).c_str(), sentence_p, text_lower.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (text_lower.empty()) {
|
if (text_lower.empty() || text_lower == ".") {
|
||||||
return {DETECTION_RESULT_SILENCE, ""};
|
return {DETECTION_RESULT_SILENCE, ""};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -160,68 +160,66 @@ void process_audio_from_buffer(struct transcription_filter_data *gf)
|
|||||||
{
|
{
|
||||||
uint32_t num_new_frames_from_infos = 0;
|
uint32_t num_new_frames_from_infos = 0;
|
||||||
uint64_t start_timestamp = 0;
|
uint64_t start_timestamp = 0;
|
||||||
|
bool last_step_in_segment = false;
|
||||||
|
|
||||||
{
|
{
|
||||||
// scoped lock the buffer mutex
|
// scoped lock the buffer mutex
|
||||||
std::lock_guard<std::mutex> lock(*gf->whisper_buf_mutex);
|
std::lock_guard<std::mutex> lock(*gf->whisper_buf_mutex);
|
||||||
|
|
||||||
// We need (gf->frames - gf->overlap_frames) new frames to run inference,
|
// We need (gf->frames - gf->last_num_frames) new frames for a full segment,
|
||||||
// except for the first segment, where we need the whole gf->frames frames
|
const size_t remaining_frames_to_full_segment = gf->frames - gf->last_num_frames;
|
||||||
size_t how_many_frames_needed = gf->frames - gf->overlap_frames;
|
|
||||||
if (gf->last_num_frames == 0) {
|
|
||||||
how_many_frames_needed = gf->frames;
|
|
||||||
}
|
|
||||||
|
|
||||||
// pop infos from the info buffer and mark the beginning timestamp from the first
|
// pop infos from the info buffer and mark the beginning timestamp from the first
|
||||||
// info as the beginning timestamp of the segment
|
// info as the beginning timestamp of the segment
|
||||||
struct transcription_filter_audio_info info_from_buf = {0};
|
struct transcription_filter_audio_info info_from_buf = {0};
|
||||||
while (gf->info_buffer.size >= sizeof(struct transcription_filter_audio_info)) {
|
const size_t size_of_audio_info = sizeof(struct transcription_filter_audio_info);
|
||||||
circlebuf_pop_front(&gf->info_buffer, &info_from_buf,
|
while (gf->info_buffer.size >= size_of_audio_info) {
|
||||||
sizeof(struct transcription_filter_audio_info));
|
circlebuf_pop_front(&gf->info_buffer, &info_from_buf, size_of_audio_info);
|
||||||
num_new_frames_from_infos += info_from_buf.frames;
|
num_new_frames_from_infos += info_from_buf.frames;
|
||||||
if (start_timestamp == 0) {
|
if (start_timestamp == 0) {
|
||||||
start_timestamp = info_from_buf.timestamp;
|
start_timestamp = info_from_buf.timestamp;
|
||||||
}
|
}
|
||||||
obs_log(gf->log_level, "popped %d frames from info buffer, %lu needed",
|
|
||||||
num_new_frames_from_infos, how_many_frames_needed);
|
|
||||||
// Check if we're within the needed segment length
|
// Check if we're within the needed segment length
|
||||||
if (num_new_frames_from_infos > how_many_frames_needed) {
|
if (num_new_frames_from_infos > remaining_frames_to_full_segment) {
|
||||||
// too big, push the last info into the buffer's front where it was
|
// too big, push the last info into the buffer's front where it was
|
||||||
num_new_frames_from_infos -= info_from_buf.frames;
|
num_new_frames_from_infos -= info_from_buf.frames;
|
||||||
circlebuf_push_front(
|
circlebuf_push_front(&gf->info_buffer, &info_from_buf,
|
||||||
&gf->info_buffer, &info_from_buf,
|
size_of_audio_info);
|
||||||
sizeof(struct transcription_filter_audio_info));
|
last_step_in_segment =
|
||||||
|
true; // this is the final step in the segment
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
obs_log(gf->log_level,
|
||||||
|
"with %lu remaining to full segment, popped %d info-frames, pushing into buffer at %lu",
|
||||||
|
remaining_frames_to_full_segment, num_new_frames_from_infos,
|
||||||
|
gf->last_num_frames);
|
||||||
|
|
||||||
/* Pop from input circlebuf */
|
/* Pop from input circlebuf */
|
||||||
for (size_t c = 0; c < gf->channels; c++) {
|
for (size_t c = 0; c < gf->channels; c++) {
|
||||||
if (gf->last_num_frames > 0) {
|
// Push the new data to the end of the existing buffer copy_buffers[c]
|
||||||
// move overlap frames from the end of the last copy_buffers to the beginning
|
circlebuf_pop_front(&gf->input_buffers[c],
|
||||||
memcpy(gf->copy_buffers[c],
|
gf->copy_buffers[c] + gf->last_num_frames,
|
||||||
gf->copy_buffers[c] + gf->last_num_frames -
|
num_new_frames_from_infos * sizeof(float));
|
||||||
gf->overlap_frames,
|
|
||||||
gf->overlap_frames * sizeof(float));
|
|
||||||
// copy new data to the end of copy_buffers[c]
|
|
||||||
circlebuf_pop_front(&gf->input_buffers[c],
|
|
||||||
gf->copy_buffers[c] + gf->overlap_frames,
|
|
||||||
num_new_frames_from_infos * sizeof(float));
|
|
||||||
} else {
|
|
||||||
// Very first time, just copy data to copy_buffers[c]
|
|
||||||
circlebuf_pop_front(&gf->input_buffers[c], gf->copy_buffers[c],
|
|
||||||
num_new_frames_from_infos * sizeof(float));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
obs_log(gf->log_level,
|
}
|
||||||
"popped %u frames from input buffer. input_buffer[0] size is %lu",
|
|
||||||
num_new_frames_from_infos, gf->input_buffers[0].size);
|
|
||||||
|
|
||||||
if (gf->last_num_frames > 0) {
|
if (gf->last_num_frames > 0) {
|
||||||
gf->last_num_frames = num_new_frames_from_infos + gf->overlap_frames;
|
gf->last_num_frames += num_new_frames_from_infos;
|
||||||
|
if (!last_step_in_segment) {
|
||||||
|
// Mid-segment process
|
||||||
|
obs_log(gf->log_level, "mid-segment, now %d frames left to full segment",
|
||||||
|
(int)(gf->frames - gf->last_num_frames));
|
||||||
} else {
|
} else {
|
||||||
gf->last_num_frames = num_new_frames_from_infos;
|
// Final step in segment
|
||||||
|
obs_log(gf->log_level, "full segment, %d frames to process",
|
||||||
|
(int)(gf->last_num_frames));
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
gf->last_num_frames = num_new_frames_from_infos;
|
||||||
|
obs_log(gf->log_level, "first segment, %d frames to process",
|
||||||
|
(int)(gf->last_num_frames));
|
||||||
}
|
}
|
||||||
|
|
||||||
obs_log(gf->log_level, "processing %d frames (%d ms), start timestamp %llu ",
|
obs_log(gf->log_level, "processing %d frames (%d ms), start timestamp %llu ",
|
||||||
@ -271,28 +269,21 @@ void process_audio_from_buffer(struct transcription_filter_data *gf)
|
|||||||
// end of timer
|
// end of timer
|
||||||
auto end = std::chrono::high_resolution_clock::now();
|
auto end = std::chrono::high_resolution_clock::now();
|
||||||
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
|
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
|
||||||
const uint32_t new_frames_from_infos_ms =
|
const uint64_t last_num_frames_ms = gf->last_num_frames * 1000 / gf->sample_rate;
|
||||||
num_new_frames_from_infos * 1000 /
|
obs_log(gf->log_level, "audio processing of %lu ms data took %d ms", last_num_frames_ms,
|
||||||
gf->sample_rate; // number of frames in this packet
|
(int)duration);
|
||||||
obs_log(gf->log_level, "audio processing of %u ms new data took %d ms",
|
|
||||||
new_frames_from_infos_ms, (int)duration);
|
|
||||||
|
|
||||||
if (duration > new_frames_from_infos_ms) {
|
if (last_step_in_segment) {
|
||||||
// try to decrease overlap down to minimum of 100 ms
|
for (size_t c = 0; c < gf->channels; c++) {
|
||||||
gf->overlap_ms = std::max((uint64_t)gf->overlap_ms - 10, (uint64_t)100);
|
// This is the last step in the segment - reset the copy buffer (include overlap frames)
|
||||||
gf->overlap_frames = gf->overlap_ms * gf->sample_rate / 1000;
|
// move overlap frames from the end of the last copy_buffers to the beginning
|
||||||
obs_log(gf->log_level,
|
memcpy(gf->copy_buffers[c],
|
||||||
"audio processing took too long (%d ms), reducing overlap to %lu ms",
|
gf->copy_buffers[c] + gf->last_num_frames - gf->overlap_frames,
|
||||||
(int)duration, gf->overlap_ms);
|
gf->overlap_frames * sizeof(float));
|
||||||
} else if (!skipped_inference) {
|
// zero out the rest of the buffer, just in case
|
||||||
if (gf->overlap_ms < OVERLAP_SIZE_MSEC) {
|
memset(gf->copy_buffers[c] + gf->overlap_frames, 0,
|
||||||
// try to increase overlap up to OVERLAP_SIZE_MSEC
|
(gf->frames - gf->overlap_frames) * sizeof(float));
|
||||||
gf->overlap_ms = std::min((uint64_t)gf->overlap_ms + 10,
|
gf->last_num_frames = gf->overlap_frames;
|
||||||
(uint64_t)OVERLAP_SIZE_MSEC);
|
|
||||||
gf->overlap_frames = gf->overlap_ms * gf->sample_rate / 1000;
|
|
||||||
obs_log(gf->log_level,
|
|
||||||
"audio processing took %d ms, increasing overlap to %lu ms",
|
|
||||||
(int)duration, gf->overlap_ms);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -306,7 +297,6 @@ void whisper_loop(void *data)
|
|||||||
|
|
||||||
struct transcription_filter_data *gf =
|
struct transcription_filter_data *gf =
|
||||||
static_cast<struct transcription_filter_data *>(data);
|
static_cast<struct transcription_filter_data *>(data);
|
||||||
const size_t segment_size = gf->frames * sizeof(float);
|
|
||||||
|
|
||||||
obs_log(LOG_INFO, "starting whisper thread");
|
obs_log(LOG_INFO, "starting whisper thread");
|
||||||
|
|
||||||
@ -327,6 +317,8 @@ void whisper_loop(void *data)
|
|||||||
std::lock_guard<std::mutex> lock(*gf->whisper_buf_mutex);
|
std::lock_guard<std::mutex> lock(*gf->whisper_buf_mutex);
|
||||||
input_buf_size = gf->input_buffers[0].size;
|
input_buf_size = gf->input_buffers[0].size;
|
||||||
}
|
}
|
||||||
|
const size_t step_size_frames = gf->step_size_msec * gf->sample_rate / 1000;
|
||||||
|
const size_t segment_size = step_size_frames * sizeof(float);
|
||||||
|
|
||||||
if (input_buf_size >= segment_size) {
|
if (input_buf_size >= segment_size) {
|
||||||
obs_log(gf->log_level,
|
obs_log(gf->log_level,
|
||||||
|
@ -3,10 +3,10 @@
|
|||||||
|
|
||||||
// buffer size in msec
|
// buffer size in msec
|
||||||
#define BUFFER_SIZE_MSEC 3000
|
#define BUFFER_SIZE_MSEC 3000
|
||||||
// at 16Khz, 3000 msec is 48000 samples
|
// at 16Khz, BUFFER_SIZE_MSEC is WHISPER_FRAME_SIZE samples
|
||||||
#define WHISPER_FRAME_SIZE 48000
|
#define WHISPER_FRAME_SIZE 48000
|
||||||
// overlap in msec
|
// overlap in msec
|
||||||
#define OVERLAP_SIZE_MSEC 200
|
#define OVERLAP_SIZE_MSEC 100
|
||||||
|
|
||||||
void whisper_loop(void *data);
|
void whisper_loop(void *data);
|
||||||
struct whisper_context *init_whisper_context(const std::string &model_path);
|
struct whisper_context *init_whisper_context(const std::string &model_path);
|
||||||
|
Loading…
Reference in New Issue
Block a user