refactor: Update timestamp variable name in transcription-filter-data.h (#109)

This commit is contained in:
Roy Shilkrot 2024-06-11 12:15:49 -04:00 committed by GitHub
parent 845c1a813c
commit 91c2842009
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 61 additions and 45 deletions

View File

@ -465,9 +465,10 @@ int wmain(int argc, wchar_t *argv[])
struct transcription_filter_audio_info info = {0};
info.frames = frames; // number of frames in this packet
// make a timestamp from the current position in the audio buffer
info.timestamp = start_time + (int64_t)(((float)frames_count /
(float)gf->sample_rate) *
1e9);
info.timestamp_offset_ns =
start_time +
(int64_t)(((float)frames_count / (float)gf->sample_rate) *
1e9);
circlebuf_push_back(&gf->info_buffer, &info, sizeof(info));
}
frames_count += frames;
@ -489,7 +490,7 @@ int wmain(int argc, wchar_t *argv[])
struct transcription_filter_audio_info info = {0};
info.frames = frames; // number of frames in this packet
// make a timestamp from the current frame count
info.timestamp = frames_count * 1000 / gf->sample_rate;
info.timestamp_offset_ns = frames_count * 1000 / gf->sample_rate;
circlebuf_push_back(&gf->info_buffer, &info, sizeof(info));
}

View File

@ -127,7 +127,7 @@ struct transcription_filter_data {
// Audio packet info
struct transcription_filter_audio_info {
uint32_t frames;
uint64_t timestamp; // absolute (since epoch) timestamp in ns
uint64_t timestamp_offset_ns; // offset (since start of processing) timestamp in ns
};
// Callback sent when the transcription has a new result

View File

@ -102,8 +102,14 @@ struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_
}
// push audio packet info (timestamp/frame count) to info circlebuf
struct transcription_filter_audio_info info = {0};
info.frames = audio->frames; // number of frames in this packet
info.timestamp = audio->timestamp; // timestamp of this packet
info.frames = audio->frames; // number of frames in this packet
// check if the timestamp is a false "negative" value for uint64_t
if (audio->timestamp > (std::numeric_limits<uint64_t>::max() - 100000000)) {
// set the timestamp to the current time
info.timestamp_offset_ns = 0;
} else {
info.timestamp_offset_ns = audio->timestamp; // timestamp of this packet
}
circlebuf_push_back(&gf->info_buffer, &info, sizeof(info));
}

View File

@ -22,24 +22,10 @@
struct vad_state {
bool vad_on;
uint64_t start_timestamp;
uint64_t end_timestamp;
uint64_t start_ts_offest_ms;
uint64_t end_ts_offset_ms;
};
// Taken from https://github.com/ggerganov/whisper.cpp/blob/master/examples/stream/stream.cpp
std::string to_timestamp(uint64_t t)
{
uint64_t sec = t / 1000;
uint64_t msec = t - sec * 1000;
uint64_t min = sec / 60;
sec = sec - min * 60;
char buf[32];
snprintf(buf, sizeof(buf), "%02d:%02d.%03d", (int)min, (int)sec, (int)msec);
return std::string(buf);
}
struct whisper_context *init_whisper_context(const std::string &model_path_in,
struct transcription_filter_data *gf)
{
@ -314,8 +300,8 @@ void run_inference_and_callbacks(transcription_filter_data *gf, uint64_t start_o
vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_vad_state)
{
uint32_t num_frames_from_infos = 0;
uint64_t start_timestamp = 0;
uint64_t end_timestamp = 0;
uint64_t start_timestamp_offset_ns = 0;
uint64_t end_timestamp_offset_ns = 0;
size_t overlap_size = 0;
for (size_t c = 0; c < gf->channels; c++) {
@ -342,8 +328,8 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
while (gf->info_buffer.size >= size_of_audio_info) {
circlebuf_pop_front(&gf->info_buffer, &info_from_buf, size_of_audio_info);
num_frames_from_infos += info_from_buf.frames;
if (start_timestamp == 0) {
start_timestamp = info_from_buf.timestamp;
if (start_timestamp_offset_ns == 0) {
start_timestamp_offset_ns = info_from_buf.timestamp_offset_ns;
}
// Check if we're within the needed segment length
if (num_frames_from_infos > max_num_frames) {
@ -354,7 +340,7 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
break;
}
}
end_timestamp = info_from_buf.timestamp;
end_timestamp_offset_ns = info_from_buf.timestamp_offset_ns;
/* Pop from input circlebuf */
for (size_t c = 0; c < gf->channels; c++) {
@ -386,10 +372,10 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
resampled_16khz[0] + resampled_16khz_frames);
gf->vad->process(vad_input, false);
const uint64_t start_offset_ms = start_timestamp / 1000000 - gf->start_timestamp_ms;
const uint64_t end_offset_ms = end_timestamp / 1000000 - gf->start_timestamp_ms;
const uint64_t start_ts_offset_ms = start_timestamp_offset_ns / 1000000;
const uint64_t end_ts_offset_ms = end_timestamp_offset_ns / 1000000;
vad_state current_vad_state = {false, start_offset_ms, end_offset_ms};
vad_state current_vad_state = {false, start_ts_offset_ms, end_ts_offset_ms};
std::vector<timestamp_t> stamps = gf->vad->get_speech_timestamps();
if (stamps.size() == 0) {
@ -397,8 +383,9 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
resampled_16khz_frames);
if (last_vad_state.vad_on) {
obs_log(gf->log_level, "Last VAD was ON: segment end -> send to inference");
run_inference_and_callbacks(gf, last_vad_state.start_timestamp,
last_vad_state.end_timestamp, VAD_STATE_WAS_ON);
run_inference_and_callbacks(gf, last_vad_state.start_ts_offest_ms,
last_vad_state.end_ts_offset_ms,
VAD_STATE_WAS_ON);
}
if (gf->enable_audio_chunks_callback) {
@ -406,8 +393,8 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
VAD_STATE_IS_OFF,
{DETECTION_RESULT_SILENCE,
"[silence]",
current_vad_state.start_timestamp,
current_vad_state.end_timestamp,
current_vad_state.start_ts_offest_ms,
current_vad_state.end_ts_offset_ms,
{}});
}
} else {
@ -447,29 +434,30 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
obs_log(gf->log_level, "VAD segment end -> send to inference");
// find the end timestamp of the segment
const uint64_t segment_end_ts =
start_offset_ms + end_frame * 1000 / WHISPER_SAMPLE_RATE;
run_inference_and_callbacks(gf, last_vad_state.start_timestamp,
start_ts_offset_ms + end_frame * 1000 / WHISPER_SAMPLE_RATE;
run_inference_and_callbacks(gf, last_vad_state.start_ts_offest_ms,
segment_end_ts,
last_vad_state.vad_on
? VAD_STATE_WAS_ON
: VAD_STATE_WAS_OFF);
current_vad_state.vad_on = false;
current_vad_state.start_timestamp = current_vad_state.end_timestamp;
current_vad_state.end_timestamp = 0;
current_vad_state.start_ts_offest_ms =
current_vad_state.end_ts_offset_ms;
current_vad_state.end_ts_offset_ms = 0;
} else {
current_vad_state.vad_on = true;
if (last_vad_state.vad_on) {
current_vad_state.start_timestamp =
last_vad_state.start_timestamp;
current_vad_state.start_ts_offest_ms =
last_vad_state.start_ts_offest_ms;
} else {
current_vad_state.start_timestamp =
start_offset_ms +
current_vad_state.start_ts_offest_ms =
start_ts_offset_ms +
start_frame * 1000 / WHISPER_SAMPLE_RATE;
}
obs_log(gf->log_level,
"end not reached. vad state: start ts: %llu, end ts: %llu",
current_vad_state.start_timestamp,
current_vad_state.end_timestamp);
current_vad_state.start_ts_offest_ms,
current_vad_state.end_ts_offset_ms);
}
last_vad_state = current_vad_state;
}

View File

@ -152,3 +152,16 @@ std::vector<whisper_token_data> reconstructSentence(const std::vector<whisper_to
return reconstructed;
}
std::string to_timestamp(uint64_t t_ms_offset)
{
uint64_t sec = t_ms_offset / 1000;
uint64_t msec = t_ms_offset - sec * 1000;
uint64_t min = sec / 60;
sec = sec - min * 60;
char buf[32];
snprintf(buf, sizeof(buf), "%02d:%02d.%03d", (int)min, (int)sec, (int)msec);
return std::string(buf);
}

View File

@ -14,4 +14,12 @@ std::pair<int, int> findStartOfOverlap(const std::vector<whisper_token_data> &se
std::vector<whisper_token_data> reconstructSentence(const std::vector<whisper_token_data> &seq1,
const std::vector<whisper_token_data> &seq2);
/**
* @brief Convert a timestamp in milliseconds to a string in the format "MM:SS.sss" .
* Taken from https://github.com/ggerganov/whisper.cpp/blob/master/examples/stream/stream.cpp
* @param t_ms_offset Timestamp in milliseconds (offset from the beginning of the stream)
* @return std::string Timestamp in the format "MM:SS.sss"
*/
std::string to_timestamp(uint64_t t_ms_offset);
#endif /* WHISPER_UTILS_H */