mirror of
https://github.com/occ-ai/obs-localvocal
synced 2024-11-07 18:57:14 +00:00
refactor: Update timestamp variable name in transcription-filter-data.h (#109)
This commit is contained in:
parent
845c1a813c
commit
91c2842009
@ -465,9 +465,10 @@ int wmain(int argc, wchar_t *argv[])
|
||||
struct transcription_filter_audio_info info = {0};
|
||||
info.frames = frames; // number of frames in this packet
|
||||
// make a timestamp from the current position in the audio buffer
|
||||
info.timestamp = start_time + (int64_t)(((float)frames_count /
|
||||
(float)gf->sample_rate) *
|
||||
1e9);
|
||||
info.timestamp_offset_ns =
|
||||
start_time +
|
||||
(int64_t)(((float)frames_count / (float)gf->sample_rate) *
|
||||
1e9);
|
||||
circlebuf_push_back(&gf->info_buffer, &info, sizeof(info));
|
||||
}
|
||||
frames_count += frames;
|
||||
@ -489,7 +490,7 @@ int wmain(int argc, wchar_t *argv[])
|
||||
struct transcription_filter_audio_info info = {0};
|
||||
info.frames = frames; // number of frames in this packet
|
||||
// make a timestamp from the current frame count
|
||||
info.timestamp = frames_count * 1000 / gf->sample_rate;
|
||||
info.timestamp_offset_ns = frames_count * 1000 / gf->sample_rate;
|
||||
circlebuf_push_back(&gf->info_buffer, &info, sizeof(info));
|
||||
}
|
||||
|
||||
|
@ -127,7 +127,7 @@ struct transcription_filter_data {
|
||||
// Audio packet info
|
||||
struct transcription_filter_audio_info {
|
||||
uint32_t frames;
|
||||
uint64_t timestamp; // absolute (since epoch) timestamp in ns
|
||||
uint64_t timestamp_offset_ns; // offset (since start of processing) timestamp in ns
|
||||
};
|
||||
|
||||
// Callback sent when the transcription has a new result
|
||||
|
@ -102,8 +102,14 @@ struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_
|
||||
}
|
||||
// push audio packet info (timestamp/frame count) to info circlebuf
|
||||
struct transcription_filter_audio_info info = {0};
|
||||
info.frames = audio->frames; // number of frames in this packet
|
||||
info.timestamp = audio->timestamp; // timestamp of this packet
|
||||
info.frames = audio->frames; // number of frames in this packet
|
||||
// check if the timestamp is a false "negative" value for uint64_t
|
||||
if (audio->timestamp > (std::numeric_limits<uint64_t>::max() - 100000000)) {
|
||||
// set the timestamp to the current time
|
||||
info.timestamp_offset_ns = 0;
|
||||
} else {
|
||||
info.timestamp_offset_ns = audio->timestamp; // timestamp of this packet
|
||||
}
|
||||
circlebuf_push_back(&gf->info_buffer, &info, sizeof(info));
|
||||
}
|
||||
|
||||
|
@ -22,24 +22,10 @@
|
||||
|
||||
struct vad_state {
|
||||
bool vad_on;
|
||||
uint64_t start_timestamp;
|
||||
uint64_t end_timestamp;
|
||||
uint64_t start_ts_offest_ms;
|
||||
uint64_t end_ts_offset_ms;
|
||||
};
|
||||
|
||||
// Taken from https://github.com/ggerganov/whisper.cpp/blob/master/examples/stream/stream.cpp
|
||||
std::string to_timestamp(uint64_t t)
|
||||
{
|
||||
uint64_t sec = t / 1000;
|
||||
uint64_t msec = t - sec * 1000;
|
||||
uint64_t min = sec / 60;
|
||||
sec = sec - min * 60;
|
||||
|
||||
char buf[32];
|
||||
snprintf(buf, sizeof(buf), "%02d:%02d.%03d", (int)min, (int)sec, (int)msec);
|
||||
|
||||
return std::string(buf);
|
||||
}
|
||||
|
||||
struct whisper_context *init_whisper_context(const std::string &model_path_in,
|
||||
struct transcription_filter_data *gf)
|
||||
{
|
||||
@ -314,8 +300,8 @@ void run_inference_and_callbacks(transcription_filter_data *gf, uint64_t start_o
|
||||
vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_vad_state)
|
||||
{
|
||||
uint32_t num_frames_from_infos = 0;
|
||||
uint64_t start_timestamp = 0;
|
||||
uint64_t end_timestamp = 0;
|
||||
uint64_t start_timestamp_offset_ns = 0;
|
||||
uint64_t end_timestamp_offset_ns = 0;
|
||||
size_t overlap_size = 0;
|
||||
|
||||
for (size_t c = 0; c < gf->channels; c++) {
|
||||
@ -342,8 +328,8 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
|
||||
while (gf->info_buffer.size >= size_of_audio_info) {
|
||||
circlebuf_pop_front(&gf->info_buffer, &info_from_buf, size_of_audio_info);
|
||||
num_frames_from_infos += info_from_buf.frames;
|
||||
if (start_timestamp == 0) {
|
||||
start_timestamp = info_from_buf.timestamp;
|
||||
if (start_timestamp_offset_ns == 0) {
|
||||
start_timestamp_offset_ns = info_from_buf.timestamp_offset_ns;
|
||||
}
|
||||
// Check if we're within the needed segment length
|
||||
if (num_frames_from_infos > max_num_frames) {
|
||||
@ -354,7 +340,7 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
|
||||
break;
|
||||
}
|
||||
}
|
||||
end_timestamp = info_from_buf.timestamp;
|
||||
end_timestamp_offset_ns = info_from_buf.timestamp_offset_ns;
|
||||
|
||||
/* Pop from input circlebuf */
|
||||
for (size_t c = 0; c < gf->channels; c++) {
|
||||
@ -386,10 +372,10 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
|
||||
resampled_16khz[0] + resampled_16khz_frames);
|
||||
gf->vad->process(vad_input, false);
|
||||
|
||||
const uint64_t start_offset_ms = start_timestamp / 1000000 - gf->start_timestamp_ms;
|
||||
const uint64_t end_offset_ms = end_timestamp / 1000000 - gf->start_timestamp_ms;
|
||||
const uint64_t start_ts_offset_ms = start_timestamp_offset_ns / 1000000;
|
||||
const uint64_t end_ts_offset_ms = end_timestamp_offset_ns / 1000000;
|
||||
|
||||
vad_state current_vad_state = {false, start_offset_ms, end_offset_ms};
|
||||
vad_state current_vad_state = {false, start_ts_offset_ms, end_ts_offset_ms};
|
||||
|
||||
std::vector<timestamp_t> stamps = gf->vad->get_speech_timestamps();
|
||||
if (stamps.size() == 0) {
|
||||
@ -397,8 +383,9 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
|
||||
resampled_16khz_frames);
|
||||
if (last_vad_state.vad_on) {
|
||||
obs_log(gf->log_level, "Last VAD was ON: segment end -> send to inference");
|
||||
run_inference_and_callbacks(gf, last_vad_state.start_timestamp,
|
||||
last_vad_state.end_timestamp, VAD_STATE_WAS_ON);
|
||||
run_inference_and_callbacks(gf, last_vad_state.start_ts_offest_ms,
|
||||
last_vad_state.end_ts_offset_ms,
|
||||
VAD_STATE_WAS_ON);
|
||||
}
|
||||
|
||||
if (gf->enable_audio_chunks_callback) {
|
||||
@ -406,8 +393,8 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
|
||||
VAD_STATE_IS_OFF,
|
||||
{DETECTION_RESULT_SILENCE,
|
||||
"[silence]",
|
||||
current_vad_state.start_timestamp,
|
||||
current_vad_state.end_timestamp,
|
||||
current_vad_state.start_ts_offest_ms,
|
||||
current_vad_state.end_ts_offset_ms,
|
||||
{}});
|
||||
}
|
||||
} else {
|
||||
@ -447,29 +434,30 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
|
||||
obs_log(gf->log_level, "VAD segment end -> send to inference");
|
||||
// find the end timestamp of the segment
|
||||
const uint64_t segment_end_ts =
|
||||
start_offset_ms + end_frame * 1000 / WHISPER_SAMPLE_RATE;
|
||||
run_inference_and_callbacks(gf, last_vad_state.start_timestamp,
|
||||
start_ts_offset_ms + end_frame * 1000 / WHISPER_SAMPLE_RATE;
|
||||
run_inference_and_callbacks(gf, last_vad_state.start_ts_offest_ms,
|
||||
segment_end_ts,
|
||||
last_vad_state.vad_on
|
||||
? VAD_STATE_WAS_ON
|
||||
: VAD_STATE_WAS_OFF);
|
||||
current_vad_state.vad_on = false;
|
||||
current_vad_state.start_timestamp = current_vad_state.end_timestamp;
|
||||
current_vad_state.end_timestamp = 0;
|
||||
current_vad_state.start_ts_offest_ms =
|
||||
current_vad_state.end_ts_offset_ms;
|
||||
current_vad_state.end_ts_offset_ms = 0;
|
||||
} else {
|
||||
current_vad_state.vad_on = true;
|
||||
if (last_vad_state.vad_on) {
|
||||
current_vad_state.start_timestamp =
|
||||
last_vad_state.start_timestamp;
|
||||
current_vad_state.start_ts_offest_ms =
|
||||
last_vad_state.start_ts_offest_ms;
|
||||
} else {
|
||||
current_vad_state.start_timestamp =
|
||||
start_offset_ms +
|
||||
current_vad_state.start_ts_offest_ms =
|
||||
start_ts_offset_ms +
|
||||
start_frame * 1000 / WHISPER_SAMPLE_RATE;
|
||||
}
|
||||
obs_log(gf->log_level,
|
||||
"end not reached. vad state: start ts: %llu, end ts: %llu",
|
||||
current_vad_state.start_timestamp,
|
||||
current_vad_state.end_timestamp);
|
||||
current_vad_state.start_ts_offest_ms,
|
||||
current_vad_state.end_ts_offset_ms);
|
||||
}
|
||||
last_vad_state = current_vad_state;
|
||||
}
|
||||
|
@ -152,3 +152,16 @@ std::vector<whisper_token_data> reconstructSentence(const std::vector<whisper_to
|
||||
|
||||
return reconstructed;
|
||||
}
|
||||
|
||||
std::string to_timestamp(uint64_t t_ms_offset)
|
||||
{
|
||||
uint64_t sec = t_ms_offset / 1000;
|
||||
uint64_t msec = t_ms_offset - sec * 1000;
|
||||
uint64_t min = sec / 60;
|
||||
sec = sec - min * 60;
|
||||
|
||||
char buf[32];
|
||||
snprintf(buf, sizeof(buf), "%02d:%02d.%03d", (int)min, (int)sec, (int)msec);
|
||||
|
||||
return std::string(buf);
|
||||
}
|
||||
|
@ -14,4 +14,12 @@ std::pair<int, int> findStartOfOverlap(const std::vector<whisper_token_data> &se
|
||||
std::vector<whisper_token_data> reconstructSentence(const std::vector<whisper_token_data> &seq1,
|
||||
const std::vector<whisper_token_data> &seq2);
|
||||
|
||||
/**
|
||||
* @brief Convert a timestamp in milliseconds to a string in the format "MM:SS.sss" .
|
||||
* Taken from https://github.com/ggerganov/whisper.cpp/blob/master/examples/stream/stream.cpp
|
||||
* @param t_ms_offset Timestamp in milliseconds (offset from the beginning of the stream)
|
||||
* @return std::string Timestamp in the format "MM:SS.sss"
|
||||
*/
|
||||
std::string to_timestamp(uint64_t t_ms_offset);
|
||||
|
||||
#endif /* WHISPER_UTILS_H */
|
||||
|
Loading…
Reference in New Issue
Block a user