mirror of
https://github.com/occ-ai/obs-localvocal
synced 2024-11-07 02:40:12 +00:00
Add new Whisper models to models_directory.json and adjust transcription filter properties
This commit is contained in:
parent
3668195652
commit
f478809f79
@ -435,6 +435,116 @@
|
|||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"friendly_name": "Whisper Base q8 (81Mb)",
|
||||||
|
"local_folder_name": "ggml-base-q8_0",
|
||||||
|
"type": "MODEL_TYPE_TRANSCRIPTION",
|
||||||
|
"files": [
|
||||||
|
{
|
||||||
|
"url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-base-q8_0.bin",
|
||||||
|
"sha256": ""
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"friendly_name": "Whisper Base English q8 (81Mb)",
|
||||||
|
"local_folder_name": "ggml-base-en-q8_0",
|
||||||
|
"type": "MODEL_TYPE_TRANSCRIPTION",
|
||||||
|
"files": [
|
||||||
|
{
|
||||||
|
"url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-base.en-q8_0.bin",
|
||||||
|
"sha256": ""
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"friendly_name": "Whisper Large v2 q8 (1.66Gb)",
|
||||||
|
"local_folder_name": "ggml-large-v2-q8_0",
|
||||||
|
"type": "MODEL_TYPE_TRANSCRIPTION",
|
||||||
|
"files": [
|
||||||
|
{
|
||||||
|
"url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-large-v2-q8_0.bin",
|
||||||
|
"sha256": ""
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"friendly_name": "Whisper Medium q8 (823Mb)",
|
||||||
|
"local_folder_name": "ggml-medium-q8_0",
|
||||||
|
"type": "MODEL_TYPE_TRANSCRIPTION",
|
||||||
|
"files": [
|
||||||
|
{
|
||||||
|
"url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-medium-q8_0.bin",
|
||||||
|
"sha256": ""
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"friendly_name": "Whisper Large v3 Trubo q8 (874Mb)",
|
||||||
|
"local_folder_name": "ggml-large-v3-turbo-q8_0",
|
||||||
|
"type": "MODEL_TYPE_TRANSCRIPTION",
|
||||||
|
"files": [
|
||||||
|
{
|
||||||
|
"url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-large-v3-turbo-q8_0.bin",
|
||||||
|
"sha256": ""
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"friendly_name": "Whisper Small q8 (264Mb)",
|
||||||
|
"local_folder_name": "ggml-small-q8_0",
|
||||||
|
"type": "MODEL_TYPE_TRANSCRIPTION",
|
||||||
|
"files": [
|
||||||
|
{
|
||||||
|
"url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-small-q8_0.bin",
|
||||||
|
"sha256": ""
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"friendly_name": "Whisper Medium English q8 (823Mb)",
|
||||||
|
"local_folder_name": "ggml-medium-en-q8_0",
|
||||||
|
"type": "MODEL_TYPE_TRANSCRIPTION",
|
||||||
|
"files": [
|
||||||
|
{
|
||||||
|
"url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-medium.en-q8_0.bin",
|
||||||
|
"sha256": ""
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"friendly_name": "Whisper Small English q8 (264Mb)",
|
||||||
|
"local_folder_name": "ggml-small-en-q8_0",
|
||||||
|
"type": "MODEL_TYPE_TRANSCRIPTION",
|
||||||
|
"files": [
|
||||||
|
{
|
||||||
|
"url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-small.en-q8_0.bin",
|
||||||
|
"sha256": ""
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"friendly_name": "Whisper Tiny q8 (43Mb)",
|
||||||
|
"local_folder_name": "ggml-tiny-q8_0",
|
||||||
|
"type": "MODEL_TYPE_TRANSCRIPTION",
|
||||||
|
"files": [
|
||||||
|
{
|
||||||
|
"url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-tiny-q8_0.bin",
|
||||||
|
"sha256": ""
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"friendly_name": "Whisper Tiny English q8 (43Mb)",
|
||||||
|
"local_folder_name": "ggml-tiny-en-q8_0",
|
||||||
|
"type": "MODEL_TYPE_TRANSCRIPTION",
|
||||||
|
"files": [
|
||||||
|
{
|
||||||
|
"url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-tiny.en-q8_0.bin",
|
||||||
|
"sha256": ""
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"friendly_name": "Whisper Base German Awareai (Marksdo)",
|
"friendly_name": "Whisper Base German Awareai (Marksdo)",
|
||||||
"local_folder_name": "ggml-base-awareai.de",
|
"local_folder_name": "ggml-base-awareai.de",
|
||||||
@ -4916,4 +5026,4 @@
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
@ -626,7 +626,7 @@ void transcription_filter_defaults(obs_data_t *s)
|
|||||||
obs_data_set_default_bool(s, "split_on_word", true);
|
obs_data_set_default_bool(s, "split_on_word", true);
|
||||||
obs_data_set_default_int(s, "max_tokens", 50);
|
obs_data_set_default_int(s, "max_tokens", 50);
|
||||||
obs_data_set_default_bool(s, "suppress_blank", false);
|
obs_data_set_default_bool(s, "suppress_blank", false);
|
||||||
obs_data_set_default_bool(s, "suppress_non_speech_tokens", true);
|
obs_data_set_default_bool(s, "suppress_non_speech_tokens", false);
|
||||||
obs_data_set_default_double(s, "temperature", 0.1);
|
obs_data_set_default_double(s, "temperature", 0.1);
|
||||||
obs_data_set_default_double(s, "max_initial_ts", 1.0);
|
obs_data_set_default_double(s, "max_initial_ts", 1.0);
|
||||||
obs_data_set_default_double(s, "length_penalty", -1.0);
|
obs_data_set_default_double(s, "length_penalty", -1.0);
|
||||||
|
@ -24,9 +24,11 @@ int get_data_from_buf_and_resample(transcription_filter_data *gf,
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef LOCALVOCAL_EXTRA_VERBOSE
|
||||||
obs_log(gf->log_level,
|
obs_log(gf->log_level,
|
||||||
"segmentation: currently %lu bytes in the audio input buffer",
|
"segmentation: currently %lu bytes in the audio input buffer",
|
||||||
gf->input_buffers[0].size);
|
gf->input_buffers[0].size);
|
||||||
|
#endif
|
||||||
|
|
||||||
// max number of frames is 10 seconds worth of audio
|
// max number of frames is 10 seconds worth of audio
|
||||||
const size_t max_num_frames = gf->sample_rate * 10;
|
const size_t max_num_frames = gf->sample_rate * 10;
|
||||||
@ -76,8 +78,10 @@ int get_data_from_buf_and_resample(transcription_filter_data *gf,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef LOCALVOCAL_EXTRA_VERBOSE
|
||||||
obs_log(gf->log_level, "found %d frames from info buffer.", num_frames_from_infos);
|
obs_log(gf->log_level, "found %d frames from info buffer.", num_frames_from_infos);
|
||||||
gf->last_num_frames = num_frames_from_infos;
|
gf->last_num_frames = num_frames_from_infos;
|
||||||
|
#endif
|
||||||
|
|
||||||
{
|
{
|
||||||
// resample to 16kHz
|
// resample to 16kHz
|
||||||
@ -95,11 +99,13 @@ int get_data_from_buf_and_resample(transcription_filter_data *gf,
|
|||||||
|
|
||||||
circlebuf_push_back(&gf->resampled_buffer, resampled_16khz[0],
|
circlebuf_push_back(&gf->resampled_buffer, resampled_16khz[0],
|
||||||
resampled_16khz_frames * sizeof(float));
|
resampled_16khz_frames * sizeof(float));
|
||||||
|
#ifdef LOCALVOCAL_EXTRA_VERBOSE
|
||||||
obs_log(gf->log_level,
|
obs_log(gf->log_level,
|
||||||
"resampled: %d channels, %d frames, %f ms, current size: %lu bytes",
|
"resampled: %d channels, %d frames, %f ms, current size: %lu bytes",
|
||||||
(int)gf->channels, (int)resampled_16khz_frames,
|
(int)gf->channels, (int)resampled_16khz_frames,
|
||||||
(float)resampled_16khz_frames / WHISPER_SAMPLE_RATE * 1000.0f,
|
(float)resampled_16khz_frames / WHISPER_SAMPLE_RATE * 1000.0f,
|
||||||
gf->resampled_buffer.size);
|
gf->resampled_buffer.size);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
@ -129,8 +135,10 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
|
|||||||
circlebuf_pop_front(&gf->resampled_buffer, vad_input.data(),
|
circlebuf_pop_front(&gf->resampled_buffer, vad_input.data(),
|
||||||
vad_input.size() * sizeof(float));
|
vad_input.size() * sizeof(float));
|
||||||
|
|
||||||
|
#ifdef LOCALVOCAL_EXTRA_VERBOSE
|
||||||
obs_log(gf->log_level, "sending %d frames to vad, %d windows, reset state? %s",
|
obs_log(gf->log_level, "sending %d frames to vad, %d windows, reset state? %s",
|
||||||
vad_input.size(), vad_num_windows, (!last_vad_state.vad_on) ? "yes" : "no");
|
vad_input.size(), vad_num_windows, (!last_vad_state.vad_on) ? "yes" : "no");
|
||||||
|
#endif
|
||||||
{
|
{
|
||||||
ProfileScope("vad->process");
|
ProfileScope("vad->process");
|
||||||
gf->vad->process(vad_input, !last_vad_state.vad_on);
|
gf->vad->process(vad_input, !last_vad_state.vad_on);
|
||||||
@ -144,7 +152,9 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
|
|||||||
|
|
||||||
std::vector<timestamp_t> stamps = gf->vad->get_speech_timestamps();
|
std::vector<timestamp_t> stamps = gf->vad->get_speech_timestamps();
|
||||||
if (stamps.size() == 0) {
|
if (stamps.size() == 0) {
|
||||||
|
#ifdef LOCALVOCAL_EXTRA_VERBOSE
|
||||||
obs_log(gf->log_level, "VAD detected no speech in %u frames", vad_input.size());
|
obs_log(gf->log_level, "VAD detected no speech in %u frames", vad_input.size());
|
||||||
|
#endif
|
||||||
if (last_vad_state.vad_on) {
|
if (last_vad_state.vad_on) {
|
||||||
obs_log(gf->log_level, "Last VAD was ON: segment end -> send to inference");
|
obs_log(gf->log_level, "Last VAD was ON: segment end -> send to inference");
|
||||||
run_inference_and_callbacks(gf, last_vad_state.start_ts_offest_ms,
|
run_inference_and_callbacks(gf, last_vad_state.start_ts_offest_ms,
|
||||||
|
Loading…
Reference in New Issue
Block a user