mirror of
https://github.com/occ-ai/obs-localvocal
synced 2024-11-07 02:40:12 +00:00
Add new Whisper models to models_directory.json and adjust transcription filter properties
This commit is contained in:
parent
3668195652
commit
f478809f79
@ -435,6 +435,116 @@
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"friendly_name": "Whisper Base q8 (81Mb)",
|
||||
"local_folder_name": "ggml-base-q8_0",
|
||||
"type": "MODEL_TYPE_TRANSCRIPTION",
|
||||
"files": [
|
||||
{
|
||||
"url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-base-q8_0.bin",
|
||||
"sha256": ""
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"friendly_name": "Whisper Base English q8 (81Mb)",
|
||||
"local_folder_name": "ggml-base-en-q8_0",
|
||||
"type": "MODEL_TYPE_TRANSCRIPTION",
|
||||
"files": [
|
||||
{
|
||||
"url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-base.en-q8_0.bin",
|
||||
"sha256": ""
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"friendly_name": "Whisper Large v2 q8 (1.66Gb)",
|
||||
"local_folder_name": "ggml-large-v2-q8_0",
|
||||
"type": "MODEL_TYPE_TRANSCRIPTION",
|
||||
"files": [
|
||||
{
|
||||
"url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-large-v2-q8_0.bin",
|
||||
"sha256": ""
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"friendly_name": "Whisper Medium q8 (823Mb)",
|
||||
"local_folder_name": "ggml-medium-q8_0",
|
||||
"type": "MODEL_TYPE_TRANSCRIPTION",
|
||||
"files": [
|
||||
{
|
||||
"url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-medium-q8_0.bin",
|
||||
"sha256": ""
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"friendly_name": "Whisper Large v3 Trubo q8 (874Mb)",
|
||||
"local_folder_name": "ggml-large-v3-turbo-q8_0",
|
||||
"type": "MODEL_TYPE_TRANSCRIPTION",
|
||||
"files": [
|
||||
{
|
||||
"url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-large-v3-turbo-q8_0.bin",
|
||||
"sha256": ""
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"friendly_name": "Whisper Small q8 (264Mb)",
|
||||
"local_folder_name": "ggml-small-q8_0",
|
||||
"type": "MODEL_TYPE_TRANSCRIPTION",
|
||||
"files": [
|
||||
{
|
||||
"url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-small-q8_0.bin",
|
||||
"sha256": ""
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"friendly_name": "Whisper Medium English q8 (823Mb)",
|
||||
"local_folder_name": "ggml-medium-en-q8_0",
|
||||
"type": "MODEL_TYPE_TRANSCRIPTION",
|
||||
"files": [
|
||||
{
|
||||
"url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-medium.en-q8_0.bin",
|
||||
"sha256": ""
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"friendly_name": "Whisper Small English q8 (264Mb)",
|
||||
"local_folder_name": "ggml-small-en-q8_0",
|
||||
"type": "MODEL_TYPE_TRANSCRIPTION",
|
||||
"files": [
|
||||
{
|
||||
"url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-small.en-q8_0.bin",
|
||||
"sha256": ""
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"friendly_name": "Whisper Tiny q8 (43Mb)",
|
||||
"local_folder_name": "ggml-tiny-q8_0",
|
||||
"type": "MODEL_TYPE_TRANSCRIPTION",
|
||||
"files": [
|
||||
{
|
||||
"url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-tiny-q8_0.bin",
|
||||
"sha256": ""
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"friendly_name": "Whisper Tiny English q8 (43Mb)",
|
||||
"local_folder_name": "ggml-tiny-en-q8_0",
|
||||
"type": "MODEL_TYPE_TRANSCRIPTION",
|
||||
"files": [
|
||||
{
|
||||
"url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-tiny.en-q8_0.bin",
|
||||
"sha256": ""
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"friendly_name": "Whisper Base German Awareai (Marksdo)",
|
||||
"local_folder_name": "ggml-base-awareai.de",
|
||||
|
@ -626,7 +626,7 @@ void transcription_filter_defaults(obs_data_t *s)
|
||||
obs_data_set_default_bool(s, "split_on_word", true);
|
||||
obs_data_set_default_int(s, "max_tokens", 50);
|
||||
obs_data_set_default_bool(s, "suppress_blank", false);
|
||||
obs_data_set_default_bool(s, "suppress_non_speech_tokens", true);
|
||||
obs_data_set_default_bool(s, "suppress_non_speech_tokens", false);
|
||||
obs_data_set_default_double(s, "temperature", 0.1);
|
||||
obs_data_set_default_double(s, "max_initial_ts", 1.0);
|
||||
obs_data_set_default_double(s, "length_penalty", -1.0);
|
||||
|
@ -24,9 +24,11 @@ int get_data_from_buf_and_resample(transcription_filter_data *gf,
|
||||
return 1;
|
||||
}
|
||||
|
||||
#ifdef LOCALVOCAL_EXTRA_VERBOSE
|
||||
obs_log(gf->log_level,
|
||||
"segmentation: currently %lu bytes in the audio input buffer",
|
||||
gf->input_buffers[0].size);
|
||||
#endif
|
||||
|
||||
// max number of frames is 10 seconds worth of audio
|
||||
const size_t max_num_frames = gf->sample_rate * 10;
|
||||
@ -76,8 +78,10 @@ int get_data_from_buf_and_resample(transcription_filter_data *gf,
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef LOCALVOCAL_EXTRA_VERBOSE
|
||||
obs_log(gf->log_level, "found %d frames from info buffer.", num_frames_from_infos);
|
||||
gf->last_num_frames = num_frames_from_infos;
|
||||
#endif
|
||||
|
||||
{
|
||||
// resample to 16kHz
|
||||
@ -95,11 +99,13 @@ int get_data_from_buf_and_resample(transcription_filter_data *gf,
|
||||
|
||||
circlebuf_push_back(&gf->resampled_buffer, resampled_16khz[0],
|
||||
resampled_16khz_frames * sizeof(float));
|
||||
#ifdef LOCALVOCAL_EXTRA_VERBOSE
|
||||
obs_log(gf->log_level,
|
||||
"resampled: %d channels, %d frames, %f ms, current size: %lu bytes",
|
||||
(int)gf->channels, (int)resampled_16khz_frames,
|
||||
(float)resampled_16khz_frames / WHISPER_SAMPLE_RATE * 1000.0f,
|
||||
gf->resampled_buffer.size);
|
||||
#endif
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -129,8 +135,10 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
|
||||
circlebuf_pop_front(&gf->resampled_buffer, vad_input.data(),
|
||||
vad_input.size() * sizeof(float));
|
||||
|
||||
#ifdef LOCALVOCAL_EXTRA_VERBOSE
|
||||
obs_log(gf->log_level, "sending %d frames to vad, %d windows, reset state? %s",
|
||||
vad_input.size(), vad_num_windows, (!last_vad_state.vad_on) ? "yes" : "no");
|
||||
#endif
|
||||
{
|
||||
ProfileScope("vad->process");
|
||||
gf->vad->process(vad_input, !last_vad_state.vad_on);
|
||||
@ -144,7 +152,9 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
|
||||
|
||||
std::vector<timestamp_t> stamps = gf->vad->get_speech_timestamps();
|
||||
if (stamps.size() == 0) {
|
||||
#ifdef LOCALVOCAL_EXTRA_VERBOSE
|
||||
obs_log(gf->log_level, "VAD detected no speech in %u frames", vad_input.size());
|
||||
#endif
|
||||
if (last_vad_state.vad_on) {
|
||||
obs_log(gf->log_level, "Last VAD was ON: segment end -> send to inference");
|
||||
run_inference_and_callbacks(gf, last_vad_state.start_ts_offest_ms,
|
||||
|
Loading…
Reference in New Issue
Block a user