Add new Whisper models to models_directory.json and adjust transcription filter properties

This commit is contained in:
Roy Shilkrot 2024-10-30 15:48:32 -04:00
parent 3668195652
commit f478809f79
3 changed files with 122 additions and 2 deletions

View File

@ -435,6 +435,116 @@
}
]
},
{
"friendly_name": "Whisper Base q8 (81Mb)",
"local_folder_name": "ggml-base-q8_0",
"type": "MODEL_TYPE_TRANSCRIPTION",
"files": [
{
"url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-base-q8_0.bin",
"sha256": ""
}
]
},
{
"friendly_name": "Whisper Base English q8 (81Mb)",
"local_folder_name": "ggml-base-en-q8_0",
"type": "MODEL_TYPE_TRANSCRIPTION",
"files": [
{
"url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-base.en-q8_0.bin",
"sha256": ""
}
]
},
{
"friendly_name": "Whisper Large v2 q8 (1.66Gb)",
"local_folder_name": "ggml-large-v2-q8_0",
"type": "MODEL_TYPE_TRANSCRIPTION",
"files": [
{
"url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-large-v2-q8_0.bin",
"sha256": ""
}
]
},
{
"friendly_name": "Whisper Medium q8 (823Mb)",
"local_folder_name": "ggml-medium-q8_0",
"type": "MODEL_TYPE_TRANSCRIPTION",
"files": [
{
"url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-medium-q8_0.bin",
"sha256": ""
}
]
},
{
"friendly_name": "Whisper Large v3 Trubo q8 (874Mb)",
"local_folder_name": "ggml-large-v3-turbo-q8_0",
"type": "MODEL_TYPE_TRANSCRIPTION",
"files": [
{
"url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-large-v3-turbo-q8_0.bin",
"sha256": ""
}
]
},
{
"friendly_name": "Whisper Small q8 (264Mb)",
"local_folder_name": "ggml-small-q8_0",
"type": "MODEL_TYPE_TRANSCRIPTION",
"files": [
{
"url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-small-q8_0.bin",
"sha256": ""
}
]
},
{
"friendly_name": "Whisper Medium English q8 (823Mb)",
"local_folder_name": "ggml-medium-en-q8_0",
"type": "MODEL_TYPE_TRANSCRIPTION",
"files": [
{
"url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-medium.en-q8_0.bin",
"sha256": ""
}
]
},
{
"friendly_name": "Whisper Small English q8 (264Mb)",
"local_folder_name": "ggml-small-en-q8_0",
"type": "MODEL_TYPE_TRANSCRIPTION",
"files": [
{
"url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-small.en-q8_0.bin",
"sha256": ""
}
]
},
{
"friendly_name": "Whisper Tiny q8 (43Mb)",
"local_folder_name": "ggml-tiny-q8_0",
"type": "MODEL_TYPE_TRANSCRIPTION",
"files": [
{
"url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-tiny-q8_0.bin",
"sha256": ""
}
]
},
{
"friendly_name": "Whisper Tiny English q8 (43Mb)",
"local_folder_name": "ggml-tiny-en-q8_0",
"type": "MODEL_TYPE_TRANSCRIPTION",
"files": [
{
"url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-tiny.en-q8_0.bin",
"sha256": ""
}
]
},
{
"friendly_name": "Whisper Base German Awareai (Marksdo)",
"local_folder_name": "ggml-base-awareai.de",
@ -4916,4 +5026,4 @@
}
}
]
}
}

View File

@ -626,7 +626,7 @@ void transcription_filter_defaults(obs_data_t *s)
obs_data_set_default_bool(s, "split_on_word", true);
obs_data_set_default_int(s, "max_tokens", 50);
obs_data_set_default_bool(s, "suppress_blank", false);
obs_data_set_default_bool(s, "suppress_non_speech_tokens", true);
obs_data_set_default_bool(s, "suppress_non_speech_tokens", false);
obs_data_set_default_double(s, "temperature", 0.1);
obs_data_set_default_double(s, "max_initial_ts", 1.0);
obs_data_set_default_double(s, "length_penalty", -1.0);

View File

@ -24,9 +24,11 @@ int get_data_from_buf_and_resample(transcription_filter_data *gf,
return 1;
}
#ifdef LOCALVOCAL_EXTRA_VERBOSE
obs_log(gf->log_level,
"segmentation: currently %lu bytes in the audio input buffer",
gf->input_buffers[0].size);
#endif
// max number of frames is 10 seconds worth of audio
const size_t max_num_frames = gf->sample_rate * 10;
@ -76,8 +78,10 @@ int get_data_from_buf_and_resample(transcription_filter_data *gf,
}
}
#ifdef LOCALVOCAL_EXTRA_VERBOSE
obs_log(gf->log_level, "found %d frames from info buffer.", num_frames_from_infos);
gf->last_num_frames = num_frames_from_infos;
#endif
{
// resample to 16kHz
@ -95,11 +99,13 @@ int get_data_from_buf_and_resample(transcription_filter_data *gf,
circlebuf_push_back(&gf->resampled_buffer, resampled_16khz[0],
resampled_16khz_frames * sizeof(float));
#ifdef LOCALVOCAL_EXTRA_VERBOSE
obs_log(gf->log_level,
"resampled: %d channels, %d frames, %f ms, current size: %lu bytes",
(int)gf->channels, (int)resampled_16khz_frames,
(float)resampled_16khz_frames / WHISPER_SAMPLE_RATE * 1000.0f,
gf->resampled_buffer.size);
#endif
}
return 0;
@ -129,8 +135,10 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
circlebuf_pop_front(&gf->resampled_buffer, vad_input.data(),
vad_input.size() * sizeof(float));
#ifdef LOCALVOCAL_EXTRA_VERBOSE
obs_log(gf->log_level, "sending %d frames to vad, %d windows, reset state? %s",
vad_input.size(), vad_num_windows, (!last_vad_state.vad_on) ? "yes" : "no");
#endif
{
ProfileScope("vad->process");
gf->vad->process(vad_input, !last_vad_state.vad_on);
@ -144,7 +152,9 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
std::vector<timestamp_t> stamps = gf->vad->get_speech_timestamps();
if (stamps.size() == 0) {
#ifdef LOCALVOCAL_EXTRA_VERBOSE
obs_log(gf->log_level, "VAD detected no speech in %u frames", vad_input.size());
#endif
if (last_vad_state.vad_on) {
obs_log(gf->log_level, "Last VAD was ON: segment end -> send to inference");
run_inference_and_callbacks(gf, last_vad_state.start_ts_offest_ms,