diff --git a/data/models/models_directory.json b/data/models/models_directory.json index e249ce8..5b90905 100644 --- a/data/models/models_directory.json +++ b/data/models/models_directory.json @@ -435,6 +435,116 @@ } ] }, + { + "friendly_name": "Whisper Base q8 (81Mb)", + "local_folder_name": "ggml-base-q8_0", + "type": "MODEL_TYPE_TRANSCRIPTION", + "files": [ + { + "url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-base-q8_0.bin", + "sha256": "" + } + ] + }, + { + "friendly_name": "Whisper Base English q8 (81Mb)", + "local_folder_name": "ggml-base-en-q8_0", + "type": "MODEL_TYPE_TRANSCRIPTION", + "files": [ + { + "url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-base.en-q8_0.bin", + "sha256": "" + } + ] + }, + { + "friendly_name": "Whisper Large v2 q8 (1.66Gb)", + "local_folder_name": "ggml-large-v2-q8_0", + "type": "MODEL_TYPE_TRANSCRIPTION", + "files": [ + { + "url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-large-v2-q8_0.bin", + "sha256": "" + } + ] + }, + { + "friendly_name": "Whisper Medium q8 (823Mb)", + "local_folder_name": "ggml-medium-q8_0", + "type": "MODEL_TYPE_TRANSCRIPTION", + "files": [ + { + "url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-medium-q8_0.bin", + "sha256": "" + } + ] + }, + { + "friendly_name": "Whisper Large v3 Trubo q8 (874Mb)", + "local_folder_name": "ggml-large-v3-turbo-q8_0", + "type": "MODEL_TYPE_TRANSCRIPTION", + "files": [ + { + "url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-large-v3-turbo-q8_0.bin", + "sha256": "" + } + ] + }, + { + "friendly_name": "Whisper Small q8 (264Mb)", + "local_folder_name": "ggml-small-q8_0", + "type": "MODEL_TYPE_TRANSCRIPTION", + "files": [ + { + "url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-small-q8_0.bin", + "sha256": "" + } + ] + }, + { + "friendly_name": "Whisper Medium English q8 (823Mb)", + "local_folder_name": "ggml-medium-en-q8_0", + "type": "MODEL_TYPE_TRANSCRIPTION", + "files": [ + { + "url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-medium.en-q8_0.bin", + "sha256": "" + } + ] + }, + { + "friendly_name": "Whisper Small English q8 (264Mb)", + "local_folder_name": "ggml-small-en-q8_0", + "type": "MODEL_TYPE_TRANSCRIPTION", + "files": [ + { + "url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-small.en-q8_0.bin", + "sha256": "" + } + ] + }, + { + "friendly_name": "Whisper Tiny q8 (43Mb)", + "local_folder_name": "ggml-tiny-q8_0", + "type": "MODEL_TYPE_TRANSCRIPTION", + "files": [ + { + "url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-tiny-q8_0.bin", + "sha256": "" + } + ] + }, + { + "friendly_name": "Whisper Tiny English q8 (43Mb)", + "local_folder_name": "ggml-tiny-en-q8_0", + "type": "MODEL_TYPE_TRANSCRIPTION", + "files": [ + { + "url": "https://huggingface.co/ggerganov/whisper.cpp/blob/main/ggml-tiny.en-q8_0.bin", + "sha256": "" + } + ] + }, { "friendly_name": "Whisper Base German Awareai (Marksdo)", "local_folder_name": "ggml-base-awareai.de", @@ -4916,4 +5026,4 @@ } } ] -} \ No newline at end of file +} diff --git a/src/transcription-filter-properties.cpp b/src/transcription-filter-properties.cpp index 0f96e30..0adda33 100644 --- a/src/transcription-filter-properties.cpp +++ b/src/transcription-filter-properties.cpp @@ -626,7 +626,7 @@ void transcription_filter_defaults(obs_data_t *s) obs_data_set_default_bool(s, "split_on_word", true); obs_data_set_default_int(s, "max_tokens", 50); obs_data_set_default_bool(s, "suppress_blank", false); - obs_data_set_default_bool(s, "suppress_non_speech_tokens", true); + obs_data_set_default_bool(s, "suppress_non_speech_tokens", false); obs_data_set_default_double(s, "temperature", 0.1); obs_data_set_default_double(s, "max_initial_ts", 1.0); obs_data_set_default_double(s, "length_penalty", -1.0); diff --git a/src/whisper-utils/vad-processing.cpp b/src/whisper-utils/vad-processing.cpp index 0e9c744..39ad715 100644 --- a/src/whisper-utils/vad-processing.cpp +++ b/src/whisper-utils/vad-processing.cpp @@ -24,9 +24,11 @@ int get_data_from_buf_and_resample(transcription_filter_data *gf, return 1; } +#ifdef LOCALVOCAL_EXTRA_VERBOSE obs_log(gf->log_level, "segmentation: currently %lu bytes in the audio input buffer", gf->input_buffers[0].size); +#endif // max number of frames is 10 seconds worth of audio const size_t max_num_frames = gf->sample_rate * 10; @@ -76,8 +78,10 @@ int get_data_from_buf_and_resample(transcription_filter_data *gf, } } +#ifdef LOCALVOCAL_EXTRA_VERBOSE obs_log(gf->log_level, "found %d frames from info buffer.", num_frames_from_infos); gf->last_num_frames = num_frames_from_infos; +#endif { // resample to 16kHz @@ -95,11 +99,13 @@ int get_data_from_buf_and_resample(transcription_filter_data *gf, circlebuf_push_back(&gf->resampled_buffer, resampled_16khz[0], resampled_16khz_frames * sizeof(float)); +#ifdef LOCALVOCAL_EXTRA_VERBOSE obs_log(gf->log_level, "resampled: %d channels, %d frames, %f ms, current size: %lu bytes", (int)gf->channels, (int)resampled_16khz_frames, (float)resampled_16khz_frames / WHISPER_SAMPLE_RATE * 1000.0f, gf->resampled_buffer.size); +#endif } return 0; @@ -129,8 +135,10 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v circlebuf_pop_front(&gf->resampled_buffer, vad_input.data(), vad_input.size() * sizeof(float)); +#ifdef LOCALVOCAL_EXTRA_VERBOSE obs_log(gf->log_level, "sending %d frames to vad, %d windows, reset state? %s", vad_input.size(), vad_num_windows, (!last_vad_state.vad_on) ? "yes" : "no"); +#endif { ProfileScope("vad->process"); gf->vad->process(vad_input, !last_vad_state.vad_on); @@ -144,7 +152,9 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v std::vector stamps = gf->vad->get_speech_timestamps(); if (stamps.size() == 0) { +#ifdef LOCALVOCAL_EXTRA_VERBOSE obs_log(gf->log_level, "VAD detected no speech in %u frames", vad_input.size()); +#endif if (last_vad_state.vad_on) { obs_log(gf->log_level, "Last VAD was ON: segment end -> send to inference"); run_inference_and_callbacks(gf, last_vad_state.start_ts_offest_ms,