diff --git a/se_extractor.py b/se_extractor.py index 3ea2b9d..b916406 100644 --- a/se_extractor.py +++ b/se_extractor.py @@ -5,19 +5,21 @@ from glob import glob import numpy as np from pydub import AudioSegment from faster_whisper import WhisperModel +import hashlib +import base64 +import librosa from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments model_size = "medium" # Run on GPU with FP16 model = None -def split_audio_whisper(audio_path, target_dir='processed'): +def split_audio_whisper(audio_path, target_dir, audio_name): global model if model is None: model = WhisperModel(model_size, device="cuda", compute_type="float16") audio = AudioSegment.from_file(audio_path) max_len = len(audio) - audio_name = os.path.basename(audio_path).rsplit('.', 1)[0] target_folder = os.path.join(target_dir, audio_name) segments, info = model.transcribe(audio_path, beam_size=5, word_timestamps=True) @@ -69,7 +71,7 @@ def split_audio_whisper(audio_path, target_dir='processed'): return wavs_folder -def split_audio_vad(audio_path, target_dir, split_seconds=10.0): +def split_audio_vad(audio_path, target_dir, audio_name, split_seconds=10.0): SAMPLE_RATE = 16000 audio_vad = get_audio_tensor(audio_path) segments = get_vad_segments( @@ -90,7 +92,6 @@ def split_audio_vad(audio_path, target_dir, split_seconds=10.0): audio_dur = audio_active.duration_seconds print(f'after vad: dur = {audio_dur}') - audio_name = os.path.basename(audio_path).rsplit('.', 1)[0] target_folder = os.path.join(target_dir, audio_name) wavs_folder = os.path.join(target_folder, 'wavs') os.makedirs(wavs_folder, exist_ok=True) @@ -112,13 +113,18 @@ def split_audio_vad(audio_path, target_dir, split_seconds=10.0): return wavs_folder - +def hash_numpy_array(array): + array_bytes = array.tobytes() + hash_object = hashlib.sha256(array_bytes) + hash_value = hash_object.digest() + base64_value = base64.b64encode(hash_value) + return base64_value.decode('utf-8')[:16].replace('/', '&') def get_se(audio_path, vc_model, target_dir='processed', vad=True): device = vc_model.device - - audio_name = os.path.basename(audio_path).rsplit('.', 1)[0] + audio_hash = hash_numpy_array(librosa.load(audio_path, mono=True)[0]) + audio_name = os.path.basename(audio_path).rsplit('.', 1)[0] + '_' + audio_hash se_path = os.path.join(target_dir, audio_name, 'se.pth') if os.path.isfile(se_path): @@ -127,9 +133,9 @@ def get_se(audio_path, vc_model, target_dir='processed', vad=True): if os.path.isdir(audio_path): wavs_folder = audio_path elif vad: - wavs_folder = split_audio_vad(audio_path, target_dir) + wavs_folder = split_audio_vad(audio_path, target_dir, audio_name) else: - wavs_folder = split_audio_whisper(audio_path, target_dir) + wavs_folder = split_audio_whisper(audio_path, target_dir, audio_name) audio_segs = glob(f'{wavs_folder}/*.wav') if len(audio_segs) == 0: