add audio hash to audio name to avoid confusion

This commit is contained in:
wl-zhao 2024-01-06 00:12:29 +08:00
parent 1445029808
commit 9d4342cafd

View File

@ -5,19 +5,21 @@ from glob import glob
import numpy as np
from pydub import AudioSegment
from faster_whisper import WhisperModel
import hashlib
import base64
import librosa
from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments
model_size = "medium"
# Run on GPU with FP16
model = None
def split_audio_whisper(audio_path, target_dir='processed'):
def split_audio_whisper(audio_path, target_dir, audio_name):
global model
if model is None:
model = WhisperModel(model_size, device="cuda", compute_type="float16")
audio = AudioSegment.from_file(audio_path)
max_len = len(audio)
audio_name = os.path.basename(audio_path).rsplit('.', 1)[0]
target_folder = os.path.join(target_dir, audio_name)
segments, info = model.transcribe(audio_path, beam_size=5, word_timestamps=True)
@ -69,7 +71,7 @@ def split_audio_whisper(audio_path, target_dir='processed'):
return wavs_folder
def split_audio_vad(audio_path, target_dir, split_seconds=10.0):
def split_audio_vad(audio_path, target_dir, audio_name, split_seconds=10.0):
SAMPLE_RATE = 16000
audio_vad = get_audio_tensor(audio_path)
segments = get_vad_segments(
@ -90,7 +92,6 @@ def split_audio_vad(audio_path, target_dir, split_seconds=10.0):
audio_dur = audio_active.duration_seconds
print(f'after vad: dur = {audio_dur}')
audio_name = os.path.basename(audio_path).rsplit('.', 1)[0]
target_folder = os.path.join(target_dir, audio_name)
wavs_folder = os.path.join(target_folder, 'wavs')
os.makedirs(wavs_folder, exist_ok=True)
@ -112,13 +113,18 @@ def split_audio_vad(audio_path, target_dir, split_seconds=10.0):
return wavs_folder
def hash_numpy_array(array):
array_bytes = array.tobytes()
hash_object = hashlib.sha256(array_bytes)
hash_value = hash_object.digest()
base64_value = base64.b64encode(hash_value)
return base64_value.decode('utf-8')[:16].replace('/', '&')
def get_se(audio_path, vc_model, target_dir='processed', vad=True):
device = vc_model.device
audio_name = os.path.basename(audio_path).rsplit('.', 1)[0]
audio_hash = hash_numpy_array(librosa.load(audio_path, mono=True)[0])
audio_name = os.path.basename(audio_path).rsplit('.', 1)[0] + '_' + audio_hash
se_path = os.path.join(target_dir, audio_name, 'se.pth')
if os.path.isfile(se_path):
@ -127,9 +133,9 @@ def get_se(audio_path, vc_model, target_dir='processed', vad=True):
if os.path.isdir(audio_path):
wavs_folder = audio_path
elif vad:
wavs_folder = split_audio_vad(audio_path, target_dir)
wavs_folder = split_audio_vad(audio_path, target_dir, audio_name)
else:
wavs_folder = split_audio_whisper(audio_path, target_dir)
wavs_folder = split_audio_whisper(audio_path, target_dir, audio_name)
audio_segs = glob(f'{wavs_folder}/*.wav')
if len(audio_segs) == 0: