From 974b9dcbbb34aa557c4e51bea3706c80cea7980c Mon Sep 17 00:00:00 2001 From: Shreeram Date: Tue, 9 Jan 2024 23:17:36 +0530 Subject: [PATCH] minor-fix: removing mps notebook --- demo_mps.ipynb | 321 ------------------------------------------------- 1 file changed, 321 deletions(-) delete mode 100644 demo_mps.ipynb diff --git a/demo_mps.ipynb b/demo_mps.ipynb deleted file mode 100644 index cf1bdeb..0000000 --- a/demo_mps.ipynb +++ /dev/null @@ -1,321 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "b6ee1ede", - "metadata": {}, - "source": [ - "## Voice Style Control Demo" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "b7f043ee", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Importing the dtw module. When using in academic works please cite:\n", - " T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.\n", - " J. Stat. Soft., doi:10.18637/jss.v031.i07.\n", - "\n" - ] - } - ], - "source": [ - "import os\n", - "import torch\n", - "import se_extractor\n", - "from api import BaseSpeakerTTS, ToneColorConverter" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0edcf17e-49a4-4d24-8f48-87d3cf955395", - "metadata": {}, - "outputs": [], - "source": [ - "# To solve https://github.com/pytorch/pytorch/issues/77764\n", - "# setting environment variable for mps fallback in fish shell\n", - "!set -gx $PYTORCH_ENABLE_MPS_FALLBACK 1 " - ] - }, - { - "cell_type": "markdown", - "id": "15116b59", - "metadata": {}, - "source": [ - "### Initialization" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "aacad912", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loaded checkpoint 'checkpoints/base_speakers/EN/checkpoint.pth'\n", - "missing/unexpected keys: [] []\n", - "Loaded checkpoint 'checkpoints/converter/checkpoint.pth'\n", - "missing/unexpected keys: [] []\n" - ] - } - ], - "source": [ - "ckpt_base = 'checkpoints/base_speakers/EN'\n", - "ckpt_converter = 'checkpoints/converter'\n", - "device = 'mps'\n", - "output_dir = 'outputs'\n", - "\n", - "base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base}/config.json', device=device)\n", - "base_speaker_tts.load_ckpt(f'{ckpt_base}/checkpoint.pth')\n", - "\n", - "tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)\n", - "tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')\n", - "\n", - "os.makedirs(output_dir, exist_ok=True)" - ] - }, - { - "cell_type": "markdown", - "id": "7f67740c", - "metadata": {}, - "source": [ - "### Obtain Tone Color Embedding" - ] - }, - { - "cell_type": "markdown", - "id": "f8add279", - "metadata": {}, - "source": [ - "The `source_se` is the tone color embedding of the base speaker. \n", - "It is an average of multiple sentences generated by the base speaker. We directly provide the result here but\n", - "the readers feel free to extract `source_se` by themselves." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "63ff6273", - "metadata": {}, - "outputs": [], - "source": [ - "source_se = torch.load(f'{ckpt_base}/en_default_se.pth').to(device)" - ] - }, - { - "cell_type": "markdown", - "id": "4f71fcc3", - "metadata": {}, - "source": [ - "The `reference_speaker.mp3` below points to the short audio clip of the reference whose voice we want to clone. We provide an example here. If you use your own reference speakers, please **make sure each speaker has a unique filename.** The `se_extractor` will save the `targeted_se` using the filename of the audio and **will not automatically overwrite.**" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "55105eae", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[(0.0, 14.5666875)]\n", - "after vad: dur = 14.566\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Applications/anaconda3/envs/voiceclone/lib/python3.9/site-packages/torch/functional.py:632: UserWarning: The operator 'aten::_fft_r2c' is not currently supported on the MPS backend and will fall back to run on the CPU. This may have performance implications. (Triggered internally at /Users/runner/work/_temp/anaconda/conda-bld/pytorch_1670525849783/work/aten/src/ATen/mps/MPSFallback.mm:11.)\n", - " return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined]\n" - ] - } - ], - "source": [ - "reference_speaker = 'resources/example_reference.mp3'\n", - "target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir='processed', vad=True)" - ] - }, - { - "cell_type": "markdown", - "id": "a40284aa", - "metadata": {}, - "source": [ - "### Inference" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "73dc1259", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " > Text splitted to sentences.\n", - "This audio is generated by OpenVoice.\n", - " > ===========================\n", - "ðɪs ˈɑdiˌoʊ ɪz ˈdʒɛnəɹˌeɪtɪd baɪ ˈoʊpən vɔɪs.\n", - " length:45\n", - " length:45\n" - ] - }, - { - "ename": "IndexError", - "evalue": "Dimension out of range (expected to be in range of [-3, 2], but got 3)", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[6], line 6\u001b[0m\n\u001b[1;32m 4\u001b[0m text \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThis audio is generated by OpenVoice.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 5\u001b[0m src_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00moutput_dir\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/tmp.wav\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m----> 6\u001b[0m \u001b[43mbase_speaker_tts\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtts\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msrc_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mspeaker\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mdefault\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlanguage\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mEnglish\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mspeed\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1.0\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;66;03m# Run the tone color converter\u001b[39;00m\n\u001b[1;32m 9\u001b[0m encode_message \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m@MyShell\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", - "File \u001b[0;32m~/ram/project/python/OpenVoice/api.py:90\u001b[0m, in \u001b[0;36mBaseSpeakerTTS.tts\u001b[0;34m(self, text, output_path, speaker, language, speed)\u001b[0m\n\u001b[1;32m 88\u001b[0m x_tst_lengths \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mLongTensor([stn_tst\u001b[38;5;241m.\u001b[39msize(\u001b[38;5;241m0\u001b[39m)])\u001b[38;5;241m.\u001b[39mto(device)\n\u001b[1;32m 89\u001b[0m sid \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mLongTensor([speaker_id])\u001b[38;5;241m.\u001b[39mto(device)\n\u001b[0;32m---> 90\u001b[0m audio \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minfer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx_tst\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mx_tst_lengths\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msid\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msid\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnoise_scale\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.667\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnoise_scale_w\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.6\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 91\u001b[0m \u001b[43m \u001b[49m\u001b[43mlength_scale\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1.0\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m/\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mspeed\u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;241m0\u001b[39m][\u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mdata\u001b[38;5;241m.\u001b[39mcpu()\u001b[38;5;241m.\u001b[39mfloat()\u001b[38;5;241m.\u001b[39mnumpy()\n\u001b[1;32m 92\u001b[0m audio_list\u001b[38;5;241m.\u001b[39mappend(audio)\n\u001b[1;32m 93\u001b[0m audio \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maudio_numpy_concat(audio_list, sr\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhps\u001b[38;5;241m.\u001b[39mdata\u001b[38;5;241m.\u001b[39msampling_rate, speed\u001b[38;5;241m=\u001b[39mspeed)\n", - "File \u001b[0;32m~/ram/project/python/OpenVoice/models.py:466\u001b[0m, in \u001b[0;36mSynthesizerTrn.infer\u001b[0;34m(self, x, x_lengths, sid, noise_scale, length_scale, noise_scale_w, sdp_ratio, max_len)\u001b[0m\n\u001b[1;32m 465\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minfer\u001b[39m(\u001b[38;5;28mself\u001b[39m, x, x_lengths, sid\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, noise_scale\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m, length_scale\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m, noise_scale_w\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1.\u001b[39m, sdp_ratio\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.2\u001b[39m, max_len\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[0;32m--> 466\u001b[0m x, m_p, logs_p, x_mask \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43menc_p\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mx_lengths\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 467\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_speakers \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 468\u001b[0m g \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39memb_g(sid)\u001b[38;5;241m.\u001b[39munsqueeze(\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m) \u001b[38;5;66;03m# [b, h, 1]\u001b[39;00m\n", - "File \u001b[0;32m/Applications/anaconda3/envs/voiceclone/lib/python3.9/site-packages/torch/nn/modules/module.py:1194\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 1190\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1191\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1192\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1193\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1194\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1195\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1196\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n", - "File \u001b[0;32m~/ram/project/python/OpenVoice/models.py:53\u001b[0m, in \u001b[0;36mTextEncoder.forward\u001b[0;34m(self, x, x_lengths)\u001b[0m\n\u001b[1;32m 50\u001b[0m x \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mtranspose(x, \u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m) \u001b[38;5;66;03m# [b, h, t]\u001b[39;00m\n\u001b[1;32m 51\u001b[0m x_mask \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39munsqueeze(commons\u001b[38;5;241m.\u001b[39msequence_mask(x_lengths, x\u001b[38;5;241m.\u001b[39msize(\u001b[38;5;241m2\u001b[39m)), \u001b[38;5;241m1\u001b[39m)\u001b[38;5;241m.\u001b[39mto(x\u001b[38;5;241m.\u001b[39mdtype)\n\u001b[0;32m---> 53\u001b[0m x \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoder\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mx_mask\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mx_mask\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 54\u001b[0m stats \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mproj(x) \u001b[38;5;241m*\u001b[39m x_mask\n\u001b[1;32m 56\u001b[0m m, logs \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39msplit(stats, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mout_channels, dim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\n", - "File \u001b[0;32m/Applications/anaconda3/envs/voiceclone/lib/python3.9/site-packages/torch/nn/modules/module.py:1194\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 1190\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1191\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1192\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1193\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1194\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1195\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1196\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n", - "File \u001b[0;32m~/ram/project/python/OpenVoice/attentions.py:113\u001b[0m, in \u001b[0;36mEncoder.forward\u001b[0;34m(self, x, x_mask, g)\u001b[0m\n\u001b[1;32m 111\u001b[0m x \u001b[38;5;241m=\u001b[39m x \u001b[38;5;241m+\u001b[39m g\n\u001b[1;32m 112\u001b[0m x \u001b[38;5;241m=\u001b[39m x \u001b[38;5;241m*\u001b[39m x_mask\n\u001b[0;32m--> 113\u001b[0m y \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mattn_layers\u001b[49m\u001b[43m[\u001b[49m\u001b[43mi\u001b[49m\u001b[43m]\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mattn_mask\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 114\u001b[0m y \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdrop(y)\n\u001b[1;32m 115\u001b[0m x \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnorm_layers_1[i](x \u001b[38;5;241m+\u001b[39m y)\n", - "File \u001b[0;32m/Applications/anaconda3/envs/voiceclone/lib/python3.9/site-packages/torch/nn/modules/module.py:1194\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 1190\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1191\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1192\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1193\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1194\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1195\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m 1196\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n", - "File \u001b[0;32m~/ram/project/python/OpenVoice/attentions.py:269\u001b[0m, in \u001b[0;36mMultiHeadAttention.forward\u001b[0;34m(self, x, c, attn_mask)\u001b[0m\n\u001b[1;32m 266\u001b[0m k \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconv_k(c)\n\u001b[1;32m 267\u001b[0m v \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconv_v(c)\n\u001b[0;32m--> 269\u001b[0m x, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mattn \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mattention\u001b[49m\u001b[43m(\u001b[49m\u001b[43mq\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mv\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattn_mask\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 271\u001b[0m x \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconv_o(x)\n\u001b[1;32m 272\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m x\n", - "File \u001b[0;32m~/ram/project/python/OpenVoice/attentions.py:286\u001b[0m, in \u001b[0;36mMultiHeadAttention.attention\u001b[0;34m(self, query, key, value, mask)\u001b[0m\n\u001b[1;32m 282\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mwindow_size \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 283\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m (\n\u001b[1;32m 284\u001b[0m t_s \u001b[38;5;241m==\u001b[39m t_t\n\u001b[1;32m 285\u001b[0m ), \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRelative attention is only available for self-attention.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m--> 286\u001b[0m key_relative_embeddings \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_relative_embeddings\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43memb_rel_k\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mt_s\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 287\u001b[0m rel_logits \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_matmul_with_relative_keys(\n\u001b[1;32m 288\u001b[0m query \u001b[38;5;241m/\u001b[39m math\u001b[38;5;241m.\u001b[39msqrt(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mk_channels), key_relative_embeddings\n\u001b[1;32m 289\u001b[0m )\n\u001b[1;32m 290\u001b[0m scores_local \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_relative_position_to_absolute_position(rel_logits)\n", - "File \u001b[0;32m~/ram/project/python/OpenVoice/attentions.py:350\u001b[0m, in \u001b[0;36mMultiHeadAttention._get_relative_embeddings\u001b[0;34m(self, relative_embeddings, length)\u001b[0m\n\u001b[1;32m 348\u001b[0m slice_end_position \u001b[38;5;241m=\u001b[39m slice_start_position \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m2\u001b[39m \u001b[38;5;241m*\u001b[39m length \u001b[38;5;241m-\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 349\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m pad_length \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m--> 350\u001b[0m padded_relative_embeddings \u001b[38;5;241m=\u001b[39m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpad\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 351\u001b[0m \u001b[43m \u001b[49m\u001b[43mrelative_embeddings\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 352\u001b[0m \u001b[43m \u001b[49m\u001b[43mcommons\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconvert_pad_shape\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[43mpad_length\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpad_length\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 353\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 354\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 355\u001b[0m padded_relative_embeddings \u001b[38;5;241m=\u001b[39m relative_embeddings\n", - "\u001b[0;31mIndexError\u001b[0m: Dimension out of range (expected to be in range of [-3, 2], but got 3)" - ] - } - ], - "source": [ - "save_path = f'{output_dir}/output_en_default.wav'\n", - "\n", - "# Run the base speaker tts\n", - "text = \"This audio is generated by OpenVoice.\"\n", - "src_path = f'{output_dir}/tmp.wav'\n", - "base_speaker_tts.tts(text, src_path, speaker='default', language='English', speed=1.0)\n", - "\n", - "# Run the tone color converter\n", - "encode_message = \"@MyShell\"\n", - "tone_color_converter.convert(\n", - " audio_src_path=src_path, \n", - " src_se=source_se, \n", - " tgt_se=target_se, \n", - " output_path=save_path,\n", - " message=encode_message)" - ] - }, - { - "cell_type": "markdown", - "id": "6e3ea28a", - "metadata": {}, - "source": [ - "**Try with different styles and speed.** The style can be controlled by the `speaker` parameter in the `base_speaker_tts.tts` method. Available choices: friendly, cheerful, excited, sad, angry, terrified, shouting, whispering. Note that the tone color embedding need to be updated. The speed can be controlled by the `speed` parameter. Let's try whispering with speed 0.9." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fd022d38", - "metadata": {}, - "outputs": [], - "source": [ - "source_se = torch.load(f'{ckpt_base}/en_style_se.pth').to(device)\n", - "save_path = f'{output_dir}/output_whispering.wav'\n", - "\n", - "# Run the base speaker tts\n", - "text = \"This audio is generated by OpenVoice with a half-performance model.\"\n", - "src_path = f'{output_dir}/tmp.wav'\n", - "base_speaker_tts.tts(text, src_path, speaker='whispering', language='English', speed=0.9)\n", - "\n", - "# Run the tone color converter\n", - "encode_message = \"@MyShell\"\n", - "tone_color_converter.convert(\n", - " audio_src_path=src_path, \n", - " src_se=source_se, \n", - " tgt_se=target_se, \n", - " output_path=save_path,\n", - " message=encode_message)" - ] - }, - { - "cell_type": "markdown", - "id": "5fcfc70b", - "metadata": {}, - "source": [ - "**Try with different languages.** OpenVoice can achieve multi-lingual voice cloning by simply replace the base speaker. We provide an example with a Chinese base speaker here and we encourage the readers to try `demo_part2.ipynb` for a detailed demo." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a71d1387", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "ckpt_base = 'checkpoints/base_speakers/ZH'\n", - "base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base}/config.json', device=device)\n", - "base_speaker_tts.load_ckpt(f'{ckpt_base}/checkpoint.pth')\n", - "\n", - "source_se = torch.load(f'{ckpt_base}/zh_default_se.pth').to(device)\n", - "save_path = f'{output_dir}/output_chinese.wav'\n", - "\n", - "# Run the base speaker tts\n", - "text = \"今天天气真好,我们一起出去吃饭吧。\"\n", - "src_path = f'{output_dir}/tmp.wav'\n", - "base_speaker_tts.tts(text, src_path, speaker='default', language='Chinese', speed=1.0)\n", - "\n", - "# Run the tone color converter\n", - "encode_message = \"@MyShell\"\n", - "tone_color_converter.convert(\n", - " audio_src_path=src_path, \n", - " src_se=source_se, \n", - " tgt_se=target_se, \n", - " output_path=save_path,\n", - " message=encode_message)" - ] - }, - { - "cell_type": "markdown", - "id": "8e513094", - "metadata": {}, - "source": [ - "**Tech for good.** For people who will deploy OpenVoice for public usage: We offer you the option to add watermark to avoid potential misuse. Please see the ToneColorConverter class. **MyShell reserves the ability to detect whether an audio is generated by OpenVoice**, no matter whether the watermark is added or not." - ] - } - ], - "metadata": { - "interpreter": { - "hash": "9d70c38e1c0b038dbdffdaa4f8bfa1f6767c43760905c87a9fbe7800d18c6c35" - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.18" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}