|
|
|
@ -5,7 +5,7 @@
|
|
|
|
|
"colab": {
|
|
|
|
|
"provenance": [],
|
|
|
|
|
"gpuType": "T4",
|
|
|
|
|
"authorship_tag": "ABX9TyMcevzeVyewWF1ZHKzBu3CB",
|
|
|
|
|
"authorship_tag": "ABX9TyNju0yzRK8wgAS+WgyeTEAl",
|
|
|
|
|
"include_colab_link": true
|
|
|
|
|
},
|
|
|
|
|
"kernelspec": {
|
|
|
|
@ -88,12 +88,13 @@
|
|
|
|
|
" playaudio(\"installing\")\n",
|
|
|
|
|
"!git clone -q https://github.com/rmcpantoja/piper\n",
|
|
|
|
|
"%cd /content/piper/src/python\n",
|
|
|
|
|
"!pip install -q -r requirements.txt\n",
|
|
|
|
|
"#!pip install -q -r requirements.txt\n",
|
|
|
|
|
"!pip install -q cython>=0.29.0 piper-phonemize==1.1.0 librosa>=0.9.2 numpy>=1.19.0 onnxruntime>=1.11.0 pytorch-lightning==1.7.0 torch==1.11.0\n",
|
|
|
|
|
"!pip install -q torchtext==0.12.0 torchvision==0.12.0\n",
|
|
|
|
|
"#!pip install -q torchtext==0.14.1 torchvision==0.14.1\n",
|
|
|
|
|
"# fixing recent compativility isswes:\n",
|
|
|
|
|
"!pip install -q torchaudio==0.11.0 torchmetrics==0.11.4\n",
|
|
|
|
|
"!bash build_monotonic_align.sh\n",
|
|
|
|
|
"!apt-get install -q espeak-ng\n",
|
|
|
|
|
"import os\n",
|
|
|
|
|
"if not os.path.exists(\"/content/piper/src/python/lng\"):\n",
|
|
|
|
|
" !cp -r \"/content/piper/notebooks/lng\" /content/piper/src/python/lng\n",
|
|
|
|
@ -190,6 +191,8 @@
|
|
|
|
|
"import logging\n",
|
|
|
|
|
"import sys\n",
|
|
|
|
|
"from pathlib import Path\n",
|
|
|
|
|
"from enum import Enum\n",
|
|
|
|
|
"from typing import Iterable, List, Optional, Union\n",
|
|
|
|
|
"import torch\n",
|
|
|
|
|
"from piper_train.vits.lightning import VitsModel\n",
|
|
|
|
|
"from piper_train.vits.utils import audio_float_to_int16\n",
|
|
|
|
@ -198,8 +201,7 @@
|
|
|
|
|
"import glob\n",
|
|
|
|
|
"import ipywidgets as widgets\n",
|
|
|
|
|
"from IPython.display import display, Audio, Markdown, clear_output\n",
|
|
|
|
|
"from espeak_phonemizer import Phonemizer\n",
|
|
|
|
|
"from piper_train import phonemize\n",
|
|
|
|
|
"from piper_phonemize import phonemize_codepoints, phonemize_espeak, tashkeel_run\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"_LOGGER = logging.getLogger(\"piper_train.infer_onnx\")\n",
|
|
|
|
|
"\n",
|
|
|
|
@ -382,35 +384,69 @@
|
|
|
|
|
" config = json.load(file)\n",
|
|
|
|
|
" return config\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"PAD = \"_\" # padding (0)\n",
|
|
|
|
|
"BOS = \"^\" # beginning of sentence\n",
|
|
|
|
|
"EOS = \"$\" # end of sentence\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"class PhonemeType(str, Enum):\n",
|
|
|
|
|
" ESPEAK = \"espeak\"\n",
|
|
|
|
|
" TEXT = \"text\"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def phonemize(config, text: str) -> List[List[str]]:\n",
|
|
|
|
|
" \"\"\"Text to phonemes grouped by sentence.\"\"\"\n",
|
|
|
|
|
" if config[\"phoneme_type\"] == PhonemeType.ESPEAK:\n",
|
|
|
|
|
" if config[\"espeak\"][\"voice\"] == \"ar\":\n",
|
|
|
|
|
" # Arabic diacritization\n",
|
|
|
|
|
" # https://github.com/mush42/libtashkeel/\n",
|
|
|
|
|
" text = tashkeel_run(text)\n",
|
|
|
|
|
" return phonemize_espeak(text, config[\"espeak\"][\"voice\"])\n",
|
|
|
|
|
" if config[\"phoneme_type\"] == PhonemeType.TEXT:\n",
|
|
|
|
|
" return phonemize_codepoints(text)\n",
|
|
|
|
|
" raise ValueError(f\"Unexpected phoneme type: {self.config.phoneme_type}\")\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def phonemes_to_ids(config, phonemes: List[str]) -> List[int]:\n",
|
|
|
|
|
" \"\"\"Phonemes to ids.\"\"\"\n",
|
|
|
|
|
" id_map = config[\"phoneme_id_map\"]\n",
|
|
|
|
|
" ids: List[int] = list(id_map[BOS])\n",
|
|
|
|
|
" for phoneme in phonemes:\n",
|
|
|
|
|
" if phoneme not in id_map:\n",
|
|
|
|
|
" print(\"Missing phoneme from id map: %s\", phoneme)\n",
|
|
|
|
|
" continue\n",
|
|
|
|
|
" ids.extend(id_map[phoneme])\n",
|
|
|
|
|
" ids.extend(id_map[PAD])\n",
|
|
|
|
|
" ids.extend(id_map[EOS])\n",
|
|
|
|
|
" return ids\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def inferencing(model, config, sid, line, length_scale = 1, noise_scale = 0.667, noise_scale_w = 0.8, auto_play=True):\n",
|
|
|
|
|
" espeak_voice = config[\"espeak\"][\"voice\"]\n",
|
|
|
|
|
" phonemizer = Phonemizer(default_voice=espeak_voice)\n",
|
|
|
|
|
" phonemes = phonemize.phonemize(line, phonemizer)\n",
|
|
|
|
|
" ids = phonemize.phonemes_to_ids(phonemes)\n",
|
|
|
|
|
" phoneme_ids = ids\n",
|
|
|
|
|
" num_speakers = config[\"num_speakers\"]\n",
|
|
|
|
|
" if num_speakers == 1:\n",
|
|
|
|
|
" speaker_id = None # for now\n",
|
|
|
|
|
" else:\n",
|
|
|
|
|
" speaker_id = sid\n",
|
|
|
|
|
" text = torch.LongTensor(phoneme_ids).unsqueeze(0)\n",
|
|
|
|
|
" text_lengths = torch.LongTensor([len(phoneme_ids)])\n",
|
|
|
|
|
" scales = [\n",
|
|
|
|
|
" noise_scale,\n",
|
|
|
|
|
" length_scale,\n",
|
|
|
|
|
" noise_scale_w\n",
|
|
|
|
|
" ]\n",
|
|
|
|
|
" sid = torch.LongTensor([speaker_id]) if speaker_id is not None else None\n",
|
|
|
|
|
" audio = model(\n",
|
|
|
|
|
" text,\n",
|
|
|
|
|
" text_lengths,\n",
|
|
|
|
|
" scales,\n",
|
|
|
|
|
" sid=sid\n",
|
|
|
|
|
" ).detach().numpy()\n",
|
|
|
|
|
" audio = audio_float_to_int16(audio.squeeze())\n",
|
|
|
|
|
" audios = []\n",
|
|
|
|
|
" text = phonemize(config, line)\n",
|
|
|
|
|
" for phonemes in text:\n",
|
|
|
|
|
" phoneme_ids = phonemes_to_ids(config, phonemes)\n",
|
|
|
|
|
" num_speakers = config[\"num_speakers\"]\n",
|
|
|
|
|
" if num_speakers == 1:\n",
|
|
|
|
|
" speaker_id = None # for now\n",
|
|
|
|
|
" else:\n",
|
|
|
|
|
" speaker_id = sid\n",
|
|
|
|
|
" text = torch.LongTensor(phoneme_ids).unsqueeze(0)\n",
|
|
|
|
|
" text_lengths = torch.LongTensor([len(phoneme_ids)])\n",
|
|
|
|
|
" scales = [\n",
|
|
|
|
|
" noise_scale,\n",
|
|
|
|
|
" length_scale,\n",
|
|
|
|
|
" noise_scale_w\n",
|
|
|
|
|
" ]\n",
|
|
|
|
|
" sid = torch.LongTensor([speaker_id]) if speaker_id is not None else None\n",
|
|
|
|
|
" audio = model(\n",
|
|
|
|
|
" text,\n",
|
|
|
|
|
" text_lengths,\n",
|
|
|
|
|
" scales,\n",
|
|
|
|
|
" sid=sid\n",
|
|
|
|
|
" ).detach().numpy()\n",
|
|
|
|
|
" audio = audio_float_to_int16(audio.squeeze())\n",
|
|
|
|
|
" audios.append(audio)\n",
|
|
|
|
|
" merged_audio = np.concatenate(audios)\n",
|
|
|
|
|
" sample_rate = config[\"audio\"][\"sample_rate\"]\n",
|
|
|
|
|
" display(Markdown(f\"{line}\"))\n",
|
|
|
|
|
" display(Audio(audio, rate=sample_rate, autoplay=auto_play))\n",
|
|
|
|
|
" display(Audio(merged_audio, rate=sample_rate, autoplay=auto_play))\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"def denoise(\n",
|
|
|
|
|
" audio: np.ndarray, bias_spec: np.ndarray, denoiser_strength: float\n",
|
|
|
|
|