Starting on Python inference

1 year ago · b21d815298
parent 0ddfb082be
commit b21d815298
13 changed files with 346 additions and 0 deletions
--- a/src/python/scripts/setup.sh
+++ b/src/python/scripts/setup.sh
@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+set -eo pipefail
+
+# Directory of *this* script
+this_dir="$( cd "$( dirname "$0" )" && pwd )"
+
+# Base directory of repo
+base_dir="$(realpath "${this_dir}/..")"
+
+# Path to virtual environment
+: "${venv:=${base_dir}/.venv}"
+
+# Python binary to use
+: "${PYTHON=python3}"
+
+python_version="$(${PYTHON} --version)"
+
+# Create virtual environment
+echo "Creating virtual environment at ${venv} (${python_version})"
+rm -rf "${venv}"
+"${PYTHON}" -m venv "${venv}"
+source "${venv}/bin/activate"
+
+# Install Python dependencies
+echo 'Installing Python dependencies'
+pip3 install --upgrade pip
+pip3 install --upgrade wheel setuptools
+
+pip3 install -r "${base_dir}/requirements.txt"
+
+# -----------------------------------------------------------------------------
+
+echo "OK"
--- a/src/python_run/.isort.cfg
+++ b/src/python_run/.isort.cfg
@ -0,0 +1,6 @@
+[settings]
+multi_line_output=3
+include_trailing_comma=True
+force_grid_wrap=0
+use_parentheses=True
+line_length=88
--- a/src/python_run/larynx/init.py
+++ b/src/python_run/larynx/init.py
@ -0,0 +1,134 @@
+import io
+import json
+import wave
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Mapping, Optional, Sequence, Union
+
+import numpy as np
+import onnxruntime
+from espeak_phonemizer import Phonemizer
+
+_BOS = "^"
+_EOS = "$"
+_PAD = "_"
+
+
+@dataclass
+class LarynxConfig:
+    num_symbols: int
+    num_speakers: int
+    sample_rate: int
+    espeak_voice: str
+    length_scale: float
+    noise_scale: float
+    noise_w: float
+    phoneme_id_map: Mapping[str, Sequence[int]]
+
+
+class Larynx:
+    def __init__(
+        self,
+        model_path: Union[str, Path],
+        config_path: Optional[Union[str, Path]] = None,
+        use_cuda: bool = False,
+    ):
+        if config_path is None:
+            config_path = f"{model_path}.json"
+
+        self.config = load_config(config_path)
+        self.phonemizer = Phonemizer(self.config.espeak_voice)
+        self.model = onnxruntime.InferenceSession(
+            str(model_path),
+            sess_options=onnxruntime.SessionOptions(),
+            providers=None if not use_cuda else ["CUDAExecutionProvider"],
+        )
+
+    def synthesize(
+        self,
+        text: str,
+        speaker_id: Optional[int] = None,
+        length_scale: Optional[float] = None,
+        noise_scale: Optional[float] = None,
+        noise_w: Optional[float] = None,
+    ) -> bytes:
+        """Synthesize WAV audio from text."""
+        if length_scale is None:
+            length_scale = self.config.length_scale
+
+        if noise_scale is None:
+            noise_scale = self.config.noise_scale
+
+        if noise_w is None:
+            noise_w = self.config.noise_w
+
+        phonemes_str = self.phonemizer.phonemize(text)
+        phonemes = [_BOS] + list(phonemes_str)
+        phoneme_ids: List[int] = []
+
+        for phoneme in phonemes:
+            phoneme_ids.extend(self.config.phoneme_id_map[phoneme])
+            phoneme_ids.extend(self.config.phoneme_id_map[_PAD])
+
+        phoneme_ids.extend(self.config.phoneme_id_map[_EOS])
+
+        phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
+        phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
+        scales = np.array(
+            [noise_scale, length_scale, noise_w],
+            dtype=np.float32,
+        )
+        sid = None
+
+        if speaker_id is not None:
+            sid = np.array([speaker_id], dtype=np.int64)
+
+        # Synthesize through Onnx
+        audio = self.model.run(
+            None,
+            {
+                "input": phoneme_ids_array,
+                "input_lengths": phoneme_ids_lengths,
+                "scales": scales,
+                "sid": sid,
+            },
+        )[0].squeeze((0, 1))
+        audio = audio_float_to_int16(audio.squeeze())
+
+        # Convert to WAV
+        with io.BytesIO() as wav_io:
+            wav_file: wave.Wave_write = wave.open(wav_io, "wb")
+            with wav_file:
+                wav_file.setframerate(self.config.sample_rate)
+                wav_file.setsampwidth(2)
+                wav_file.setnchannels(1)
+                wav_file.writeframes(audio.tobytes())
+
+            return wav_io.getvalue()
+
+
+def load_config(config_path: Union[str, Path]) -> LarynxConfig:
+    with open(config_path, "r", encoding="utf-8") as config_file:
+        config_dict = json.load(config_file)
+        inference = config_dict.get("inference", {})
+
+        return LarynxConfig(
+            num_symbols=config_dict["num_symbols"],
+            num_speakers=config_dict["num_speakers"],
+            sample_rate=config_dict["audio"]["sample_rate"],
+            espeak_voice=config_dict["espeak"]["voice"],
+            noise_scale=inference.get("noise_scale", 0.667),
+            length_scale=inference.get("length_scale", 1.0),
+            noise_w=inference.get("noise_w", 0.8),
+            phoneme_id_map=config_dict["phoneme_id_map"],
+        )
+
+
+def audio_float_to_int16(
+    audio: np.ndarray, max_wav_value: float = 32767.0
+) -> np.ndarray:
+    """Normalize audio and convert to int16 range"""
+    audio_norm = audio * (max_wav_value / max(0.01, np.max(np.abs(audio))))
+    audio_norm = np.clip(audio_norm, -max_wav_value, max_wav_value)
+    audio_norm = audio_norm.astype("int16")
+    return audio_norm
--- a/src/python_run/larynx/main.py
+++ b/src/python_run/larynx/main.py
@ -0,0 +1,19 @@
+import argparse
+import sys
+
+from . import Larynx
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-m", "--model", help="Path to Onnx model file")
+    parser.add_argument("--cuda", action="store_true", help="Use GPU")
+    args = parser.parse_args()
+
+    voice = Larynx(args.model, use_cuda=args.cuda)
+    wav_bytes = voice.synthesize(sys.stdin.read())
+    sys.stdout.buffer.write(wav_bytes)
+
+
+if __name__ == "__main__":
+    main()
--- a/src/python_run/mypy.ini
+++ b/src/python_run/mypy.ini
@ -0,0 +1,4 @@
+[mypy]
+
+[mypy-onnxruntime.*]
+ignore_missing_imports = True
--- a/src/python_run/py.typed
+++ b/src/python_run/py.typed
--- a/src/python_run/pylintrc
+++ b/src/python_run/pylintrc
@ -0,0 +1,40 @@
+[MESSAGES CONTROL]
+disable=
+  format,
+  abstract-class-little-used,
+  abstract-method,
+  cyclic-import,
+  duplicate-code,
+  global-statement,
+  import-outside-toplevel,
+  inconsistent-return-statements,
+  locally-disabled,
+  not-context-manager,
+  redefined-variable-type,
+  too-few-public-methods,
+  too-many-arguments,
+  too-many-branches,
+  too-many-instance-attributes,
+  too-many-lines,
+  too-many-locals,
+  too-many-public-methods,
+  too-many-return-statements,
+  too-many-statements,
+  too-many-boolean-expressions,
+  unnecessary-pass,
+  unused-argument,
+  broad-except,
+  too-many-nested-blocks,
+  invalid-name,
+  unused-import,
+  no-self-use,
+  fixme,
+  useless-super-delegation,
+  missing-module-docstring,
+  missing-class-docstring,
+  missing-function-docstring,
+  import-error,
+  relative-beyond-top-level
+
+[FORMAT]
+expected-line-ending-format=LF
--- a/src/python_run/requirements.txt
+++ b/src/python_run/requirements.txt
@ -0,0 +1,2 @@
+espeak-phonemizer>=1.1.0,<2
+onnxruntime~=1.11.0
--- a/src/python_run/requirements_dev.txt
+++ b/src/python_run/requirements_dev.txt
@ -0,0 +1,7 @@
+black==22.3.0
+coverage==5.0.4
+flake8==3.7.9
+mypy==0.910
+pylint==2.10.2
+pytest==5.4.1
+pytest-cov==2.8.1
--- a/src/python_run/scripts/check.sh
+++ b/src/python_run/scripts/check.sh
@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+
+# Runs formatters, linters, and type checkers on Python code.
+
+set -eo pipefail
+
+# Directory of *this* script
+this_dir="$( cd "$( dirname "$0" )" && pwd )"
+
+base_dir="$(realpath "${this_dir}/..")"
+
+# Path to virtual environment
+: "${venv:=${base_dir}/.venv}"
+
+if [ -d "${venv}" ]; then
+    # Activate virtual environment if available
+    source "${venv}/bin/activate"
+fi
+
+python_files=("${base_dir}/larynx")
+
+# Format code
+black "${python_files[@]}"
+isort "${python_files[@]}"
+
+# Check
+flake8 "${python_files[@]}"
+pylint "${python_files[@]}"
+mypy "${python_files[@]}"
--- a/src/python_run/scripts/larynx
+++ b/src/python_run/scripts/larynx
@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+set -eo pipefail
+
+# Directory of *this* script
+this_dir="$( cd "$( dirname "$0" )" && pwd )"
+
+base_dir="$(realpath "${this_dir}/..")"
+
+# Path to virtual environment
+: "${venv:=${base_dir}/.venv}"
+
+if [ -d "${venv}" ]; then
+    # Activate virtual environment if available
+    source "${venv}/bin/activate"
+fi
+
+python3 -m larynx "$@"
--- a/src/python_run/scripts/setup.sh
+++ b/src/python_run/scripts/setup.sh
@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+set -eo pipefail
+
+# Directory of *this* script
+this_dir="$( cd "$( dirname "$0" )" && pwd )"
+
+# Base directory of repo
+base_dir="$(realpath "${this_dir}/..")"
+
+# Path to virtual environment
+: "${venv:=${base_dir}/.venv}"
+
+# Python binary to use
+: "${PYTHON=python3}"
+
+python_version="$(${PYTHON} --version)"
+
+# Create virtual environment
+echo "Creating virtual environment at ${venv} (${python_version})"
+rm -rf "${venv}"
+"${PYTHON}" -m venv "${venv}"
+source "${venv}/bin/activate"
+
+# Install Python dependencies
+echo 'Installing Python dependencies'
+pip3 install --upgrade pip
+pip3 install --upgrade wheel setuptools
+
+pip3 install -r "${base_dir}/requirements.txt"
+
+# -----------------------------------------------------------------------------
+
+echo "OK"
--- a/src/python_run/setup.cfg
+++ b/src/python_run/setup.cfg
@ -0,0 +1,22 @@
+[flake8]
+# To work with Black
+max-line-length = 88
+# E501: line too long
+# W503: Line break occurred before a binary operator
+# E203: Whitespace before ':'
+# D202 No blank lines allowed after function docstring
+# W504 line break after binary operator
+ignore =
+    E501,
+    W503,
+    E203,
+    D202,
+    W504
+
+[isort]
+multi_line_output = 3
+include_trailing_comma=True
+force_grid_wrap=0
+use_parentheses=True
+line_length=88
+indent = "    "