|
|
|
@ -1,5 +1,6 @@
|
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
import argparse
|
|
|
|
|
import logging
|
|
|
|
|
import json
|
|
|
|
|
import time
|
|
|
|
|
import statistics
|
|
|
|
@ -11,6 +12,8 @@ _NOISE_SCALE = 0.667
|
|
|
|
|
_LENGTH_SCALE = 1.0
|
|
|
|
|
_NOISE_W = 0.8
|
|
|
|
|
|
|
|
|
|
_LOGGER = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main() -> None:
|
|
|
|
|
parser = argparse.ArgumentParser()
|
|
|
|
@ -19,6 +22,7 @@ def main() -> None:
|
|
|
|
|
)
|
|
|
|
|
parser.add_argument("-c", "--config", help="Path to model config file (.json)")
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
logging.basicConfig(level=logging.DEBUG)
|
|
|
|
|
|
|
|
|
|
if not args.config:
|
|
|
|
|
args.config = f"{args.model}.json"
|
|
|
|
@ -81,10 +85,18 @@ def synthesize(model, phoneme_ids, speaker_id, sample_rate) -> float:
|
|
|
|
|
)
|
|
|
|
|
end_time = time.monotonic_ns()
|
|
|
|
|
|
|
|
|
|
audio_sec = (len(audio) / 2) / sample_rate
|
|
|
|
|
audio_sec = len(audio) / sample_rate
|
|
|
|
|
infer_sec = (end_time - start_time) / 1e9
|
|
|
|
|
rtf = infer_sec / audio_sec
|
|
|
|
|
|
|
|
|
|
_LOGGER.debug(
|
|
|
|
|
"Real-time factor: %s (infer=%s sec, audio=%s sec)",
|
|
|
|
|
rtf,
|
|
|
|
|
infer_sec,
|
|
|
|
|
audio_sec,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return infer_sec / audio_sec
|
|
|
|
|
return rtf
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|