|
|
|
@ -30,9 +30,7 @@ const float MAX_WAV_VALUE = 32767.0f;
|
|
|
|
|
|
|
|
|
|
const std::string instanceName{"piper"};
|
|
|
|
|
|
|
|
|
|
std::string getVersion() {
|
|
|
|
|
return VERSION;
|
|
|
|
|
}
|
|
|
|
|
std::string getVersion() { return VERSION; }
|
|
|
|
|
|
|
|
|
|
// True if the string is a single UTF-8 codepoint
|
|
|
|
|
bool isSingleCodepoint(std::string s) {
|
|
|
|
@ -458,30 +456,90 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text,
|
|
|
|
|
sentencePhonemes.size(), phonemesStr);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
SynthesisResult sentenceResult;
|
|
|
|
|
std::vector<std::shared_ptr<std::vector<Phoneme>>> phrasePhonemes;
|
|
|
|
|
std::vector<SynthesisResult> phraseResults;
|
|
|
|
|
std::vector<size_t> phraseSilenceSamples;
|
|
|
|
|
|
|
|
|
|
// Use phoneme/id map from config
|
|
|
|
|
PhonemeIdConfig idConfig;
|
|
|
|
|
idConfig.phonemeIdMap =
|
|
|
|
|
std::make_shared<PhonemeIdMap>(voice.phonemizeConfig.phonemeIdMap);
|
|
|
|
|
|
|
|
|
|
// phonemes -> ids
|
|
|
|
|
phonemes_to_ids(sentencePhonemes, idConfig, phonemeIds, missingPhonemes);
|
|
|
|
|
if (spdlog::should_log(spdlog::level::debug)) {
|
|
|
|
|
// DEBUG log for phoneme ids
|
|
|
|
|
std::stringstream phonemeIdsStr;
|
|
|
|
|
for (auto phonemeId : phonemeIds) {
|
|
|
|
|
phonemeIdsStr << phonemeId << ", ";
|
|
|
|
|
if (voice.synthesisConfig.phonemeSilenceSeconds) {
|
|
|
|
|
// Split into phrases
|
|
|
|
|
std::map<Phoneme, float> &phonemeSilenceSeconds =
|
|
|
|
|
*voice.synthesisConfig.phonemeSilenceSeconds;
|
|
|
|
|
|
|
|
|
|
auto currentPhrasePhonemes = std::make_shared<std::vector<Phoneme>>();
|
|
|
|
|
phrasePhonemes.push_back(currentPhrasePhonemes);
|
|
|
|
|
|
|
|
|
|
for (auto sentencePhonemesIter = sentencePhonemes.begin();
|
|
|
|
|
sentencePhonemesIter != sentencePhonemes.end();
|
|
|
|
|
sentencePhonemesIter++) {
|
|
|
|
|
Phoneme ¤tPhoneme = *sentencePhonemesIter;
|
|
|
|
|
currentPhrasePhonemes->push_back(currentPhoneme);
|
|
|
|
|
|
|
|
|
|
if (phonemeSilenceSeconds.count(currentPhoneme) > 0) {
|
|
|
|
|
// Split at phrase boundary
|
|
|
|
|
phraseSilenceSamples.push_back(
|
|
|
|
|
(std::size_t)(phonemeSilenceSeconds[currentPhoneme] *
|
|
|
|
|
voice.synthesisConfig.sampleRate *
|
|
|
|
|
voice.synthesisConfig.channels));
|
|
|
|
|
|
|
|
|
|
currentPhrasePhonemes = std::make_shared<std::vector<Phoneme>>();
|
|
|
|
|
phrasePhonemes.push_back(currentPhrasePhonemes);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
// Use all phonemes
|
|
|
|
|
phrasePhonemes.push_back(
|
|
|
|
|
std::make_shared<std::vector<Phoneme>>(sentencePhonemes));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
spdlog::debug("Converted {} phoneme(s) to {} phoneme id(s): {}",
|
|
|
|
|
sentencePhonemes.size(), phonemeIds.size(),
|
|
|
|
|
phonemeIdsStr.str());
|
|
|
|
|
// Ensure results/samples are the same size
|
|
|
|
|
while (phraseResults.size() < phrasePhonemes.size()) {
|
|
|
|
|
phraseResults.emplace_back();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ids -> audio
|
|
|
|
|
synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer,
|
|
|
|
|
sentenceResult);
|
|
|
|
|
while (phraseSilenceSamples.size() < phrasePhonemes.size()) {
|
|
|
|
|
phraseSilenceSamples.push_back(0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// phonemes -> ids -> audio
|
|
|
|
|
for (size_t phraseIdx = 0; phraseIdx < phrasePhonemes.size(); phraseIdx++) {
|
|
|
|
|
if (phrasePhonemes[phraseIdx]->size() <= 0) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// phonemes -> ids
|
|
|
|
|
phonemes_to_ids(*(phrasePhonemes[phraseIdx]), idConfig, phonemeIds,
|
|
|
|
|
missingPhonemes);
|
|
|
|
|
if (spdlog::should_log(spdlog::level::debug)) {
|
|
|
|
|
// DEBUG log for phoneme ids
|
|
|
|
|
std::stringstream phonemeIdsStr;
|
|
|
|
|
for (auto phonemeId : phonemeIds) {
|
|
|
|
|
phonemeIdsStr << phonemeId << ", ";
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
spdlog::debug("Converted {} phoneme(s) to {} phoneme id(s): {}",
|
|
|
|
|
phrasePhonemes[phraseIdx]->size(), phonemeIds.size(),
|
|
|
|
|
phonemeIdsStr.str());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ids -> audio
|
|
|
|
|
synthesize(phonemeIds, voice.synthesisConfig, voice.session, audioBuffer,
|
|
|
|
|
phraseResults[phraseIdx]);
|
|
|
|
|
|
|
|
|
|
// Add end of phrase silence
|
|
|
|
|
for (std::size_t i = 0; i < phraseSilenceSamples[phraseIdx]; i++) {
|
|
|
|
|
audioBuffer.push_back(0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
result.audioSeconds += phraseResults[phraseIdx].audioSeconds;
|
|
|
|
|
result.inferSeconds += phraseResults[phraseIdx].inferSeconds;
|
|
|
|
|
|
|
|
|
|
phonemeIds.clear();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Add end of sentence silence
|
|
|
|
|
if (sentenceSilenceSamples > 0) {
|
|
|
|
@ -496,9 +554,6 @@ void textToAudio(PiperConfig &config, Voice &voice, std::string text,
|
|
|
|
|
audioBuffer.clear();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
result.audioSeconds += sentenceResult.audioSeconds;
|
|
|
|
|
result.inferSeconds += sentenceResult.inferSeconds;
|
|
|
|
|
|
|
|
|
|
phonemeIds.clear();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|