From e8a51569ada3bfd85fc0c93911bd5061ce3b6017 Mon Sep 17 00:00:00 2001 From: Solomon Peachy Date: Fri, 19 Apr 2024 21:53:43 -0400 Subject: voice: Add support for the Piper TTS engine https://github.com/rhasspy/piper High quality, offline, neural-network-based, with good language coverage Note that you have to manually download the piper voice models, and set PIPER_MODEL_DIR appropriately. The configure script will let you choose from the available models and remember your choices. Change-Id: I8eba9fcf78b51b01b89491539aac3e423cc42f16 --- tools/configure | 52 ++++++++++++++++++-- tools/voice.pl | 146 +++++++++++++++++++++++++++++++++++++------------------- 2 files changed, 147 insertions(+), 51 deletions(-) diff --git a/tools/configure b/tools/configure index adccc3e5e4..51a47971fd 100755 --- a/tools/configure +++ b/tools/configure @@ -1159,6 +1159,13 @@ voiceconfig () { DEFAULT_TTS_OPTS=$GTTS_OPTS DEFAULT_CHOICE="g" fi + if [ -n "`findtool piper`" ]; then + PIPER="(p)iper " + PIPER_OPTS="" + DEFAULT_TTS="piper" + DEFAULT_TTS_OPTS=$PIPER_OPTS + DEFAULT_CHOICE="p" + fi if [ -n "`findtool rbspeak`" ]; then RBSPEAK="(O)ther " RBSPEAK_OPTS="" @@ -1167,15 +1174,15 @@ voiceconfig () { DEFAULT_CHOICE="O" fi - if [ "$FESTIVAL" = "$FLITE" ] && [ "$FLITE" = "$ESPEAK" ] && [ "$ESPEAK" = "$SAPI" ] && [ "$SAPI" = "$MIMIC" ] && [ "$MIMIC" = "$SWIFT" ] && [ "$SWIFT" = "$GTTS" ] && [ "$GTTS" = "$RBSPEAK" ] ; then - echo "You need Festival, eSpeak, Mimic, Flite, gtts, or rbspeak in your path, or SAPI available to build voice files" + if [ "$FESTIVAL" = "$FLITE" ] && [ "$FLITE" = "$ESPEAK" ] && [ "$ESPEAK" = "$SAPI" ] && [ "$SAPI" = "$MIMIC" ] && [ "$MIMIC" = "$SWIFT" ] && [ "$SWIFT" = "$GTTS" ] && [ "$GTTS" = "$PIPER" ] && [ "$PIPER" = "$RBSPEAK" ] ; then + echo "You need Festival, eSpeak, Mimic, Flite, piper, gtts, or rbspeak in your path, or SAPI available to build voice files" exit 3 fi if [ "$ARG_TTS" ]; then option=$ARG_TTS else - echo "TTS engine to use: ${FLITE}${FESTIVAL}${ESPEAK}${MIMIC}${SAPI}${SWIFT}${GTTS}${RBSPEAK}(${DEFAULT_CHOICE})?" + echo "TTS engine to use: ${FLITE}${FESTIVAL}${ESPEAK}${MIMIC}${SAPI}${SWIFT}${GTTS}${RBSPEAK}${PIPER}(${DEFAULT_CHOICE})?" option=`input` if [ -z "$option" ]; then option=${DEFAULT_CHOICE}; fi advopts="$advopts --tts=$option" @@ -1209,6 +1216,10 @@ voiceconfig () { TTS_ENGINE="gtts" TTS_OPTS=$GTTS_OPTS ;; + [Pp]|piper) + TTS_ENGINE="piper" + TTS_OPTS=$PIPER_OPTS + ;; [Oo]|rbspeak) TTS_ENGINE="rbspeak" TTS_OPTS=$RBSPEAK_OPTS @@ -1247,6 +1258,39 @@ voiceconfig () { advopts="$advopts --voice=$CHOICE" echo "Festival voice set to $TTS_FESTIVAL_VOICE" echo "(voice_$TTS_FESTIVAL_VOICE)" > festival-prolog.scm + elif [ "$TTS_ENGINE" = "piper" ]; then + if [ -z "$PIPER_MODEL_DIR" ]; then + echo "Please set PIPER_MODEL_DIR!"; + exit 1 + fi + models=`(cd $PIPER_MODEL_DIR ; ls -1 *onnx)` + for model in $models; do + PIPER_MODEL="$model" # Default + break; + done + if [ "$ARG_VOICE" ]; then + CHOICE=$ARG_VOICE + else + i=1 + for model in $models; do + printf "%3d. %s\n" "$i" "$model" + i=`expr $i + 1` + done + printf "Please select which piper model to use (default is $PIPER_MODEL): " + CHOICE=`input` + fi + i=1 + for model in $models; do + if [ "$i" = "$CHOICE" -o "$model" = "$CHOICE" ]; then + PIPER_MODEL="$model" + break; + fi + i=`expr $i + 1` + done + + TTS_OPTS="$TTS_OPTS --model $PIPER_MODEL_DIR/$PIPER_MODEL" + advopts="$advopts --voice=$PIPER_MODEL" + echo "Piper model set to $PIPER_MODEL" elif [ "$TTS_ENGINE" = "mimic" ]; then voicelist=`mimic -lv | cut -d':' -f2` for voice in $voicelist; do @@ -1268,6 +1312,7 @@ voiceconfig () { for voice in $voicelist; do if [ "$i" = "$CHOICE" -o "$voice" = "$CHOICE" ]; then TTS_MIMIC_VOICE="$voice" + break fi i=`expr $i + 1` done @@ -4756,6 +4801,7 @@ export ANDROID_NDK_PATH=${ANDROID_NDK_PATH} export ANDROID_SDK_PATH=${ANDROID_SDK_PATH} export ANDROID_PLATFORM_VERSION=${ANDROID_PLATFORM_VERSION} export TOOLSET=${toolset} +export PIPER_MODEL_DIR=${PIPER_MODEL_DIR} $CCACHE_ARG CONFIGURE_OPTIONS=${cmdline} diff --git a/tools/voice.pl b/tools/voice.pl index 6b3f807854..337407e187 100755 --- a/tools/voice.pl +++ b/tools/voice.pl @@ -42,7 +42,8 @@ Usage: voice.pl [options] [path to dir] Specify which target you want to build voicefile for. Must include any features that target supports. - -f= Use existing voiceids file + -f= + Use existing voiceids file -i= Numeric target id. Needed for voice building. @@ -64,7 +65,8 @@ Usage: voice.pl [options] [path to dir] Options to pass to the TTS engine. Enclose in double quotes if the options include spaces. - -F Force the file to be regenerated even if present + -F + Force the file to be regenerated even if present -v Be verbose @@ -73,57 +75,78 @@ USAGE } my %festival_lang_map = ( - 'english' => 'english', - 'english-us' => 'english', - 'espanol' => 'spanish', - #'finnish' => 'finnish' - #'italiano' => 'italian', - #'czech' => 'czech', - #'welsh' => 'welsh' + 'english' => 'english', + 'english-us' => 'english', + 'espanol' => 'spanish', + #'finnish' => 'finnish' + #'italiano' => 'italian', + #'czech' => 'czech', + #'welsh' => 'welsh' ); my %gtts_lang_map = ( 'english' => '-l en -t co.uk', # Always first, it's the golden master - 'czech' => '-l cs', # not supported - 'dansk' => '-l da', - 'deutsch' => '-l de', - 'english-us' => '-l en -t us', - 'espanol' => '-l es', - 'francais' => '-l fr', - 'greek' => '-l el', - 'magyar' => '-l hu', - 'italiano' => '-l it', - 'nederlands' => '-l nl', - 'norsk' => '-l no', - 'polski' => '-l pl', - 'russian' => '-l ru', - 'slovak' => '-l sk', - 'srpski' => '-l sr', - 'svenska' => '-l sv', - 'turkce' => '-l tr', + 'czech' => '-l cs', + 'dansk' => '-l da', + 'deutsch' => '-l de', + 'english-us' => '-l en -t us', + 'espanol' => '-l es', + 'francais' => '-l fr', + 'greek' => '-l el', + 'magyar' => '-l hu', + 'italiano' => '-l it', + 'nederlands' => '-l nl', + 'norsk' => '-l no', + 'polski' => '-l pl', + 'russian' => '-l ru', + 'slovak' => '-l sk', + 'srpski' => '-l sr', + 'svenska' => '-l sv', + 'turkce' => '-l tr', ); my %espeak_lang_map = ( - 'english' => 'en-gb', # Always first, it's the golden master - 'czech' => 'cs', - 'dansk' => 'da', - 'deutsch' => 'de', - 'english-us' => 'en-us', - 'espanol' => 'es', - 'francais' => 'fr-fr', - 'greek' => 'el', - 'nederlands' => 'nl', - 'magyar' => 'hu', - 'italiano' => 'it', - 'japanese' => 'ja', - 'nederlands' => 'nl', - 'norsk' => 'no', - 'polski' => 'pl', - 'russian' => 'ru', - 'slovak' => 'sk', - 'srpski' => 'sr', - 'svenska' => 'sv', - 'turkce' => 'tr', + 'english' => '-ven-gb -k 5', # Always first, it's the golden master + 'czech' => '-vcs', + 'dansk' => '-vda', + 'deutsch' => '-vde', + 'english-us' => '-ven-us -k 5', + 'espanol' => '-ves', + 'francais' => '-vfr-fr', + 'greek' => '-vel', + 'magyar' => '-vhu', + 'italiano' => '-vit', + 'japanese' => '-vja', + 'nederlands' => '-vnl', + 'norsk' => '-vno', + 'polski' => '-vpl', + 'russian' => '-vru', + 'slovak' => '-vsk', + 'srpski' => '-vsr', + 'svenska' => '-vsv', + 'turkce' => '-vtr', + ); + +my %piper_lang_map = ( + 'english' => 'en_GB-cori-high.onnx', # Always first, it's the golden master + 'czech' => 'cs_CZ-jirka-medium.onnx', + 'dansk' => 'da_DK-talesyntese-medium.onnx', + 'deutsch' => 'de_DE-thorsten-high.onnx', + 'english-us' => 'en_US-libritts-high.onnx', + 'espanol' => 'es_ES-sharvard-medium.onnx', + 'francais' => 'fr_FR-siwis-medium.onnx', + 'greek' => 'el_GR-rapunzelina-low.onnx', +# 'magyar' => '-vhu', + 'italiano' => 'it_IT-riccardo-x_low.onnx', +# 'japanese' => '-vja', + 'nederlands' => 'nl_NL-mls-medium.onnx', + 'norsk' => 'no_NO-talesyntese-medium.onnx', + 'polski' => 'pl_PL-gosia-medium.onnx', + 'russian' => 'ru_RU-irina-medium.onnx', + 'slovak' => 'sk_SK-lili-medium.onnx', + 'srpski' => 'sr_RS-serbski_institut-medium.onnx', + 'svenska' => 'sv_SE-nst-medium.onnx', + 'turkce' => 'tr_TR-fettah-medium.onnx', ); my $trim_thresh = 500; # Trim silence if over this, in ms @@ -141,6 +164,7 @@ sub init_tts { # Don't use given/when here - it's not compatible with old perl versions if ($tts_engine eq 'festival') { print("> festival $tts_engine_opts --server\n") if $verbose; + # Open command, and filehandles for STDIN, STDOUT, STDERR my $pid = open(FESTIVAL_SERVER, "| festival $tts_engine_opts --server > /dev/null 2>&1"); my $dummy = *FESTIVAL_SERVER; #suppress warning $SIG{INT} = sub { kill TERM => $pid; print("foo"); panic_cleanup(); }; @@ -149,6 +173,21 @@ sub init_tts { if (defined($festival_lang_map{$language}) && $tts_engine_opts !~ /--language/) { $ret{"ttsoptions"} = "--language $festival_lang_map{$language} "; } + } elsif ($tts_engine eq 'piper') { + my $cmd = "piper $tts_engine_opts --json-input"; + print("> $cmd\n") if $verbose; + + my $pid = open3(*CMD_IN, *CMD_OUT, *CMD_ERR, $cmd); + $SIG{INT} = sub { kill TERM => $pid; print("foo"); panic_cleanup(); }; + $SIG{KILL} = sub { kill TERM => $pid; print("boo"); panic_cleanup(); }; + $ret{"pid"} = $pid; + binmode(*CMD_IN, ':encoding(utf8)'); + binmode(*CMD_OUT, ':encoding(utf8)'); + binmode(*CMD_ERR, ':encoding(utf8)'); + if (defined($piper_lang_map{$language}) && $tts_engine_opts !~ /--model/) { + die("Need PIPER_MODEL_DIR\n") if (!defined($ENV{'PIPER_MODEL_DIR'})); + $ret{"ttsoptions"} = "--model $ENV{PIPER_MODEL_DIR}/$piper_lang_map{$language} "; + } } elsif ($tts_engine eq 'sapi') { my $toolsdir = dirname($0); my $path = `cygpath $toolsdir -a -w`; @@ -176,7 +215,7 @@ sub init_tts { } } elsif ($tts_engine eq 'espeak' || $tts_engine eq 'espeak-ng') { if (defined($espeak_lang_map{$language}) && $tts_engine_opts !~ /-v/) { - $ret{"ttsoptions"} = "-v$espeak_lang_map{$language} "; + $ret{"ttsoptions"} = " $espeak_lang_map{$language} "; } } @@ -190,6 +229,10 @@ sub shutdown_tts { # Send SIGTERM to festival server kill TERM => $$tts_object{"pid"}; } + elsif ($$tts_object{'name'} eq 'piper') { + # Send SIGTERM to piper + kill TERM => $$tts_object{"pid"}; + } elsif ($$tts_object{'name'} eq 'sapi') { print({$$tts_object{"stdin"}} "QUIT\r\n"); close($$tts_object{"stdin"}); @@ -244,6 +287,13 @@ sub voicestring { close(CMD_OUT); close(CMD_ERR); } + elsif ($name eq 'piper') { + $cmd = "{ \"text\": \"$string\", \"output_file\": \"$output\" }"; + print(">> $cmd\n") if $verbose; + print(CMD_IN "$cmd\n"); + my $res = ; + $res = ; + } elsif ($name eq 'flite') { $cmd = "flite $tts_engine_opts -t \"$string\" \"$output\""; print("> $cmd\n") if $verbose; @@ -469,7 +519,6 @@ sub generateclips { print("\n"); unlink($updfile) if (-f $updfile); - shutdown_tts($tts_object); } # Assemble the voicefile @@ -608,6 +657,7 @@ if ($V == 1) { defined($t) ? $t : "unknown", $l, $e, $E, $s, $S); generateclips($l, $t, $e, $E, $tts_object, $S, $f); + shutdown_tts($tts_object); createvoice($l, $i, $f); deleteencs(); } elsif ($C) { -- cgit v1.2.3