summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSolomon Peachy <pizza@shaftnet.org>2024-04-19 21:53:43 -0400
committerSolomon Peachy <pizza@shaftnet.org>2024-04-21 18:08:47 -0400
commite8a51569ada3bfd85fc0c93911bd5061ce3b6017 (patch)
tree1d248f88fce315240f7df2e415b001b785cd3b6b
parent418a5acea08876cc02ccfec9ba51e2fede4e6eb6 (diff)
downloadrockbox-e8a51569ada3bfd85fc0c93911bd5061ce3b6017.tar.gz
rockbox-e8a51569ada3bfd85fc0c93911bd5061ce3b6017.zip
voice: Add support for the Piper TTS engine
https://github.com/rhasspy/piper High quality, offline, neural-network-based, with good language coverage Note that you have to manually download the piper voice models, and set PIPER_MODEL_DIR appropriately. The configure script will let you choose from the available models and remember your choices. Change-Id: I8eba9fcf78b51b01b89491539aac3e423cc42f16
-rwxr-xr-xtools/configure52
-rwxr-xr-xtools/voice.pl146
2 files changed, 147 insertions, 51 deletions
diff --git a/tools/configure b/tools/configure
index adccc3e5e4..51a47971fd 100755
--- a/tools/configure
+++ b/tools/configure
@@ -1159,6 +1159,13 @@ voiceconfig () {
1159 DEFAULT_TTS_OPTS=$GTTS_OPTS 1159 DEFAULT_TTS_OPTS=$GTTS_OPTS
1160 DEFAULT_CHOICE="g" 1160 DEFAULT_CHOICE="g"
1161 fi 1161 fi
1162 if [ -n "`findtool piper`" ]; then
1163 PIPER="(p)iper "
1164 PIPER_OPTS=""
1165 DEFAULT_TTS="piper"
1166 DEFAULT_TTS_OPTS=$PIPER_OPTS
1167 DEFAULT_CHOICE="p"
1168 fi
1162 if [ -n "`findtool rbspeak`" ]; then 1169 if [ -n "`findtool rbspeak`" ]; then
1163 RBSPEAK="(O)ther " 1170 RBSPEAK="(O)ther "
1164 RBSPEAK_OPTS="" 1171 RBSPEAK_OPTS=""
@@ -1167,15 +1174,15 @@ voiceconfig () {
1167 DEFAULT_CHOICE="O" 1174 DEFAULT_CHOICE="O"
1168 fi 1175 fi
1169 1176
1170 if [ "$FESTIVAL" = "$FLITE" ] && [ "$FLITE" = "$ESPEAK" ] && [ "$ESPEAK" = "$SAPI" ] && [ "$SAPI" = "$MIMIC" ] && [ "$MIMIC" = "$SWIFT" ] && [ "$SWIFT" = "$GTTS" ] && [ "$GTTS" = "$RBSPEAK" ] ; then 1177 if [ "$FESTIVAL" = "$FLITE" ] && [ "$FLITE" = "$ESPEAK" ] && [ "$ESPEAK" = "$SAPI" ] && [ "$SAPI" = "$MIMIC" ] && [ "$MIMIC" = "$SWIFT" ] && [ "$SWIFT" = "$GTTS" ] && [ "$GTTS" = "$PIPER" ] && [ "$PIPER" = "$RBSPEAK" ] ; then
1171 echo "You need Festival, eSpeak, Mimic, Flite, gtts, or rbspeak in your path, or SAPI available to build voice files" 1178 echo "You need Festival, eSpeak, Mimic, Flite, piper, gtts, or rbspeak in your path, or SAPI available to build voice files"
1172 exit 3 1179 exit 3
1173 fi 1180 fi
1174 1181
1175 if [ "$ARG_TTS" ]; then 1182 if [ "$ARG_TTS" ]; then
1176 option=$ARG_TTS 1183 option=$ARG_TTS
1177 else 1184 else
1178 echo "TTS engine to use: ${FLITE}${FESTIVAL}${ESPEAK}${MIMIC}${SAPI}${SWIFT}${GTTS}${RBSPEAK}(${DEFAULT_CHOICE})?" 1185 echo "TTS engine to use: ${FLITE}${FESTIVAL}${ESPEAK}${MIMIC}${SAPI}${SWIFT}${GTTS}${RBSPEAK}${PIPER}(${DEFAULT_CHOICE})?"
1179 option=`input` 1186 option=`input`
1180 if [ -z "$option" ]; then option=${DEFAULT_CHOICE}; fi 1187 if [ -z "$option" ]; then option=${DEFAULT_CHOICE}; fi
1181 advopts="$advopts --tts=$option" 1188 advopts="$advopts --tts=$option"
@@ -1209,6 +1216,10 @@ voiceconfig () {
1209 TTS_ENGINE="gtts" 1216 TTS_ENGINE="gtts"
1210 TTS_OPTS=$GTTS_OPTS 1217 TTS_OPTS=$GTTS_OPTS
1211 ;; 1218 ;;
1219 [Pp]|piper)
1220 TTS_ENGINE="piper"
1221 TTS_OPTS=$PIPER_OPTS
1222 ;;
1212 [Oo]|rbspeak) 1223 [Oo]|rbspeak)
1213 TTS_ENGINE="rbspeak" 1224 TTS_ENGINE="rbspeak"
1214 TTS_OPTS=$RBSPEAK_OPTS 1225 TTS_OPTS=$RBSPEAK_OPTS
@@ -1247,6 +1258,39 @@ voiceconfig () {
1247 advopts="$advopts --voice=$CHOICE" 1258 advopts="$advopts --voice=$CHOICE"
1248 echo "Festival voice set to $TTS_FESTIVAL_VOICE" 1259 echo "Festival voice set to $TTS_FESTIVAL_VOICE"
1249 echo "(voice_$TTS_FESTIVAL_VOICE)" > festival-prolog.scm 1260 echo "(voice_$TTS_FESTIVAL_VOICE)" > festival-prolog.scm
1261 elif [ "$TTS_ENGINE" = "piper" ]; then
1262 if [ -z "$PIPER_MODEL_DIR" ]; then
1263 echo "Please set PIPER_MODEL_DIR!";
1264 exit 1
1265 fi
1266 models=`(cd $PIPER_MODEL_DIR ; ls -1 *onnx)`
1267 for model in $models; do
1268 PIPER_MODEL="$model" # Default
1269 break;
1270 done
1271 if [ "$ARG_VOICE" ]; then
1272 CHOICE=$ARG_VOICE
1273 else
1274 i=1
1275 for model in $models; do
1276 printf "%3d. %s\n" "$i" "$model"
1277 i=`expr $i + 1`
1278 done
1279 printf "Please select which piper model to use (default is $PIPER_MODEL): "
1280 CHOICE=`input`
1281 fi
1282 i=1
1283 for model in $models; do
1284 if [ "$i" = "$CHOICE" -o "$model" = "$CHOICE" ]; then
1285 PIPER_MODEL="$model"
1286 break;
1287 fi
1288 i=`expr $i + 1`
1289 done
1290
1291 TTS_OPTS="$TTS_OPTS --model $PIPER_MODEL_DIR/$PIPER_MODEL"
1292 advopts="$advopts --voice=$PIPER_MODEL"
1293 echo "Piper model set to $PIPER_MODEL"
1250 elif [ "$TTS_ENGINE" = "mimic" ]; then 1294 elif [ "$TTS_ENGINE" = "mimic" ]; then
1251 voicelist=`mimic -lv | cut -d':' -f2` 1295 voicelist=`mimic -lv | cut -d':' -f2`
1252 for voice in $voicelist; do 1296 for voice in $voicelist; do
@@ -1268,6 +1312,7 @@ voiceconfig () {
1268 for voice in $voicelist; do 1312 for voice in $voicelist; do
1269 if [ "$i" = "$CHOICE" -o "$voice" = "$CHOICE" ]; then 1313 if [ "$i" = "$CHOICE" -o "$voice" = "$CHOICE" ]; then
1270 TTS_MIMIC_VOICE="$voice" 1314 TTS_MIMIC_VOICE="$voice"
1315 break
1271 fi 1316 fi
1272 i=`expr $i + 1` 1317 i=`expr $i + 1`
1273 done 1318 done
@@ -4756,6 +4801,7 @@ export ANDROID_NDK_PATH=${ANDROID_NDK_PATH}
4756export ANDROID_SDK_PATH=${ANDROID_SDK_PATH} 4801export ANDROID_SDK_PATH=${ANDROID_SDK_PATH}
4757export ANDROID_PLATFORM_VERSION=${ANDROID_PLATFORM_VERSION} 4802export ANDROID_PLATFORM_VERSION=${ANDROID_PLATFORM_VERSION}
4758export TOOLSET=${toolset} 4803export TOOLSET=${toolset}
4804export PIPER_MODEL_DIR=${PIPER_MODEL_DIR}
4759$CCACHE_ARG 4805$CCACHE_ARG
4760 4806
4761CONFIGURE_OPTIONS=${cmdline} 4807CONFIGURE_OPTIONS=${cmdline}
diff --git a/tools/voice.pl b/tools/voice.pl
index 6b3f807854..337407e187 100755
--- a/tools/voice.pl
+++ b/tools/voice.pl
@@ -42,7 +42,8 @@ Usage: voice.pl [options] [path to dir]
42 Specify which target you want to build voicefile for. Must include 42 Specify which target you want to build voicefile for. Must include
43 any features that target supports. 43 any features that target supports.
44 44
45 -f=<file> Use existing voiceids file 45 -f=<file>
46 Use existing voiceids file
46 47
47 -i=<target_id> 48 -i=<target_id>
48 Numeric target id. Needed for voice building. 49 Numeric target id. Needed for voice building.
@@ -64,7 +65,8 @@ Usage: voice.pl [options] [path to dir]
64 Options to pass to the TTS engine. Enclose in double quotes if the 65 Options to pass to the TTS engine. Enclose in double quotes if the
65 options include spaces. 66 options include spaces.
66 67
67 -F Force the file to be regenerated even if present 68 -F
69 Force the file to be regenerated even if present
68 70
69 -v 71 -v
70 Be verbose 72 Be verbose
@@ -73,57 +75,78 @@ USAGE
73} 75}
74 76
75my %festival_lang_map = ( 77my %festival_lang_map = (
76 'english' => 'english', 78 'english' => 'english',
77 'english-us' => 'english', 79 'english-us' => 'english',
78 'espanol' => 'spanish', 80 'espanol' => 'spanish',
79 #'finnish' => 'finnish' 81 #'finnish' => 'finnish'
80 #'italiano' => 'italian', 82 #'italiano' => 'italian',
81 #'czech' => 'czech', 83 #'czech' => 'czech',
82 #'welsh' => 'welsh' 84 #'welsh' => 'welsh'
83); 85);
84 86
85my %gtts_lang_map = ( 87my %gtts_lang_map = (
86 'english' => '-l en -t co.uk', # Always first, it's the golden master 88 'english' => '-l en -t co.uk', # Always first, it's the golden master
87 'czech' => '-l cs', # not supported 89 'czech' => '-l cs',
88 'dansk' => '-l da', 90 'dansk' => '-l da',
89 'deutsch' => '-l de', 91 'deutsch' => '-l de',
90 'english-us' => '-l en -t us', 92 'english-us' => '-l en -t us',
91 'espanol' => '-l es', 93 'espanol' => '-l es',
92 'francais' => '-l fr', 94 'francais' => '-l fr',
93 'greek' => '-l el', 95 'greek' => '-l el',
94 'magyar' => '-l hu', 96 'magyar' => '-l hu',
95 'italiano' => '-l it', 97 'italiano' => '-l it',
96 'nederlands' => '-l nl', 98 'nederlands' => '-l nl',
97 'norsk' => '-l no', 99 'norsk' => '-l no',
98 'polski' => '-l pl', 100 'polski' => '-l pl',
99 'russian' => '-l ru', 101 'russian' => '-l ru',
100 'slovak' => '-l sk', 102 'slovak' => '-l sk',
101 'srpski' => '-l sr', 103 'srpski' => '-l sr',
102 'svenska' => '-l sv', 104 'svenska' => '-l sv',
103 'turkce' => '-l tr', 105 'turkce' => '-l tr',
104); 106);
105 107
106my %espeak_lang_map = ( 108my %espeak_lang_map = (
107 'english' => 'en-gb', # Always first, it's the golden master 109 'english' => '-ven-gb -k 5', # Always first, it's the golden master
108 'czech' => 'cs', 110 'czech' => '-vcs',
109 'dansk' => 'da', 111 'dansk' => '-vda',
110 'deutsch' => 'de', 112 'deutsch' => '-vde',
111 'english-us' => 'en-us', 113 'english-us' => '-ven-us -k 5',
112 'espanol' => 'es', 114 'espanol' => '-ves',
113 'francais' => 'fr-fr', 115 'francais' => '-vfr-fr',
114 'greek' => 'el', 116 'greek' => '-vel',
115 'nederlands' => 'nl', 117 'magyar' => '-vhu',
116 'magyar' => 'hu', 118 'italiano' => '-vit',
117 'italiano' => 'it', 119 'japanese' => '-vja',
118 'japanese' => 'ja', 120 'nederlands' => '-vnl',
119 'nederlands' => 'nl', 121 'norsk' => '-vno',
120 'norsk' => 'no', 122 'polski' => '-vpl',
121 'polski' => 'pl', 123 'russian' => '-vru',
122 'russian' => 'ru', 124 'slovak' => '-vsk',
123 'slovak' => 'sk', 125 'srpski' => '-vsr',
124 'srpski' => 'sr', 126 'svenska' => '-vsv',
125 'svenska' => 'sv', 127 'turkce' => '-vtr',
126 'turkce' => 'tr', 128 );
129
130my %piper_lang_map = (
131 'english' => 'en_GB-cori-high.onnx', # Always first, it's the golden master
132 'czech' => 'cs_CZ-jirka-medium.onnx',
133 'dansk' => 'da_DK-talesyntese-medium.onnx',
134 'deutsch' => 'de_DE-thorsten-high.onnx',
135 'english-us' => 'en_US-libritts-high.onnx',
136 'espanol' => 'es_ES-sharvard-medium.onnx',
137 'francais' => 'fr_FR-siwis-medium.onnx',
138 'greek' => 'el_GR-rapunzelina-low.onnx',
139# 'magyar' => '-vhu',
140 'italiano' => 'it_IT-riccardo-x_low.onnx',
141# 'japanese' => '-vja',
142 'nederlands' => 'nl_NL-mls-medium.onnx',
143 'norsk' => 'no_NO-talesyntese-medium.onnx',
144 'polski' => 'pl_PL-gosia-medium.onnx',
145 'russian' => 'ru_RU-irina-medium.onnx',
146 'slovak' => 'sk_SK-lili-medium.onnx',
147 'srpski' => 'sr_RS-serbski_institut-medium.onnx',
148 'svenska' => 'sv_SE-nst-medium.onnx',
149 'turkce' => 'tr_TR-fettah-medium.onnx',
127); 150);
128 151
129my $trim_thresh = 500; # Trim silence if over this, in ms 152my $trim_thresh = 500; # Trim silence if over this, in ms
@@ -141,6 +164,7 @@ sub init_tts {
141 # Don't use given/when here - it's not compatible with old perl versions 164 # Don't use given/when here - it's not compatible with old perl versions
142 if ($tts_engine eq 'festival') { 165 if ($tts_engine eq 'festival') {
143 print("> festival $tts_engine_opts --server\n") if $verbose; 166 print("> festival $tts_engine_opts --server\n") if $verbose;
167 # Open command, and filehandles for STDIN, STDOUT, STDERR
144 my $pid = open(FESTIVAL_SERVER, "| festival $tts_engine_opts --server > /dev/null 2>&1"); 168 my $pid = open(FESTIVAL_SERVER, "| festival $tts_engine_opts --server > /dev/null 2>&1");
145 my $dummy = *FESTIVAL_SERVER; #suppress warning 169 my $dummy = *FESTIVAL_SERVER; #suppress warning
146 $SIG{INT} = sub { kill TERM => $pid; print("foo"); panic_cleanup(); }; 170 $SIG{INT} = sub { kill TERM => $pid; print("foo"); panic_cleanup(); };
@@ -149,6 +173,21 @@ sub init_tts {
149 if (defined($festival_lang_map{$language}) && $tts_engine_opts !~ /--language/) { 173 if (defined($festival_lang_map{$language}) && $tts_engine_opts !~ /--language/) {
150 $ret{"ttsoptions"} = "--language $festival_lang_map{$language} "; 174 $ret{"ttsoptions"} = "--language $festival_lang_map{$language} ";
151 } 175 }
176 } elsif ($tts_engine eq 'piper') {
177 my $cmd = "piper $tts_engine_opts --json-input";
178 print("> $cmd\n") if $verbose;
179
180 my $pid = open3(*CMD_IN, *CMD_OUT, *CMD_ERR, $cmd);
181 $SIG{INT} = sub { kill TERM => $pid; print("foo"); panic_cleanup(); };
182 $SIG{KILL} = sub { kill TERM => $pid; print("boo"); panic_cleanup(); };
183 $ret{"pid"} = $pid;
184 binmode(*CMD_IN, ':encoding(utf8)');
185 binmode(*CMD_OUT, ':encoding(utf8)');
186 binmode(*CMD_ERR, ':encoding(utf8)');
187 if (defined($piper_lang_map{$language}) && $tts_engine_opts !~ /--model/) {
188 die("Need PIPER_MODEL_DIR\n") if (!defined($ENV{'PIPER_MODEL_DIR'}));
189 $ret{"ttsoptions"} = "--model $ENV{PIPER_MODEL_DIR}/$piper_lang_map{$language} ";
190 }
152 } elsif ($tts_engine eq 'sapi') { 191 } elsif ($tts_engine eq 'sapi') {
153 my $toolsdir = dirname($0); 192 my $toolsdir = dirname($0);
154 my $path = `cygpath $toolsdir -a -w`; 193 my $path = `cygpath $toolsdir -a -w`;
@@ -176,7 +215,7 @@ sub init_tts {
176 } 215 }
177 } elsif ($tts_engine eq 'espeak' || $tts_engine eq 'espeak-ng') { 216 } elsif ($tts_engine eq 'espeak' || $tts_engine eq 'espeak-ng') {
178 if (defined($espeak_lang_map{$language}) && $tts_engine_opts !~ /-v/) { 217 if (defined($espeak_lang_map{$language}) && $tts_engine_opts !~ /-v/) {
179 $ret{"ttsoptions"} = "-v$espeak_lang_map{$language} "; 218 $ret{"ttsoptions"} = " $espeak_lang_map{$language} ";
180 } 219 }
181 } 220 }
182 221
@@ -190,6 +229,10 @@ sub shutdown_tts {
190 # Send SIGTERM to festival server 229 # Send SIGTERM to festival server
191 kill TERM => $$tts_object{"pid"}; 230 kill TERM => $$tts_object{"pid"};
192 } 231 }
232 elsif ($$tts_object{'name'} eq 'piper') {
233 # Send SIGTERM to piper
234 kill TERM => $$tts_object{"pid"};
235 }
193 elsif ($$tts_object{'name'} eq 'sapi') { 236 elsif ($$tts_object{'name'} eq 'sapi') {
194 print({$$tts_object{"stdin"}} "QUIT\r\n"); 237 print({$$tts_object{"stdin"}} "QUIT\r\n");
195 close($$tts_object{"stdin"}); 238 close($$tts_object{"stdin"});
@@ -244,6 +287,13 @@ sub voicestring {
244 close(CMD_OUT); 287 close(CMD_OUT);
245 close(CMD_ERR); 288 close(CMD_ERR);
246 } 289 }
290 elsif ($name eq 'piper') {
291 $cmd = "{ \"text\": \"$string\", \"output_file\": \"$output\" }";
292 print(">> $cmd\n") if $verbose;
293 print(CMD_IN "$cmd\n");
294 my $res = <CMD_OUT>;
295 $res = <CMD_ERR>;
296 }
247 elsif ($name eq 'flite') { 297 elsif ($name eq 'flite') {
248 $cmd = "flite $tts_engine_opts -t \"$string\" \"$output\""; 298 $cmd = "flite $tts_engine_opts -t \"$string\" \"$output\"";
249 print("> $cmd\n") if $verbose; 299 print("> $cmd\n") if $verbose;
@@ -469,7 +519,6 @@ sub generateclips {
469 print("\n"); 519 print("\n");
470 520
471 unlink($updfile) if (-f $updfile); 521 unlink($updfile) if (-f $updfile);
472 shutdown_tts($tts_object);
473} 522}
474 523
475# Assemble the voicefile 524# Assemble the voicefile
@@ -608,6 +657,7 @@ if ($V == 1) {
608 defined($t) ? $t : "unknown", 657 defined($t) ? $t : "unknown",
609 $l, $e, $E, $s, $S); 658 $l, $e, $E, $s, $S);
610 generateclips($l, $t, $e, $E, $tts_object, $S, $f); 659 generateclips($l, $t, $e, $E, $tts_object, $S, $f);
660 shutdown_tts($tts_object);
611 createvoice($l, $i, $f); 661 createvoice($l, $i, $f);
612 deleteencs(); 662 deleteencs();
613} elsif ($C) { 663} elsif ($C) {