diff options
author | Solomon Peachy <pizza@shaftnet.org> | 2024-04-19 21:53:43 -0400 |
---|---|---|
committer | Solomon Peachy <pizza@shaftnet.org> | 2024-04-21 18:08:47 -0400 |
commit | e8a51569ada3bfd85fc0c93911bd5061ce3b6017 (patch) | |
tree | 1d248f88fce315240f7df2e415b001b785cd3b6b /tools | |
parent | 418a5acea08876cc02ccfec9ba51e2fede4e6eb6 (diff) | |
download | rockbox-e8a51569ada3bfd85fc0c93911bd5061ce3b6017.tar.gz rockbox-e8a51569ada3bfd85fc0c93911bd5061ce3b6017.zip |
voice: Add support for the Piper TTS engine
https://github.com/rhasspy/piper
High quality, offline, neural-network-based, with good language coverage
Note that you have to manually download the piper voice models, and set
PIPER_MODEL_DIR appropriately. The configure script will let you choose
from the available models and remember your choices.
Change-Id: I8eba9fcf78b51b01b89491539aac3e423cc42f16
Diffstat (limited to 'tools')
-rwxr-xr-x | tools/configure | 52 | ||||
-rwxr-xr-x | tools/voice.pl | 146 |
2 files changed, 147 insertions, 51 deletions
diff --git a/tools/configure b/tools/configure index adccc3e5e4..51a47971fd 100755 --- a/tools/configure +++ b/tools/configure | |||
@@ -1159,6 +1159,13 @@ voiceconfig () { | |||
1159 | DEFAULT_TTS_OPTS=$GTTS_OPTS | 1159 | DEFAULT_TTS_OPTS=$GTTS_OPTS |
1160 | DEFAULT_CHOICE="g" | 1160 | DEFAULT_CHOICE="g" |
1161 | fi | 1161 | fi |
1162 | if [ -n "`findtool piper`" ]; then | ||
1163 | PIPER="(p)iper " | ||
1164 | PIPER_OPTS="" | ||
1165 | DEFAULT_TTS="piper" | ||
1166 | DEFAULT_TTS_OPTS=$PIPER_OPTS | ||
1167 | DEFAULT_CHOICE="p" | ||
1168 | fi | ||
1162 | if [ -n "`findtool rbspeak`" ]; then | 1169 | if [ -n "`findtool rbspeak`" ]; then |
1163 | RBSPEAK="(O)ther " | 1170 | RBSPEAK="(O)ther " |
1164 | RBSPEAK_OPTS="" | 1171 | RBSPEAK_OPTS="" |
@@ -1167,15 +1174,15 @@ voiceconfig () { | |||
1167 | DEFAULT_CHOICE="O" | 1174 | DEFAULT_CHOICE="O" |
1168 | fi | 1175 | fi |
1169 | 1176 | ||
1170 | if [ "$FESTIVAL" = "$FLITE" ] && [ "$FLITE" = "$ESPEAK" ] && [ "$ESPEAK" = "$SAPI" ] && [ "$SAPI" = "$MIMIC" ] && [ "$MIMIC" = "$SWIFT" ] && [ "$SWIFT" = "$GTTS" ] && [ "$GTTS" = "$RBSPEAK" ] ; then | 1177 | if [ "$FESTIVAL" = "$FLITE" ] && [ "$FLITE" = "$ESPEAK" ] && [ "$ESPEAK" = "$SAPI" ] && [ "$SAPI" = "$MIMIC" ] && [ "$MIMIC" = "$SWIFT" ] && [ "$SWIFT" = "$GTTS" ] && [ "$GTTS" = "$PIPER" ] && [ "$PIPER" = "$RBSPEAK" ] ; then |
1171 | echo "You need Festival, eSpeak, Mimic, Flite, gtts, or rbspeak in your path, or SAPI available to build voice files" | 1178 | echo "You need Festival, eSpeak, Mimic, Flite, piper, gtts, or rbspeak in your path, or SAPI available to build voice files" |
1172 | exit 3 | 1179 | exit 3 |
1173 | fi | 1180 | fi |
1174 | 1181 | ||
1175 | if [ "$ARG_TTS" ]; then | 1182 | if [ "$ARG_TTS" ]; then |
1176 | option=$ARG_TTS | 1183 | option=$ARG_TTS |
1177 | else | 1184 | else |
1178 | echo "TTS engine to use: ${FLITE}${FESTIVAL}${ESPEAK}${MIMIC}${SAPI}${SWIFT}${GTTS}${RBSPEAK}(${DEFAULT_CHOICE})?" | 1185 | echo "TTS engine to use: ${FLITE}${FESTIVAL}${ESPEAK}${MIMIC}${SAPI}${SWIFT}${GTTS}${RBSPEAK}${PIPER}(${DEFAULT_CHOICE})?" |
1179 | option=`input` | 1186 | option=`input` |
1180 | if [ -z "$option" ]; then option=${DEFAULT_CHOICE}; fi | 1187 | if [ -z "$option" ]; then option=${DEFAULT_CHOICE}; fi |
1181 | advopts="$advopts --tts=$option" | 1188 | advopts="$advopts --tts=$option" |
@@ -1209,6 +1216,10 @@ voiceconfig () { | |||
1209 | TTS_ENGINE="gtts" | 1216 | TTS_ENGINE="gtts" |
1210 | TTS_OPTS=$GTTS_OPTS | 1217 | TTS_OPTS=$GTTS_OPTS |
1211 | ;; | 1218 | ;; |
1219 | [Pp]|piper) | ||
1220 | TTS_ENGINE="piper" | ||
1221 | TTS_OPTS=$PIPER_OPTS | ||
1222 | ;; | ||
1212 | [Oo]|rbspeak) | 1223 | [Oo]|rbspeak) |
1213 | TTS_ENGINE="rbspeak" | 1224 | TTS_ENGINE="rbspeak" |
1214 | TTS_OPTS=$RBSPEAK_OPTS | 1225 | TTS_OPTS=$RBSPEAK_OPTS |
@@ -1247,6 +1258,39 @@ voiceconfig () { | |||
1247 | advopts="$advopts --voice=$CHOICE" | 1258 | advopts="$advopts --voice=$CHOICE" |
1248 | echo "Festival voice set to $TTS_FESTIVAL_VOICE" | 1259 | echo "Festival voice set to $TTS_FESTIVAL_VOICE" |
1249 | echo "(voice_$TTS_FESTIVAL_VOICE)" > festival-prolog.scm | 1260 | echo "(voice_$TTS_FESTIVAL_VOICE)" > festival-prolog.scm |
1261 | elif [ "$TTS_ENGINE" = "piper" ]; then | ||
1262 | if [ -z "$PIPER_MODEL_DIR" ]; then | ||
1263 | echo "Please set PIPER_MODEL_DIR!"; | ||
1264 | exit 1 | ||
1265 | fi | ||
1266 | models=`(cd $PIPER_MODEL_DIR ; ls -1 *onnx)` | ||
1267 | for model in $models; do | ||
1268 | PIPER_MODEL="$model" # Default | ||
1269 | break; | ||
1270 | done | ||
1271 | if [ "$ARG_VOICE" ]; then | ||
1272 | CHOICE=$ARG_VOICE | ||
1273 | else | ||
1274 | i=1 | ||
1275 | for model in $models; do | ||
1276 | printf "%3d. %s\n" "$i" "$model" | ||
1277 | i=`expr $i + 1` | ||
1278 | done | ||
1279 | printf "Please select which piper model to use (default is $PIPER_MODEL): " | ||
1280 | CHOICE=`input` | ||
1281 | fi | ||
1282 | i=1 | ||
1283 | for model in $models; do | ||
1284 | if [ "$i" = "$CHOICE" -o "$model" = "$CHOICE" ]; then | ||
1285 | PIPER_MODEL="$model" | ||
1286 | break; | ||
1287 | fi | ||
1288 | i=`expr $i + 1` | ||
1289 | done | ||
1290 | |||
1291 | TTS_OPTS="$TTS_OPTS --model $PIPER_MODEL_DIR/$PIPER_MODEL" | ||
1292 | advopts="$advopts --voice=$PIPER_MODEL" | ||
1293 | echo "Piper model set to $PIPER_MODEL" | ||
1250 | elif [ "$TTS_ENGINE" = "mimic" ]; then | 1294 | elif [ "$TTS_ENGINE" = "mimic" ]; then |
1251 | voicelist=`mimic -lv | cut -d':' -f2` | 1295 | voicelist=`mimic -lv | cut -d':' -f2` |
1252 | for voice in $voicelist; do | 1296 | for voice in $voicelist; do |
@@ -1268,6 +1312,7 @@ voiceconfig () { | |||
1268 | for voice in $voicelist; do | 1312 | for voice in $voicelist; do |
1269 | if [ "$i" = "$CHOICE" -o "$voice" = "$CHOICE" ]; then | 1313 | if [ "$i" = "$CHOICE" -o "$voice" = "$CHOICE" ]; then |
1270 | TTS_MIMIC_VOICE="$voice" | 1314 | TTS_MIMIC_VOICE="$voice" |
1315 | break | ||
1271 | fi | 1316 | fi |
1272 | i=`expr $i + 1` | 1317 | i=`expr $i + 1` |
1273 | done | 1318 | done |
@@ -4756,6 +4801,7 @@ export ANDROID_NDK_PATH=${ANDROID_NDK_PATH} | |||
4756 | export ANDROID_SDK_PATH=${ANDROID_SDK_PATH} | 4801 | export ANDROID_SDK_PATH=${ANDROID_SDK_PATH} |
4757 | export ANDROID_PLATFORM_VERSION=${ANDROID_PLATFORM_VERSION} | 4802 | export ANDROID_PLATFORM_VERSION=${ANDROID_PLATFORM_VERSION} |
4758 | export TOOLSET=${toolset} | 4803 | export TOOLSET=${toolset} |
4804 | export PIPER_MODEL_DIR=${PIPER_MODEL_DIR} | ||
4759 | $CCACHE_ARG | 4805 | $CCACHE_ARG |
4760 | 4806 | ||
4761 | CONFIGURE_OPTIONS=${cmdline} | 4807 | CONFIGURE_OPTIONS=${cmdline} |
diff --git a/tools/voice.pl b/tools/voice.pl index 6b3f807854..337407e187 100755 --- a/tools/voice.pl +++ b/tools/voice.pl | |||
@@ -42,7 +42,8 @@ Usage: voice.pl [options] [path to dir] | |||
42 | Specify which target you want to build voicefile for. Must include | 42 | Specify which target you want to build voicefile for. Must include |
43 | any features that target supports. | 43 | any features that target supports. |
44 | 44 | ||
45 | -f=<file> Use existing voiceids file | 45 | -f=<file> |
46 | Use existing voiceids file | ||
46 | 47 | ||
47 | -i=<target_id> | 48 | -i=<target_id> |
48 | Numeric target id. Needed for voice building. | 49 | Numeric target id. Needed for voice building. |
@@ -64,7 +65,8 @@ Usage: voice.pl [options] [path to dir] | |||
64 | Options to pass to the TTS engine. Enclose in double quotes if the | 65 | Options to pass to the TTS engine. Enclose in double quotes if the |
65 | options include spaces. | 66 | options include spaces. |
66 | 67 | ||
67 | -F Force the file to be regenerated even if present | 68 | -F |
69 | Force the file to be regenerated even if present | ||
68 | 70 | ||
69 | -v | 71 | -v |
70 | Be verbose | 72 | Be verbose |
@@ -73,57 +75,78 @@ USAGE | |||
73 | } | 75 | } |
74 | 76 | ||
75 | my %festival_lang_map = ( | 77 | my %festival_lang_map = ( |
76 | 'english' => 'english', | 78 | 'english' => 'english', |
77 | 'english-us' => 'english', | 79 | 'english-us' => 'english', |
78 | 'espanol' => 'spanish', | 80 | 'espanol' => 'spanish', |
79 | #'finnish' => 'finnish' | 81 | #'finnish' => 'finnish' |
80 | #'italiano' => 'italian', | 82 | #'italiano' => 'italian', |
81 | #'czech' => 'czech', | 83 | #'czech' => 'czech', |
82 | #'welsh' => 'welsh' | 84 | #'welsh' => 'welsh' |
83 | ); | 85 | ); |
84 | 86 | ||
85 | my %gtts_lang_map = ( | 87 | my %gtts_lang_map = ( |
86 | 'english' => '-l en -t co.uk', # Always first, it's the golden master | 88 | 'english' => '-l en -t co.uk', # Always first, it's the golden master |
87 | 'czech' => '-l cs', # not supported | 89 | 'czech' => '-l cs', |
88 | 'dansk' => '-l da', | 90 | 'dansk' => '-l da', |
89 | 'deutsch' => '-l de', | 91 | 'deutsch' => '-l de', |
90 | 'english-us' => '-l en -t us', | 92 | 'english-us' => '-l en -t us', |
91 | 'espanol' => '-l es', | 93 | 'espanol' => '-l es', |
92 | 'francais' => '-l fr', | 94 | 'francais' => '-l fr', |
93 | 'greek' => '-l el', | 95 | 'greek' => '-l el', |
94 | 'magyar' => '-l hu', | 96 | 'magyar' => '-l hu', |
95 | 'italiano' => '-l it', | 97 | 'italiano' => '-l it', |
96 | 'nederlands' => '-l nl', | 98 | 'nederlands' => '-l nl', |
97 | 'norsk' => '-l no', | 99 | 'norsk' => '-l no', |
98 | 'polski' => '-l pl', | 100 | 'polski' => '-l pl', |
99 | 'russian' => '-l ru', | 101 | 'russian' => '-l ru', |
100 | 'slovak' => '-l sk', | 102 | 'slovak' => '-l sk', |
101 | 'srpski' => '-l sr', | 103 | 'srpski' => '-l sr', |
102 | 'svenska' => '-l sv', | 104 | 'svenska' => '-l sv', |
103 | 'turkce' => '-l tr', | 105 | 'turkce' => '-l tr', |
104 | ); | 106 | ); |
105 | 107 | ||
106 | my %espeak_lang_map = ( | 108 | my %espeak_lang_map = ( |
107 | 'english' => 'en-gb', # Always first, it's the golden master | 109 | 'english' => '-ven-gb -k 5', # Always first, it's the golden master |
108 | 'czech' => 'cs', | 110 | 'czech' => '-vcs', |
109 | 'dansk' => 'da', | 111 | 'dansk' => '-vda', |
110 | 'deutsch' => 'de', | 112 | 'deutsch' => '-vde', |
111 | 'english-us' => 'en-us', | 113 | 'english-us' => '-ven-us -k 5', |
112 | 'espanol' => 'es', | 114 | 'espanol' => '-ves', |
113 | 'francais' => 'fr-fr', | 115 | 'francais' => '-vfr-fr', |
114 | 'greek' => 'el', | 116 | 'greek' => '-vel', |
115 | 'nederlands' => 'nl', | 117 | 'magyar' => '-vhu', |
116 | 'magyar' => 'hu', | 118 | 'italiano' => '-vit', |
117 | 'italiano' => 'it', | 119 | 'japanese' => '-vja', |
118 | 'japanese' => 'ja', | 120 | 'nederlands' => '-vnl', |
119 | 'nederlands' => 'nl', | 121 | 'norsk' => '-vno', |
120 | 'norsk' => 'no', | 122 | 'polski' => '-vpl', |
121 | 'polski' => 'pl', | 123 | 'russian' => '-vru', |
122 | 'russian' => 'ru', | 124 | 'slovak' => '-vsk', |
123 | 'slovak' => 'sk', | 125 | 'srpski' => '-vsr', |
124 | 'srpski' => 'sr', | 126 | 'svenska' => '-vsv', |
125 | 'svenska' => 'sv', | 127 | 'turkce' => '-vtr', |
126 | 'turkce' => 'tr', | 128 | ); |
129 | |||
130 | my %piper_lang_map = ( | ||
131 | 'english' => 'en_GB-cori-high.onnx', # Always first, it's the golden master | ||
132 | 'czech' => 'cs_CZ-jirka-medium.onnx', | ||
133 | 'dansk' => 'da_DK-talesyntese-medium.onnx', | ||
134 | 'deutsch' => 'de_DE-thorsten-high.onnx', | ||
135 | 'english-us' => 'en_US-libritts-high.onnx', | ||
136 | 'espanol' => 'es_ES-sharvard-medium.onnx', | ||
137 | 'francais' => 'fr_FR-siwis-medium.onnx', | ||
138 | 'greek' => 'el_GR-rapunzelina-low.onnx', | ||
139 | # 'magyar' => '-vhu', | ||
140 | 'italiano' => 'it_IT-riccardo-x_low.onnx', | ||
141 | # 'japanese' => '-vja', | ||
142 | 'nederlands' => 'nl_NL-mls-medium.onnx', | ||
143 | 'norsk' => 'no_NO-talesyntese-medium.onnx', | ||
144 | 'polski' => 'pl_PL-gosia-medium.onnx', | ||
145 | 'russian' => 'ru_RU-irina-medium.onnx', | ||
146 | 'slovak' => 'sk_SK-lili-medium.onnx', | ||
147 | 'srpski' => 'sr_RS-serbski_institut-medium.onnx', | ||
148 | 'svenska' => 'sv_SE-nst-medium.onnx', | ||
149 | 'turkce' => 'tr_TR-fettah-medium.onnx', | ||
127 | ); | 150 | ); |
128 | 151 | ||
129 | my $trim_thresh = 500; # Trim silence if over this, in ms | 152 | my $trim_thresh = 500; # Trim silence if over this, in ms |
@@ -141,6 +164,7 @@ sub init_tts { | |||
141 | # Don't use given/when here - it's not compatible with old perl versions | 164 | # Don't use given/when here - it's not compatible with old perl versions |
142 | if ($tts_engine eq 'festival') { | 165 | if ($tts_engine eq 'festival') { |
143 | print("> festival $tts_engine_opts --server\n") if $verbose; | 166 | print("> festival $tts_engine_opts --server\n") if $verbose; |
167 | # Open command, and filehandles for STDIN, STDOUT, STDERR | ||
144 | my $pid = open(FESTIVAL_SERVER, "| festival $tts_engine_opts --server > /dev/null 2>&1"); | 168 | my $pid = open(FESTIVAL_SERVER, "| festival $tts_engine_opts --server > /dev/null 2>&1"); |
145 | my $dummy = *FESTIVAL_SERVER; #suppress warning | 169 | my $dummy = *FESTIVAL_SERVER; #suppress warning |
146 | $SIG{INT} = sub { kill TERM => $pid; print("foo"); panic_cleanup(); }; | 170 | $SIG{INT} = sub { kill TERM => $pid; print("foo"); panic_cleanup(); }; |
@@ -149,6 +173,21 @@ sub init_tts { | |||
149 | if (defined($festival_lang_map{$language}) && $tts_engine_opts !~ /--language/) { | 173 | if (defined($festival_lang_map{$language}) && $tts_engine_opts !~ /--language/) { |
150 | $ret{"ttsoptions"} = "--language $festival_lang_map{$language} "; | 174 | $ret{"ttsoptions"} = "--language $festival_lang_map{$language} "; |
151 | } | 175 | } |
176 | } elsif ($tts_engine eq 'piper') { | ||
177 | my $cmd = "piper $tts_engine_opts --json-input"; | ||
178 | print("> $cmd\n") if $verbose; | ||
179 | |||
180 | my $pid = open3(*CMD_IN, *CMD_OUT, *CMD_ERR, $cmd); | ||
181 | $SIG{INT} = sub { kill TERM => $pid; print("foo"); panic_cleanup(); }; | ||
182 | $SIG{KILL} = sub { kill TERM => $pid; print("boo"); panic_cleanup(); }; | ||
183 | $ret{"pid"} = $pid; | ||
184 | binmode(*CMD_IN, ':encoding(utf8)'); | ||
185 | binmode(*CMD_OUT, ':encoding(utf8)'); | ||
186 | binmode(*CMD_ERR, ':encoding(utf8)'); | ||
187 | if (defined($piper_lang_map{$language}) && $tts_engine_opts !~ /--model/) { | ||
188 | die("Need PIPER_MODEL_DIR\n") if (!defined($ENV{'PIPER_MODEL_DIR'})); | ||
189 | $ret{"ttsoptions"} = "--model $ENV{PIPER_MODEL_DIR}/$piper_lang_map{$language} "; | ||
190 | } | ||
152 | } elsif ($tts_engine eq 'sapi') { | 191 | } elsif ($tts_engine eq 'sapi') { |
153 | my $toolsdir = dirname($0); | 192 | my $toolsdir = dirname($0); |
154 | my $path = `cygpath $toolsdir -a -w`; | 193 | my $path = `cygpath $toolsdir -a -w`; |
@@ -176,7 +215,7 @@ sub init_tts { | |||
176 | } | 215 | } |
177 | } elsif ($tts_engine eq 'espeak' || $tts_engine eq 'espeak-ng') { | 216 | } elsif ($tts_engine eq 'espeak' || $tts_engine eq 'espeak-ng') { |
178 | if (defined($espeak_lang_map{$language}) && $tts_engine_opts !~ /-v/) { | 217 | if (defined($espeak_lang_map{$language}) && $tts_engine_opts !~ /-v/) { |
179 | $ret{"ttsoptions"} = "-v$espeak_lang_map{$language} "; | 218 | $ret{"ttsoptions"} = " $espeak_lang_map{$language} "; |
180 | } | 219 | } |
181 | } | 220 | } |
182 | 221 | ||
@@ -190,6 +229,10 @@ sub shutdown_tts { | |||
190 | # Send SIGTERM to festival server | 229 | # Send SIGTERM to festival server |
191 | kill TERM => $$tts_object{"pid"}; | 230 | kill TERM => $$tts_object{"pid"}; |
192 | } | 231 | } |
232 | elsif ($$tts_object{'name'} eq 'piper') { | ||
233 | # Send SIGTERM to piper | ||
234 | kill TERM => $$tts_object{"pid"}; | ||
235 | } | ||
193 | elsif ($$tts_object{'name'} eq 'sapi') { | 236 | elsif ($$tts_object{'name'} eq 'sapi') { |
194 | print({$$tts_object{"stdin"}} "QUIT\r\n"); | 237 | print({$$tts_object{"stdin"}} "QUIT\r\n"); |
195 | close($$tts_object{"stdin"}); | 238 | close($$tts_object{"stdin"}); |
@@ -244,6 +287,13 @@ sub voicestring { | |||
244 | close(CMD_OUT); | 287 | close(CMD_OUT); |
245 | close(CMD_ERR); | 288 | close(CMD_ERR); |
246 | } | 289 | } |
290 | elsif ($name eq 'piper') { | ||
291 | $cmd = "{ \"text\": \"$string\", \"output_file\": \"$output\" }"; | ||
292 | print(">> $cmd\n") if $verbose; | ||
293 | print(CMD_IN "$cmd\n"); | ||
294 | my $res = <CMD_OUT>; | ||
295 | $res = <CMD_ERR>; | ||
296 | } | ||
247 | elsif ($name eq 'flite') { | 297 | elsif ($name eq 'flite') { |
248 | $cmd = "flite $tts_engine_opts -t \"$string\" \"$output\""; | 298 | $cmd = "flite $tts_engine_opts -t \"$string\" \"$output\""; |
249 | print("> $cmd\n") if $verbose; | 299 | print("> $cmd\n") if $verbose; |
@@ -469,7 +519,6 @@ sub generateclips { | |||
469 | print("\n"); | 519 | print("\n"); |
470 | 520 | ||
471 | unlink($updfile) if (-f $updfile); | 521 | unlink($updfile) if (-f $updfile); |
472 | shutdown_tts($tts_object); | ||
473 | } | 522 | } |
474 | 523 | ||
475 | # Assemble the voicefile | 524 | # Assemble the voicefile |
@@ -608,6 +657,7 @@ if ($V == 1) { | |||
608 | defined($t) ? $t : "unknown", | 657 | defined($t) ? $t : "unknown", |
609 | $l, $e, $E, $s, $S); | 658 | $l, $e, $E, $s, $S); |
610 | generateclips($l, $t, $e, $E, $tts_object, $S, $f); | 659 | generateclips($l, $t, $e, $E, $tts_object, $S, $f); |
660 | shutdown_tts($tts_object); | ||
611 | createvoice($l, $i, $f); | 661 | createvoice($l, $i, $f); |
612 | deleteencs(); | 662 | deleteencs(); |
613 | } elsif ($C) { | 663 | } elsif ($C) { |