From 2c3399537cfb9d481a1e31a4b625d2abb97664e2 Mon Sep 17 00:00:00 2001
From: Solomon Peachy <pizza@shaftnet.org>
Date: Wed, 8 Jul 2020 19:05:09 -0400
Subject: voice:  Add support for Google Translate's speech synthesizer

Uses the 'gtts-cli' command line client.  Supports a wide variety of
languages, including all "Complete" and "Good" Rockbox translations.

Additional changes:

 * voice synth script can accept pre-encoded mp3 files
 * Move language->synth options mapping into the voice script
 * Additional cleanups

Change-Id: I9523e2bca87cbcee2d8c4111f9892e8e458c7419
---
 tools/configure | 53 ++++++++++++++++-----------------
 tools/voice.pl  | 91 ++++++++++++++++++++++++++++++++++++++++++---------------
 2 files changed, 93 insertions(+), 51 deletions(-)

(limited to 'tools')

diff --git a/tools/configure b/tools/configure
index 585eadd357..aa8b190250 100755
--- a/tools/configure
+++ b/tools/configure
@@ -1111,23 +1111,7 @@ voiceconfig () {
     fi
     if [ -n "`findtool festival`" ]; then
         FESTIVAL="(F)estival "
-        case "$thislang" in
-            "italiano")
-            FESTIVAL_OPTS="--language italian"
-            ;;
-            "espanol")
-            FESTIVAL_OPTS="--language spanish"
-            ;;
-            "finnish")
-            FESTIVAL_OPTS="--language finnish"
-            ;;
-            "czech")
-            FESTIVAL_OPTS="--language czech"
-            ;;
-            *)
-            FESTIVAL_OPTS=""
-            ;;
-        esac
+        FESTIVAL_OPTS=""
         DEFAULT_TTS="festival"
         DEFAULT_TTS_OPTS=$FESTIVAL_OPTS
         DEFAULT_NOISEFLOOR="500"
@@ -1149,14 +1133,6 @@ voiceconfig () {
         DEFAULT_NOISEFLOOR="500"
         DEFAULT_CHOICE="w"
     fi
-    if [ -n "`findtool rbspeak`" ]; then
-        RBSPEAK="(O)ther "
-        RBSPEAK_OPTS=""
-        DEFAULT_TTS="rbspeak"
-        DEFAULT_TTS_OPTS=$RBSPEAK_OPTS
-        DEFAULT_NOISEFLOOR="500"
-        DEFAULT_CHOICE="O"
-    fi
     # Allow SAPI if Windows is in use
     if [ -n "`findtool winver`" ]; then
         SAPI="(S)API "
@@ -1164,10 +1140,26 @@ voiceconfig () {
         DEFAULT_TTS="sapi"
         DEFAULT_TTS_OPTS=$SAPI_OPTS
         DEFAULT_NOISEFLOOR="500"
-        DEFAULT_CHOICE="s"
+        DEFAULT_CHOICE="S"
+    fi
+    if [ -n "`findtool gtts-cli`" ]; then
+        GTTS="(g)tts "
+        GTTS_OPTS=""
+        DEFAULT_TTS="gtts"
+        DEFAULT_TTS_OPTS=$GTTS_OPTS
+        DEFAULT_NOISEFLOOR="500"
+        DEFAULT_CHOICE="g"
+    fi
+    if [ -n "`findtool rbspeak`" ]; then
+        RBSPEAK="(O)ther "
+        RBSPEAK_OPTS=""
+        DEFAULT_TTS="rbspeak"
+        DEFAULT_TTS_OPTS=$RBSPEAK_OPTS
+        DEFAULT_NOISEFLOOR="500"
+        DEFAULT_CHOICE="O"
     fi
 
-    if [ "$FESTIVAL" = "$FLITE" ] && [ "$FLITE" = "$ESPEAK" ] && [ "$ESPEAK" = "$SAPI" ] && [ "$SAPI" = "$MIMIC"] && [ "$MIMIC" = "$SWIFT" ] && [ "$SWIFT" = "$RBSPEAK" ]; then
+    if [ "$FESTIVAL" = "$FLITE" ] && [ "$FLITE" = "$ESPEAK" ] && [ "$ESPEAK" = "$SAPI" ] && [ "$SAPI" = "$MIMIC"] && [ "$MIMIC" = "$SWIFT" ] && [ "$SWIFT" = "$RBSPEAK" ] && [ "$RBSPEAK" = "$GTTS" ] ; then
         echo "You need Festival, eSpeak, Mimic, Flite, or rbspeak in your path, or SAPI available to build voice files"
         exit 3
     fi
@@ -1175,7 +1167,7 @@ voiceconfig () {
     if [ "$ARG_TTS" ]; then
         option=$ARG_TTS
     else
-        echo "TTS engine to use: ${FLITE}${FESTIVAL}${ESPEAK}${MIMIC}${SAPI}${SWIFT}${RBSPEAK}(${DEFAULT_CHOICE})?"
+        echo "TTS engine to use: ${FLITE}${FESTIVAL}${ESPEAK}${MIMIC}${SAPI}${SWIFT}${GTTS}${RBSPEAK}(${DEFAULT_CHOICE})?"
         option=`input`
         if [ -z "$option" ]; then option=${DEFAULT_CHOICE}; fi
         advopts="$advopts --tts=$option"
@@ -1211,6 +1203,11 @@ voiceconfig () {
         NOISEFLOOR="500"
         TTS_OPTS=$SWIFT_OPTS
 	;;
+	[Gg)
+        TTS_ENGINE="gtts"
+        NOISEFLOOR="500"
+        TTS_OPTS=$GTTS_OPTS
+	;;
 	[Oo])
         TTS_ENGINE="rbspeak"
         NOISEFLOOR="500"
diff --git a/tools/voice.pl b/tools/voice.pl
index 8198501777..56195d9e12 100755
--- a/tools/voice.pl
+++ b/tools/voice.pl
@@ -5,7 +5,7 @@
 #   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
 #   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
 #                     \/            \/     \/    \/            \/
-# $Id$ 
+# $Id$
 #
 # Copyright (C) 2007 Jonas Häggqvist
 #
@@ -33,46 +33,73 @@ sub printusage {
 Usage: voice.pl [options] [path to dir]
  -V
     Create voice file. You must also specify -t and -l.
- 
+
  -C
     Create .talk clips.
 
  -t=<target>
     Specify which target you want to build voicefile for. Must include
     any features that target supports.
- 
+
  -i=<target_id>
     Numeric target id. Needed for voice building.
- 
+
  -l=<language>
     Specify which language you want to build. Without .lang extension.
- 
+
  -e=<encoder>
     Which encoder to use for voice strings
 
  -E=<encoder options>
     Which encoder options to use when compressing voice strings. Enclose
     in double quotes if the options include spaces.
- 
+
  -s=<TTS engine>
     Which TTS engine to use.
- 
+
  -S=<TTS engine options>
     Options to pass to the TTS engine. Enclose in double quotes if the
     options include spaces.
- 
+
  -v
     Be verbose
 USAGE
 ;
 }
 
+my %festival_lang_map = {
+                           'english' => 'english',
+			   'english-us' => 'english',
+			   'espanol' => 'spanish',
+			  #'finnish' => 'finnish'
+			  #'italiano' => 'italian',
+                          #'czech' => 'czech',
+			  #'welsh' => 'welsh'
+};
+
+my %gtts_lang_map = {
+    'english' => 'en-gb',  # Always first, it's the golden master
+	'deutsch' => 'de',
+	'english-us' => 'en-us',
+	'francais' => 'fr-fr',
+	'greek' => 'gr',
+	'italiano' => 'it',
+	'norsk' => 'no',
+	'polski' => 'pl',
+	'russian' => 'ru',
+	'slovak' => 'sk',
+	'srpski' => 'sr',
+};
+
 # Initialize TTS engine. May return an object or value which will be passed
 # to voicestring and shutdown_tts
 sub init_tts {
     our $verbose;
     my ($tts_engine, $tts_engine_opts, $language) = @_;
     my %ret = ("name" => $tts_engine);
+    $ret{"format"} = 'wav';
+    $ret{"ttsoptions"} = "";
+
     # Don't use given/when here - it's not compatible with old perl versions
     if ($tts_engine eq 'festival') {
         print("> festival $tts_engine_opts --server\n") if $verbose;
@@ -81,8 +108,10 @@ sub init_tts {
         $SIG{INT} = sub { kill TERM => $pid; print("foo"); panic_cleanup(); };
         $SIG{KILL} = sub { kill TERM => $pid; print("boo"); panic_cleanup(); };
         $ret{"pid"} = $pid;
-    }
-    elsif ($tts_engine eq 'sapi') {
+        if (defined($festival_lang_map{$language})) {
+            $ret{"ttsoptions"} = "-l $festival_lang_map{$language} ";
+        }
+    } elsif ($tts_engine eq 'sapi') {
         my $toolsdir = dirname($0);
         my $path = `cygpath $toolsdir -a -w`;
         chomp($path);
@@ -102,6 +131,11 @@ sub init_tts {
                 "stdin" => *CMD_IN,
                 "stdout" => *CMD_OUT,
                 "vendor" => $vendor);
+    } elsif ($tts_engine eq 'gtts') {
+        $ret{"format"} = 'mp3';
+        if (defined($gtts_lang_map{$language})) {
+            $ret{"ttsoptions"} = "-l $gtts_lang_map{$language} ";
+        }
     }
     return \%ret;
 }
@@ -143,6 +177,9 @@ sub voicestring {
     my ($string, $output, $tts_engine_opts, $tts_object) = @_;
     my $cmd;
     my $name = $$tts_object{'name'};
+
+    $tts_engine_opts .= $$tts_object{"ttsoptions"};
+
     printf("Generate \"%s\" with %s in file %s\n", $string, $name, $output) if $verbose;
     if ($name eq 'festival') {
         # festival_client lies to us, so we have to do awful soul-eating
@@ -167,7 +204,7 @@ sub voicestring {
     elsif ($name eq 'flite') {
         $cmd = "flite $tts_engine_opts -t \"$string\" \"$output\"";
         print("> $cmd\n") if $verbose;
-        `$cmd`;
+        system($cmd);
     }
     elsif ($name eq 'espeak') {
         $cmd = "espeak $tts_engine_opts -w \"$output\"";
@@ -193,11 +230,14 @@ sub voicestring {
         close(RBSPEAK);
     }
     elsif ($name eq 'mimic') {
-	$cmd = "mimic $tts_engine_opts -o $output";
-	print("> $cmd\n") if $verbose;
-	open (MIMIC, "| $cmd");
-	print MIMIC $string . "\n";
-	close(MIMIC);
+        $cmd = "mimic $tts_engine_opts -o $output -t \"$string\" ";
+        print("> $cmd\n") if $verbose;
+        system($cmd);
+    }
+    elsif ($name eq 'gtts') {
+        $cmd = "gtts-cli $tts_engine_opts -o $output \"$string\"";
+        print("> $cmd\n") if $verbose;
+        system($cmd);
     }
 }
 
@@ -326,17 +366,22 @@ sub generateclips {
                     if ($id eq "VOICE_PAUSE") {
                         print("Use distributed $wav\n") if $verbose;
                         copy(dirname($0)."/VOICE_PAUSE.wav", $wav);
+                    } else {
+			voicestring($voice, $wav, $tts_engine_opts, $tts_object);
+			if ($tts_object->{'format'} eq "wav") {
+			    wavtrim($wav, 500, $tts_object);
+			    # 500 seems to be a reasonable default for now
+			}
                     }
-                    else {
-                        voicestring($voice, $wav, $tts_engine_opts, $tts_object);
-                        wavtrim($wav, 500, $tts_object);
-                        # 500 seems to be a reasonable default for now
-                    }
+		    if ($tts_object->{'format'} eq "wav" || $id eq "VOICE_PAUSE") {
+			encodewav($wav, $mp3, $encoder, $encoder_opts, $tts_object);
+		    } else {
+			copy($wav, $mp3);
+		    }
 
-                    encodewav($wav, $mp3, $encoder, $encoder_opts, $tts_object);
                     synchronize($tts_object);
                     if (defined($ENV{'POOL'})) {
-                        copy($mp3, $pool_file);
+			copy($mp3, $pool_file);
                     }
                     unlink($wav);
                 }
-- 
cgit v1.2.3