From b99182c8d4424cb1f77bbc36628af432525ef7ee Mon Sep 17 00:00:00 2001
From: Chakib Benziane <contact@blob42.xyz>
Date: Sat, 1 Jun 2024 20:26:27 +0200
Subject: [PATCH] TTS  API improvements (#2308)

* update doc on COQUI_LANGUAGE env variable

Signed-off-by: blob42 <contact@blob42.xyz>

* return errors from tts gRPC backend

Signed-off-by: blob42 <contact@blob42.xyz>

* handle speaker_id and language in coqui TTS backend

Signed-off-by: blob42 <contact@blob42.xyz>

* TTS endpoint: add optional language paramter

Signed-off-by: blob42 <contact@blob42.xyz>

* tts fix: empty language string breaks non-multilingual models

Signed-off-by: blob42 <contact@blob42.xyz>

* allow tts param definition in config file

- consolidate TTS options under `tts` config entry

Signed-off-by: blob42 <contact@blob42.xyz>

* tts: update doc

Signed-off-by: blob42 <contact@blob42.xyz>

---------

Signed-off-by: blob42 <contact@blob42.xyz>
Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
---
 Makefile                                    |   2 +-
 backend/backend.proto                       |   1 +
 backend/python/coqui/backend.py             |  16 ++-
 core/backend/tts.go                         |  17 ++-
 core/cli/tts.go                             |   3 +-
 core/config/backend_config.go               |  13 ++-
 core/http/endpoints/elevenlabs/tts.go       |   2 +-
 core/http/endpoints/localai/tts.go          |  22 +++-
 core/schema/localai.go                      | 120 ++++++++++----------
 docs/content/docs/features/text-to-audio.md |  48 ++++++--
 10 files changed, 166 insertions(+), 78 deletions(-)

diff --git a/Makefile b/Makefile
index 20a5f2e0001..71ce394f430 100644
--- a/Makefile
+++ b/Makefile
@@ -447,7 +447,7 @@ protogen-clean: protogen-go-clean protogen-python-clean
 .PHONY: protogen-go
 protogen-go:
 	mkdir -p pkg/grpc/proto
-	protoc -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
+	protoc --experimental_allow_proto3_optional -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
     backend/backend.proto
 
 .PHONY: protogen-go-clean
diff --git a/backend/backend.proto b/backend/backend.proto
index cb87fe02d46..aec0c00e74e 100644
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -266,6 +266,7 @@ message TTSRequest {
   string model = 2;
   string dst = 3;
   string voice = 4;
+  optional string language = 5;
 }
 
 message TokenizationResponse {
diff --git a/backend/python/coqui/backend.py b/backend/python/coqui/backend.py
index c6432208f5e..02ab56f4a58 100644
--- a/backend/python/coqui/backend.py
+++ b/backend/python/coqui/backend.py
@@ -66,7 +66,21 @@ def LoadModel(self, request, context):
 
     def TTS(self, request, context):
         try:
-            self.tts.tts_to_file(text=request.text, speaker_wav=self.AudioPath, language=COQUI_LANGUAGE, file_path=request.dst)
+            # if model is multilangual add language from request or env as fallback
+            lang = request.language or COQUI_LANGUAGE
+            if lang == "":
+                lang = None
+            if self.tts.is_multi_lingual and lang is None:
+               return backend_pb2.Result(success=False, message=f"Model is multi-lingual, but no language was provided")
+
+            # if model is multi-speaker, use speaker_wav or the speaker_id from request.voice
+            if self.tts.is_multi_speaker and self.AudioPath is None and request.voice is None:
+                return backend_pb2.Result(success=False, message=f"Model is multi-speaker, but no speaker was provided")
+
+            if self.tts.is_multi_speaker and request.voice is not None:
+               self.tts.tts_to_file(text=request.text, speaker=request.voice, language=lang, file_path=request.dst)
+            else:
+                self.tts.tts_to_file(text=request.text, speaker_wav=self.AudioPath, language=lang, file_path=request.dst)
         except Exception as err:
             return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
         return backend_pb2.Result(success=True)
diff --git a/core/backend/tts.go b/core/backend/tts.go
index 4532cf00adb..b1c23ebb3e5 100644
--- a/core/backend/tts.go
+++ b/core/backend/tts.go
@@ -29,7 +29,16 @@ func generateUniqueFileName(dir, baseName, ext string) string {
 	}
 }
 
-func ModelTTS(backend, text, modelFile, voice string, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (string, *proto.Result, error) {
+func ModelTTS(
+	backend,
+	text,
+	modelFile,
+	voice ,
+	language string,
+	loader *model.ModelLoader,
+	appConfig *config.ApplicationConfig,
+	backendConfig config.BackendConfig,
+) (string, *proto.Result, error) {
 	bb := backend
 	if bb == "" {
 		bb = model.PiperBackend
@@ -83,7 +92,13 @@ func ModelTTS(backend, text, modelFile, voice string, loader *model.ModelLoader,
 		Model: modelPath,
 		Voice: voice,
 		Dst:   filePath,
+		Language: &language,
 	})
 
+	// return RPC error if any
+	if !res.Success {
+		return "", nil, fmt.Errorf(res.Message)
+	}
+
 	return filePath, res, err
 }
diff --git a/core/cli/tts.go b/core/cli/tts.go
index 8b54ed28111..cbba0fc5fb7 100644
--- a/core/cli/tts.go
+++ b/core/cli/tts.go
@@ -20,6 +20,7 @@ type TTSCMD struct {
 	Backend           string `short:"b" default:"piper" help:"Backend to run the TTS model"`
 	Model             string `short:"m" required:"" help:"Model name to run the TTS"`
 	Voice             string `short:"v" help:"Voice name to run the TTS"`
+	Language          string `short:"l" help:"Language to use with the TTS"`
 	OutputFile        string `short:"o" type:"path" help:"The path to write the output wav file"`
 	ModelsPath        string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
 	BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
@@ -52,7 +53,7 @@ func (t *TTSCMD) Run(ctx *cliContext.Context) error {
 	options := config.BackendConfig{}
 	options.SetDefaults()
 
-	filePath, _, err := backend.ModelTTS(t.Backend, text, t.Model, t.Voice, ml, opts, options)
+	filePath, _, err := backend.ModelTTS(t.Backend, text, t.Model, t.Voice, t.Language, ml, opts, options)
 	if err != nil {
 		return err
 	}
diff --git a/core/config/backend_config.go b/core/config/backend_config.go
index eda66360305..1ca11716550 100644
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -15,6 +15,15 @@ const (
 	RAND_SEED = -1
 )
 
+type TTSConfig struct {
+
+	// Voice wav path or id
+	Voice string `yaml:"voice"`
+
+	// Vall-e-x
+	VallE    VallE  `yaml:"vall-e"`
+}
+
 type BackendConfig struct {
 	schema.PredictionOptions `yaml:"parameters"`
 	Name                     string `yaml:"name"`
@@ -49,8 +58,8 @@ type BackendConfig struct {
 	// GRPC Options
 	GRPC GRPC `yaml:"grpc"`
 
-	// Vall-e-x
-	VallE VallE `yaml:"vall-e"`
+	// TTS specifics
+	TTSConfig `yaml:"tts"`
 
 	// CUDA
 	// Explicitly enable CUDA or not (some backends might need it)
diff --git a/core/http/endpoints/elevenlabs/tts.go b/core/http/endpoints/elevenlabs/tts.go
index 841f9b5f784..e7bfe0f7bbf 100644
--- a/core/http/endpoints/elevenlabs/tts.go
+++ b/core/http/endpoints/elevenlabs/tts.go
@@ -52,7 +52,7 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
 		}
 		log.Debug().Msgf("Request for model: %s", modelFile)
 
-		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Text, modelFile, voiceID, ml, appConfig, *cfg)
+		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Text, modelFile, "", voiceID, ml, appConfig, *cfg)
 		if err != nil {
 			return err
 		}
diff --git a/core/http/endpoints/localai/tts.go b/core/http/endpoints/localai/tts.go
index 7822e0242c2..4e5a1b5b16d 100644
--- a/core/http/endpoints/localai/tts.go
+++ b/core/http/endpoints/localai/tts.go
@@ -12,10 +12,13 @@ import (
 )
 
 // TTSEndpoint is the OpenAI Speech API endpoint https://platform.openai.com/docs/api-reference/audio/createSpeech
-// @Summary Generates audio from the input text.
-// @Param request body schema.TTSRequest true "query params"
-// @Success 200 {string} binary	 "Response"
-// @Router /v1/audio/speech [post]
+//	@Summary	Generates audio from the input text.
+//  @Accept json
+//  @Produce audio/x-wav
+//	@Param		request	body		schema.TTSRequest	true	"query params"
+//	@Success	200		{string}	binary				"generated audio/wav file"
+//	@Router		/v1/audio/speech [post]
+//	@Router		/tts [post]
 func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 
@@ -40,6 +43,7 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
 		)
 
 		if err != nil {
+			log.Err(err)
 			modelFile = input.Model
 			log.Warn().Msgf("Model not found in context: %s", input.Model)
 		} else {
@@ -51,7 +55,15 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
 			cfg.Backend = input.Backend
 		}
 
-		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Input, modelFile, input.Voice, ml, appConfig, *cfg)
+		if input.Language != "" {
+			cfg.Language = input.Language
+		}
+
+		if input.Voice != "" {
+			cfg.Voice = input.Voice
+		}
+
+		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Input, modelFile, cfg.Voice, cfg.Language, ml, appConfig, *cfg)
 		if err != nil {
 			return err
 		}
diff --git a/core/schema/localai.go b/core/schema/localai.go
index e9b61cf3d50..9bbfe28b5e4 100644
--- a/core/schema/localai.go
+++ b/core/schema/localai.go
@@ -1,59 +1,61 @@
-package schema
-
-import (
-	gopsutil "github.com/shirou/gopsutil/v3/process"
-)
-
-type BackendMonitorRequest struct {
-	Model string `json:"model" yaml:"model"`
-}
-
-type BackendMonitorResponse struct {
-	MemoryInfo    *gopsutil.MemoryInfoStat
-	MemoryPercent float32
-	CPUPercent    float64
-}
-
-type TTSRequest struct {
-	Model   string `json:"model" yaml:"model"`
-	Input   string `json:"input" yaml:"input"`
-	Voice   string `json:"voice" yaml:"voice"`
-	Backend string `json:"backend" yaml:"backend"`
-}
-
-type StoresSet struct {
-	Store string `json:"store,omitempty" yaml:"store,omitempty"`
-
-	Keys   [][]float32 `json:"keys" yaml:"keys"`
-	Values []string    `json:"values" yaml:"values"`
-}
-
-type StoresDelete struct {
-	Store string `json:"store,omitempty" yaml:"store,omitempty"`
-
-	Keys [][]float32 `json:"keys"`
-}
-
-type StoresGet struct {
-	Store string `json:"store,omitempty" yaml:"store,omitempty"`
-
-	Keys [][]float32 `json:"keys" yaml:"keys"`
-}
-
-type StoresGetResponse struct {
-	Keys   [][]float32 `json:"keys" yaml:"keys"`
-	Values []string    `json:"values" yaml:"values"`
-}
-
-type StoresFind struct {
-	Store string `json:"store,omitempty" yaml:"store,omitempty"`
-
-	Key  []float32 `json:"key" yaml:"key"`
-	Topk int       `json:"topk" yaml:"topk"`
-}
-
-type StoresFindResponse struct {
-	Keys         [][]float32 `json:"keys" yaml:"keys"`
-	Values       []string    `json:"values" yaml:"values"`
-	Similarities []float32   `json:"similarities" yaml:"similarities"`
-}
+package schema
+
+import (
+	gopsutil "github.com/shirou/gopsutil/v3/process"
+)
+
+type BackendMonitorRequest struct {
+	Model string `json:"model" yaml:"model"`
+}
+
+type BackendMonitorResponse struct {
+	MemoryInfo    *gopsutil.MemoryInfoStat
+	MemoryPercent float32
+	CPUPercent    float64
+}
+
+// @Description TTS request body
+type TTSRequest struct {
+	Model    string `json:"model" yaml:"model"` // model name or full path
+	Input    string `json:"input" yaml:"input"` // text input
+	Voice    string `json:"voice" yaml:"voice"` // voice audio file or speaker id
+	Backend  string `json:"backend" yaml:"backend"`
+	Language string `json:"language,omitempty" yaml:"language,omitempty"` // (optional) language to use with TTS model
+}
+
+type StoresSet struct {
+	Store string `json:"store,omitempty" yaml:"store,omitempty"`
+
+	Keys   [][]float32 `json:"keys" yaml:"keys"`
+	Values []string    `json:"values" yaml:"values"`
+}
+
+type StoresDelete struct {
+	Store string `json:"store,omitempty" yaml:"store,omitempty"`
+
+	Keys [][]float32 `json:"keys"`
+}
+
+type StoresGet struct {
+	Store string `json:"store,omitempty" yaml:"store,omitempty"`
+
+	Keys [][]float32 `json:"keys" yaml:"keys"`
+}
+
+type StoresGetResponse struct {
+	Keys   [][]float32 `json:"keys" yaml:"keys"`
+	Values []string    `json:"values" yaml:"values"`
+}
+
+type StoresFind struct {
+	Store string `json:"store,omitempty" yaml:"store,omitempty"`
+
+	Key  []float32 `json:"key" yaml:"key"`
+	Topk int       `json:"topk" yaml:"topk"`
+}
+
+type StoresFindResponse struct {
+	Keys         [][]float32 `json:"keys" yaml:"keys"`
+	Values       []string    `json:"values" yaml:"values"`
+	Similarities []float32   `json:"similarities" yaml:"similarities"`
+}
diff --git a/docs/content/docs/features/text-to-audio.md b/docs/content/docs/features/text-to-audio.md
index ebfdda1d0e9..0e82f7f07ba 100644
--- a/docs/content/docs/features/text-to-audio.md
+++ b/docs/content/docs/features/text-to-audio.md
@@ -46,6 +46,10 @@ Coqui works without any configuration, to test it, you can run the following cur
         }'
 ```
 
+You can use the env variable COQUI_LANGUAGE to set the language used by the coqui backend.
+
+You can also use config files to configure tts models (see section below on how to use config files).
+
 ### Bark
 
 [Bark](https://github.com/suno-ai/bark) allows to generate audio from text prompts.
@@ -148,11 +152,12 @@ name: cloned-voice
 backend: vall-e-x
 parameters:
   model: "cloned-voice"
-vall-e:
-  # The path to the audio file to be cloned
-  # relative to the models directory
-  # Max 15s
-  audio_path: "audio-sample.wav"
+tts:
+    vall-e:
+      # The path to the audio file to be cloned
+      # relative to the models directory
+      # Max 15s
+      audio_path: "audio-sample.wav"
 ```
 
 Then you can specify the model name in the requests:
@@ -164,6 +169,35 @@ curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
    }' | aplay
 ```
 
-## Parler-tts
+### Parler-tts
+
+`parler-tts`. It is possible to install and configure the model directly from the gallery. https://github.com/huggingface/parler-tts
+
+
+## Using config files
+
+You can also use a `config-file` to specify TTS models and their parameters.
+
+In the following example we define a custom config to load the `xtts_v2` model, and specify a voice and language.
+
+```yaml
+
+name: xtts_v2
+backend: coqui
+parameters:
+  language: fr
+  model: tts_models/multilingual/multi-dataset/xtts_v2
+
+tts:
+  voice: Ana Florence
+```
 
-`parler-tts`. It is possible to install and configure the model directly from the gallery. https://github.com/huggingface/parler-tts
\ No newline at end of file
+With this config, you can now use the following curl command to generate a text-to-speech audio file:
+```bash
+curl -L http://localhost:8080/tts \
+    -H "Content-Type: application/json" \
+    -d '{
+"model": "xtts_v2",
+"input": "Bonjour, je suis Ana Florence. Comment puis-je vous aider?"
+}' | aplay
+```