TTS endpoint: add optional language paramter

blob42 · blob42 · commit 628502f5887d · 2024-04-29T18:19:21.000+02:00
Signed-off-by: blob42 &lt;contact@blob42.xyz&gt;
diff --git a/Makefile b/Makefile
@@ -428,7 +428,7 @@ protogen-clean: protogen-go-clean protogen-python-clean
 .PHONY: protogen-go
 protogen-go:
 	mkdir -p pkg/grpc/proto
-	protoc -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
+	protoc --experimental_allow_proto3_optional -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
     backend/backend.proto
 
 .PHONY: protogen-go-clean
diff --git a/backend/backend.proto b/backend/backend.proto
@@ -263,6 +263,7 @@ message TTSRequest {
   string model = 2;
   string dst = 3;
   string voice = 4;
+  optional string language = 5;
 }
 
 message TokenizationResponse {
diff --git a/backend/python/coqui/coqui_server.py b/backend/python/coqui/coqui_server.py
@@ -67,7 +67,7 @@ def LoadModel(self, request, context):
     def TTS(self, request, context):
         try:
             # if model is multilangual add language from request or env as fallback
-            lang = request.Lang or COQUI_LANGUAGE
+            lang = request.language or COQUI_LANGUAGE
             if self.tts.is_multi_lingual and lang is None:
                return backend_pb2.Result(success=False, message=f"Model is multi-lingual, but no language was provided")
 
diff --git a/core/backend/tts.go b/core/backend/tts.go
@@ -29,7 +29,7 @@ func generateUniqueFileName(dir, baseName, ext string) string {
 	}
 }
 
-func ModelTTS(backend, text, modelFile, voice string, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (string, *proto.Result, error) {
+func ModelTTS(backend, text, modelFile, voice string, language string, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (string, *proto.Result, error) {
 	bb := backend
 	if bb == "" {
 		bb = model.PiperBackend
@@ -83,6 +83,7 @@ func ModelTTS(backend, text, modelFile, voice string, loader *model.ModelLoader,
 		Model: modelPath,
 		Voice: voice,
 		Dst:   filePath,
+		Language: &language,
 	})
 
 	// return RPC error if any
@@ -92,3 +93,9 @@ func ModelTTS(backend, text, modelFile, voice string, loader *model.ModelLoader,
 
 	return filePath, res, err
 }
+
+func ModelTTSInfo(backend, text, modelFile, voice string, language string, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (string, *proto.Result, error) {
+
+
+	return "", nil, nil
+}
diff --git a/core/cli/tts.go b/core/cli/tts.go
@@ -18,6 +18,7 @@ type TTSCMD struct {
 	Backend           string `short:"b" default:"piper" help:"Backend to run the TTS model"`
 	Model             string `short:"m" required:"" help:"Model name to run the TTS"`
 	Voice             string `short:"v" help:"Voice name to run the TTS"`
+	Language          string `short:"l" help:"Language to use with the TTS"`
 	OutputFile        string `short:"o" type:"path" help:"The path to write the output wav file"`
 	ModelsPath        string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
 	BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
@@ -45,7 +46,7 @@ func (t *TTSCMD) Run(ctx *Context) error {
 	options := config.BackendConfig{}
 	options.SetDefaults()
 
-	filePath, _, err := backend.ModelTTS(t.Backend, text, t.Model, t.Voice, ml, opts, options)
+	filePath, _, err := backend.ModelTTS(t.Backend, text, t.Model, t.Voice, t.Language, ml, opts, options)
 	if err != nil {
 		return err
 	}
diff --git a/core/http/endpoints/elevenlabs/tts.go b/core/http/endpoints/elevenlabs/tts.go
@@ -52,7 +52,7 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
 		}
 		log.Debug().Msgf("Request for model: %s", modelFile)
 
-		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Text, modelFile, voiceID, ml, appConfig, *cfg)
+		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Text, modelFile, "", voiceID, ml, appConfig, *cfg)
 		if err != nil {
 			return err
 		}
diff --git a/core/http/endpoints/localai/tts.go b/core/http/endpoints/localai/tts.go
@@ -12,11 +12,13 @@ import (
 )
 
 // TTSEndpoint is the OpenAI Speech API endpoint https://platform.openai.com/docs/api-reference/audio/createSpeech
-// @Summary Generates audio from the input text.
-// @Param request body schema.TTSRequest true "query params"
-// @Success 200 {string} binary	 "Response"
-// @Router /v1/audio/speech [post]
-// @Router /tts [post]
+//	@Summary	Generates audio from the input text.
+//  @Accept json
+//  @Produce audio/x-wav
+//	@Param		request	body		schema.TTSRequest	true	"query params"
+//	@Success	200		{string}	binary				"generated audio/wav file"
+//	@Router		/v1/audio/speech [post]
+//	@Router		/tts [post]
 func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 
@@ -52,7 +54,7 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
 			cfg.Backend = input.Backend
 		}
 
-		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Input, modelFile, input.Voice, ml, appConfig, *cfg)
+		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Input, modelFile, input.Voice, input.Language, ml, appConfig, *cfg)
 		if err != nil {
 			return err
 		}
diff --git a/core/schema/localai.go b/core/schema/localai.go
@@ -1,59 +1,61 @@
-package schema
-
-import (
-	gopsutil "github.com/shirou/gopsutil/v3/process"
-)
-
-type BackendMonitorRequest struct {
-	Model string `json:"model" yaml:"model"`
-}
-
-type BackendMonitorResponse struct {
-	MemoryInfo    *gopsutil.MemoryInfoStat
-	MemoryPercent float32
-	CPUPercent    float64
-}
-
-type TTSRequest struct {
-	Model   string `json:"model" yaml:"model"`
-	Input   string `json:"input" yaml:"input"`
-	Voice   string `json:"voice" yaml:"voice"`
-	Backend string `json:"backend" yaml:"backend"`
-}
-
-type StoresSet struct {
-	Store string `json:"store,omitempty" yaml:"store,omitempty"`
-
-	Keys   [][]float32 `json:"keys" yaml:"keys"`
-	Values []string    `json:"values" yaml:"values"`
-}
-
-type StoresDelete struct {
-	Store string `json:"store,omitempty" yaml:"store,omitempty"`
-
-	Keys [][]float32 `json:"keys"`
-}
-
-type StoresGet struct {
-	Store string `json:"store,omitempty" yaml:"store,omitempty"`
-
-	Keys [][]float32 `json:"keys" yaml:"keys"`
-}
-
-type StoresGetResponse struct {
-	Keys   [][]float32 `json:"keys" yaml:"keys"`
-	Values []string    `json:"values" yaml:"values"`
-}
-
-type StoresFind struct {
-	Store string `json:"store,omitempty" yaml:"store,omitempty"`
-
-	Key  []float32 `json:"key" yaml:"key"`
-	Topk int       `json:"topk" yaml:"topk"`
-}
-
-type StoresFindResponse struct {
-	Keys         [][]float32 `json:"keys" yaml:"keys"`
-	Values       []string    `json:"values" yaml:"values"`
-	Similarities []float32   `json:"similarities" yaml:"similarities"`
-}
+package schema
+
+import (
+	gopsutil "github.com/shirou/gopsutil/v3/process"
+)
+
+type BackendMonitorRequest struct {
+	Model string `json:"model" yaml:"model"`
+}
+
+type BackendMonitorResponse struct {
+	MemoryInfo    *gopsutil.MemoryInfoStat
+	MemoryPercent float32
+	CPUPercent    float64
+}
+
+// @Descsription TTS request body
+type TTSRequest struct {
+	Model    string `json:"model" yaml:"model"` // model name or full path
+	Input    string `json:"input" yaml:"input"` // text input
+	Voice    string `json:"voice" yaml:"voice"` // voice audio file or speaker id
+	Backend  string `json:"backend" yaml:"backend"`
+	Language string `json:"language,omitempty" yaml:"language,omitempty"` // (optional) language to use with TTS model
+}
+
+type StoresSet struct {
+	Store string `json:"store,omitempty" yaml:"store,omitempty"`
+
+	Keys   [][]float32 `json:"keys" yaml:"keys"`
+	Values []string    `json:"values" yaml:"values"`
+}
+
+type StoresDelete struct {
+	Store string `json:"store,omitempty" yaml:"store,omitempty"`
+
+	Keys [][]float32 `json:"keys"`
+}
+
+type StoresGet struct {
+	Store string `json:"store,omitempty" yaml:"store,omitempty"`
+
+	Keys [][]float32 `json:"keys" yaml:"keys"`
+}
+
+type StoresGetResponse struct {
+	Keys   [][]float32 `json:"keys" yaml:"keys"`
+	Values []string    `json:"values" yaml:"values"`
+}
+
+type StoresFind struct {
+	Store string `json:"store,omitempty" yaml:"store,omitempty"`
+
+	Key  []float32 `json:"key" yaml:"key"`
+	Topk int       `json:"topk" yaml:"topk"`
+}
+
+type StoresFindResponse struct {
+	Keys         [][]float32 `json:"keys" yaml:"keys"`
+	Values       []string    `json:"values" yaml:"values"`
+	Similarities []float32   `json:"similarities" yaml:"similarities"`
+}

Original file line number	Diff line number	Diff line change
`@@ -263,6 +263,7 @@ message TTSRequest {`
`263`	`263`	`string model = 2;`
`264`	`264`	`string dst = 3;`
`265`	`265`	`string voice = 4;`
	`266`	`+ optional string language = 5;`
`266`	`267`	`}`
`267`	`268`
`268`	`269`	`message TokenizationResponse {`
Original file line number	Diff line number	Diff line change
`@@ -52,7 +52,7 @@ func TTSEndpoint(cl config.BackendConfigLoader, ml model.ModelLoader, appConfi`
`52`	`52`	`}`
`53`	`53`	`log.Debug().Msgf("Request for model: %s", modelFile)`
`54`	`54`
`55`		`- filePath, _, err := backend.ModelTTS(cfg.Backend, input.Text, modelFile, voiceID, ml, appConfig, *cfg)`
	`55`	`+ filePath, _, err := backend.ModelTTS(cfg.Backend, input.Text, modelFile, "", voiceID, ml, appConfig, *cfg)`
`56`	`56`	`if err != nil {`
`57`	`57`	`return err`
`58`	`58`	`}`