TTS: configurable auth, Health check, Phoenix options; .env.example; Gitea CI workflow

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-10 16:54:10 -08:00
parent b4753cef7e
commit 9839401d1d
8 changed files with 259 additions and 30 deletions
--- a/backend/main.go
+++ b/backend/main.go
@@ -55,9 +55,9 @@ func main() {
 	// Initialize services
 	sessionManager := session.NewManager(db, redisClient)

-	// Initialize ASR/TTS (using mocks for now)
+	// Initialize ASR/TTS
 	asrService := asr.NewMockASRService()
-	ttsService := tts.NewMockTTSService()
+	ttsService := newTTSService()

 	// Initialize LLM (using mock for now)
 	llmGateway := llm.NewMockLLMGateway()
@@ -128,6 +128,28 @@ func main() {
 	log.Println("Server exited")
 }

+// newTTSService returns a TTS service from env: use real API when TTS_API_KEY (or
+// ELEVENLABS_API_KEY) and TTS_VOICE_ID are set. Optional: TTS_BASE_URL (Phoenix),
+// TTS_AUTH_HEADER_NAME / TTS_AUTH_HEADER_VALUE (e.g. Authorization: Bearer),
+// USE_PHOENIX_TTS=true to require TTS_BASE_URL.
+func newTTSService() tts.Service {
+	apiKey := getEnv("TTS_API_KEY", os.Getenv("ELEVENLABS_API_KEY"))
+	voiceID := getEnv("TTS_VOICE_ID", os.Getenv("ELEVENLABS_VOICE_ID"))
+	baseURL := getEnv("TTS_BASE_URL", "")
+	authName := getEnv("TTS_AUTH_HEADER_NAME", "")
+	authValue := getEnv("TTS_AUTH_HEADER_VALUE", "")
+	usePhoenix := getEnv("USE_PHOENIX_TTS", "") == "true" || getEnv("USE_PHOENIX_TTS", "") == "1"
+	if usePhoenix && baseURL == "" {
+		baseURL = getEnv("PHOENIX_TTS_BASE_URL", "https://phoenix.example.com/tts/v1")
+	}
+	hasAuth := apiKey != "" || authValue != ""
+	if hasAuth && voiceID != "" {
+		opts := tts.TTSOptions{BaseURL: baseURL, AuthHeaderName: authName, AuthHeaderValue: authValue}
+		return tts.NewElevenLabsTTSServiceWithOptionsFull(apiKey, voiceID, opts)
+	}
+	return tts.NewMockTTSService()
+}
+
 func getEnv(key, defaultValue string) string {
 	if value := os.Getenv(key); value != "" {
 		return value
--- a/backend/observability/tracing.go
+++ b/backend/observability/tracing.go
@@ -2,7 +2,6 @@ package observability

 import (
 	"context"
-	"fmt"
 )

 // Tracer provides distributed tracing
--- a/backend/tts/README.md
+++ b/backend/tts/README.md
@@ -0,0 +1,86 @@
+# TTS package — ElevenLabs-compatible, Phoenix endpoint swap
+
+This package provides a **text-to-speech client** that matches the [ElevenLabs TTS API](https://elevenlabs.io/docs/api-reference/text-to-speech) contract. You can point it at **ElevenLabs** or at a **Phoenix-hosted** TTS service that implements the same API shape; switching is a config change (base URL), no code change.
+
+**Note:** The repo [eleven-labs/api-service](https://github.com/eleven-labs/api-service) on GitHub is a PHP OpenAPI consumer library, not the voice TTS API. This client targets the **REST TTS API** at `api.elevenlabs.io` (and compatible backends).
+
+---
+
+## Parity with ElevenLabs TTS API
+
+| Feature | ElevenLabs API | This client |
+|--------|----------------|-------------|
+| **Sync** `POST /v1/text-to-speech/:voice_id` | ✅ | ✅ `Synthesize` |
+| **Stream** `POST /v1/text-to-speech/:voice_id/stream` | ✅ | ✅ `SynthesizeStream` |
+| **Voice settings** (stability, similarity_boost, style, speaker_boost) | ✅ | ✅ `VoiceConfig` |
+| **Model** (`model_id`) | ✅ | ✅ `SetModelID` / default `eleven_multilingual_v2` |
+| **Auth** `xi-api-key` header | ✅ | ✅ |
+| **Output** `Accept: audio/mpeg` (mp3) | ✅ | ✅ |
+| **Retries** (5xx, backoff) | — | ✅ on sync |
+| **Visemes** (lip sync) | ❌ (no phoneme API) | ✅ client-side approximation |
+
+Optional ElevenLabs features not used here: `output_format` query, `optimize_streaming_latency`, WebSocket streaming. For “just change endpoint” to Phoenix, the host only needs to implement the same **sync + stream** JSON body and return **audio/mpeg**.
+
+---
+
+## Which TTS backend? (decision table)
+
+| Env / condition | Backend used |
+|----------------|--------------|
+| `TTS_VOICE_ID` unset (or no auth) | **Mock** (no real synthesis) |
+| `TTS_VOICE_ID` + `TTS_API_KEY` or `ELEVENLABS_*` set, `TTS_BASE_URL` unset | **ElevenLabs** (api.elevenlabs.io) |
+| `TTS_BASE_URL` set (e.g. Phoenix) + auth + voice | **Phoenix** (or other compatible host) |
+| `USE_PHOENIX_TTS=true` | Prefer Phoenix; use `TTS_BASE_URL` or `PHOENIX_TTS_BASE_URL` |
+
+Auth: default header is `xi-api-key` (ElevenLabs). For Phoenix with Bearer token set `TTS_AUTH_HEADER_NAME=Authorization` and `TTS_AUTH_HEADER_VALUE=Bearer <token>`.
+
+---
+
+## Using with Phoenix (swap endpoint)
+
+1. **Phoenix TTS service** must expose the same contract:
+   - `POST /v1/text-to-speech/:voice_id` — body: `{"text","model_id","voice_settings"}` → response: raw mp3
+   - `POST /v1/text-to-speech/:voice_id/stream` — same body → response: streaming mp3
+   - **Health:** `GET /health` at the same origin (e.g. `{baseURL}/../health`) returning 2xx so `tts.Service.Health(ctx)` can be used for readiness.
+
+2. **Configure the app** with the Phoenix base URL (and optional auth):
+
+   ```bash
+   export TTS_BASE_URL="https://phoenix.example.com/tts/v1"
+   export TTS_VOICE_ID="default-voice-id"
+   # Optional: Phoenix uses Bearer token
+   export TTS_AUTH_HEADER_NAME="Authorization"
+   export TTS_AUTH_HEADER_VALUE="Bearer your-token"
+   # Or feature flag to force Phoenix
+   export USE_PHOENIX_TTS=true
+   export PHOENIX_TTS_BASE_URL="https://phoenix.example.com/tts/v1"
+   ```
+
+3. **Health check:** The client’s `Health(ctx)` calls `GET {baseURL}/../health` when base URL is not ElevenLabs. Wire this into your readiness probe or a `/ready` endpoint if you need TTS to be up before accepting traffic.
+
+4. **In code** (e.g. for reuse in another project):
+
+   ```go
+   opts := tts.TTSOptions{
+       BaseURL:         "https://phoenix.example.com/tts/v1",
+       AuthHeaderName:  "Authorization",
+       AuthHeaderValue: "Bearer token",
+   }
+   svc := tts.NewElevenLabsTTSServiceWithOptionsFull(apiKey, voiceID, opts)
+   if err := svc.Health(ctx); err != nil { /* not ready */ }
+   audio, err := svc.Synthesize(ctx, "Hello world")
+   ```
+
+No code change beyond config: same interface, different base URL and optional auth header.
+
+---
+
+## Reuse across projects
+
+This package lives in **virtual-banker** and can be depended on as a Go module path (e.g. `github.com/your-org/virtual-banker/backend/tts` or via a shared repo). Any project that needs TTS can:
+
+- Depend on this package.
+- Use `tts.Service` and either `NewMockTTSService()` or `NewElevenLabsTTSServiceWithOptions(apiKey, voiceID, baseURL)` / `NewElevenLabsTTSServiceWithOptionsFull(apiKey, voiceID, opts)` for custom auth.
+- Set `baseURL` to ElevenLabs (`""` or `https://api.elevenlabs.io/v1`) or to the Phoenix TTS base URL.
+
+The **interface** (`Synthesize`, `SynthesizeStream`, `GetVisemes`) stays the same regardless of backend.
--- a/backend/tts/elevenlabs-adapter.go
+++ b/backend/tts/elevenlabs-adapter.go
@@ -7,20 +7,31 @@ import (
 	"fmt"
 	"io"
 	"net/http"
+	"net/url"
+	"path"
 	"strings"
 	"time"
 )

-// ElevenLabsTTSService integrates with ElevenLabs TTS API
+// ElevenLabsTTSService integrates with ElevenLabs TTS API or a Phoenix-compatible endpoint
 type ElevenLabsTTSService struct {
-	apiKey      string
-	voiceID     string
-	modelID     string
-	baseURL     string
-	httpClient  *http.Client
+	apiKey             string
+	voiceID            string
+	modelID            string
+	baseURL            string
+	authHeaderName     string // default "xi-api-key" when empty
+	authHeaderValue    string
+	httpClient         *http.Client
 	defaultVoiceConfig *VoiceConfig
 }

+// TTSOptions allows optional overrides when creating the TTS service (e.g. Phoenix auth)
+type TTSOptions struct {
+	BaseURL         string // e.g. "https://phoenix.example.com/tts/v1"
+	AuthHeaderName  string // e.g. "Authorization"; empty = "xi-api-key"
+	AuthHeaderValue string // e.g. "Bearer token"; empty = apiKey
+}
+
 // VoiceConfig holds ElevenLabs voice configuration
 type VoiceConfig struct {
 	Stability       float64 `json:"stability"`
@@ -36,13 +47,45 @@ type ElevenLabsRequest struct {
 	VoiceSettings VoiceConfig `json:"voice_settings,omitempty"`
 }

-// NewElevenLabsTTSService creates a new ElevenLabs TTS service
+// DefaultElevenLabsBaseURL is the default TTS API base (ElevenLabs or Phoenix-compatible).
+const DefaultElevenLabsBaseURL = "https://api.elevenlabs.io/v1"
+
+// NewElevenLabsTTSService creates a new TTS service for ElevenLabs or a Phoenix-hosted
+// ElevenLabs-compatible API. Use baseURL "" for default (api.elevenlabs.io); set to
+// your Phoenix TTS base (e.g. https://phoenix.example.com/tts/v1) to swap endpoint.
 func NewElevenLabsTTSService(apiKey, voiceID string) *ElevenLabsTTSService {
+	return NewElevenLabsTTSServiceWithOptions(apiKey, voiceID, "")
+}
+
+// NewElevenLabsTTSServiceWithOptions creates a TTS service with a configurable base URL.
+// baseURL: if empty, uses DefaultElevenLabsBaseURL (ElevenLabs). For Phoenix, use e.g.
+// "https://phoenix.example.com/tts/v1" so that /text-to-speech/:id and /stream are used.
+func NewElevenLabsTTSServiceWithOptions(apiKey, voiceID, baseURL string) *ElevenLabsTTSService {
+	return NewElevenLabsTTSServiceWithOptionsFull(apiKey, voiceID, TTSOptions{BaseURL: baseURL})
+}
+
+// NewElevenLabsTTSServiceWithOptionsFull creates a TTS service with full options (base URL, auth header).
+// Use for Phoenix when auth differs from ElevenLabs (e.g. Authorization: Bearer <token>).
+func NewElevenLabsTTSServiceWithOptionsFull(apiKey, voiceID string, opts TTSOptions) *ElevenLabsTTSService {
+	baseURL := strings.TrimSuffix(opts.BaseURL, "/")
+	if baseURL == "" {
+		baseURL = DefaultElevenLabsBaseURL
+	}
+	authName := opts.AuthHeaderName
+	if authName == "" {
+		authName = "xi-api-key"
+	}
+	authVal := opts.AuthHeaderValue
+	if authVal == "" {
+		authVal = apiKey
+	}
 	return &ElevenLabsTTSService{
-		apiKey:  apiKey,
-		voiceID: voiceID,
-		modelID: "eleven_multilingual_v2", // Default model
-		baseURL: "https://api.elevenlabs.io/v1",
+		apiKey:          apiKey,
+		voiceID:         voiceID,
+		modelID:         "eleven_multilingual_v2",
+		baseURL:         baseURL,
+		authHeaderName:  authName,
+		authHeaderValue: authVal,
 		httpClient: &http.Client{
 			Timeout: 30 * time.Second,
 		},
@@ -71,8 +114,8 @@ func (s *ElevenLabsTTSService) Synthesize(ctx context.Context, text string) ([]b

 // SynthesizeWithConfig synthesizes text to audio with custom voice configuration
 func (s *ElevenLabsTTSService) SynthesizeWithConfig(ctx context.Context, text string, config *VoiceConfig) ([]byte, error) {
-	if s.apiKey == "" {
-		return nil, fmt.Errorf("ElevenLabs API key not configured")
+	if s.authHeaderValue == "" && s.apiKey == "" {
+		return nil, fmt.Errorf("TTS API key or auth not configured")
 	}
 	if s.voiceID == "" {
 		return nil, fmt.Errorf("ElevenLabs voice ID not configured")
@@ -109,8 +152,9 @@ func (s *ElevenLabsTTSService) SynthesizeWithConfig(ctx context.Context, text st

 	req.Header.Set("Accept", "audio/mpeg")
 	req.Header.Set("Content-Type", "application/json")
-	req.Header.Set("xi-api-key", s.apiKey)
-
+	if s.authHeaderValue != "" {
+		req.Header.Set(s.authHeaderName, s.authHeaderValue)
+	}
 	// Execute request with retry logic
 	var resp *http.Response
 	maxRetries := 3
@@ -131,12 +175,8 @@ func (s *ElevenLabsTTSService) SynthesizeWithConfig(ctx context.Context, text st
 		}

 		if resp.StatusCode != http.StatusOK {
+			bodyBytes, _ := io.ReadAll(resp.Body)
 			resp.Body.Close()
-			bodyBytes, _ := io.ReadAll(bytes.NewReader([]byte{}))
-			if resp.Body != nil {
-				bodyBytes, _ = io.ReadAll(resp.Body)
-			}
-			
 			// Retry on 5xx errors
 			if resp.StatusCode >= 500 && i < maxRetries-1 {
 				backoff := time.Duration(i+1) * time.Second
@@ -165,8 +205,8 @@ func (s *ElevenLabsTTSService) SynthesizeStream(ctx context.Context, text string

 // SynthesizeStreamWithConfig synthesizes text to audio stream with custom voice configuration
 func (s *ElevenLabsTTSService) SynthesizeStreamWithConfig(ctx context.Context, text string, config *VoiceConfig) (io.Reader, error) {
-	if s.apiKey == "" {
-		return nil, fmt.Errorf("ElevenLabs API key not configured")
+	if s.authHeaderValue == "" && s.apiKey == "" {
+		return nil, fmt.Errorf("TTS API key or auth not configured")
 	}
 	if s.voiceID == "" {
 		return nil, fmt.Errorf("ElevenLabs voice ID not configured")
@@ -203,8 +243,9 @@ func (s *ElevenLabsTTSService) SynthesizeStreamWithConfig(ctx context.Context, t

 	req.Header.Set("Accept", "audio/mpeg")
 	req.Header.Set("Content-Type", "application/json")
-	req.Header.Set("xi-api-key", s.apiKey)
-
+	if s.authHeaderValue != "" {
+		req.Header.Set(s.authHeaderName, s.authHeaderValue)
+	}
 	// Execute request
 	resp, err := s.httpClient.Do(req)
 	if err != nil {
@@ -212,15 +253,44 @@ func (s *ElevenLabsTTSService) SynthesizeStreamWithConfig(ctx context.Context, t
 	}

 	if resp.StatusCode != http.StatusOK {
-		resp.Body.Close()
 		bodyBytes, _ := io.ReadAll(resp.Body)
-		return nil, fmt.Errorf("ElevenLabs streaming API error: status %d, body: %s", resp.StatusCode, string(bodyBytes))
+		resp.Body.Close()
+		return nil, fmt.Errorf("TTS streaming API error: status %d, body: %s", resp.StatusCode, string(bodyBytes))
 	}

 	// Return stream reader (caller is responsible for closing)
 	return resp.Body, nil
 }

+// Health checks connectivity to the TTS backend. For Phoenix, expects GET {baseURL}/../health (or /health).
+// For ElevenLabs (default base URL), this is a no-op and returns nil (no public health endpoint).
+func (s *ElevenLabsTTSService) Health(ctx context.Context) error {
+	if s.baseURL == DefaultElevenLabsBaseURL {
+		return nil // ElevenLabs has no public health; skip to avoid unnecessary calls
+	}
+	u, err := url.Parse(s.baseURL)
+	if err != nil {
+		return fmt.Errorf("TTS base URL invalid: %w", err)
+	}
+	u.Path = path.Join(path.Dir(u.Path), "health")
+	req, err := http.NewRequestWithContext(ctx, "GET", u.String(), nil)
+	if err != nil {
+		return err
+	}
+	if s.authHeaderValue != "" {
+		req.Header.Set(s.authHeaderName, s.authHeaderValue)
+	}
+	resp, err := s.httpClient.Do(req)
+	if err != nil {
+		return fmt.Errorf("TTS health check failed: %w", err)
+	}
+	resp.Body.Close()
+	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+		return fmt.Errorf("TTS health returned status %d", resp.StatusCode)
+	}
+	return nil
+}
+
 // GetVisemes returns viseme events for lip sync
 // ElevenLabs doesn't provide viseme data directly, so we use phoneme-to-viseme mapping
 func (s *ElevenLabsTTSService) GetVisemes(ctx context.Context, text string) ([]VisemeEvent, error) {
--- a/backend/tts/service.go
+++ b/backend/tts/service.go
@@ -2,7 +2,6 @@ package tts

 import (
 	"context"
-	"fmt"
 	"io"
 )

@@ -11,6 +10,8 @@ type Service interface {
 	SynthesizeStream(ctx context.Context, text string) (io.Reader, error)
 	Synthesize(ctx context.Context, text string) ([]byte, error)
 	GetVisemes(ctx context.Context, text string) ([]VisemeEvent, error)
+	// Health checks connectivity to the TTS backend (e.g. Phoenix /health). No-op for mocks.
+	Health(ctx context.Context) error
 }

 // VisemeEvent represents a viseme (lip shape) event for lip sync
@@ -52,6 +53,9 @@ func (s *MockTTSService) GetVisemes(ctx context.Context, text string) ([]VisemeE
 	}, nil
 }

+// Health is a no-op for the mock (no backend).
+func (s *MockTTSService) Health(ctx context.Context) error { return nil }
+
 // ElevenLabsTTSService integrates with ElevenLabs (implementation in elevenlabs-adapter.go)
 // This interface definition is kept for backwards compatibility
 // The actual implementation is in elevenlabs-adapter.go