explorer-monorepo/virtual-banker/backend/tts/elevenlabs-adapter.go

package tts

import (
	"bytes"
	"context"
	"encoding/json"
	"fmt"
	"io"
	"net/http"
	"strings"
	"time"
)

// ElevenLabsTTSService integrates with ElevenLabs TTS API
type ElevenLabsTTSService struct {
	apiKey      string
	voiceID     string
	modelID     string
	baseURL     string
	httpClient  *http.Client
	defaultVoiceConfig *VoiceConfig
}

// VoiceConfig holds ElevenLabs voice configuration
type VoiceConfig struct {
	Stability       float64 `json:"stability"`
	SimilarityBoost float64 `json:"similarity_boost"`
	Style           float64 `json:"style,omitempty"`
	UseSpeakerBoost bool    `json:"use_speaker_boost,omitempty"`
}

// ElevenLabsRequest represents the request body for ElevenLabs API
type ElevenLabsRequest struct {
	Text      string       `json:"text"`
	ModelID   string       `json:"model_id,omitempty"`
	VoiceSettings VoiceConfig `json:"voice_settings,omitempty"`
}

// NewElevenLabsTTSService creates a new ElevenLabs TTS service
func NewElevenLabsTTSService(apiKey, voiceID string) *ElevenLabsTTSService {
	return &ElevenLabsTTSService{
		apiKey:  apiKey,
		voiceID: voiceID,
		modelID: "eleven_multilingual_v2", // Default model
		baseURL: "https://api.elevenlabs.io/v1",
		httpClient: &http.Client{
			Timeout: 30 * time.Second,
		},
		defaultVoiceConfig: &VoiceConfig{
			Stability:       0.5,
			SimilarityBoost: 0.75,
			UseSpeakerBoost: true,
		},
	}
}

// SetModelID sets the model ID for synthesis
func (s *ElevenLabsTTSService) SetModelID(modelID string) {
	s.modelID = modelID
}

// SetVoiceConfig sets the default voice configuration
func (s *ElevenLabsTTSService) SetVoiceConfig(config *VoiceConfig) {
	s.defaultVoiceConfig = config
}

// Synthesize synthesizes text to audio using ElevenLabs REST API
func (s *ElevenLabsTTSService) Synthesize(ctx context.Context, text string) ([]byte, error) {
	return s.SynthesizeWithConfig(ctx, text, s.defaultVoiceConfig)
}

// SynthesizeWithConfig synthesizes text to audio with custom voice configuration
func (s *ElevenLabsTTSService) SynthesizeWithConfig(ctx context.Context, text string, config *VoiceConfig) ([]byte, error) {
	if s.apiKey == "" {
		return nil, fmt.Errorf("ElevenLabs API key not configured")
	}
	if s.voiceID == "" {
		return nil, fmt.Errorf("ElevenLabs voice ID not configured")
	}
	if text == "" {
		return nil, fmt.Errorf("text cannot be empty")
	}

	// Use default config if none provided
	if config == nil {
		config = s.defaultVoiceConfig
	}

	// Prepare request body
	reqBody := ElevenLabsRequest{
		Text:      text,
		ModelID:   s.modelID,
		VoiceSettings: *config,
	}

	jsonBody, err := json.Marshal(reqBody)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal request: %w", err)
	}

	// Build request URL
	url := fmt.Sprintf("%s/text-to-speech/%s", s.baseURL, s.voiceID)

	// Create HTTP request
	req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewBuffer(jsonBody))
	if err != nil {
		return nil, fmt.Errorf("failed to create request: %w", err)
	}

	req.Header.Set("Accept", "audio/mpeg")
	req.Header.Set("Content-Type", "application/json")
	req.Header.Set("xi-api-key", s.apiKey)

	// Execute request with retry logic
	var resp *http.Response
	maxRetries := 3
	for i := 0; i < maxRetries; i++ {
		resp, err = s.httpClient.Do(req)
		if err == nil && resp.StatusCode == http.StatusOK {
			break
		}

		if err != nil {
			if i < maxRetries-1 {
				// Exponential backoff
				backoff := time.Duration(i+1) * time.Second
				time.Sleep(backoff)
				continue
			}
			return nil, fmt.Errorf("failed to call ElevenLabs API after %d retries: %w", maxRetries, err)
		}

		if resp.StatusCode != http.StatusOK {
			resp.Body.Close()
			bodyBytes, _ := io.ReadAll(bytes.NewReader([]byte{}))
			if resp.Body != nil {
				bodyBytes, _ = io.ReadAll(resp.Body)
			}

			// Retry on 5xx errors
			if resp.StatusCode >= 500 && i < maxRetries-1 {
				backoff := time.Duration(i+1) * time.Second
				time.Sleep(backoff)
				continue
			}

			return nil, fmt.Errorf("ElevenLabs API error: status %d, body: %s", resp.StatusCode, string(bodyBytes))
		}
	}
	defer resp.Body.Close()

	// Read audio data
	audioData, err := io.ReadAll(resp.Body)
	if err != nil {
		return nil, fmt.Errorf("failed to read audio data: %w", err)
	}

	return audioData, nil
}

// SynthesizeStream synthesizes text to audio using ElevenLabs streaming API
func (s *ElevenLabsTTSService) SynthesizeStream(ctx context.Context, text string) (io.Reader, error) {
	return s.SynthesizeStreamWithConfig(ctx, text, s.defaultVoiceConfig)
}

// SynthesizeStreamWithConfig synthesizes text to audio stream with custom voice configuration
func (s *ElevenLabsTTSService) SynthesizeStreamWithConfig(ctx context.Context, text string, config *VoiceConfig) (io.Reader, error) {
	if s.apiKey == "" {
		return nil, fmt.Errorf("ElevenLabs API key not configured")
	}
	if s.voiceID == "" {
		return nil, fmt.Errorf("ElevenLabs voice ID not configured")
	}
	if text == "" {
		return nil, fmt.Errorf("text cannot be empty")
	}

	// Use default config if none provided
	if config == nil {
		config = s.defaultVoiceConfig
	}

	// Prepare request body
	reqBody := ElevenLabsRequest{
		Text:      text,
		ModelID:   s.modelID,
		VoiceSettings: *config,
	}

	jsonBody, err := json.Marshal(reqBody)
	if err != nil {
		return nil, fmt.Errorf("failed to marshal request: %w", err)
	}

	// Build request URL for streaming
	url := fmt.Sprintf("%s/text-to-speech/%s/stream", s.baseURL, s.voiceID)

	// Create HTTP request
	req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewBuffer(jsonBody))
	if err != nil {
		return nil, fmt.Errorf("failed to create request: %w", err)
	}

	req.Header.Set("Accept", "audio/mpeg")
	req.Header.Set("Content-Type", "application/json")
	req.Header.Set("xi-api-key", s.apiKey)

	// Execute request
	resp, err := s.httpClient.Do(req)
	if err != nil {
		return nil, fmt.Errorf("failed to call ElevenLabs streaming API: %w", err)
	}

	if resp.StatusCode != http.StatusOK {
		resp.Body.Close()
		bodyBytes, _ := io.ReadAll(resp.Body)
		return nil, fmt.Errorf("ElevenLabs streaming API error: status %d, body: %s", resp.StatusCode, string(bodyBytes))
	}

	// Return stream reader (caller is responsible for closing)
	return resp.Body, nil
}

// GetVisemes returns viseme events for lip sync
// ElevenLabs doesn't provide viseme data directly, so we use phoneme-to-viseme mapping
func (s *ElevenLabsTTSService) GetVisemes(ctx context.Context, text string) ([]VisemeEvent, error) {
	if text == "" {
		return nil, fmt.Errorf("text cannot be empty")
	}

	// Use phoneme-to-viseme mapping to generate viseme events
	// This is a simplified implementation - in production, you might want to use
	// a more sophisticated phoneme-to-viseme mapping service or library
	visemes := s.generateVisemesFromText(text)

	return visemes, nil
}

// generateVisemesFromText generates viseme events from text using basic phoneme-to-viseme mapping
// This is a simplified implementation. For production, consider using:
// - A dedicated phoneme-to-viseme mapping service
// - A TTS provider that provides phoneme timing data (e.g., Azure TTS with SSML)
// - Integration with a speech analysis library
func (s *ElevenLabsTTSService) generateVisemesFromText(text string) []VisemeEvent {
	// Basic phoneme-to-viseme mapping
	phonemeToViseme := map[string]string{
		// Vowels
		"aa": "aa", "ae": "aa", "ah": "aa", "ao": "aa", "aw": "aa",
		"ay": "aa", "eh": "ee", "er": "er", "ey": "ee", "ih": "ee",
		"iy": "ee", "ow": "oh", "oy": "oh", "uh": "ou", "uw": "ou",
		// Consonants
		"b": "aa", "p": "aa", "m": "aa",
		"f": "ee", "v": "ee",
		"th": "ee",
		"d": "aa", "t": "aa", "n": "aa", "l": "aa",
		"k": "aa", "g": "aa", "ng": "aa",
		"s": "ee", "z": "ee",
		"sh": "ee", "zh": "ee", "ch": "ee", "jh": "ee",
		"y": "ee",
		"w": "ou",
		"r": "er",
		"h": "sil",
		"sil": "sil", "sp": "sil",
	}

	// Simple word-to-phoneme approximation
	// In production, use a proper TTS API that provides phoneme timing or a phoneme-to-viseme service
	words := strings.Fields(strings.ToLower(text))
	visemes := []VisemeEvent{}
	currentTime := 0.0
	durationPerWord := 0.3 // Approximate duration per word in seconds
	initialPause := 0.1

	// Initial silence
	visemes = append(visemes, VisemeEvent{
		Viseme:    "sil",
		StartTime: currentTime,
		EndTime:   currentTime + initialPause,
		Phoneme:   "sil",
	})
	currentTime += initialPause

	// Generate visemes for each word
	for _, word := range words {
		// Simple approximation: map first phoneme to viseme
		viseme := "aa" // default
		if len(word) > 0 {
			firstChar := string(word[0])
			if mapped, ok := phonemeToViseme[firstChar]; ok {
				viseme = mapped
			} else {
				// Map common starting consonants
				switch firstChar {
				case "a", "e", "i", "o", "u":
					viseme = "aa"
				default:
					viseme = "aa"
				}
			}
		}

		visemes = append(visemes, VisemeEvent{
			Viseme:    viseme,
			StartTime: currentTime,
			EndTime:   currentTime + durationPerWord,
			Phoneme:   word,
		})
		currentTime += durationPerWord

		// Small pause between words
		visemes = append(visemes, VisemeEvent{
			Viseme:    "sil",
			StartTime: currentTime,
			EndTime:   currentTime + 0.05,
			Phoneme:   "sil",
		})
		currentTime += 0.05
	}

	// Final silence
	visemes = append(visemes, VisemeEvent{
		Viseme:    "sil",
		StartTime: currentTime,
		EndTime:   currentTime + 0.1,
		Phoneme:   "sil",
	})

	return visemes
}