Files
explorer-monorepo/virtual-banker/backend/tts/elevenlabs-adapter.go

330 lines
9.2 KiB
Go

package tts
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"strings"
"time"
)
// ElevenLabsTTSService integrates with ElevenLabs TTS API
type ElevenLabsTTSService struct {
apiKey string
voiceID string
modelID string
baseURL string
httpClient *http.Client
defaultVoiceConfig *VoiceConfig
}
// VoiceConfig holds ElevenLabs voice configuration
type VoiceConfig struct {
Stability float64 `json:"stability"`
SimilarityBoost float64 `json:"similarity_boost"`
Style float64 `json:"style,omitempty"`
UseSpeakerBoost bool `json:"use_speaker_boost,omitempty"`
}
// ElevenLabsRequest represents the request body for ElevenLabs API
type ElevenLabsRequest struct {
Text string `json:"text"`
ModelID string `json:"model_id,omitempty"`
VoiceSettings VoiceConfig `json:"voice_settings,omitempty"`
}
// NewElevenLabsTTSService creates a new ElevenLabs TTS service
func NewElevenLabsTTSService(apiKey, voiceID string) *ElevenLabsTTSService {
return &ElevenLabsTTSService{
apiKey: apiKey,
voiceID: voiceID,
modelID: "eleven_multilingual_v2", // Default model
baseURL: "https://api.elevenlabs.io/v1",
httpClient: &http.Client{
Timeout: 30 * time.Second,
},
defaultVoiceConfig: &VoiceConfig{
Stability: 0.5,
SimilarityBoost: 0.75,
UseSpeakerBoost: true,
},
}
}
// SetModelID sets the model ID for synthesis
func (s *ElevenLabsTTSService) SetModelID(modelID string) {
s.modelID = modelID
}
// SetVoiceConfig sets the default voice configuration
func (s *ElevenLabsTTSService) SetVoiceConfig(config *VoiceConfig) {
s.defaultVoiceConfig = config
}
// Synthesize synthesizes text to audio using ElevenLabs REST API
func (s *ElevenLabsTTSService) Synthesize(ctx context.Context, text string) ([]byte, error) {
return s.SynthesizeWithConfig(ctx, text, s.defaultVoiceConfig)
}
// SynthesizeWithConfig synthesizes text to audio with custom voice configuration
func (s *ElevenLabsTTSService) SynthesizeWithConfig(ctx context.Context, text string, config *VoiceConfig) ([]byte, error) {
if s.apiKey == "" {
return nil, fmt.Errorf("ElevenLabs API key not configured")
}
if s.voiceID == "" {
return nil, fmt.Errorf("ElevenLabs voice ID not configured")
}
if text == "" {
return nil, fmt.Errorf("text cannot be empty")
}
// Use default config if none provided
if config == nil {
config = s.defaultVoiceConfig
}
// Prepare request body
reqBody := ElevenLabsRequest{
Text: text,
ModelID: s.modelID,
VoiceSettings: *config,
}
jsonBody, err := json.Marshal(reqBody)
if err != nil {
return nil, fmt.Errorf("failed to marshal request: %w", err)
}
// Build request URL
url := fmt.Sprintf("%s/text-to-speech/%s", s.baseURL, s.voiceID)
// Create HTTP request
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewBuffer(jsonBody))
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}
req.Header.Set("Accept", "audio/mpeg")
req.Header.Set("Content-Type", "application/json")
req.Header.Set("xi-api-key", s.apiKey)
// Execute request with retry logic
var resp *http.Response
maxRetries := 3
for i := 0; i < maxRetries; i++ {
resp, err = s.httpClient.Do(req)
if err == nil && resp.StatusCode == http.StatusOK {
break
}
if err != nil {
if i < maxRetries-1 {
// Exponential backoff
backoff := time.Duration(i+1) * time.Second
time.Sleep(backoff)
continue
}
return nil, fmt.Errorf("failed to call ElevenLabs API after %d retries: %w", maxRetries, err)
}
if resp.StatusCode != http.StatusOK {
resp.Body.Close()
bodyBytes, _ := io.ReadAll(bytes.NewReader([]byte{}))
if resp.Body != nil {
bodyBytes, _ = io.ReadAll(resp.Body)
}
// Retry on 5xx errors
if resp.StatusCode >= 500 && i < maxRetries-1 {
backoff := time.Duration(i+1) * time.Second
time.Sleep(backoff)
continue
}
return nil, fmt.Errorf("ElevenLabs API error: status %d, body: %s", resp.StatusCode, string(bodyBytes))
}
}
defer resp.Body.Close()
// Read audio data
audioData, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("failed to read audio data: %w", err)
}
return audioData, nil
}
// SynthesizeStream synthesizes text to audio using ElevenLabs streaming API
func (s *ElevenLabsTTSService) SynthesizeStream(ctx context.Context, text string) (io.Reader, error) {
return s.SynthesizeStreamWithConfig(ctx, text, s.defaultVoiceConfig)
}
// SynthesizeStreamWithConfig synthesizes text to audio stream with custom voice configuration
func (s *ElevenLabsTTSService) SynthesizeStreamWithConfig(ctx context.Context, text string, config *VoiceConfig) (io.Reader, error) {
if s.apiKey == "" {
return nil, fmt.Errorf("ElevenLabs API key not configured")
}
if s.voiceID == "" {
return nil, fmt.Errorf("ElevenLabs voice ID not configured")
}
if text == "" {
return nil, fmt.Errorf("text cannot be empty")
}
// Use default config if none provided
if config == nil {
config = s.defaultVoiceConfig
}
// Prepare request body
reqBody := ElevenLabsRequest{
Text: text,
ModelID: s.modelID,
VoiceSettings: *config,
}
jsonBody, err := json.Marshal(reqBody)
if err != nil {
return nil, fmt.Errorf("failed to marshal request: %w", err)
}
// Build request URL for streaming
url := fmt.Sprintf("%s/text-to-speech/%s/stream", s.baseURL, s.voiceID)
// Create HTTP request
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewBuffer(jsonBody))
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}
req.Header.Set("Accept", "audio/mpeg")
req.Header.Set("Content-Type", "application/json")
req.Header.Set("xi-api-key", s.apiKey)
// Execute request
resp, err := s.httpClient.Do(req)
if err != nil {
return nil, fmt.Errorf("failed to call ElevenLabs streaming API: %w", err)
}
if resp.StatusCode != http.StatusOK {
resp.Body.Close()
bodyBytes, _ := io.ReadAll(resp.Body)
return nil, fmt.Errorf("ElevenLabs streaming API error: status %d, body: %s", resp.StatusCode, string(bodyBytes))
}
// Return stream reader (caller is responsible for closing)
return resp.Body, nil
}
// GetVisemes returns viseme events for lip sync
// ElevenLabs doesn't provide viseme data directly, so we use phoneme-to-viseme mapping
func (s *ElevenLabsTTSService) GetVisemes(ctx context.Context, text string) ([]VisemeEvent, error) {
if text == "" {
return nil, fmt.Errorf("text cannot be empty")
}
// Use phoneme-to-viseme mapping to generate viseme events
// This is a simplified implementation - in production, you might want to use
// a more sophisticated phoneme-to-viseme mapping service or library
visemes := s.generateVisemesFromText(text)
return visemes, nil
}
// generateVisemesFromText generates viseme events from text using basic phoneme-to-viseme mapping
// This is a simplified implementation. For production, consider using:
// - A dedicated phoneme-to-viseme mapping service
// - A TTS provider that provides phoneme timing data (e.g., Azure TTS with SSML)
// - Integration with a speech analysis library
func (s *ElevenLabsTTSService) generateVisemesFromText(text string) []VisemeEvent {
// Basic phoneme-to-viseme mapping
phonemeToViseme := map[string]string{
// Vowels
"aa": "aa", "ae": "aa", "ah": "aa", "ao": "aa", "aw": "aa",
"ay": "aa", "eh": "ee", "er": "er", "ey": "ee", "ih": "ee",
"iy": "ee", "ow": "oh", "oy": "oh", "uh": "ou", "uw": "ou",
// Consonants
"b": "aa", "p": "aa", "m": "aa",
"f": "ee", "v": "ee",
"th": "ee",
"d": "aa", "t": "aa", "n": "aa", "l": "aa",
"k": "aa", "g": "aa", "ng": "aa",
"s": "ee", "z": "ee",
"sh": "ee", "zh": "ee", "ch": "ee", "jh": "ee",
"y": "ee",
"w": "ou",
"r": "er",
"h": "sil",
"sil": "sil", "sp": "sil",
}
// Simple word-to-phoneme approximation
// In production, use a proper TTS API that provides phoneme timing or a phoneme-to-viseme service
words := strings.Fields(strings.ToLower(text))
visemes := []VisemeEvent{}
currentTime := 0.0
durationPerWord := 0.3 // Approximate duration per word in seconds
initialPause := 0.1
// Initial silence
visemes = append(visemes, VisemeEvent{
Viseme: "sil",
StartTime: currentTime,
EndTime: currentTime + initialPause,
Phoneme: "sil",
})
currentTime += initialPause
// Generate visemes for each word
for _, word := range words {
// Simple approximation: map first phoneme to viseme
viseme := "aa" // default
if len(word) > 0 {
firstChar := string(word[0])
if mapped, ok := phonemeToViseme[firstChar]; ok {
viseme = mapped
} else {
// Map common starting consonants
switch firstChar {
case "a", "e", "i", "o", "u":
viseme = "aa"
default:
viseme = "aa"
}
}
}
visemes = append(visemes, VisemeEvent{
Viseme: viseme,
StartTime: currentTime,
EndTime: currentTime + durationPerWord,
Phoneme: word,
})
currentTime += durationPerWord
// Small pause between words
visemes = append(visemes, VisemeEvent{
Viseme: "sil",
StartTime: currentTime,
EndTime: currentTime + 0.05,
Phoneme: "sil",
})
currentTime += 0.05
}
// Final silence
visemes = append(visemes, VisemeEvent{
Viseme: "sil",
StartTime: currentTime,
EndTime: currentTime + 0.1,
Phoneme: "sil",
})
return visemes
}