330 lines
9.2 KiB
Go
330 lines
9.2 KiB
Go
package tts
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// ElevenLabsTTSService integrates with ElevenLabs TTS API
|
|
type ElevenLabsTTSService struct {
|
|
apiKey string
|
|
voiceID string
|
|
modelID string
|
|
baseURL string
|
|
httpClient *http.Client
|
|
defaultVoiceConfig *VoiceConfig
|
|
}
|
|
|
|
// VoiceConfig holds ElevenLabs voice configuration
|
|
type VoiceConfig struct {
|
|
Stability float64 `json:"stability"`
|
|
SimilarityBoost float64 `json:"similarity_boost"`
|
|
Style float64 `json:"style,omitempty"`
|
|
UseSpeakerBoost bool `json:"use_speaker_boost,omitempty"`
|
|
}
|
|
|
|
// ElevenLabsRequest represents the request body for ElevenLabs API
|
|
type ElevenLabsRequest struct {
|
|
Text string `json:"text"`
|
|
ModelID string `json:"model_id,omitempty"`
|
|
VoiceSettings VoiceConfig `json:"voice_settings,omitempty"`
|
|
}
|
|
|
|
// NewElevenLabsTTSService creates a new ElevenLabs TTS service
|
|
func NewElevenLabsTTSService(apiKey, voiceID string) *ElevenLabsTTSService {
|
|
return &ElevenLabsTTSService{
|
|
apiKey: apiKey,
|
|
voiceID: voiceID,
|
|
modelID: "eleven_multilingual_v2", // Default model
|
|
baseURL: "https://api.elevenlabs.io/v1",
|
|
httpClient: &http.Client{
|
|
Timeout: 30 * time.Second,
|
|
},
|
|
defaultVoiceConfig: &VoiceConfig{
|
|
Stability: 0.5,
|
|
SimilarityBoost: 0.75,
|
|
UseSpeakerBoost: true,
|
|
},
|
|
}
|
|
}
|
|
|
|
// SetModelID sets the model ID for synthesis
|
|
func (s *ElevenLabsTTSService) SetModelID(modelID string) {
|
|
s.modelID = modelID
|
|
}
|
|
|
|
// SetVoiceConfig sets the default voice configuration
|
|
func (s *ElevenLabsTTSService) SetVoiceConfig(config *VoiceConfig) {
|
|
s.defaultVoiceConfig = config
|
|
}
|
|
|
|
// Synthesize synthesizes text to audio using ElevenLabs REST API
|
|
func (s *ElevenLabsTTSService) Synthesize(ctx context.Context, text string) ([]byte, error) {
|
|
return s.SynthesizeWithConfig(ctx, text, s.defaultVoiceConfig)
|
|
}
|
|
|
|
// SynthesizeWithConfig synthesizes text to audio with custom voice configuration
|
|
func (s *ElevenLabsTTSService) SynthesizeWithConfig(ctx context.Context, text string, config *VoiceConfig) ([]byte, error) {
|
|
if s.apiKey == "" {
|
|
return nil, fmt.Errorf("ElevenLabs API key not configured")
|
|
}
|
|
if s.voiceID == "" {
|
|
return nil, fmt.Errorf("ElevenLabs voice ID not configured")
|
|
}
|
|
if text == "" {
|
|
return nil, fmt.Errorf("text cannot be empty")
|
|
}
|
|
|
|
// Use default config if none provided
|
|
if config == nil {
|
|
config = s.defaultVoiceConfig
|
|
}
|
|
|
|
// Prepare request body
|
|
reqBody := ElevenLabsRequest{
|
|
Text: text,
|
|
ModelID: s.modelID,
|
|
VoiceSettings: *config,
|
|
}
|
|
|
|
jsonBody, err := json.Marshal(reqBody)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to marshal request: %w", err)
|
|
}
|
|
|
|
// Build request URL
|
|
url := fmt.Sprintf("%s/text-to-speech/%s", s.baseURL, s.voiceID)
|
|
|
|
// Create HTTP request
|
|
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewBuffer(jsonBody))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create request: %w", err)
|
|
}
|
|
|
|
req.Header.Set("Accept", "audio/mpeg")
|
|
req.Header.Set("Content-Type", "application/json")
|
|
req.Header.Set("xi-api-key", s.apiKey)
|
|
|
|
// Execute request with retry logic
|
|
var resp *http.Response
|
|
maxRetries := 3
|
|
for i := 0; i < maxRetries; i++ {
|
|
resp, err = s.httpClient.Do(req)
|
|
if err == nil && resp.StatusCode == http.StatusOK {
|
|
break
|
|
}
|
|
|
|
if err != nil {
|
|
if i < maxRetries-1 {
|
|
// Exponential backoff
|
|
backoff := time.Duration(i+1) * time.Second
|
|
time.Sleep(backoff)
|
|
continue
|
|
}
|
|
return nil, fmt.Errorf("failed to call ElevenLabs API after %d retries: %w", maxRetries, err)
|
|
}
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
resp.Body.Close()
|
|
bodyBytes, _ := io.ReadAll(bytes.NewReader([]byte{}))
|
|
if resp.Body != nil {
|
|
bodyBytes, _ = io.ReadAll(resp.Body)
|
|
}
|
|
|
|
// Retry on 5xx errors
|
|
if resp.StatusCode >= 500 && i < maxRetries-1 {
|
|
backoff := time.Duration(i+1) * time.Second
|
|
time.Sleep(backoff)
|
|
continue
|
|
}
|
|
|
|
return nil, fmt.Errorf("ElevenLabs API error: status %d, body: %s", resp.StatusCode, string(bodyBytes))
|
|
}
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
// Read audio data
|
|
audioData, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to read audio data: %w", err)
|
|
}
|
|
|
|
return audioData, nil
|
|
}
|
|
|
|
// SynthesizeStream synthesizes text to audio using ElevenLabs streaming API
|
|
func (s *ElevenLabsTTSService) SynthesizeStream(ctx context.Context, text string) (io.Reader, error) {
|
|
return s.SynthesizeStreamWithConfig(ctx, text, s.defaultVoiceConfig)
|
|
}
|
|
|
|
// SynthesizeStreamWithConfig synthesizes text to audio stream with custom voice configuration
|
|
func (s *ElevenLabsTTSService) SynthesizeStreamWithConfig(ctx context.Context, text string, config *VoiceConfig) (io.Reader, error) {
|
|
if s.apiKey == "" {
|
|
return nil, fmt.Errorf("ElevenLabs API key not configured")
|
|
}
|
|
if s.voiceID == "" {
|
|
return nil, fmt.Errorf("ElevenLabs voice ID not configured")
|
|
}
|
|
if text == "" {
|
|
return nil, fmt.Errorf("text cannot be empty")
|
|
}
|
|
|
|
// Use default config if none provided
|
|
if config == nil {
|
|
config = s.defaultVoiceConfig
|
|
}
|
|
|
|
// Prepare request body
|
|
reqBody := ElevenLabsRequest{
|
|
Text: text,
|
|
ModelID: s.modelID,
|
|
VoiceSettings: *config,
|
|
}
|
|
|
|
jsonBody, err := json.Marshal(reqBody)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to marshal request: %w", err)
|
|
}
|
|
|
|
// Build request URL for streaming
|
|
url := fmt.Sprintf("%s/text-to-speech/%s/stream", s.baseURL, s.voiceID)
|
|
|
|
// Create HTTP request
|
|
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewBuffer(jsonBody))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create request: %w", err)
|
|
}
|
|
|
|
req.Header.Set("Accept", "audio/mpeg")
|
|
req.Header.Set("Content-Type", "application/json")
|
|
req.Header.Set("xi-api-key", s.apiKey)
|
|
|
|
// Execute request
|
|
resp, err := s.httpClient.Do(req)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to call ElevenLabs streaming API: %w", err)
|
|
}
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
resp.Body.Close()
|
|
bodyBytes, _ := io.ReadAll(resp.Body)
|
|
return nil, fmt.Errorf("ElevenLabs streaming API error: status %d, body: %s", resp.StatusCode, string(bodyBytes))
|
|
}
|
|
|
|
// Return stream reader (caller is responsible for closing)
|
|
return resp.Body, nil
|
|
}
|
|
|
|
// GetVisemes returns viseme events for lip sync
|
|
// ElevenLabs doesn't provide viseme data directly, so we use phoneme-to-viseme mapping
|
|
func (s *ElevenLabsTTSService) GetVisemes(ctx context.Context, text string) ([]VisemeEvent, error) {
|
|
if text == "" {
|
|
return nil, fmt.Errorf("text cannot be empty")
|
|
}
|
|
|
|
// Use phoneme-to-viseme mapping to generate viseme events
|
|
// This is a simplified implementation - in production, you might want to use
|
|
// a more sophisticated phoneme-to-viseme mapping service or library
|
|
visemes := s.generateVisemesFromText(text)
|
|
|
|
return visemes, nil
|
|
}
|
|
|
|
// generateVisemesFromText generates viseme events from text using basic phoneme-to-viseme mapping
|
|
// This is a simplified implementation. For production, consider using:
|
|
// - A dedicated phoneme-to-viseme mapping service
|
|
// - A TTS provider that provides phoneme timing data (e.g., Azure TTS with SSML)
|
|
// - Integration with a speech analysis library
|
|
func (s *ElevenLabsTTSService) generateVisemesFromText(text string) []VisemeEvent {
|
|
// Basic phoneme-to-viseme mapping
|
|
phonemeToViseme := map[string]string{
|
|
// Vowels
|
|
"aa": "aa", "ae": "aa", "ah": "aa", "ao": "aa", "aw": "aa",
|
|
"ay": "aa", "eh": "ee", "er": "er", "ey": "ee", "ih": "ee",
|
|
"iy": "ee", "ow": "oh", "oy": "oh", "uh": "ou", "uw": "ou",
|
|
// Consonants
|
|
"b": "aa", "p": "aa", "m": "aa",
|
|
"f": "ee", "v": "ee",
|
|
"th": "ee",
|
|
"d": "aa", "t": "aa", "n": "aa", "l": "aa",
|
|
"k": "aa", "g": "aa", "ng": "aa",
|
|
"s": "ee", "z": "ee",
|
|
"sh": "ee", "zh": "ee", "ch": "ee", "jh": "ee",
|
|
"y": "ee",
|
|
"w": "ou",
|
|
"r": "er",
|
|
"h": "sil",
|
|
"sil": "sil", "sp": "sil",
|
|
}
|
|
|
|
// Simple word-to-phoneme approximation
|
|
// In production, use a proper TTS API that provides phoneme timing or a phoneme-to-viseme service
|
|
words := strings.Fields(strings.ToLower(text))
|
|
visemes := []VisemeEvent{}
|
|
currentTime := 0.0
|
|
durationPerWord := 0.3 // Approximate duration per word in seconds
|
|
initialPause := 0.1
|
|
|
|
// Initial silence
|
|
visemes = append(visemes, VisemeEvent{
|
|
Viseme: "sil",
|
|
StartTime: currentTime,
|
|
EndTime: currentTime + initialPause,
|
|
Phoneme: "sil",
|
|
})
|
|
currentTime += initialPause
|
|
|
|
// Generate visemes for each word
|
|
for _, word := range words {
|
|
// Simple approximation: map first phoneme to viseme
|
|
viseme := "aa" // default
|
|
if len(word) > 0 {
|
|
firstChar := string(word[0])
|
|
if mapped, ok := phonemeToViseme[firstChar]; ok {
|
|
viseme = mapped
|
|
} else {
|
|
// Map common starting consonants
|
|
switch firstChar {
|
|
case "a", "e", "i", "o", "u":
|
|
viseme = "aa"
|
|
default:
|
|
viseme = "aa"
|
|
}
|
|
}
|
|
}
|
|
|
|
visemes = append(visemes, VisemeEvent{
|
|
Viseme: viseme,
|
|
StartTime: currentTime,
|
|
EndTime: currentTime + durationPerWord,
|
|
Phoneme: word,
|
|
})
|
|
currentTime += durationPerWord
|
|
|
|
// Small pause between words
|
|
visemes = append(visemes, VisemeEvent{
|
|
Viseme: "sil",
|
|
StartTime: currentTime,
|
|
EndTime: currentTime + 0.05,
|
|
Phoneme: "sil",
|
|
})
|
|
currentTime += 0.05
|
|
}
|
|
|
|
// Final silence
|
|
visemes = append(visemes, VisemeEvent{
|
|
Viseme: "sil",
|
|
StartTime: currentTime,
|
|
EndTime: currentTime + 0.1,
|
|
Phoneme: "sil",
|
|
})
|
|
|
|
return visemes
|
|
}
|