""" Voice Handler Module Provides Speech-to-Text (STT) and Text-to-Speech (TTS) capabilities with multiple provider options for different cost/quality tiers. """ import os import tempfile from abc import ABC, abstractmethod from pathlib import Path from typing import Optional, List, Dict # Import voice processing libraries from openai import OpenAI import whisper from gtts import gTTS # ============================================================================ # Configuration and Cost Tiers # ============================================================================ class VoiceConfig: """Configuration for voice providers and their characteristics.""" # STT Provider definitions STT_PROVIDERS = { "OpenAI Whisper API": { "id": "openai_whisper", "cost_tier": "medium", "cost_per_minute": 0.006, "requires_api_key": True, }, "Local Whisper (Tiny)": { "id": "local_whisper_tiny", "cost_tier": "free", "cost_per_minute": 0.0, "requires_api_key": False, }, "Local Whisper (Base)": { "id": "local_whisper_base", "cost_tier": "free", "cost_per_minute": 0.0, "requires_api_key": False, }, } # TTS Provider definitions TTS_PROVIDERS = { "OpenAI TTS": { "id": "openai_tts", "cost_tier": "medium", "cost_per_1k_chars": 0.015, "requires_api_key": True, "voices": ["alloy", "echo", "fable", "onyx", "nova", "shimmer"] }, "gTTS (Free)": { "id": "gtts", "cost_tier": "free", "cost_per_1k_chars": 0.0, "requires_api_key": False, "voices": ["default"] }, } # Default selections DEFAULT_STT = "OpenAI Whisper API" DEFAULT_TTS = "OpenAI TTS" DEFAULT_TTS_VOICE = "nova" # ============================================================================ # Abstract Base Classes # ============================================================================ class STTProvider(ABC): """Abstract base class for Speech-to-Text providers.""" @abstractmethod def transcribe(self, audio_path: str) -> str: """ Transcribe audio file to text. Args: audio_path: Path to audio file Returns: Transcribed text """ pass class TTSProvider(ABC): """Abstract base class for Text-to-Speech providers.""" @abstractmethod def synthesize(self, text: str, output_path: Optional[str] = None) -> str: """ Synthesize text to speech. Args: text: Text to convert to speech output_path: Optional path to save audio file Returns: Path to generated audio file """ pass @abstractmethod def get_available_voices(self) -> List[str]: """Get list of available voices for this provider.""" pass # ============================================================================ # STT Provider Implementations # ============================================================================ class OpenAIWhisperSTT(STTProvider): """OpenAI Whisper API implementation.""" def __init__(self, api_key: Optional[str] = None): self.api_key = api_key or os.getenv("OPENAI_API_KEY") if not self.api_key: raise ValueError("OpenAI API key is required. Set OPENAI_API_KEY environment variable.") self.client = OpenAI(api_key=self.api_key) def transcribe(self, audio_path: str) -> str: """Transcribe audio using OpenAI Whisper API.""" try: with open(audio_path, "rb") as audio_file: transcript = self.client.audio.transcriptions.create( model="whisper-1", file=audio_file ) return transcript.text except Exception as e: raise Exception(f"OpenAI Whisper transcription failed: {str(e)}") class LocalWhisperSTT(STTProvider): """Local Whisper model implementation.""" def __init__(self, model_size: str = "base"): """ Initialize local Whisper model. Args: model_size: Model size (tiny, base, small, medium, large) """ self.model_size = model_size self.model = None def _load_model(self): """Lazy load the model.""" if self.model is None: self.model = whisper.load_model(self.model_size) def transcribe(self, audio_path: str) -> str: """Transcribe audio using local Whisper model.""" self._load_model() try: result = self.model.transcribe(audio_path) return result["text"] except Exception as e: raise Exception(f"Local Whisper transcription failed: {str(e)}") # ============================================================================ # TTS Provider Implementations # ============================================================================ class OpenAITTSProvider(TTSProvider): """OpenAI TTS implementation.""" def __init__(self, voice: str = "nova", api_key: Optional[str] = None): self.voice = voice self.api_key = api_key or os.getenv("OPENAI_API_KEY") if not self.api_key: raise ValueError("OpenAI API key is required. Set OPENAI_API_KEY environment variable.") self.client = OpenAI(api_key=self.api_key) def synthesize(self, text: str, output_path: Optional[str] = None) -> str: """Synthesize speech using OpenAI TTS.""" if output_path is None: output_path = os.path.join(tempfile.gettempdir(), f"tts_{os.getpid()}.mp3") try: response = self.client.audio.speech.create( model="tts-1", voice=self.voice, input=text ) # Write response content manually to avoid file descriptor issues # This is more compatible with containerized environments like HuggingFace Spaces with open(output_path, 'wb') as f: f.write(response.content) return output_path except Exception as e: raise Exception(f"OpenAI TTS synthesis failed: {str(e)}") def get_available_voices(self) -> List[str]: """Get available OpenAI TTS voices.""" return VoiceConfig.TTS_PROVIDERS["OpenAI TTS"]["voices"] class GTTSProvider(TTSProvider): """Google TTS implementation (free, basic quality).""" def __init__(self, voice: str = "default"): self.voice = voice def synthesize(self, text: str, output_path: Optional[str] = None) -> str: """Synthesize speech using gTTS.""" if output_path is None: output_path = os.path.join(tempfile.gettempdir(), f"tts_{os.getpid()}.mp3") try: tts = gTTS(text=text, lang='en') tts.save(output_path) return output_path except Exception as e: raise Exception(f"gTTS synthesis failed: {str(e)}") def get_available_voices(self) -> List[str]: """Get available gTTS voices.""" return VoiceConfig.TTS_PROVIDERS["gTTS (Free)"]["voices"] # ============================================================================ # Factory Functions # ============================================================================ def create_stt_provider(provider_name: str) -> STTProvider: """ Create an STT provider instance. Args: provider_name: Name of the provider (from VoiceConfig.STT_PROVIDERS) Returns: STTProvider instance """ provider_id = VoiceConfig.STT_PROVIDERS[provider_name]["id"] if provider_id == "openai_whisper": return OpenAIWhisperSTT() elif provider_id == "local_whisper_tiny": return LocalWhisperSTT(model_size="tiny") elif provider_id == "local_whisper_base": return LocalWhisperSTT(model_size="base") else: raise ValueError(f"Unknown STT provider: {provider_name}") def create_tts_provider(provider_name: str, voice: Optional[str] = None) -> TTSProvider: """ Create a TTS provider instance. Args: provider_name: Name of the provider (from VoiceConfig.TTS_PROVIDERS) voice: Optional voice name Returns: TTSProvider instance """ provider_id = VoiceConfig.TTS_PROVIDERS[provider_name]["id"] provider_info = VoiceConfig.TTS_PROVIDERS[provider_name] # Use default voice if not specified if voice is None: voice = provider_info["voices"][0] if provider_id == "openai_tts": return OpenAITTSProvider(voice=voice) elif provider_id == "gtts": return GTTSProvider(voice=voice) else: raise ValueError(f"Unknown TTS provider: {provider_name}") def get_available_stt_providers() -> List[str]: """Get list of available STT provider names.""" return list(VoiceConfig.STT_PROVIDERS.keys()) def get_available_tts_providers() -> List[str]: """Get list of available TTS provider names.""" return list(VoiceConfig.TTS_PROVIDERS.keys()) def get_voices_for_provider(provider_name: str) -> List[str]: """Get available voices for a TTS provider.""" if provider_name not in VoiceConfig.TTS_PROVIDERS: return [] return VoiceConfig.TTS_PROVIDERS[provider_name]["voices"] def get_provider_info(provider_name: str, provider_type: str = "tts") -> Dict: """ Get information about a provider. Args: provider_name: Name of the provider provider_type: "stt" or "tts" Returns: Provider information dictionary """ if provider_type == "tts": return VoiceConfig.TTS_PROVIDERS.get(provider_name, {}) else: return VoiceConfig.STT_PROVIDERS.get(provider_name, {})