Spaces:
Sleeping
Sleeping
| """ | |
| Enhanced Voice Service with Groq ASR for superior transcription accuracy | |
| Based on friend's proven implementation that achieves much better transcription quality | |
| """ | |
| import asyncio | |
| import logging | |
| import tempfile | |
| import os | |
| import aiohttp | |
| import base64 | |
| from typing import Optional, Dict, Any | |
| from pathlib import Path | |
| from groq import Groq | |
| from config import ( | |
| ENABLE_VOICE_FEATURES, TTS_PROVIDER, ASR_PROVIDER, | |
| VOICE_LANGUAGE, DEFAULT_VOICE_SPEED, GROQ_API_KEY | |
| ) | |
| logger = logging.getLogger("voicebot") | |
| class GroqVoiceService: | |
| def __init__(self): | |
| self.voice_enabled = ENABLE_VOICE_FEATURES | |
| self.tts_provider = TTS_PROVIDER | |
| self.asr_provider = "groq" # Force Groq ASR for better accuracy | |
| self.language = VOICE_LANGUAGE | |
| self.voice_speed = DEFAULT_VOICE_SPEED | |
| # Initialize Groq client | |
| if GROQ_API_KEY: | |
| self.groq_client = Groq(api_key=GROQ_API_KEY) | |
| logger.info("β Groq ASR client initialized") | |
| else: | |
| logger.error("β GROQ_API_KEY not found - ASR will not work") | |
| self.groq_client = None | |
| # Initialize services if voice is enabled | |
| if self.voice_enabled: | |
| self._init_tts_service() | |
| self._init_asr_service() | |
| logger.info(f"π€ Enhanced Voice Service initialized - TTS: {self.tts_provider}, ASR: Groq") | |
| else: | |
| logger.info("π Voice features disabled") | |
| def _init_tts_service(self): | |
| """Initialize Text-to-Speech service""" | |
| try: | |
| if self.tts_provider == "edge-tts": | |
| import edge_tts | |
| self.tts_available = True | |
| logger.info("β Edge TTS initialized") | |
| elif self.tts_provider == "murf": | |
| self.tts_available = True | |
| logger.info("β Murf AI TTS initialized") | |
| else: | |
| self.tts_available = False | |
| logger.warning(f"β οΈ Unknown TTS provider: {self.tts_provider}") | |
| except ImportError as e: | |
| self.tts_available = False | |
| logger.warning(f"β οΈ TTS dependencies not available: {e}") | |
| def _init_asr_service(self): | |
| """Initialize Groq ASR service""" | |
| if self.groq_client: | |
| self.asr_available = True | |
| logger.info("β Groq ASR initialized - superior transcription quality") | |
| else: | |
| self.asr_available = False | |
| logger.error("β Groq ASR not available - API key missing") | |
| def _get_default_voice(self) -> str: | |
| """Get default voice based on language setting""" | |
| language_voices = { | |
| 'hi-IN': 'hi-IN-SwaraNeural', # Hindi (India) female voice | |
| 'en-IN': 'en-IN-NeerjaNeural', # English (India) female voice | |
| 'en-US': 'en-US-AriaNeural', # English (US) female voice | |
| 'es-ES': 'es-ES-ElviraNeural', # Spanish (Spain) female voice | |
| 'fr-FR': 'fr-FR-DeniseNeural', # French (France) female voice | |
| 'de-DE': 'de-DE-KatjaNeural', # German (Germany) female voice | |
| 'ja-JP': 'ja-JP-NanamiNeural', # Japanese female voice | |
| 'ko-KR': 'ko-KR-SunHiNeural', # Korean female voice | |
| 'zh-CN': 'zh-CN-XiaoxiaoNeural' # Chinese (Simplified) female voice | |
| } | |
| return language_voices.get(self.language, 'en-US-AriaNeural') | |
| async def text_to_speech(self, text: str, voice: str = None) -> Optional[bytes]: | |
| """ | |
| Convert text to speech audio | |
| Returns audio bytes or None if TTS not available | |
| """ | |
| if not self.voice_enabled or not self.tts_available: | |
| return None | |
| # Use default voice for the configured language if no voice specified | |
| if voice is None: | |
| voice = self._get_default_voice() | |
| logger.info(f"π Generating TTS with voice: {voice}, language: {self.language}") | |
| try: | |
| if self.tts_provider == "edge-tts": | |
| import edge_tts | |
| # Create TTS communication | |
| communicate = edge_tts.Communicate(text, voice, rate=f"{int((self.voice_speed - 1) * 100):+d}%") | |
| audio_data = b"" | |
| async for chunk in communicate.stream(): | |
| if chunk["type"] == "audio": | |
| audio_data += chunk["data"] | |
| # Validate audio data was received | |
| if not audio_data: | |
| logger.warning(f"β οΈ No audio generated from TTS for voice: {voice}") | |
| # Try fallback voice | |
| fallback_voice = "en-US-AriaNeural" | |
| logger.info(f"π Retrying with fallback voice: {fallback_voice}") | |
| communicate = edge_tts.Communicate(text, fallback_voice, rate=f"{int((self.voice_speed - 1) * 100):+d}%") | |
| audio_data = b"" | |
| async for chunk in communicate.stream(): | |
| if chunk["type"] == "audio": | |
| audio_data += chunk["data"] | |
| if not audio_data: | |
| logger.error("β Fallback TTS also failed") | |
| return None | |
| return audio_data | |
| elif self.tts_provider == "murf": | |
| audio_data = await self._murf_tts(text, voice) | |
| return audio_data | |
| except Exception as e: | |
| logger.error(f"β TTS Error: {e}") | |
| # Try one last fallback with basic US English voice | |
| try: | |
| import edge_tts | |
| logger.info("π Attempting emergency fallback TTS") | |
| communicate = edge_tts.Communicate(text, "en-US-AriaNeural") | |
| audio_data = b"" | |
| async for chunk in communicate.stream(): | |
| if chunk["type"] == "audio": | |
| audio_data += chunk["data"] | |
| return audio_data if audio_data else None | |
| except: | |
| logger.error("β All TTS attempts failed") | |
| return None | |
| async def _murf_tts(self, text: str, voice: str = None) -> Optional[bytes]: | |
| """ | |
| Call Murf AI TTS API to convert text to speech | |
| Returns audio bytes or None | |
| """ | |
| murf_api_key = os.environ.get("MURF_API_KEY", "ap2_947765d6-b958-4493-a681-d05f89a63276") | |
| murf_url = "https://api.murf.ai/v1/speech/generate" | |
| payload = { | |
| "text": text, | |
| "voice": voice or "en-US-1", # Default Murf voice | |
| "format": "mp3" | |
| } | |
| headers = { | |
| "Authorization": f"Bearer {murf_api_key}", | |
| "Content-Type": "application/json" | |
| } | |
| try: | |
| async with aiohttp.ClientSession() as session: | |
| async with session.post(murf_url, json=payload, headers=headers) as resp: | |
| if resp.status == 200: | |
| result = await resp.json() | |
| audio_url = result.get("audio_url") | |
| if audio_url: | |
| async with session.get(audio_url) as audio_resp: | |
| if audio_resp.status == 200: | |
| return await audio_resp.read() | |
| logger.error(f"β Murf TTS: No audio_url in response: {result}") | |
| else: | |
| logger.error(f"β Murf TTS API error: {resp.status} {await resp.text()}") | |
| except Exception as e: | |
| logger.error(f"β Murf TTS Exception: {e}") | |
| return None | |
| async def groq_asr_bytes(self, audio_bytes: bytes, user_language: str = None) -> Optional[str]: | |
| """ | |
| Enhanced Groq ASR function that processes audio bytes directly | |
| Based on friend's proven implementation for superior accuracy | |
| Args: | |
| audio_bytes: Raw audio data in bytes | |
| user_language: User's preferred language | |
| Returns: | |
| Transcribed text with much better accuracy than Whisper | |
| """ | |
| if not self.groq_client or not self.asr_available: | |
| logger.error("β Groq ASR not available") | |
| return None | |
| try: | |
| # Create temporary file for Groq API | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: | |
| temp_file.write(audio_bytes) | |
| temp_file_path = temp_file.name | |
| try: | |
| # Use Groq's whisper-large-v3 model for superior accuracy | |
| with open(temp_file_path, "rb") as audio_file: | |
| transcription = self.groq_client.audio.transcriptions.create( | |
| file=audio_file, | |
| model="whisper-large-v3", # Best available model | |
| language=self._get_groq_language_code(user_language), | |
| temperature=0.0, # Deterministic output | |
| response_format="json" | |
| ) | |
| transcribed_text = transcription.text.strip() | |
| logger.info(f"π€ Groq ASR result: {transcribed_text}") | |
| # Log quality metrics | |
| if hasattr(transcription, 'confidence'): | |
| logger.info(f"π€ Groq confidence: {transcription.confidence:.2f}") | |
| return transcribed_text | |
| finally: | |
| # Clean up temporary file | |
| try: | |
| os.unlink(temp_file_path) | |
| except Exception as cleanup_error: | |
| logger.warning(f"β οΈ Failed to cleanup temp file: {cleanup_error}") | |
| except Exception as e: | |
| logger.error(f"β Groq ASR Error: {e}") | |
| return None | |
| def _get_groq_language_code(self, user_language: str = None) -> str: | |
| """ | |
| Convert user language preference to Groq language code | |
| Args: | |
| user_language: User's language preference ('english', 'hindi', 'hi-IN', etc.) | |
| Returns: | |
| Language code for Groq (e.g., 'en', 'hi') | |
| """ | |
| if not user_language: | |
| # Fallback to default config language | |
| return self.language.split('-')[0] if self.language else 'en' | |
| # Handle different language format inputs | |
| user_lang_lower = user_language.lower() | |
| # Map common language names to codes | |
| language_mapping = { | |
| 'english': 'en', | |
| 'hindi': 'hi', | |
| 'hinglish': 'hi', # Treat Hinglish as Hindi for better results | |
| 'en': 'en', | |
| 'hi': 'hi', | |
| 'en-in': 'en', | |
| 'hi-in': 'hi', | |
| 'en-us': 'en' | |
| } | |
| # Extract base language if it's a locale code (e.g., 'hi-IN' -> 'hi') | |
| if '-' in user_lang_lower: | |
| base_lang = user_lang_lower.split('-')[0] | |
| return language_mapping.get(base_lang, 'en') | |
| return language_mapping.get(user_lang_lower, 'en') | |
| async def speech_to_text(self, audio_file_path: str, user_language: str = None) -> Optional[str]: | |
| """ | |
| Convert speech audio to text using Groq ASR for superior accuracy | |
| Args: | |
| audio_file_path: Path to the audio file | |
| user_language: User's preferred language | |
| """ | |
| if not self.voice_enabled or not self.asr_available: | |
| logger.warning("π Voice features or Groq ASR not available") | |
| return None | |
| try: | |
| # Read audio file and process with Groq ASR | |
| with open(audio_file_path, 'rb') as audio_file: | |
| audio_bytes = audio_file.read() | |
| return await self.groq_asr_bytes(audio_bytes, user_language) | |
| except Exception as e: | |
| logger.error(f"β Groq ASR Error: {e}") | |
| return None | |
| def get_available_voices(self) -> Dict[str, Any]: | |
| """Get list of available TTS voices""" | |
| if not self.voice_enabled or self.tts_provider != "edge-tts": | |
| return {} | |
| # Common Edge TTS voices | |
| voices = { | |
| "english": { | |
| "female": ["en-US-AriaNeural", "en-US-JennyNeural", "en-GB-SoniaNeural"], | |
| "male": ["en-US-GuyNeural", "en-US-DavisNeural", "en-GB-RyanNeural"] | |
| }, | |
| "multilingual": { | |
| "spanish": ["es-ES-ElviraNeural", "es-MX-DaliaNeural"], | |
| "french": ["fr-FR-DeniseNeural", "fr-CA-SylvieNeural"], | |
| "german": ["de-DE-KatjaNeural", "de-AT-IngridNeural"], | |
| "italian": ["it-IT-ElsaNeural", "it-IT-IsabellaNeural"], | |
| "hindi": ["hi-IN-SwaraNeural", "hi-IN-MadhurNeural"] | |
| } | |
| } | |
| return voices | |
| def is_voice_enabled(self) -> bool: | |
| """Check if voice features are enabled""" | |
| return self.voice_enabled | |
| def get_voice_status(self) -> Dict[str, Any]: | |
| """Get current voice service status""" | |
| return { | |
| "voice_enabled": self.voice_enabled, | |
| "tts_available": getattr(self, 'tts_available', False), | |
| "asr_available": getattr(self, 'asr_available', False), | |
| "tts_provider": self.tts_provider, | |
| "asr_provider": "groq", # Always Groq for superior quality | |
| "language": self.language, | |
| "voice_speed": self.voice_speed, | |
| "groq_available": self.groq_client is not None | |
| } | |
| # Global instance | |
| groq_voice_service = GroqVoiceService() |