File size: 13,728 Bytes
4e6d880
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
504b344
 
4e6d880
 
 
 
 
 
 
 
 
 
504b344
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4e6d880
 
 
 
 
 
504b344
 
 
 
 
 
 
 
 
 
 
 
 
4e6d880
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
"""
Enhanced Voice Service with Groq ASR for superior transcription accuracy
Based on friend's proven implementation that achieves much better transcription quality
"""

import asyncio
import logging
import tempfile
import os
import aiohttp
import base64
from typing import Optional, Dict, Any
from pathlib import Path
from groq import Groq

from config import (
    ENABLE_VOICE_FEATURES, TTS_PROVIDER, ASR_PROVIDER, 
    VOICE_LANGUAGE, DEFAULT_VOICE_SPEED, GROQ_API_KEY
)

logger = logging.getLogger("voicebot")

class GroqVoiceService:
    def __init__(self):
        self.voice_enabled = ENABLE_VOICE_FEATURES
        self.tts_provider = TTS_PROVIDER
        self.asr_provider = "groq"  # Force Groq ASR for better accuracy
        self.language = VOICE_LANGUAGE
        self.voice_speed = DEFAULT_VOICE_SPEED
        
        # Initialize Groq client
        if GROQ_API_KEY:
            self.groq_client = Groq(api_key=GROQ_API_KEY)
            logger.info("βœ… Groq ASR client initialized")
        else:
            logger.error("❌ GROQ_API_KEY not found - ASR will not work")
            self.groq_client = None
        
        # Initialize services if voice is enabled
        if self.voice_enabled:
            self._init_tts_service()
            self._init_asr_service()
            logger.info(f"🎀 Enhanced Voice Service initialized - TTS: {self.tts_provider}, ASR: Groq")
        else:
            logger.info("πŸ”‡ Voice features disabled")

    def _init_tts_service(self):
        """Initialize Text-to-Speech service"""
        try:
            if self.tts_provider == "edge-tts":
                import edge_tts
                self.tts_available = True
                logger.info("βœ… Edge TTS initialized")
            elif self.tts_provider == "murf":
                self.tts_available = True
                logger.info("βœ… Murf AI TTS initialized")
            else:
                self.tts_available = False
                logger.warning(f"⚠️ Unknown TTS provider: {self.tts_provider}")
        except ImportError as e:
            self.tts_available = False
            logger.warning(f"⚠️ TTS dependencies not available: {e}")

    def _init_asr_service(self):
        """Initialize Groq ASR service"""
        if self.groq_client:
            self.asr_available = True
            logger.info("βœ… Groq ASR initialized - superior transcription quality")
        else:
            self.asr_available = False
            logger.error("❌ Groq ASR not available - API key missing")

    def _get_default_voice(self) -> str:
        """Get default voice based on language setting"""
        language_voices = {
            'hi-IN': 'hi-IN-SwaraNeural',  # Hindi (India) female voice
            'en-IN': 'en-IN-NeerjaNeural',  # English (India) female voice
            'en-US': 'en-US-AriaNeural',   # English (US) female voice
            'es-ES': 'es-ES-ElviraNeural', # Spanish (Spain) female voice
            'fr-FR': 'fr-FR-DeniseNeural', # French (France) female voice
            'de-DE': 'de-DE-KatjaNeural',  # German (Germany) female voice
            'ja-JP': 'ja-JP-NanamiNeural', # Japanese female voice
            'ko-KR': 'ko-KR-SunHiNeural',  # Korean female voice
            'zh-CN': 'zh-CN-XiaoxiaoNeural' # Chinese (Simplified) female voice
        }
        return language_voices.get(self.language, 'en-US-AriaNeural')

    async def text_to_speech(self, text: str, voice: str = None) -> Optional[bytes]:
        """
        Convert text to speech audio
        Returns audio bytes or None if TTS not available
        """
        if not self.voice_enabled or not self.tts_available:
            return None

        # Use default voice for the configured language if no voice specified
        if voice is None:
            voice = self._get_default_voice()
        
        logger.info(f"πŸ”Š Generating TTS with voice: {voice}, language: {self.language}")

        try:
            if self.tts_provider == "edge-tts":
                import edge_tts
                # Create TTS communication
                communicate = edge_tts.Communicate(text, voice, rate=f"{int((self.voice_speed - 1) * 100):+d}%")
                audio_data = b""
                async for chunk in communicate.stream():
                    if chunk["type"] == "audio":
                        audio_data += chunk["data"]
                
                # Validate audio data was received
                if not audio_data:
                    logger.warning(f"⚠️ No audio generated from TTS for voice: {voice}")
                    # Try fallback voice
                    fallback_voice = "en-US-AriaNeural"
                    logger.info(f"πŸ”„ Retrying with fallback voice: {fallback_voice}")
                    communicate = edge_tts.Communicate(text, fallback_voice, rate=f"{int((self.voice_speed - 1) * 100):+d}%")
                    audio_data = b""
                    async for chunk in communicate.stream():
                        if chunk["type"] == "audio":
                            audio_data += chunk["data"]
                    
                    if not audio_data:
                        logger.error("❌ Fallback TTS also failed")
                        return None
                    
                return audio_data
            elif self.tts_provider == "murf":
                audio_data = await self._murf_tts(text, voice)
                return audio_data
        except Exception as e:
            logger.error(f"❌ TTS Error: {e}")
            # Try one last fallback with basic US English voice
            try:
                import edge_tts
                logger.info("πŸ”„ Attempting emergency fallback TTS")
                communicate = edge_tts.Communicate(text, "en-US-AriaNeural")
                audio_data = b""
                async for chunk in communicate.stream():
                    if chunk["type"] == "audio":
                        audio_data += chunk["data"]
                return audio_data if audio_data else None
            except:
                logger.error("❌ All TTS attempts failed")
                return None

    async def _murf_tts(self, text: str, voice: str = None) -> Optional[bytes]:
        """
        Call Murf AI TTS API to convert text to speech
        Returns audio bytes or None
        """
        murf_api_key = os.environ.get("MURF_API_KEY", "ap2_947765d6-b958-4493-a681-d05f89a63276")
        murf_url = "https://api.murf.ai/v1/speech/generate"
        payload = {
            "text": text,
            "voice": voice or "en-US-1",  # Default Murf voice
            "format": "mp3"
        }
        headers = {
            "Authorization": f"Bearer {murf_api_key}",
            "Content-Type": "application/json"
        }
        try:
            async with aiohttp.ClientSession() as session:
                async with session.post(murf_url, json=payload, headers=headers) as resp:
                    if resp.status == 200:
                        result = await resp.json()
                        audio_url = result.get("audio_url")
                        if audio_url:
                            async with session.get(audio_url) as audio_resp:
                                if audio_resp.status == 200:
                                    return await audio_resp.read()
                        logger.error(f"❌ Murf TTS: No audio_url in response: {result}")
                    else:
                        logger.error(f"❌ Murf TTS API error: {resp.status} {await resp.text()}")
        except Exception as e:
            logger.error(f"❌ Murf TTS Exception: {e}")
        return None

    async def groq_asr_bytes(self, audio_bytes: bytes, user_language: str = None) -> Optional[str]:
        """
        Enhanced Groq ASR function that processes audio bytes directly
        Based on friend's proven implementation for superior accuracy
        
        Args:
            audio_bytes: Raw audio data in bytes
            user_language: User's preferred language
        
        Returns:
            Transcribed text with much better accuracy than Whisper
        """
        if not self.groq_client or not self.asr_available:
            logger.error("❌ Groq ASR not available")
            return None

        try:
            # Create temporary file for Groq API
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
                temp_file.write(audio_bytes)
                temp_file_path = temp_file.name

            try:
                # Use Groq's whisper-large-v3 model for superior accuracy
                with open(temp_file_path, "rb") as audio_file:
                    transcription = self.groq_client.audio.transcriptions.create(
                        file=audio_file,
                        model="whisper-large-v3",  # Best available model
                        language=self._get_groq_language_code(user_language),
                        temperature=0.0,  # Deterministic output
                        response_format="json"
                    )
                
                transcribed_text = transcription.text.strip()
                logger.info(f"🎀 Groq ASR result: {transcribed_text}")
                
                # Log quality metrics
                if hasattr(transcription, 'confidence'):
                    logger.info(f"🎀 Groq confidence: {transcription.confidence:.2f}")
                
                return transcribed_text
                
            finally:
                # Clean up temporary file
                try:
                    os.unlink(temp_file_path)
                except Exception as cleanup_error:
                    logger.warning(f"⚠️ Failed to cleanup temp file: {cleanup_error}")
                
        except Exception as e:
            logger.error(f"❌ Groq ASR Error: {e}")
            return None

    def _get_groq_language_code(self, user_language: str = None) -> str:
        """
        Convert user language preference to Groq language code
        
        Args:
            user_language: User's language preference ('english', 'hindi', 'hi-IN', etc.)
            
        Returns:
            Language code for Groq (e.g., 'en', 'hi')
        """
        if not user_language:
            # Fallback to default config language
            return self.language.split('-')[0] if self.language else 'en'
        
        # Handle different language format inputs
        user_lang_lower = user_language.lower()
        
        # Map common language names to codes
        language_mapping = {
            'english': 'en',
            'hindi': 'hi',
            'hinglish': 'hi',  # Treat Hinglish as Hindi for better results
            'en': 'en',
            'hi': 'hi',
            'en-in': 'en',
            'hi-in': 'hi',
            'en-us': 'en'
        }
        
        # Extract base language if it's a locale code (e.g., 'hi-IN' -> 'hi')
        if '-' in user_lang_lower:
            base_lang = user_lang_lower.split('-')[0]
            return language_mapping.get(base_lang, 'en')
        
        return language_mapping.get(user_lang_lower, 'en')

    async def speech_to_text(self, audio_file_path: str, user_language: str = None) -> Optional[str]:
        """
        Convert speech audio to text using Groq ASR for superior accuracy
        
        Args:
            audio_file_path: Path to the audio file
            user_language: User's preferred language
        """
        if not self.voice_enabled or not self.asr_available:
            logger.warning("πŸ”‡ Voice features or Groq ASR not available")
            return None

        try:
            # Read audio file and process with Groq ASR
            with open(audio_file_path, 'rb') as audio_file:
                audio_bytes = audio_file.read()
            
            return await self.groq_asr_bytes(audio_bytes, user_language)
                
        except Exception as e:
            logger.error(f"❌ Groq ASR Error: {e}")
            return None

    def get_available_voices(self) -> Dict[str, Any]:
        """Get list of available TTS voices"""
        if not self.voice_enabled or self.tts_provider != "edge-tts":
            return {}
            
        # Common Edge TTS voices
        voices = {
            "english": {
                "female": ["en-US-AriaNeural", "en-US-JennyNeural", "en-GB-SoniaNeural"],
                "male": ["en-US-GuyNeural", "en-US-DavisNeural", "en-GB-RyanNeural"]
            },
            "multilingual": {
                "spanish": ["es-ES-ElviraNeural", "es-MX-DaliaNeural"],
                "french": ["fr-FR-DeniseNeural", "fr-CA-SylvieNeural"],
                "german": ["de-DE-KatjaNeural", "de-AT-IngridNeural"],
                "italian": ["it-IT-ElsaNeural", "it-IT-IsabellaNeural"],
                "hindi": ["hi-IN-SwaraNeural", "hi-IN-MadhurNeural"]
            }
        }
        return voices

    def is_voice_enabled(self) -> bool:
        """Check if voice features are enabled"""
        return self.voice_enabled

    def get_voice_status(self) -> Dict[str, Any]:
        """Get current voice service status"""
        return {
            "voice_enabled": self.voice_enabled,
            "tts_available": getattr(self, 'tts_available', False),
            "asr_available": getattr(self, 'asr_available', False),
            "tts_provider": self.tts_provider,
            "asr_provider": "groq",  # Always Groq for superior quality
            "language": self.language,
            "voice_speed": self.voice_speed,
            "groq_available": self.groq_client is not None
        }

# Global instance
groq_voice_service = GroqVoiceService()