PensionBot / groq_voice_service.py
ChAbhishek28's picture
changes
504b344
"""
Enhanced Voice Service with Groq ASR for superior transcription accuracy
Based on friend's proven implementation that achieves much better transcription quality
"""
import asyncio
import logging
import tempfile
import os
import aiohttp
import base64
from typing import Optional, Dict, Any
from pathlib import Path
from groq import Groq
from config import (
ENABLE_VOICE_FEATURES, TTS_PROVIDER, ASR_PROVIDER,
VOICE_LANGUAGE, DEFAULT_VOICE_SPEED, GROQ_API_KEY
)
logger = logging.getLogger("voicebot")
class GroqVoiceService:
def __init__(self):
self.voice_enabled = ENABLE_VOICE_FEATURES
self.tts_provider = TTS_PROVIDER
self.asr_provider = "groq" # Force Groq ASR for better accuracy
self.language = VOICE_LANGUAGE
self.voice_speed = DEFAULT_VOICE_SPEED
# Initialize Groq client
if GROQ_API_KEY:
self.groq_client = Groq(api_key=GROQ_API_KEY)
logger.info("βœ… Groq ASR client initialized")
else:
logger.error("❌ GROQ_API_KEY not found - ASR will not work")
self.groq_client = None
# Initialize services if voice is enabled
if self.voice_enabled:
self._init_tts_service()
self._init_asr_service()
logger.info(f"🎀 Enhanced Voice Service initialized - TTS: {self.tts_provider}, ASR: Groq")
else:
logger.info("πŸ”‡ Voice features disabled")
def _init_tts_service(self):
"""Initialize Text-to-Speech service"""
try:
if self.tts_provider == "edge-tts":
import edge_tts
self.tts_available = True
logger.info("βœ… Edge TTS initialized")
elif self.tts_provider == "murf":
self.tts_available = True
logger.info("βœ… Murf AI TTS initialized")
else:
self.tts_available = False
logger.warning(f"⚠️ Unknown TTS provider: {self.tts_provider}")
except ImportError as e:
self.tts_available = False
logger.warning(f"⚠️ TTS dependencies not available: {e}")
def _init_asr_service(self):
"""Initialize Groq ASR service"""
if self.groq_client:
self.asr_available = True
logger.info("βœ… Groq ASR initialized - superior transcription quality")
else:
self.asr_available = False
logger.error("❌ Groq ASR not available - API key missing")
def _get_default_voice(self) -> str:
"""Get default voice based on language setting"""
language_voices = {
'hi-IN': 'hi-IN-SwaraNeural', # Hindi (India) female voice
'en-IN': 'en-IN-NeerjaNeural', # English (India) female voice
'en-US': 'en-US-AriaNeural', # English (US) female voice
'es-ES': 'es-ES-ElviraNeural', # Spanish (Spain) female voice
'fr-FR': 'fr-FR-DeniseNeural', # French (France) female voice
'de-DE': 'de-DE-KatjaNeural', # German (Germany) female voice
'ja-JP': 'ja-JP-NanamiNeural', # Japanese female voice
'ko-KR': 'ko-KR-SunHiNeural', # Korean female voice
'zh-CN': 'zh-CN-XiaoxiaoNeural' # Chinese (Simplified) female voice
}
return language_voices.get(self.language, 'en-US-AriaNeural')
async def text_to_speech(self, text: str, voice: str = None) -> Optional[bytes]:
"""
Convert text to speech audio
Returns audio bytes or None if TTS not available
"""
if not self.voice_enabled or not self.tts_available:
return None
# Use default voice for the configured language if no voice specified
if voice is None:
voice = self._get_default_voice()
logger.info(f"πŸ”Š Generating TTS with voice: {voice}, language: {self.language}")
try:
if self.tts_provider == "edge-tts":
import edge_tts
# Create TTS communication
communicate = edge_tts.Communicate(text, voice, rate=f"{int((self.voice_speed - 1) * 100):+d}%")
audio_data = b""
async for chunk in communicate.stream():
if chunk["type"] == "audio":
audio_data += chunk["data"]
# Validate audio data was received
if not audio_data:
logger.warning(f"⚠️ No audio generated from TTS for voice: {voice}")
# Try fallback voice
fallback_voice = "en-US-AriaNeural"
logger.info(f"πŸ”„ Retrying with fallback voice: {fallback_voice}")
communicate = edge_tts.Communicate(text, fallback_voice, rate=f"{int((self.voice_speed - 1) * 100):+d}%")
audio_data = b""
async for chunk in communicate.stream():
if chunk["type"] == "audio":
audio_data += chunk["data"]
if not audio_data:
logger.error("❌ Fallback TTS also failed")
return None
return audio_data
elif self.tts_provider == "murf":
audio_data = await self._murf_tts(text, voice)
return audio_data
except Exception as e:
logger.error(f"❌ TTS Error: {e}")
# Try one last fallback with basic US English voice
try:
import edge_tts
logger.info("πŸ”„ Attempting emergency fallback TTS")
communicate = edge_tts.Communicate(text, "en-US-AriaNeural")
audio_data = b""
async for chunk in communicate.stream():
if chunk["type"] == "audio":
audio_data += chunk["data"]
return audio_data if audio_data else None
except:
logger.error("❌ All TTS attempts failed")
return None
async def _murf_tts(self, text: str, voice: str = None) -> Optional[bytes]:
"""
Call Murf AI TTS API to convert text to speech
Returns audio bytes or None
"""
murf_api_key = os.environ.get("MURF_API_KEY", "ap2_947765d6-b958-4493-a681-d05f89a63276")
murf_url = "https://api.murf.ai/v1/speech/generate"
payload = {
"text": text,
"voice": voice or "en-US-1", # Default Murf voice
"format": "mp3"
}
headers = {
"Authorization": f"Bearer {murf_api_key}",
"Content-Type": "application/json"
}
try:
async with aiohttp.ClientSession() as session:
async with session.post(murf_url, json=payload, headers=headers) as resp:
if resp.status == 200:
result = await resp.json()
audio_url = result.get("audio_url")
if audio_url:
async with session.get(audio_url) as audio_resp:
if audio_resp.status == 200:
return await audio_resp.read()
logger.error(f"❌ Murf TTS: No audio_url in response: {result}")
else:
logger.error(f"❌ Murf TTS API error: {resp.status} {await resp.text()}")
except Exception as e:
logger.error(f"❌ Murf TTS Exception: {e}")
return None
async def groq_asr_bytes(self, audio_bytes: bytes, user_language: str = None) -> Optional[str]:
"""
Enhanced Groq ASR function that processes audio bytes directly
Based on friend's proven implementation for superior accuracy
Args:
audio_bytes: Raw audio data in bytes
user_language: User's preferred language
Returns:
Transcribed text with much better accuracy than Whisper
"""
if not self.groq_client or not self.asr_available:
logger.error("❌ Groq ASR not available")
return None
try:
# Create temporary file for Groq API
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
temp_file.write(audio_bytes)
temp_file_path = temp_file.name
try:
# Use Groq's whisper-large-v3 model for superior accuracy
with open(temp_file_path, "rb") as audio_file:
transcription = self.groq_client.audio.transcriptions.create(
file=audio_file,
model="whisper-large-v3", # Best available model
language=self._get_groq_language_code(user_language),
temperature=0.0, # Deterministic output
response_format="json"
)
transcribed_text = transcription.text.strip()
logger.info(f"🎀 Groq ASR result: {transcribed_text}")
# Log quality metrics
if hasattr(transcription, 'confidence'):
logger.info(f"🎀 Groq confidence: {transcription.confidence:.2f}")
return transcribed_text
finally:
# Clean up temporary file
try:
os.unlink(temp_file_path)
except Exception as cleanup_error:
logger.warning(f"⚠️ Failed to cleanup temp file: {cleanup_error}")
except Exception as e:
logger.error(f"❌ Groq ASR Error: {e}")
return None
def _get_groq_language_code(self, user_language: str = None) -> str:
"""
Convert user language preference to Groq language code
Args:
user_language: User's language preference ('english', 'hindi', 'hi-IN', etc.)
Returns:
Language code for Groq (e.g., 'en', 'hi')
"""
if not user_language:
# Fallback to default config language
return self.language.split('-')[0] if self.language else 'en'
# Handle different language format inputs
user_lang_lower = user_language.lower()
# Map common language names to codes
language_mapping = {
'english': 'en',
'hindi': 'hi',
'hinglish': 'hi', # Treat Hinglish as Hindi for better results
'en': 'en',
'hi': 'hi',
'en-in': 'en',
'hi-in': 'hi',
'en-us': 'en'
}
# Extract base language if it's a locale code (e.g., 'hi-IN' -> 'hi')
if '-' in user_lang_lower:
base_lang = user_lang_lower.split('-')[0]
return language_mapping.get(base_lang, 'en')
return language_mapping.get(user_lang_lower, 'en')
async def speech_to_text(self, audio_file_path: str, user_language: str = None) -> Optional[str]:
"""
Convert speech audio to text using Groq ASR for superior accuracy
Args:
audio_file_path: Path to the audio file
user_language: User's preferred language
"""
if not self.voice_enabled or not self.asr_available:
logger.warning("πŸ”‡ Voice features or Groq ASR not available")
return None
try:
# Read audio file and process with Groq ASR
with open(audio_file_path, 'rb') as audio_file:
audio_bytes = audio_file.read()
return await self.groq_asr_bytes(audio_bytes, user_language)
except Exception as e:
logger.error(f"❌ Groq ASR Error: {e}")
return None
def get_available_voices(self) -> Dict[str, Any]:
"""Get list of available TTS voices"""
if not self.voice_enabled or self.tts_provider != "edge-tts":
return {}
# Common Edge TTS voices
voices = {
"english": {
"female": ["en-US-AriaNeural", "en-US-JennyNeural", "en-GB-SoniaNeural"],
"male": ["en-US-GuyNeural", "en-US-DavisNeural", "en-GB-RyanNeural"]
},
"multilingual": {
"spanish": ["es-ES-ElviraNeural", "es-MX-DaliaNeural"],
"french": ["fr-FR-DeniseNeural", "fr-CA-SylvieNeural"],
"german": ["de-DE-KatjaNeural", "de-AT-IngridNeural"],
"italian": ["it-IT-ElsaNeural", "it-IT-IsabellaNeural"],
"hindi": ["hi-IN-SwaraNeural", "hi-IN-MadhurNeural"]
}
}
return voices
def is_voice_enabled(self) -> bool:
"""Check if voice features are enabled"""
return self.voice_enabled
def get_voice_status(self) -> Dict[str, Any]:
"""Get current voice service status"""
return {
"voice_enabled": self.voice_enabled,
"tts_available": getattr(self, 'tts_available', False),
"asr_available": getattr(self, 'asr_available', False),
"tts_provider": self.tts_provider,
"asr_provider": "groq", # Always Groq for superior quality
"language": self.language,
"voice_speed": self.voice_speed,
"groq_available": self.groq_client is not None
}
# Global instance
groq_voice_service = GroqVoiceService()