Spaces:

ChAbhishek28
/

PensionBot

Sleeping

File size: 6,422 Bytes
"""
Enhanced Audio Services for Conversational Voice Bot
Based on friend's implementation with Murf TTS and Groq ASR
"""
from groq import AsyncGroq
from config import GROQ_API_KEY, MURF_API_KEY
import asyncio
import logging
import re

logger = logging.getLogger(__name__)

# Initialize Groq client
groq_client = AsyncGroq(api_key=GROQ_API_KEY)

# Initialize Murf client
try:
    from murf import AsyncMurf
    murf_client = AsyncMurf(api_key=MURF_API_KEY)
    MURF_AVAILABLE = True
except ImportError:
    logger.warning("Murf package not available. Install with: pip install murf")
    MURF_AVAILABLE = False

def clean_markdown_for_tts(text: str) -> str:
    """
    Clean markdown and formatting from text to make it suitable for TTS
    """
    if not text:
        return ""
    
    # Remove markdown links [text](url) -> text
    text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
    
    # Remove markdown emphasis **bold** and *italic* -> text
    text = re.sub(r'\*\*([^\*]+)\*\*', r'\1', text)
    text = re.sub(r'\*([^\*]+)\*', r'\1', text)
    
    # Remove markdown headers ### -> 
    text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE)
    
    # Remove code blocks ```code``` -> code
    text = re.sub(r'```[^`]*```', '', text)
    text = re.sub(r'`([^`]+)`', r'\1', text)
    
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    
    # Clean up bullet points and lists
    text = re.sub(r'^\s*[-\*\+]\s*', '', text, flags=re.MULTILINE)
    text = re.sub(r'^\s*\d+\.\s*', '', text, flags=re.MULTILINE)
    
    # Remove extra whitespace and line breaks
    text = re.sub(r'\n\s*\n', '. ', text)
    text = re.sub(r'\s+', ' ', text)
    
    # Remove special characters that TTS might struggle with
    text = re.sub(r'[#\*_~`]', '', text)
    
    return text.strip()

async def groq_asr_bytes(audio_bytes: bytes, model: str = "whisper-large-v3", language: str = "en") -> str:
    """
    Transcribe audio using Groq Whisper ASR
    Enhanced version similar to friend's implementation
    """
    try:
        logger.info(f"🎤 Transcribing audio with Groq ASR (model: {model}, language: {language})")
        
        # Groq client is async, so we can use it directly
        response = await groq_client.audio.transcriptions.create(
            model=model,
            file=("audio.wav", audio_bytes, "audio/wav"),
            response_format="text",
            language=language,
            temperature=0.0  # For more consistent results
        )
        
        transcription = response.strip() if response else ""
        logger.info(f"🎯 Transcription result: {transcription}")
        
        return transcription
        
    except Exception as e:
        logger.error(f"❌ Groq ASR failed: {e}")
        return ""

async def murf_tts(text: str, voice_id: str = "en-IN-isha", format: str = "MP3") -> bytes:
    """
    Convert text to speech using Murf TTS
    Enhanced version similar to friend's implementation
    """
    if not MURF_AVAILABLE:
        logger.error("❌ Murf TTS not available")
        return b""
    
    if not text or not text.strip():
        logger.warning("⚠️ Empty text provided to TTS")
        return b""
    
    try:
        # Clean text for TTS
        clean_text = clean_markdown_for_tts(text)
        if not clean_text:
            logger.warning("⚠️ Text became empty after cleaning")
            return b""
        
        logger.info(f"🔊 Generating speech with Murf TTS (voice: {voice_id})")
        logger.debug(f"TTS text: {clean_text}")
        
        # Generate speech using async streaming
        response = murf_client.text_to_speech.stream(
            text=clean_text,
            voice_id=voice_id,
            format=format,
            sample_rate=44100.0
        )
        
        # Collect all chunks
        chunks = [chunk async for chunk in response]
        full_audio = b''.join(chunks)
        
        logger.info(f"✅ Generated {len(full_audio)} bytes of audio")
        return full_audio
        
    except Exception as e:
        logger.error(f"❌ Murf TTS failed: {e}")
        return b""

async def edge_tts_fallback(text: str, voice: str = "en-IN-Neerja") -> bytes:
    """
    Fallback TTS using edge-tts if Murf is not available
    """
    try:
        import edge_tts
        
        clean_text = clean_markdown_for_tts(text)
        if not clean_text:
            return b""
        
        logger.info(f"🔊 Using Edge TTS fallback (voice: {voice})")
        
        communicate = edge_tts.Communicate(clean_text, voice)
        audio_chunks = []
        
        async for chunk in communicate.stream():
            if chunk["type"] == "audio":
                audio_chunks.append(chunk["data"])
        
        audio_data = b"".join(audio_chunks)
        logger.info(f"✅ Generated {len(audio_data)} bytes of audio with Edge TTS")
        return audio_data
        
    except ImportError:
        logger.error("❌ Edge TTS not available. Install with: pip install edge-tts")
        return b""
    except Exception as e:
        logger.error(f"❌ Edge TTS failed: {e}")
        return b""

class ConversationalAudioService:
    """
    Main audio service class for conversational voice bot
    """
    
    def __init__(self):
        self.groq_client = groq_client
        self.murf_available = MURF_AVAILABLE
        self.default_voice = "en-IN-isha"  # Indian English voice
        
    async def transcribe_audio(self, audio_bytes: bytes, language: str = "en") -> str:
        """Transcribe audio to text using Groq ASR"""
        return await groq_asr_bytes(audio_bytes, language=language)
    
    async def synthesize_speech(self, text: str, voice_id: str = None) -> bytes:
        """Convert text to speech using best available TTS"""
        voice = voice_id or self.default_voice
        
        if self.murf_available:
            # Try Murf TTS first
            audio = await murf_tts(text, voice_id=voice)
            if audio:
                return audio
        
        # Fallback to Edge TTS
        return await edge_tts_fallback(text, voice="en-IN-Neerja")
    
    def set_default_voice(self, voice_id: str):
        """Set default voice for TTS"""
        self.default_voice = voice_id
        logger.info(f"🎵 Default voice set to: {voice_id}")

# Global audio service instance
conversational_audio_service = ConversationalAudioService()