Spaces:
Sleeping
Sleeping
| """ | |
| Enhanced Audio Services for Conversational Voice Bot | |
| Based on friend's implementation with Murf TTS and Groq ASR | |
| """ | |
| from groq import AsyncGroq | |
| from config import GROQ_API_KEY, MURF_API_KEY | |
| import asyncio | |
| import logging | |
| import re | |
| logger = logging.getLogger(__name__) | |
| # Initialize Groq client | |
| groq_client = AsyncGroq(api_key=GROQ_API_KEY) | |
| # Initialize Murf client | |
| try: | |
| from murf import AsyncMurf | |
| murf_client = AsyncMurf(api_key=MURF_API_KEY) | |
| MURF_AVAILABLE = True | |
| except ImportError: | |
| logger.warning("Murf package not available. Install with: pip install murf") | |
| MURF_AVAILABLE = False | |
| def clean_markdown_for_tts(text: str) -> str: | |
| """ | |
| Clean markdown and formatting from text to make it suitable for TTS | |
| """ | |
| if not text: | |
| return "" | |
| # Remove markdown links [text](url) -> text | |
| text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text) | |
| # Remove markdown emphasis **bold** and *italic* -> text | |
| text = re.sub(r'\*\*([^\*]+)\*\*', r'\1', text) | |
| text = re.sub(r'\*([^\*]+)\*', r'\1', text) | |
| # Remove markdown headers ### -> | |
| text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE) | |
| # Remove code blocks ```code``` -> code | |
| text = re.sub(r'```[^`]*```', '', text) | |
| text = re.sub(r'`([^`]+)`', r'\1', text) | |
| # Remove HTML tags | |
| text = re.sub(r'<[^>]+>', '', text) | |
| # Clean up bullet points and lists | |
| text = re.sub(r'^\s*[-\*\+]\s*', '', text, flags=re.MULTILINE) | |
| text = re.sub(r'^\s*\d+\.\s*', '', text, flags=re.MULTILINE) | |
| # Remove extra whitespace and line breaks | |
| text = re.sub(r'\n\s*\n', '. ', text) | |
| text = re.sub(r'\s+', ' ', text) | |
| # Remove special characters that TTS might struggle with | |
| text = re.sub(r'[#\*_~`]', '', text) | |
| return text.strip() | |
| async def groq_asr_bytes(audio_bytes: bytes, model: str = "whisper-large-v3", language: str = "en") -> str: | |
| """ | |
| Transcribe audio using Groq Whisper ASR | |
| Enhanced version similar to friend's implementation | |
| """ | |
| try: | |
| logger.info(f"π€ Transcribing audio with Groq ASR (model: {model}, language: {language})") | |
| # Groq client is async, so we can use it directly | |
| response = await groq_client.audio.transcriptions.create( | |
| model=model, | |
| file=("audio.wav", audio_bytes, "audio/wav"), | |
| response_format="text", | |
| language=language, | |
| temperature=0.0 # For more consistent results | |
| ) | |
| transcription = response.strip() if response else "" | |
| logger.info(f"π― Transcription result: {transcription}") | |
| return transcription | |
| except Exception as e: | |
| logger.error(f"β Groq ASR failed: {e}") | |
| return "" | |
| async def murf_tts(text: str, voice_id: str = "en-IN-isha", format: str = "MP3") -> bytes: | |
| """ | |
| Convert text to speech using Murf TTS | |
| Enhanced version similar to friend's implementation | |
| """ | |
| if not MURF_AVAILABLE: | |
| logger.error("β Murf TTS not available") | |
| return b"" | |
| if not text or not text.strip(): | |
| logger.warning("β οΈ Empty text provided to TTS") | |
| return b"" | |
| try: | |
| # Clean text for TTS | |
| clean_text = clean_markdown_for_tts(text) | |
| if not clean_text: | |
| logger.warning("β οΈ Text became empty after cleaning") | |
| return b"" | |
| logger.info(f"π Generating speech with Murf TTS (voice: {voice_id})") | |
| logger.debug(f"TTS text: {clean_text}") | |
| # Generate speech using async streaming | |
| response = murf_client.text_to_speech.stream( | |
| text=clean_text, | |
| voice_id=voice_id, | |
| format=format, | |
| sample_rate=44100.0 | |
| ) | |
| # Collect all chunks | |
| chunks = [chunk async for chunk in response] | |
| full_audio = b''.join(chunks) | |
| logger.info(f"β Generated {len(full_audio)} bytes of audio") | |
| return full_audio | |
| except Exception as e: | |
| logger.error(f"β Murf TTS failed: {e}") | |
| return b"" | |
| async def edge_tts_fallback(text: str, voice: str = "en-IN-Neerja") -> bytes: | |
| """ | |
| Fallback TTS using edge-tts if Murf is not available | |
| """ | |
| try: | |
| import edge_tts | |
| clean_text = clean_markdown_for_tts(text) | |
| if not clean_text: | |
| return b"" | |
| logger.info(f"π Using Edge TTS fallback (voice: {voice})") | |
| communicate = edge_tts.Communicate(clean_text, voice) | |
| audio_chunks = [] | |
| async for chunk in communicate.stream(): | |
| if chunk["type"] == "audio": | |
| audio_chunks.append(chunk["data"]) | |
| audio_data = b"".join(audio_chunks) | |
| logger.info(f"β Generated {len(audio_data)} bytes of audio with Edge TTS") | |
| return audio_data | |
| except ImportError: | |
| logger.error("β Edge TTS not available. Install with: pip install edge-tts") | |
| return b"" | |
| except Exception as e: | |
| logger.error(f"β Edge TTS failed: {e}") | |
| return b"" | |
| class ConversationalAudioService: | |
| """ | |
| Main audio service class for conversational voice bot | |
| """ | |
| def __init__(self): | |
| self.groq_client = groq_client | |
| self.murf_available = MURF_AVAILABLE | |
| self.default_voice = "en-IN-isha" # Indian English voice | |
| async def transcribe_audio(self, audio_bytes: bytes, language: str = "en") -> str: | |
| """Transcribe audio to text using Groq ASR""" | |
| return await groq_asr_bytes(audio_bytes, language=language) | |
| async def synthesize_speech(self, text: str, voice_id: str = None) -> bytes: | |
| """Convert text to speech using best available TTS""" | |
| voice = voice_id or self.default_voice | |
| if self.murf_available: | |
| # Try Murf TTS first | |
| audio = await murf_tts(text, voice_id=voice) | |
| if audio: | |
| return audio | |
| # Fallback to Edge TTS | |
| return await edge_tts_fallback(text, voice="en-IN-Neerja") | |
| def set_default_voice(self, voice_id: str): | |
| """Set default voice for TTS""" | |
| self.default_voice = voice_id | |
| logger.info(f"π΅ Default voice set to: {voice_id}") | |
| # Global audio service instance | |
| conversational_audio_service = ConversationalAudioService() |