File size: 10,295 Bytes
8ef276c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
# voice_processing_service.py
import logging
import whisper
import os
import tempfile
from typing import Dict, Any, Optional
from pathlib import Path

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class VoiceProcessor:
    """
    Service for processing voice queries with speech-to-text and translation.

    Features:
    - Speech-to-text using OpenAI Whisper
    - Automatic language detection
    - Arabic-to-English translation
    - Supports 99+ languages
    - Works offline

    Whisper Model Sizes:
    - tiny: 39M params, ~1GB RAM, fast but less accurate
    - base: 74M params, ~1GB RAM, balanced (RECOMMENDED for quick start)
    - small: 244M params, ~2GB RAM, good accuracy
    - medium: 769M params, ~5GB RAM, better accuracy
    - large: 1550M params, ~10GB RAM, best accuracy
    """

    def __init__(self, model_size: str = "base"):
        """
        Initialize the voice processing service.

        Args:
            model_size: Whisper model to use. Options:
                - "tiny" (39M) - Fast, less accurate
                - "base" (74M) - Balanced, recommended for development
                - "small" (244M) - Good accuracy
                - "medium" (769M) - Better accuracy
                - "large" (1550M) - Best accuracy, slowest
        """
        logger.info(f"Loading Whisper model: {model_size}")
        logger.info("This may take a few minutes on first run (downloading model)...")

        # Load Whisper model
        # This downloads the model on first run
        self.model = whisper.load_model(model_size)

        self.model_size = model_size
        logger.info(f"โœ“ Whisper model '{model_size}' loaded successfully")
        logger.info(f"Supported languages: 99+ (auto-detected)")

    def transcribe_audio(
        self,
        audio_path: str,
        language: Optional[str] = None
    ) -> Dict[str, Any]:
        """
        Transcribe audio file in its original language.

        Args:
            audio_path: Path to audio file (mp3, wav, m4a, etc.)
            language: Optional language code (e.g., "en", "ar"). If None, auto-detect.

        Returns:
            Dictionary with transcription results:
            {
                "text": "transcribed text",
                "language": "en",
                "language_name": "English",
                "confidence": 0.95
            }
        """
        logger.info(f"Transcribing audio: {audio_path}")

        # Transcribe with Whisper
        result = self.model.transcribe(
            audio_path,
            language=language,
            fp16=False  # Use fp32 for better compatibility
        )

        transcription = {
            "text": result["text"].strip(),
            "language": result["language"],
            "language_name": self._get_language_name(result["language"]),
            "confidence": self._calculate_confidence(result)
        }

        logger.info(f"โœ“ Transcribed: '{transcription['text'][:100]}...'")
        logger.info(f"  Language: {transcription['language_name']} ({transcription['language']})")
        logger.info(f"  Confidence: {transcription['confidence']:.2f}")

        return transcription

    def translate_to_english(self, audio_path: str) -> Dict[str, Any]:
        """
        Transcribe audio and translate to English (if not already English).

        This is optimized for the use case where you always want English output,
        regardless of the input language.

        Args:
            audio_path: Path to audio file

        Returns:
            Dictionary with translation results:
            {
                "original_text": "ุงู„ู†ุต ุงู„ุฃุตู„ูŠ",
                "english_text": "translated text",
                "original_language": "ar",
                "original_language_name": "Arabic",
                "was_translated": true
            }
        """
        logger.info(f"Processing audio for English output: {audio_path}")

        # First, transcribe in original language to detect it
        original = self.model.transcribe(audio_path, fp16=False)

        # Then translate to English
        translated = self.model.transcribe(
            audio_path,
            task="translate",  # This translates to English
            fp16=False
        )

        result = {
            "original_text": original["text"].strip(),
            "english_text": translated["text"].strip(),
            "original_language": original["language"],
            "original_language_name": self._get_language_name(original["language"]),
            "was_translated": original["language"] != "en"
        }

        if result["was_translated"]:
            logger.info(f"โœ“ Detected {result['original_language_name']}, translated to English")
            logger.info(f"  Original: '{result['original_text'][:100]}...'")
            logger.info(f"  English: '{result['english_text'][:100]}...'")
        else:
            logger.info(f"โœ“ Already in English, no translation needed")

        return result

    def process_voice_query(self, audio_path: str) -> Dict[str, Any]:
        """
        Complete pipeline: transcribe, translate if needed, return query text.

        This is the main method for the voice assistant use case.

        Args:
            audio_path: Path to audio file

        Returns:
            Dictionary ready for division extraction:
            {
                "query": "english text for processing",
                "original_text": "original text if different",
                "language": "ar",
                "language_name": "Arabic",
                "was_translated": true,
                "audio_duration": 5.2
            }
        """
        logger.info(f"Processing voice query: {audio_path}")

        # Get audio duration
        audio_info = whisper.load_audio(audio_path)
        duration = len(audio_info) / whisper.audio.SAMPLE_RATE

        # Translate to English (works for all languages)
        result = self.translate_to_english(audio_path)

        return {
            "query": result["english_text"],  # Always English for processing
            "original_text": result["original_text"],
            "language": result["original_language"],
            "language_name": result["original_language_name"],
            "was_translated": result["was_translated"],
            "audio_duration": round(duration, 2)
        }

    def _get_language_name(self, lang_code: str) -> str:
        """Get full language name from code."""
        language_names = {
            "en": "English",
            "ar": "Arabic",
            "es": "Spanish",
            "fr": "French",
            "de": "German",
            "zh": "Chinese",
            "ja": "Japanese",
            "ko": "Korean",
            "ru": "Russian",
            "pt": "Portuguese",
            "it": "Italian",
            "nl": "Dutch",
            "tr": "Turkish",
            "pl": "Polish",
            "uk": "Ukrainian",
            "vi": "Vietnamese",
            "th": "Thai",
            "hi": "Hindi",
            "ur": "Urdu",
            # Add more as needed
        }
        return language_names.get(lang_code, lang_code.upper())

    def _calculate_confidence(self, whisper_result: Dict) -> float:
        """
        Calculate confidence score from Whisper result.

        Whisper doesn't directly provide confidence, so we estimate it
        based on available metrics.
        """
        # If segments are available, average their probabilities
        if "segments" in whisper_result and whisper_result["segments"]:
            avg_logprob = sum(s.get("avg_logprob", -1.0) for s in whisper_result["segments"])
            avg_logprob /= len(whisper_result["segments"])
            # Convert log probability to approximate confidence (0-1)
            # logprob ranges from -inf to 0, typically -2 to 0 for good transcriptions
            confidence = max(0.0, min(1.0, (avg_logprob + 2.0) / 2.0))
            return round(confidence, 2)

        # Default confidence
        return 0.85

    def save_uploaded_audio(self, audio_bytes: bytes, filename: str) -> str:
        """
        Save uploaded audio file to temporary location.

        Args:
            audio_bytes: Audio file bytes
            filename: Original filename

        Returns:
            Path to saved file
        """
        # Create temp directory if it doesn't exist
        temp_dir = Path(tempfile.gettempdir()) / "voice_assistant_uploads"
        temp_dir.mkdir(exist_ok=True)

        # Save file
        file_extension = Path(filename).suffix
        temp_file = temp_dir / f"upload_{os.urandom(8).hex()}{file_extension}"

        temp_file.write_bytes(audio_bytes)
        logger.info(f"Saved uploaded audio to: {temp_file}")

        return str(temp_file)

    def save_audio_array(self, audio_data, sample_rate: int) -> str:
        """
        Save audio numpy array to temporary WAV file (for Gradio integration).

        Args:
            audio_data: Audio data as numpy array
            sample_rate: Sample rate of the audio

        Returns:
            Path to saved WAV file
        """
        import numpy as np
        import scipy.io.wavfile as wavfile

        # Create temp directory if it doesn't exist
        temp_dir = Path(tempfile.gettempdir()) / "voice_assistant_uploads"
        temp_dir.mkdir(exist_ok=True)

        # Save as WAV file
        temp_file = temp_dir / f"gradio_{os.urandom(8).hex()}.wav"

        # Ensure audio_data is in the correct format
        if isinstance(audio_data, np.ndarray):
            # Normalize to int16 if needed
            if audio_data.dtype == np.float32 or audio_data.dtype == np.float64:
                audio_data = (audio_data * 32767).astype(np.int16)

        wavfile.write(str(temp_file), sample_rate, audio_data)
        logger.info(f"Saved Gradio audio to: {temp_file}")

        return str(temp_file)

    def cleanup_temp_file(self, file_path: str):
        """Delete temporary audio file."""
        try:
            if os.path.exists(file_path):
                os.remove(file_path)
                logger.info(f"Cleaned up temp file: {file_path}")
        except Exception as e:
            logger.warning(f"Failed to cleanup temp file {file_path}: {e}")