Spaces:

cuhgrel
/

nemo-tts-api

Running

App Files Files Community

cuhgrel commited on Oct 1

Commit

a07ebd0

verified ·

1 Parent(s): f8eb5eb

update

Browse files

Files changed (1) hide show

app.py +35 -29

app.py CHANGED Viewed

@@ -74,53 +74,62 @@ def synthesize_speech(request: TTSRequest):
             detail="Models are not loaded yet. Please try again in a moment."
         )
-    # Validate the requested language
     if request.language not in ['en', 'bikol']:
-        raise HTTPException(
-            status_code=status.HTTP_400_BAD_REQUEST,
-            detail="Invalid language specified. Use 'en' or 'bikol'."
-        )
-    # Check if requested model is available
     if request.language not in models:
         available = [k for k in models.keys() if k != 'hifigan']
-        raise HTTPException(
-            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
-            detail=f"The '{request.language}' model is not available. Available languages: {', '.join(available)}"
-        )
     try:
-        # Select the correct FastPitch model
         spectrogram_generator = models[request.language]
         vocoder = models['hifigan']
-        logger.info(f"Generating speech for text: '{request.text}' in language: {request.language}")
-        # --- THE FIX STARTS HERE: Wrap inference in torch.no_grad() ---
         with torch.no_grad():
-            # Parse text into token IDs
             parsed = spectrogram_generator.parse(request.text)
-            # Generate spectrogram
             spectrogram = spectrogram_generator.generate_spectrogram(tokens=parsed)
-            # Convert spectrogram to audio waveform
-            audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram)
-        # --- THE FIX ENDS HERE ---
         # --- Prepare and return audio file ---
         audio_numpy = audio.to('cpu').detach().numpy()
-        # Ensure audio is 1D
         if len(audio_numpy.shape) > 1:
             audio_numpy = audio_numpy.squeeze()
-        # Use an in-memory buffer to avoid writing to disk
         buffer = io.BytesIO()
         sf.write(buffer, audio_numpy, samplerate=22050, format='WAV')
         buffer.seek(0)
-        # Return the audio data as a streaming response
         from fastapi.responses import StreamingResponse
         return StreamingResponse(buffer, media_type="audio/wav")
@@ -128,10 +137,7 @@ def synthesize_speech(request: TTSRequest):
         logger.error(f"Error during synthesis: {e}")
         import traceback
         traceback.print_exc()
-        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail=f"An error occurred during audio synthesis: {str(e)}"
-        )
 # --- 5. Add a Root Endpoint for Health Check ---
 @app.get("/")

             detail="Models are not loaded yet. Please try again in a moment."
         )
+    # ... (the validation code remains the same) ...
     if request.language not in ['en', 'bikol']:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid language specified. Use 'en' or 'bikol'.")
     if request.language not in models:
         available = [k for k in models.keys() if k != 'hifigan']
+        raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=f"The '{request.language}' model is not available. Available languages: {', '.join(available)}")
     try:
         spectrogram_generator = models[request.language]
         vocoder = models['hifigan']
+        logger.info(f"--- STARTING SYNTHESIS FOR '{request.text}' ---")
+        audio = None # Define audio here to ensure it exists
         with torch.no_grad():
+            # --- DEBUG STEP 1: Check the parsed tokens ---
             parsed = spectrogram_generator.parse(request.text)
+            logger.info(f"1. Parsed tokens shape: {parsed.shape}")
+            logger.info(f"   Parsed tokens content: {parsed}")
+            # --- DEBUG STEP 2: Check the generated spectrogram ---
             spectrogram = spectrogram_generator.generate_spectrogram(tokens=parsed)
+            if spectrogram is not None:
+                logger.info(f"2. Spectrogram generated with shape: {spectrogram.shape}")
+                logger.info(f"   Spectrogram stats: min={spectrogram.min()}, max={spectrogram.max()}, mean={spectrogram.mean()}")
+            else:
+                logger.error("2. FAILED: Spectrogram is None!")
+            # --- DEBUG STEP 3: Check the generated audio waveform ---
+            if spectrogram is not None:
+                audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram)
+                if audio is not None:
+                    logger.info(f"3. Audio generated with shape: {audio.shape}")
+                    logger.info(f"   Audio stats: min={audio.min()}, max={audio.max()}, mean={audio.mean()}")
+                else:
+                    logger.error("3. FAILED: Audio is None!")
+        # If audio generation failed, we can't proceed
+        if audio is None:
+            logger.error("Synthesis failed, audio tensor is None.")
+            raise HTTPException(status_code=500, detail="Audio generation failed internally, resulting in None.")
         # --- Prepare and return audio file ---
         audio_numpy = audio.to('cpu').detach().numpy()
+        logger.info(f"4. Successfully converted to NumPy array.")
         if len(audio_numpy.shape) > 1:
             audio_numpy = audio_numpy.squeeze()
         buffer = io.BytesIO()
         sf.write(buffer, audio_numpy, samplerate=22050, format='WAV')
         buffer.seek(0)
+        logger.info(f"--- SYNTHESIS COMPLETE ---")
         from fastapi.responses import StreamingResponse
         return StreamingResponse(buffer, media_type="audio/wav")
         logger.error(f"Error during synthesis: {e}")
         import traceback
         traceback.print_exc()
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"An error occurred during audio synthesis: {str(e)}")
 # --- 5. Add a Root Endpoint for Health Check ---
 @app.get("/")