Spaces:

cuhgrel
/

nemo-tts-api

Running

App Files Files Community

cuhgrel commited on Sep 29

Commit

df5718a

1 Parent(s): b404ee5

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -20

app.py CHANGED Viewed

@@ -1,9 +1,15 @@
 import torch
 import soundfile as sf
 import io
 from fastapi import FastAPI, HTTPException, status
 from pydantic import BaseModel
 from nemo.collections.tts.models import FastPitchModel, HifiGanModel
 # --- 1. Initialize FastAPI App ---
 app = FastAPI(
@@ -12,43 +18,91 @@ app = FastAPI(
 )
 # --- 2. Load Models on Startup ---
-# This dictionary will hold our loaded models to avoid reloading on every request.
 models = {}
 @app.on_event("startup")
 def load_models():
     """Load all NeMo models into memory when the application starts."""
-    print("Loading models...")
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    # Load the shared HiFi-GAN Vocoder
     try:
         models['hifigan'] = HifiGanModel.restore_from("models/hifigan_en.nemo").to(device)
         models['hifigan'].eval()
         # Load the English Spectrogram Generator
         models['en'] = FastPitchModel.restore_from("models/fastpitch_en.nemo").to(device)
         models['en'].eval()
-        # Load the Bikol Spectrogram Generator with strict=False
-        # This allows loading even if there are size mismatches in embedding layers
-        models['bikol'] = FastPitchModel.restore_from(
-            "models/fastpitch_bikol_repacked.nemo",
-            strict=False  # Critical: allows loading with different vocabulary size
-        ).to(device)
-        models['bikol'].eval()
-        print("All models loaded successfully.")
     except Exception as e:
-        print(f"FATAL: Could not load models. Error: {e}")
         import traceback
         traceback.print_exc()
-        # In a real app, you might want the app to fail fast if models can't load.
 # --- 3. Define API Request and Response Models ---
 class TTSRequest(BaseModel):
     text: str
-    language: str # Should be 'en' or 'bikol'
 # --- 4. Define the TTS API Endpoint ---
 @app.post("/synthesize/")
@@ -61,30 +115,46 @@ def synthesize_speech(request: TTSRequest):
             status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
             detail="Models are not loaded yet. Please try again in a moment."
         )
     # Validate the requested language
     if request.language not in ['en', 'bikol']:
         raise HTTPException(
             status_code=status.HTTP_400_BAD_REQUEST,
             detail="Invalid language specified. Use 'en' or 'bikol'."
         )
     try:
         # Select the correct FastPitch model
         spectrogram_generator = models[request.language]
         vocoder = models['hifigan']
         # --- Generate Audio ---
         # Parse text into token IDs
         parsed = spectrogram_generator.parse(request.text)
         # Generate spectrogram
         spectrogram = spectrogram_generator.generate_spectrogram(tokens=parsed)
         # Convert spectrogram to audio waveform
         audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram)
         # --- Prepare and return audio file ---
         audio_numpy = audio.to('cpu').numpy()
         # Use an in-memory buffer to avoid writing to disk
         buffer = io.BytesIO()
         sf.write(buffer, audio_numpy, samplerate=22050, format='WAV')
@@ -93,9 +163,9 @@ def synthesize_speech(request: TTSRequest):
         # Return the audio data as a streaming response
         from fastapi.responses import StreamingResponse
         return StreamingResponse(buffer, media_type="audio/wav")
     except Exception as e:
-        print(f"Error during synthesis: {e}")
         import traceback
         traceback.print_exc()
         raise HTTPException(
@@ -106,4 +176,20 @@ def synthesize_speech(request: TTSRequest):
 # --- 5. Add a Root Endpoint for Health Check ---
 @app.get("/")
 def read_root():
-    return {"status": "Nemo TTS Backend is running."}

 import torch
 import soundfile as sf
 import io
+import logging
 from fastapi import FastAPI, HTTPException, status
 from pydantic import BaseModel
 from nemo.collections.tts.models import FastPitchModel, HifiGanModel
+from omegaconf import OmegaConf, open_dict
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 # --- 1. Initialize FastAPI App ---
 app = FastAPI(
 )
 # --- 2. Load Models on Startup ---
 models = {}
 @app.on_event("startup")
 def load_models():
     """Load all NeMo models into memory when the application starts."""
+    logger.info("Loading models...")
     device = "cuda" if torch.cuda.is_available() else "cpu"
     try:
+        # Load the shared HiFi-GAN Vocoder
+        logger.info("Loading HiFi-GAN vocoder...")
         models['hifigan'] = HifiGanModel.restore_from("models/hifigan_en.nemo").to(device)
         models['hifigan'].eval()
+        logger.info("HiFi-GAN loaded successfully")
         # Load the English Spectrogram Generator
+        logger.info("Loading English FastPitch model...")
         models['en'] = FastPitchModel.restore_from("models/fastpitch_en.nemo").to(device)
         models['en'].eval()
+        logger.info("English model loaded successfully")
+        # Load the Bikol Spectrogram Generator with configuration override
+        logger.info("Loading Bikol FastPitch model...")
+        try:
+            # First attempt: Try loading with strict=False
+            models['bikol'] = FastPitchModel.restore_from(
+                "models/fastpitch_bikol_repacked.nemo",
+                strict=False
+            ).to(device)
+            models['bikol'].eval()
+            logger.info("Bikol model loaded successfully")
+        except Exception as e:
+            logger.warning(f"First attempt failed: {e}")
+            logger.info("Attempting to load Bikol model with config override...")
+            # Second attempt: Override the text_tokenizer config to remove g2p parameter
+            try:
+                # Create a config override that removes the problematic g2p parameter
+                override_config = OmegaConf.create({
+                    'text_tokenizer': {
+                        '_target_': 'nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.BaseCharsTokenizer',
+                        'pad_with_space': True
+                    }
+                })
+                models['bikol'] = FastPitchModel.restore_from(
+                    "models/fastpitch_bikol_repacked.nemo",
+                    override_config_path=override_config,
+                    strict=False
+                ).to(device)
+                models['bikol'].eval()
+                logger.info("Bikol model loaded successfully with config override")
+            except Exception as e2:
+                logger.error(f"Failed to load Bikol model with override: {e2}")
+                # Third attempt: Try modifying the saved config
+                logger.info("Attempting alternative loading method...")
+                try:
+                    # Load model with map_location to avoid device issues
+                    models['bikol'] = FastPitchModel.restore_from(
+                        "models/fastpitch_bikol_repacked.nemo",
+                        map_location=device,
+                        strict=False
+                    )
+                    models['bikol'].eval()
+                    logger.info("Bikol model loaded with map_location")
+                except Exception as e3:
+                    logger.error(f"All attempts to load Bikol model failed: {e3}")
+                    logger.error("Bikol language will not be available")
+                    # Don't raise - allow app to start with just English
+        logger.info("Model loading complete. Available models: " + ", ".join(models.keys()))
     except Exception as e:
+        logger.error(f"FATAL: Could not load models. Error: {e}")
         import traceback
         traceback.print_exc()
+        # Allow app to start even if models fail - better for debugging
 # --- 3. Define API Request and Response Models ---
 class TTSRequest(BaseModel):
     text: str
+    language: str  # Should be 'en' or 'bikol'
 # --- 4. Define the TTS API Endpoint ---
 @app.post("/synthesize/")
             status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
             detail="Models are not loaded yet. Please try again in a moment."
         )
     # Validate the requested language
     if request.language not in ['en', 'bikol']:
         raise HTTPException(
             status_code=status.HTTP_400_BAD_REQUEST,
             detail="Invalid language specified. Use 'en' or 'bikol'."
         )
+    # Check if requested model is available
+    if request.language not in models:
+        available = [k for k in models.keys() if k != 'hifigan']
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail=f"The '{request.language}' model is not available. Available languages: {', '.join(available)}"
+        )
     try:
         # Select the correct FastPitch model
         spectrogram_generator = models[request.language]
         vocoder = models['hifigan']
         # --- Generate Audio ---
+        logger.info(f"Generating speech for text: '{request.text}' in language: {request.language}")
         # Parse text into token IDs
         parsed = spectrogram_generator.parse(request.text)
         # Generate spectrogram
         spectrogram = spectrogram_generator.generate_spectrogram(tokens=parsed)
         # Convert spectrogram to audio waveform
         audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram)
         # --- Prepare and return audio file ---
         audio_numpy = audio.to('cpu').numpy()
+        # Ensure audio is 1D
+        if len(audio_numpy.shape) > 1:
+            audio_numpy = audio_numpy.squeeze()
         # Use an in-memory buffer to avoid writing to disk
         buffer = io.BytesIO()
         sf.write(buffer, audio_numpy, samplerate=22050, format='WAV')
         # Return the audio data as a streaming response
         from fastapi.responses import StreamingResponse
         return StreamingResponse(buffer, media_type="audio/wav")
     except Exception as e:
+        logger.error(f"Error during synthesis: {e}")
         import traceback
         traceback.print_exc()
         raise HTTPException(
 # --- 5. Add a Root Endpoint for Health Check ---
 @app.get("/")
 def read_root():
+    available_models = [k for k in models.keys() if k != 'hifigan']
+    return {
+        "status": "NeMo TTS Backend is running",
+        "available_languages": available_models,
+        "device": "cuda" if torch.cuda.is_available() else "cpu"
+    }
+# --- 6. Add Model Status Endpoint ---
+@app.get("/status")
+def get_status():
+    """Get the status of all loaded models."""
+    return {
+        "models_loaded": list(models.keys()),
+        "device": "cuda" if torch.cuda.is_available() else "cpu",
+        "english_available": 'en' in models,
+        "bikol_available": 'bikol' in models
+    }