Spaces:

cuhgrel
/

nemo-tts-api

Running

App Files Files Community

cuhgrel commited on 17 days ago

Commit

13f5b32

verified ·

1 Parent(s): c542137

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -22

app.py CHANGED Viewed

@@ -8,8 +8,8 @@ from fastapi.responses import StreamingResponse
 # --- Library Imports ---
 from nemo.collections.tts.models import FastPitchModel, HifiGanModel
-from nemo.collections.tts.torch.tts_tokenizers import BaseCharsTokenizer
-# CORRECTED: Import VitsModel instead of AutoModelForTextToWaveform
 from transformers import VitsModel, AutoTokenizer
 # Configure logging
@@ -31,8 +31,8 @@ def load_models():
     """Load all models into memory when the application starts."""
     logger.info("Loading models...")
     try:
-        # --- NeMo Models ---
-        logger.info("Loading HiFi-GAN vocoder...")
         models['hifigan'] = HifiGanModel.restore_from("models/hifigan_en.nemo").to(device)
         models['hifigan'].eval()
@@ -40,19 +40,14 @@ def load_models():
         models['en'] = FastPitchModel.restore_from("models/fastpitch_en.nemo").to(device)
         models['en'].eval()
-        logger.info("Loading Bikol FastPitch model...")
-        models['bikol'] = FastPitchModel.restore_from("models/fastpitch_bikol_corrected.nemo").to(device)
-        logger.info("Overriding Bikol model tokenizer...")
-        BIKOL_CHARS = [
-            ' ', '!', ',', '-', '.', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i',
-            'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
-            'y', 'z', 'à', 'á', 'â', 'é', 'ì', 'í', 'î', 'ñ', 'ò', 'ó', 'ô', 'ú', '’'
-        ]
-        models['bikol'].tokenizer = BaseCharsTokenizer(chars=BIKOL_CHARS)
-        models['bikol'].eval()
-        # --- Transformers MMS-TTS Model ---
         logger.info("Loading Tagalog (tgl) MMS-TTS model from Hub...")
         models['tgl_tokenizer'] = AutoTokenizer.from_pretrained("facebook/mms-tts-tgl")
         models['tgl_model'] = VitsModel.from_pretrained("facebook/mms-tts-tgl").to(device)
@@ -84,9 +79,10 @@ def synthesize_speech(request: TTSRequest):
     try:
         logger.info(f"--- STARTING SYNTHESIS for '{lang}' ---")
-        if lang in ['en', 'bikol']:
             sample_rate = 22050
-            spectrogram_generator = models[lang]
             vocoder = models['hifigan']
             with torch.no_grad():
                 parsed = spectrogram_generator.parse(request.text)
@@ -94,10 +90,14 @@ def synthesize_speech(request: TTSRequest):
                 audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram)
             audio_numpy = audio.to('cpu').detach().numpy().squeeze()
-        elif lang == 'tgl':
-            sample_rate = 16000
-            tokenizer = models['tgl_tokenizer']
-            model = models['tgl_model']
             with torch.no_grad():
                 inputs = tokenizer(request.text, return_tensors="pt").to(device)
                 output = model(**inputs).waveform
@@ -126,4 +126,10 @@ def read_root():
 @app.get("/status")
 def get_status():
     """Get the status of all loaded models."""
-    return {"models_loaded": list(models.keys()), "device": device, "english_available": 'en' in models, "bikol_available": 'bikol' in models, "tagalog_available": 'tgl_model' in models}

 # --- Library Imports ---
 from nemo.collections.tts.models import FastPitchModel, HifiGanModel
+# BaseCharsTokenizer is no longer needed since we aren't using the old NeMo Bikol model
+# from nemo.collections.tts.torch.tts_tokenizers import BaseCharsTokenizer
 from transformers import VitsModel, AutoTokenizer
 # Configure logging
     """Load all models into memory when the application starts."""
     logger.info("Loading models...")
     try:
+        # --- NeMo Models (English Only now) ---
+        logger.info("Loading HiFi-GAN vocoder (for English)...")
         models['hifigan'] = HifiGanModel.restore_from("models/hifigan_en.nemo").to(device)
         models['hifigan'].eval()
         models['en'] = FastPitchModel.restore_from("models/fastpitch_en.nemo").to(device)
         models['en'].eval()
+        # --- Transformers MMS-TTS Models (Bikol & Tagalog) ---
+        # 1. NEW BIKOL MODEL (Fine-tuned VITS)
+        logger.info("Loading Bikol (bcl) MMS-TTS model from Hub...")
+        models['bikol_tokenizer'] = AutoTokenizer.from_pretrained("cuhgrel/bikol-mms-finetuned")
+        models['bikol_model'] = VitsModel.from_pretrained("cuhgrel/bikol-mms-finetuned").to(device)
+        # 2. TAGALOG MODEL (Base VITS)
         logger.info("Loading Tagalog (tgl) MMS-TTS model from Hub...")
         models['tgl_tokenizer'] = AutoTokenizer.from_pretrained("facebook/mms-tts-tgl")
         models['tgl_model'] = VitsModel.from_pretrained("facebook/mms-tts-tgl").to(device)
     try:
         logger.info(f"--- STARTING SYNTHESIS for '{lang}' ---")
+        # --- CASE A: English (Uses NeMo) ---
+        if lang == 'en':
             sample_rate = 22050
+            spectrogram_generator = models['en']
             vocoder = models['hifigan']
             with torch.no_grad():
                 parsed = spectrogram_generator.parse(request.text)
                 audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram)
             audio_numpy = audio.to('cpu').detach().numpy().squeeze()
+        # --- CASE B: Bikol & Tagalog (Uses Hugging Face VITS) ---
+        elif lang in ['tgl', 'bikol']:
+            sample_rate = 16000 # MMS models are usually 16kHz
+            # Dynamically select the correct tokenizer and model from the dictionary
+            tokenizer = models[f'{lang}_tokenizer']
+            model = models[f'{lang}_model']
             with torch.no_grad():
                 inputs = tokenizer(request.text, return_tensors="pt").to(device)
                 output = model(**inputs).waveform
 @app.get("/status")
 def get_status():
     """Get the status of all loaded models."""
+    return {
+        "models_loaded": list(models.keys()),
+        "device": device,
+        "english_available": 'en' in models,
+        "bikol_available": 'bikol_model' in models, # Updated check
+        "tagalog_available": 'tgl_model' in models
+    }