cuhgrel commited on
Commit
a07ebd0
·
verified ·
1 Parent(s): f8eb5eb
Files changed (1) hide show
  1. app.py +35 -29
app.py CHANGED
@@ -74,53 +74,62 @@ def synthesize_speech(request: TTSRequest):
74
  detail="Models are not loaded yet. Please try again in a moment."
75
  )
76
 
77
- # Validate the requested language
78
  if request.language not in ['en', 'bikol']:
79
- raise HTTPException(
80
- status_code=status.HTTP_400_BAD_REQUEST,
81
- detail="Invalid language specified. Use 'en' or 'bikol'."
82
- )
83
-
84
- # Check if requested model is available
85
  if request.language not in models:
86
  available = [k for k in models.keys() if k != 'hifigan']
87
- raise HTTPException(
88
- status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
89
- detail=f"The '{request.language}' model is not available. Available languages: {', '.join(available)}"
90
- )
91
 
92
  try:
93
- # Select the correct FastPitch model
94
  spectrogram_generator = models[request.language]
95
  vocoder = models['hifigan']
96
 
97
- logger.info(f"Generating speech for text: '{request.text}' in language: {request.language}")
98
 
99
- # --- THE FIX STARTS HERE: Wrap inference in torch.no_grad() ---
100
  with torch.no_grad():
101
- # Parse text into token IDs
102
  parsed = spectrogram_generator.parse(request.text)
103
-
104
- # Generate spectrogram
 
 
105
  spectrogram = spectrogram_generator.generate_spectrogram(tokens=parsed)
106
-
107
- # Convert spectrogram to audio waveform
108
- audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram)
109
- # --- THE FIX ENDS HERE ---
110
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  # --- Prepare and return audio file ---
112
  audio_numpy = audio.to('cpu').detach().numpy()
113
 
114
- # Ensure audio is 1D
 
115
  if len(audio_numpy.shape) > 1:
116
  audio_numpy = audio_numpy.squeeze()
117
 
118
- # Use an in-memory buffer to avoid writing to disk
119
  buffer = io.BytesIO()
120
  sf.write(buffer, audio_numpy, samplerate=22050, format='WAV')
121
  buffer.seek(0)
122
 
123
- # Return the audio data as a streaming response
 
124
  from fastapi.responses import StreamingResponse
125
  return StreamingResponse(buffer, media_type="audio/wav")
126
 
@@ -128,10 +137,7 @@ def synthesize_speech(request: TTSRequest):
128
  logger.error(f"Error during synthesis: {e}")
129
  import traceback
130
  traceback.print_exc()
131
- raise HTTPException(
132
- status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
133
- detail=f"An error occurred during audio synthesis: {str(e)}"
134
- )
135
 
136
  # --- 5. Add a Root Endpoint for Health Check ---
137
  @app.get("/")
 
74
  detail="Models are not loaded yet. Please try again in a moment."
75
  )
76
 
77
+ # ... (the validation code remains the same) ...
78
  if request.language not in ['en', 'bikol']:
79
+ raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid language specified. Use 'en' or 'bikol'.")
 
 
 
 
 
80
  if request.language not in models:
81
  available = [k for k in models.keys() if k != 'hifigan']
82
+ raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=f"The '{request.language}' model is not available. Available languages: {', '.join(available)}")
 
 
 
83
 
84
  try:
 
85
  spectrogram_generator = models[request.language]
86
  vocoder = models['hifigan']
87
 
88
+ logger.info(f"--- STARTING SYNTHESIS FOR '{request.text}' ---")
89
 
90
+ audio = None # Define audio here to ensure it exists
91
  with torch.no_grad():
92
+ # --- DEBUG STEP 1: Check the parsed tokens ---
93
  parsed = spectrogram_generator.parse(request.text)
94
+ logger.info(f"1. Parsed tokens shape: {parsed.shape}")
95
+ logger.info(f" Parsed tokens content: {parsed}")
96
+
97
+ # --- DEBUG STEP 2: Check the generated spectrogram ---
98
  spectrogram = spectrogram_generator.generate_spectrogram(tokens=parsed)
99
+ if spectrogram is not None:
100
+ logger.info(f"2. Spectrogram generated with shape: {spectrogram.shape}")
101
+ logger.info(f" Spectrogram stats: min={spectrogram.min()}, max={spectrogram.max()}, mean={spectrogram.mean()}")
102
+ else:
103
+ logger.error("2. FAILED: Spectrogram is None!")
104
+
105
+ # --- DEBUG STEP 3: Check the generated audio waveform ---
106
+ if spectrogram is not None:
107
+ audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram)
108
+ if audio is not None:
109
+ logger.info(f"3. Audio generated with shape: {audio.shape}")
110
+ logger.info(f" Audio stats: min={audio.min()}, max={audio.max()}, mean={audio.mean()}")
111
+ else:
112
+ logger.error("3. FAILED: Audio is None!")
113
+
114
+ # If audio generation failed, we can't proceed
115
+ if audio is None:
116
+ logger.error("Synthesis failed, audio tensor is None.")
117
+ raise HTTPException(status_code=500, detail="Audio generation failed internally, resulting in None.")
118
+
119
  # --- Prepare and return audio file ---
120
  audio_numpy = audio.to('cpu').detach().numpy()
121
 
122
+ logger.info(f"4. Successfully converted to NumPy array.")
123
+
124
  if len(audio_numpy.shape) > 1:
125
  audio_numpy = audio_numpy.squeeze()
126
 
 
127
  buffer = io.BytesIO()
128
  sf.write(buffer, audio_numpy, samplerate=22050, format='WAV')
129
  buffer.seek(0)
130
 
131
+ logger.info(f"--- SYNTHESIS COMPLETE ---")
132
+
133
  from fastapi.responses import StreamingResponse
134
  return StreamingResponse(buffer, media_type="audio/wav")
135
 
 
137
  logger.error(f"Error during synthesis: {e}")
138
  import traceback
139
  traceback.print_exc()
140
+ raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"An error occurred during audio synthesis: {str(e)}")
 
 
 
141
 
142
  # --- 5. Add a Root Endpoint for Health Check ---
143
  @app.get("/")