Spaces:
Running
on
Zero
Running
on
Zero
File size: 6,530 Bytes
eb18e14 164603c eb18e14 164603c eb18e14 164603c eb18e14 164603c 9ae05cf 800e65f 9ae05cf 800e65f 9ae05cf 800e65f 9ae05cf 800e65f 9ae05cf 800e65f 164603c e9bcb5a 9ae05cf eb18e14 164603c 800e65f 164603c eb18e14 164603c 9ae05cf 164603c a8c7294 164603c 800e65f 164603c 52c0d1f 164603c 800e65f 164603c f53d138 088ca61 9ae05cf 164603c 00e4cff f53d138 164603c 9ae05cf 2cca797 eb18e14 b246644 eb18e14 164603c 0488cfb 164603c 00e4cff ad693da f53d138 ad693da 00e4cff bb2123e 00e4cff ad693da 08ade56 00e4cff ad693da 08ade56 ad693da 00e4cff 164603c ad693da 0488cfb 164603c 0488cfb 164603c 52c0d1f aa6abd6 52c0d1f 164603c eb18e14 9ae05cf eb18e14 164603c eb18e14 52c0d1f 164603c 9ae05cf 46cf002 0488cfb eb18e14 4e3722d 949c8bd 9ae05cf 0488cfb 164603c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 |
from create_env import setup_dependencies
setup_dependencies()
import spaces
import gradio as gr
from util import NemoAudioPlayer, InitModels, load_config, Examples
import numpy as np
import torch
import os
# Get HuggingFace token
token_ = os.getenv('HF_TOKEN')
config = load_config("./model_config.yaml")
models_configs = config.models
nemo_player_cfg = config.nemo_player
examples_cfg = load_config("./examples.yaml")
examples_maker = Examples(examples_cfg)
examples = examples_maker()
# Global variables for lazy loading
player = None
models = {}
models_loaded = False
@spaces.GPU(duration=120) # Increase duration for longer sessions
def get_models():
"""Lazy load models only when needed (inside GPU context)"""
global player, models, models_loaded
if not models_loaded:
player = NemoAudioPlayer(nemo_player_cfg)
init_models = InitModels(models_configs, player, token_)
models = init_models()
models_loaded = True
return models
@spaces.GPU(duration=60) # Set appropriate duration
def generate_speech_gpu(text, model_choice, speaker_display: str, t, top_p, rp, max_tok):
"""
Generate speech from text using the selected model on GPU
"""
if not text.strip():
return None, "Please enter text for speech generation."
if not model_choice:
return None, "Please select a model."
try:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# Load models inside GPU context
models = get_models()
selected_model = models[model_choice]
cfg = models_configs.get(model_choice)
speaker_map = cfg.get('speaker_id', {}) if cfg is not None else {}
if speaker_display and speaker_map:
speaker_id = speaker_map.get(speaker_display)
else:
speaker_id = None
print(f"Generating speech with {model_choice}...")
# Use torch.inference_mode for better performance
with torch.inference_mode():
audio, _, time_report = selected_model.run_model(text, speaker_id, t, top_p, rp, max_tok)
sample_rate = 22050
print("Speech generation completed!")
return (sample_rate, audio), time_report
except Exception as e:
print(f"Error during generation: {str(e)}")
return None, f"❌ Error during generation: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="French with Haitian accent TTS - Text to Speech", theme=gr.themes.Ocean()) as demo:
gr.Markdown("Fast and Expressive Speech Generation Model")
gr.Markdown("Select a model and enter text to generate emotional speech")
gr.Markdown("⚠️ **Note**: First generation may take 30-60s to load models. Subsequent generations will be faster.")
with gr.Row():
with gr.Column(scale=1):
model_dropdown = gr.Dropdown(
choices=list(models_configs.keys()),
value=list(models_configs.keys())[0],
label="Selected Model",
info="Finetuned from KANIS-TTS"
)
# Speaker selector
all_speakers = []
for _cfg in models_configs.values():
if _cfg and _cfg.get('speaker_id'):
all_speakers.extend(list(_cfg.speaker_id.keys()))
all_speakers = sorted(list(set(all_speakers)))
speaker_dropdown = gr.Dropdown(
choices=all_speakers,
value=None,
label="Speaker",
visible=True,
allow_custom_value=False
)
text_input = gr.Textbox(
label="Text",
placeholder="Enter your text ...",
lines=3,
max_lines=10
)
with gr.Accordion("Settings", open=False):
temp = gr.Slider(
minimum=0.1, maximum=1.5, value=0.7, step=0.05,
label="Temp",
)
top_p = gr.Slider(
minimum=0.1, maximum=1.0, value=0.95, step=0.05,
label="Top P",
)
rp = gr.Slider(
minimum=1.0, maximum=2.0, value=1.95, step=0.05,
label="Repetition Penalty",
)
max_tok = gr.Slider(
minimum=100, maximum=2000, value=1000, step=100,
label="Max Tokens",
)
generate_btn = gr.Button("Run", variant="primary", size="lg")
with gr.Column(scale=1):
audio_output = gr.Audio(
label="Generated Audio",
type="numpy"
)
time_report_output = gr.Textbox(
label="Time Report",
interactive=False,
value="Ready to generate speech",
lines=3
)
# Update speakers when model changes
def update_speakers(model_choice):
cfg = models_configs.get(model_choice)
speakers = list(cfg.speaker_id.keys()) if (cfg and cfg.get('speaker_id')) else []
if speakers:
return gr.update(choices=speakers, value=speakers[0], visible=True)
else:
return gr.update(choices=[], value=None, visible=False)
model_dropdown.change(
fn=update_speakers,
inputs=[model_dropdown],
outputs=[speaker_dropdown]
)
# Populate speakers on initial page load
demo.load(
fn=update_speakers,
inputs=[model_dropdown],
outputs=[speaker_dropdown]
)
# GPU generation event
generate_btn.click(
fn=generate_speech_gpu,
inputs=[text_input, model_dropdown, speaker_dropdown, temp, top_p, rp, max_tok],
outputs=[audio_output, time_report_output]
)
# Disable example caching on Zero GPU to avoid cold starts
with gr.Row():
gr.Examples(
examples=examples,
inputs=[text_input, model_dropdown, speaker_dropdown, temp, top_p, rp, max_tok],
fn=generate_speech_gpu,
outputs=[audio_output, time_report_output],
cache_examples=False, # Changed to False for Zero GPU
)
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
show_error=True
) |