Spaces:
Running
Running
| """ | |
| Cantonese TTS Demo - Powered by GPT-SoVITS | |
| Final Version: All models downloaded from HuggingFace | |
| """ | |
| import os | |
| import sys | |
| import torch | |
| import numpy as np | |
| import gradio as gr | |
| import soundfile as sf | |
| from pathlib import Path | |
| from huggingface_hub import hf_hub_download, snapshot_download | |
| import zipfile | |
| import shutil | |
| # Add this for Zero GPU spaces | |
| import spaces | |
| # Set up paths | |
| ROOT_DIR = Path(__file__).parent | |
| sys.path.append(str(ROOT_DIR)) | |
| # Configure environment | |
| os.environ["version"] = "v2ProPlus" | |
| os.environ["is_half"] = "True" | |
| os.environ["is_share"] = "False" | |
| # Model repositories | |
| YOUR_MODEL_REPO = "laubonghaudoi/zoengjyutgaai_tts" # Your fine-tuned models | |
| PRETRAINED_REPO = "XXXXRT/GPT-SoVITS-Pretrained" # Official pretrained models | |
| # Global variables | |
| tts_instance = None | |
| models_ready = False | |
| def download_and_extract_pretrained(): | |
| """Download and extract pretrained models from HuggingFace""" | |
| pretrained_dir = ROOT_DIR / "GPT_SoVITS" / "pretrained_models" | |
| pretrained_dir.mkdir(parents=True, exist_ok=True) | |
| # Check if already downloaded | |
| if (pretrained_dir / "chinese-hubert-base").exists() and \ | |
| (pretrained_dir / "chinese-roberta-wwm-ext-large").exists(): | |
| print("✓ Pretrained models already exist") | |
| return True | |
| try: | |
| print("📥 Downloading pretrained models from HuggingFace...") | |
| # Download the pretrained models zip | |
| zip_path = hf_hub_download( | |
| repo_id=PRETRAINED_REPO, | |
| filename="pretrained_models.zip", | |
| cache_dir="./cache", | |
| resume_download=True | |
| ) | |
| print("📦 Extracting pretrained models...") | |
| with zipfile.ZipFile(zip_path, 'r') as zip_ref: | |
| # Extract to GPT_SoVITS directory | |
| zip_ref.extractall(ROOT_DIR / "GPT_SoVITS") | |
| print("✓ Pretrained models ready") | |
| return True | |
| except Exception as e: | |
| print(f"❌ Error downloading pretrained models: {e}") | |
| return False | |
| def download_g2pw_model(): | |
| """Download G2PW model for Chinese text processing""" | |
| g2pw_dir = ROOT_DIR / "GPT_SoVITS" / "text" / "G2PWModel" | |
| if g2pw_dir.exists(): | |
| print("✓ G2PW model already exists") | |
| return True | |
| try: | |
| print("📥 Downloading G2PW model...") | |
| # Download G2PW model zip | |
| zip_path = hf_hub_download( | |
| repo_id=PRETRAINED_REPO, | |
| filename="G2PWModel.zip", | |
| cache_dir="./cache", | |
| resume_download=True | |
| ) | |
| print("📦 Extracting G2PW model...") | |
| with zipfile.ZipFile(zip_path, 'r') as zip_ref: | |
| zip_ref.extractall(ROOT_DIR / "GPT_SoVITS" / "text") | |
| print("✓ G2PW model ready") | |
| return True | |
| except Exception as e: | |
| print(f"❌ Error downloading G2PW model: {e}") | |
| return False | |
| def download_finetuned_models(): | |
| """Download your fine-tuned models""" | |
| try: | |
| print(f"📥 Downloading fine-tuned models from {YOUR_MODEL_REPO}...") | |
| # Create directories for the models | |
| gpt_dir = ROOT_DIR / "GPT_SoVITS" / "pretrained_models" / "fine_tuned" | |
| gpt_dir.mkdir(parents=True, exist_ok=True) | |
| # Download GPT model | |
| gpt_cache_path = hf_hub_download( | |
| repo_id=YOUR_MODEL_REPO, | |
| filename="gpt/dpo1-e1000.ckpt", | |
| cache_dir="./models", | |
| resume_download=True | |
| ) | |
| # Copy GPT model to expected location | |
| gpt_path = gpt_dir / "dpo1-e1000.ckpt" | |
| if not gpt_path.exists(): | |
| shutil.copy2(gpt_cache_path, gpt_path) | |
| print(f"✓ GPT model downloaded: {gpt_path}") | |
| # Download the known working SoVITS model | |
| sovits_file = "sovits/188hr_e50_s5950.pth" | |
| model_name = Path(sovits_file).name | |
| print(f"📥 Downloading SoVITS model {model_name}...") | |
| sovits_cache_path = hf_hub_download( | |
| repo_id=YOUR_MODEL_REPO, | |
| filename=sovits_file, | |
| cache_dir="./models", | |
| resume_download=True | |
| ) | |
| # Copy to expected location | |
| sovits_path = gpt_dir / model_name | |
| if not sovits_path.exists(): | |
| shutil.copy2(sovits_cache_path, sovits_path) | |
| file_size = sovits_path.stat().st_size / (1024 * 1024) | |
| print(f"✓ SoVITS model downloaded: {model_name} ({file_size:.1f}MB)") | |
| return str(gpt_path), str(sovits_path) | |
| except Exception as e: | |
| print(f"❌ Error downloading fine-tuned models: {e}") | |
| raise | |
| def ensure_all_models(): | |
| """Ensure all required models are downloaded""" | |
| global models_ready | |
| if models_ready: | |
| return True | |
| print("🔄 Checking and downloading required models...") | |
| # Download pretrained models | |
| if not download_and_extract_pretrained(): | |
| return False | |
| # Download G2PW model | |
| if not download_g2pw_model(): | |
| return False | |
| # Download nltk data if needed (for text processing) | |
| try: | |
| import nltk | |
| nltk.download('averaged_perceptron_tagger', quiet=True) | |
| nltk.download('cmudict', quiet=True) | |
| except: | |
| pass | |
| models_ready = True | |
| print("✅ All models ready!") | |
| return True | |
| def generate_tts( | |
| text, | |
| ref_audio, | |
| ref_text, | |
| top_k=15, | |
| top_p=1.0, | |
| temperature=1.0, | |
| speed=1.0 | |
| ): | |
| """Generate TTS with GPU acceleration""" | |
| global tts_instance | |
| try: | |
| # Ensure models are downloaded | |
| if not ensure_all_models(): | |
| return None, "❌ 模型下载失败 | Model download failed" | |
| # Initialize TTS instance if needed | |
| if tts_instance is None: | |
| # Import here after models are downloaded | |
| sys.path.append(str(ROOT_DIR / "GPT_SoVITS")) | |
| from TTS_infer_pack.TTS import TTS, TTS_Config | |
| # Get model paths | |
| gpt_path, sovits_path = download_finetuned_models() | |
| print(f"Using fine-tuned models:") | |
| print(f" GPT model: {gpt_path}") | |
| print(f" SoVITS model: {sovits_path}") | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # The TTS_Config looks for a "custom" key in the config dict | |
| # If not found, it falls back to version defaults | |
| # So we need to wrap our config in a "custom" key | |
| config_dict = { | |
| "custom": { | |
| "device": device, | |
| "is_half": torch.cuda.is_available(), | |
| "bert_base_path": str(ROOT_DIR / "GPT_SoVITS" / "pretrained_models" / "chinese-roberta-wwm-ext-large"), | |
| "cnhuhbert_base_path": str(ROOT_DIR / "GPT_SoVITS" / "pretrained_models" / "chinese-hubert-base"), | |
| "t2s_weights_path": gpt_path, # Your fine-tuned GPT model | |
| "vits_weights_path": sovits_path, # Your fine-tuned SoVITS model | |
| "version": "v2ProPlus" # Match the environment variable | |
| } | |
| } | |
| # Initialize TTS with config dictionary | |
| tts_instance = TTS(config_dict) | |
| print("✓ TTS instance initialized") | |
| # Validate inputs | |
| text = text.strip() | |
| if not text: | |
| return None, "輸入要合成嘅文本" | |
| if ref_audio is None: | |
| return None, "請上傳參考音頻" | |
| if not ref_text or ref_text.strip() == "": | |
| return None, "請輸入參考音頻文本" | |
| # Generate audio | |
| print(f"🎙️ Generating speech for: {text[:50]}...") | |
| params = { | |
| "text": text, | |
| "text_lang": "yue", | |
| "ref_audio_path": ref_audio, # ref_audio is already a string path | |
| "prompt_text": ref_text.strip(), | |
| "prompt_lang": "yue", | |
| "top_k": top_k, | |
| "top_p": top_p, | |
| "temperature": temperature, | |
| "speed_factor": speed # Note: parameter name might be speed_factor | |
| } | |
| # Call TTS (run method returns a generator) | |
| with torch.no_grad(): | |
| generator = tts_instance.run(params) | |
| # The generator yields (sample_rate, audio_data) tuples | |
| # We need to iterate through it to get the audio | |
| sr = None | |
| audio_data = None | |
| for chunk_sr, chunk_audio in generator: | |
| sr = chunk_sr | |
| audio_data = chunk_audio | |
| # Usually there's only one chunk for non-streaming mode | |
| break | |
| # Handle empty result | |
| if audio_data is None or sr is None: | |
| return None, "❌ 生成失败:返回空结果 | Generation failed: empty result" | |
| # audio_data should already be a numpy array from the generator | |
| # Ensure it's float32 for soundfile | |
| if audio_data.dtype != np.float32: | |
| audio_data = audio_data.astype(np.float32) | |
| # Normalize to [-1, 1] range if needed | |
| audio_max = np.abs(audio_data).max() | |
| if audio_max > 1.0: | |
| audio_data = audio_data / audio_max | |
| # Save output | |
| output_path = "output.wav" | |
| sf.write(output_path, audio_data, sr) | |
| return output_path, "✅ 合成成功!| Synthesis successful!" | |
| except Exception as e: | |
| import traceback | |
| error_details = traceback.format_exc() | |
| print(f"Error details:\n{error_details}") | |
| return None, f"❌ 生成失败 | Generation failed: {str(e)}" | |
| # Gradio interface | |
| def create_interface(): | |
| with gr.Blocks( | |
| title="粤语 TTS 演示 | Cantonese TTS Demo", | |
| theme=gr.themes.Soft(), | |
| css=""" | |
| .gradio-container { | |
| font-family: 'Microsoft YaHei', 'PingFang SC', -apple-system, BlinkMacSystemFont, sans-serif; | |
| } | |
| #ref_audio { | |
| min-height: 100px; | |
| } | |
| """ | |
| ) as demo: | |
| gr.Markdown(""" | |
| # 張悦楷講古語音合成器 Zoeng Jyut Gaai TTS | |
| 模型信息見 [laubonghaudoi/zoengjyutgaai_tts](https://huggingface.co/laubonghaudoi/zoengjyutgaai_tts) | |
| 數據採用張悦楷講古語音數據集 [CanCLID/zoengjyutgaai](https://huggingface.co/datasets/CanCLID/zoengjyutgaai) | |
| --- | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown(""" | |
| ## 使用步驟 | |
| 1. 上傳一段 3 - 10 秒嘅粵語音頻作為參考音頻,然後輸入埋佢嘅對應文本。 | |
| 2. 輸入音頻對應嘅粵語文本,可以揀下面示例文本其中一句嚟試下效果 | |
| 3. (可選)喺高級設定度揀語速、Top K、Top P、溫度 | |
| 4. 撳生成掣 | |
| ### 參考音頻係咩? | |
| 上傳嘅參考音頻主要用嚟控制生成音頻嘅語氣同情感。例如參考音頻係朗讀詩詞,噉生成嘅音頻就會好似朗讀詩詞噉講嘢;如果參考音頻係疑問,噉生成嘅音頻都會有疑問語氣。 | |
| 如果你冇參考音頻或者懶得揾,都可以直接撳「使用預設參考音頻」入面嘅選項。 | |
| ## 已知問題 | |
| 1. 模型有時會有幻覺,生成啲同文本完全無關嘅亂噏。呢個一般係參考音頻嘅問題,換一條參考音頻同文本重試就得。 | |
| 1. 因為個基礎模型係用簡體字訓練嘅,所以可能會出現「只隻」不分、「松鬆」不分嘅問題,例如「一隻狗」會讀成「一 zi2 狗」。要解決只能用同音字代替,例如寫成「一脊狗」。 | |
| 1. 輸入文本唔可以太長,否則後面嗰啲會自動切晒。 | |
| """) | |
| with gr.Column(scale=2): | |
| # Reference audio section | |
| with gr.Group(): | |
| gr.Markdown("### 参考音频") | |
| with gr.Row(): | |
| with gr.Column(): | |
| ref_audio_input = gr.Audio( | |
| label="上傳參考音頻 (3 - 10秒)", | |
| type="filepath", | |
| elem_id="ref_audio" | |
| ) | |
| with gr.Column(): | |
| ref_text_input = gr.Textbox( | |
| label="參考音頻文本", | |
| placeholder="參考音頻對應嘅粵文轉寫", | |
| lines=3 | |
| ) | |
| # Default reference section | |
| with gr.Accordion("用預設參考音頻", open=True): | |
| with gr.Row(): | |
| default_ref_btn = gr.Button( | |
| "張悦楷《三國演義》開場白", | |
| variant="secondary", | |
| size="sm" | |
| ) | |
| gr.Markdown("*各位朋友,喺講《三國演義》之前啊,我唸一首詞畀大家聽下吓。*", elem_id="ref_desc") | |
| # Text to synthesize | |
| text_input = gr.Textbox( | |
| label="輸入文本", | |
| placeholder="例:從前有個住喺海邊嘅阿婆", | |
| lines=5 | |
| ) | |
| # Examples section moved here | |
| gr.Markdown("### 示例文本") | |
| gr.Examples( | |
| examples=[ | |
| ["廣州商團事變,廣東革命政府叫廣州商團叛亂。廣州商團叫廣州屠城事件、西關屠城血案或者西關慘案,係一九二四年十月十號喺廣州爆發嘅一場武裝衝突。"], | |
| ["紅線女,原名鄺健廉,粵劇表演藝術家、粵劇紅派表演藝術創始人。她曾被周恩來譽為「南國紅豆」。"], | |
| ["二十日,葉舉又與粵軍諸將致電孫文,要求恢復陳炯明廣東省長、粵軍總司令之職,遭孫文拒絕。"], | |
| ["但係呢,三個月之後,上海失咗,南京失咗。共產黨喺武漢呢,即刻變咗口嘞,話,凡親有主張話蘇聯參戰嘅呢,嗰個就係國賊漢奸噉。"], | |
| ], | |
| inputs=text_input, | |
| label="揀一個嚟生成試下效果" | |
| ) | |
| # Advanced settings | |
| with gr.Accordion("⚙️ 高级設定", open=False): | |
| with gr.Row(): | |
| top_k_slider = gr.Slider( | |
| minimum=1, maximum=50, value=15, step=1, | |
| label="Top K", | |
| info="控制採樣,越高隨機性越大,太低可能會變成亂噏" | |
| ) | |
| top_p_slider = gr.Slider( | |
| minimum=0.0, maximum=1.0, value=1.0, step=0.1, | |
| label="Top P", | |
| info="核采样" | |
| ) | |
| with gr.Row(): | |
| temperature_slider = gr.Slider( | |
| minimum=0.1, maximum=2.0, value=1.0, step=0.1, | |
| label="Temperature", | |
| info="温度,越高越有創造力但不可預測" | |
| ) | |
| speed_slider = gr.Slider( | |
| minimum=0.5, maximum=2.0, value=1.0, step=0.1, | |
| label="语速", | |
| info="1.0 = 正常" | |
| ) | |
| # Generate button | |
| generate_btn = gr.Button( | |
| "生成", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| # Output | |
| with gr.Group(): | |
| audio_output = gr.Audio( | |
| label="成果", | |
| type="filepath" | |
| ) | |
| status_output = gr.Textbox( | |
| label="状态", | |
| interactive=False, | |
| max_lines=3 | |
| ) | |
| # Event handlers | |
| # Default reference audio button | |
| def use_default_reference(): | |
| ref_audio_path = ROOT_DIR / "ref" / "001_001.opus" | |
| # Check if file exists | |
| if ref_audio_path.exists(): | |
| ref_text = "各位朋友,喺講《三國演義》之前啊,我唸一首詞畀大家聽下吓。" | |
| return str(ref_audio_path), ref_text | |
| else: | |
| print(f"Warning: Default reference audio not found at {ref_audio_path}") | |
| return None, "" | |
| default_ref_btn.click( | |
| fn=use_default_reference, | |
| outputs=[ref_audio_input, ref_text_input] | |
| ) | |
| # Generate button | |
| generate_btn.click( | |
| fn=generate_tts, | |
| inputs=[ | |
| text_input, | |
| ref_audio_input, | |
| ref_text_input, | |
| top_k_slider, | |
| top_p_slider, | |
| temperature_slider, | |
| speed_slider | |
| ], | |
| outputs=[audio_output, status_output] | |
| ) | |
| return demo | |
| # Launch the app | |
| if __name__ == "__main__": | |
| print("🎤 Initializing Cantonese TTS Demo...") | |
| print("=" * 50) | |
| print("This Space downloads all models from HuggingFace Hub:") | |
| print(f"- Your models: {YOUR_MODEL_REPO}") | |
| print(f"- Pretrained models: {PRETRAINED_REPO}") | |
| print("=" * 50) | |
| # Create necessary directories | |
| (ROOT_DIR / "GPT_SoVITS").mkdir(exist_ok=True) | |
| (ROOT_DIR / "models").mkdir(exist_ok=True) | |
| (ROOT_DIR / "cache").mkdir(exist_ok=True) | |
| (ROOT_DIR / "ref").mkdir(exist_ok=True) # For reference audio files | |
| # Create and launch interface | |
| demo = create_interface() | |
| demo.queue(max_size=10) | |
| demo.launch( | |
| share=False, | |
| show_error=True, | |
| server_name="0.0.0.0", | |
| server_port=7860 | |
| ) | |