laubonghaudoi's picture
改用 e1000 模型
82948a8
"""
Cantonese TTS Demo - Powered by GPT-SoVITS
Final Version: All models downloaded from HuggingFace
"""
import os
import sys
import torch
import numpy as np
import gradio as gr
import soundfile as sf
from pathlib import Path
from huggingface_hub import hf_hub_download, snapshot_download
import zipfile
import shutil
# Add this for Zero GPU spaces
import spaces
# Set up paths
ROOT_DIR = Path(__file__).parent
sys.path.append(str(ROOT_DIR))
# Configure environment
os.environ["version"] = "v2ProPlus"
os.environ["is_half"] = "True"
os.environ["is_share"] = "False"
# Model repositories
YOUR_MODEL_REPO = "laubonghaudoi/zoengjyutgaai_tts" # Your fine-tuned models
PRETRAINED_REPO = "XXXXRT/GPT-SoVITS-Pretrained" # Official pretrained models
# Global variables
tts_instance = None
models_ready = False
def download_and_extract_pretrained():
"""Download and extract pretrained models from HuggingFace"""
pretrained_dir = ROOT_DIR / "GPT_SoVITS" / "pretrained_models"
pretrained_dir.mkdir(parents=True, exist_ok=True)
# Check if already downloaded
if (pretrained_dir / "chinese-hubert-base").exists() and \
(pretrained_dir / "chinese-roberta-wwm-ext-large").exists():
print("✓ Pretrained models already exist")
return True
try:
print("📥 Downloading pretrained models from HuggingFace...")
# Download the pretrained models zip
zip_path = hf_hub_download(
repo_id=PRETRAINED_REPO,
filename="pretrained_models.zip",
cache_dir="./cache",
resume_download=True
)
print("📦 Extracting pretrained models...")
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
# Extract to GPT_SoVITS directory
zip_ref.extractall(ROOT_DIR / "GPT_SoVITS")
print("✓ Pretrained models ready")
return True
except Exception as e:
print(f"❌ Error downloading pretrained models: {e}")
return False
def download_g2pw_model():
"""Download G2PW model for Chinese text processing"""
g2pw_dir = ROOT_DIR / "GPT_SoVITS" / "text" / "G2PWModel"
if g2pw_dir.exists():
print("✓ G2PW model already exists")
return True
try:
print("📥 Downloading G2PW model...")
# Download G2PW model zip
zip_path = hf_hub_download(
repo_id=PRETRAINED_REPO,
filename="G2PWModel.zip",
cache_dir="./cache",
resume_download=True
)
print("📦 Extracting G2PW model...")
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(ROOT_DIR / "GPT_SoVITS" / "text")
print("✓ G2PW model ready")
return True
except Exception as e:
print(f"❌ Error downloading G2PW model: {e}")
return False
def download_finetuned_models():
"""Download your fine-tuned models"""
try:
print(f"📥 Downloading fine-tuned models from {YOUR_MODEL_REPO}...")
# Create directories for the models
gpt_dir = ROOT_DIR / "GPT_SoVITS" / "pretrained_models" / "fine_tuned"
gpt_dir.mkdir(parents=True, exist_ok=True)
# Download GPT model
gpt_cache_path = hf_hub_download(
repo_id=YOUR_MODEL_REPO,
filename="gpt/dpo1-e1000.ckpt",
cache_dir="./models",
resume_download=True
)
# Copy GPT model to expected location
gpt_path = gpt_dir / "dpo1-e1000.ckpt"
if not gpt_path.exists():
shutil.copy2(gpt_cache_path, gpt_path)
print(f"✓ GPT model downloaded: {gpt_path}")
# Download the known working SoVITS model
sovits_file = "sovits/188hr_e50_s5950.pth"
model_name = Path(sovits_file).name
print(f"📥 Downloading SoVITS model {model_name}...")
sovits_cache_path = hf_hub_download(
repo_id=YOUR_MODEL_REPO,
filename=sovits_file,
cache_dir="./models",
resume_download=True
)
# Copy to expected location
sovits_path = gpt_dir / model_name
if not sovits_path.exists():
shutil.copy2(sovits_cache_path, sovits_path)
file_size = sovits_path.stat().st_size / (1024 * 1024)
print(f"✓ SoVITS model downloaded: {model_name} ({file_size:.1f}MB)")
return str(gpt_path), str(sovits_path)
except Exception as e:
print(f"❌ Error downloading fine-tuned models: {e}")
raise
def ensure_all_models():
"""Ensure all required models are downloaded"""
global models_ready
if models_ready:
return True
print("🔄 Checking and downloading required models...")
# Download pretrained models
if not download_and_extract_pretrained():
return False
# Download G2PW model
if not download_g2pw_model():
return False
# Download nltk data if needed (for text processing)
try:
import nltk
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('cmudict', quiet=True)
except:
pass
models_ready = True
print("✅ All models ready!")
return True
@spaces.GPU(duration=60)
def generate_tts(
text,
ref_audio,
ref_text,
top_k=15,
top_p=1.0,
temperature=1.0,
speed=1.0
):
"""Generate TTS with GPU acceleration"""
global tts_instance
try:
# Ensure models are downloaded
if not ensure_all_models():
return None, "❌ 模型下载失败 | Model download failed"
# Initialize TTS instance if needed
if tts_instance is None:
# Import here after models are downloaded
sys.path.append(str(ROOT_DIR / "GPT_SoVITS"))
from TTS_infer_pack.TTS import TTS, TTS_Config
# Get model paths
gpt_path, sovits_path = download_finetuned_models()
print(f"Using fine-tuned models:")
print(f" GPT model: {gpt_path}")
print(f" SoVITS model: {sovits_path}")
device = "cuda" if torch.cuda.is_available() else "cpu"
# The TTS_Config looks for a "custom" key in the config dict
# If not found, it falls back to version defaults
# So we need to wrap our config in a "custom" key
config_dict = {
"custom": {
"device": device,
"is_half": torch.cuda.is_available(),
"bert_base_path": str(ROOT_DIR / "GPT_SoVITS" / "pretrained_models" / "chinese-roberta-wwm-ext-large"),
"cnhuhbert_base_path": str(ROOT_DIR / "GPT_SoVITS" / "pretrained_models" / "chinese-hubert-base"),
"t2s_weights_path": gpt_path, # Your fine-tuned GPT model
"vits_weights_path": sovits_path, # Your fine-tuned SoVITS model
"version": "v2ProPlus" # Match the environment variable
}
}
# Initialize TTS with config dictionary
tts_instance = TTS(config_dict)
print("✓ TTS instance initialized")
# Validate inputs
text = text.strip()
if not text:
return None, "輸入要合成嘅文本"
if ref_audio is None:
return None, "請上傳參考音頻"
if not ref_text or ref_text.strip() == "":
return None, "請輸入參考音頻文本"
# Generate audio
print(f"🎙️ Generating speech for: {text[:50]}...")
params = {
"text": text,
"text_lang": "yue",
"ref_audio_path": ref_audio, # ref_audio is already a string path
"prompt_text": ref_text.strip(),
"prompt_lang": "yue",
"top_k": top_k,
"top_p": top_p,
"temperature": temperature,
"speed_factor": speed # Note: parameter name might be speed_factor
}
# Call TTS (run method returns a generator)
with torch.no_grad():
generator = tts_instance.run(params)
# The generator yields (sample_rate, audio_data) tuples
# We need to iterate through it to get the audio
sr = None
audio_data = None
for chunk_sr, chunk_audio in generator:
sr = chunk_sr
audio_data = chunk_audio
# Usually there's only one chunk for non-streaming mode
break
# Handle empty result
if audio_data is None or sr is None:
return None, "❌ 生成失败:返回空结果 | Generation failed: empty result"
# audio_data should already be a numpy array from the generator
# Ensure it's float32 for soundfile
if audio_data.dtype != np.float32:
audio_data = audio_data.astype(np.float32)
# Normalize to [-1, 1] range if needed
audio_max = np.abs(audio_data).max()
if audio_max > 1.0:
audio_data = audio_data / audio_max
# Save output
output_path = "output.wav"
sf.write(output_path, audio_data, sr)
return output_path, "✅ 合成成功!| Synthesis successful!"
except Exception as e:
import traceback
error_details = traceback.format_exc()
print(f"Error details:\n{error_details}")
return None, f"❌ 生成失败 | Generation failed: {str(e)}"
# Gradio interface
def create_interface():
with gr.Blocks(
title="粤语 TTS 演示 | Cantonese TTS Demo",
theme=gr.themes.Soft(),
css="""
.gradio-container {
font-family: 'Microsoft YaHei', 'PingFang SC', -apple-system, BlinkMacSystemFont, sans-serif;
}
#ref_audio {
min-height: 100px;
}
"""
) as demo:
gr.Markdown("""
# 張悦楷講古語音合成器 Zoeng Jyut Gaai TTS
模型信息見 [laubonghaudoi/zoengjyutgaai_tts](https://huggingface.co/laubonghaudoi/zoengjyutgaai_tts)
數據採用張悦楷講古語音數據集 [CanCLID/zoengjyutgaai](https://huggingface.co/datasets/CanCLID/zoengjyutgaai)
---
""")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("""
## 使用步驟
1. 上傳一段 3 - 10 秒嘅粵語音頻作為參考音頻,然後輸入埋佢嘅對應文本。
2. 輸入音頻對應嘅粵語文本,可以揀下面示例文本其中一句嚟試下效果
3. (可選)喺高級設定度揀語速、Top K、Top P、溫度
4. 撳生成掣
### 參考音頻係咩?
上傳嘅參考音頻主要用嚟控制生成音頻嘅語氣同情感。例如參考音頻係朗讀詩詞,噉生成嘅音頻就會好似朗讀詩詞噉講嘢;如果參考音頻係疑問,噉生成嘅音頻都會有疑問語氣。
如果你冇參考音頻或者懶得揾,都可以直接撳「使用預設參考音頻」入面嘅選項。
## 已知問題
1. 模型有時會有幻覺,生成啲同文本完全無關嘅亂噏。呢個一般係參考音頻嘅問題,換一條參考音頻同文本重試就得。
1. 因為個基礎模型係用簡體字訓練嘅,所以可能會出現「只隻」不分、「松鬆」不分嘅問題,例如「一隻狗」會讀成「一 zi2 狗」。要解決只能用同音字代替,例如寫成「一脊狗」。
1. 輸入文本唔可以太長,否則後面嗰啲會自動切晒。
""")
with gr.Column(scale=2):
# Reference audio section
with gr.Group():
gr.Markdown("### 参考音频")
with gr.Row():
with gr.Column():
ref_audio_input = gr.Audio(
label="上傳參考音頻 (3 - 10秒)",
type="filepath",
elem_id="ref_audio"
)
with gr.Column():
ref_text_input = gr.Textbox(
label="參考音頻文本",
placeholder="參考音頻對應嘅粵文轉寫",
lines=3
)
# Default reference section
with gr.Accordion("用預設參考音頻", open=True):
with gr.Row():
default_ref_btn = gr.Button(
"張悦楷《三國演義》開場白",
variant="secondary",
size="sm"
)
gr.Markdown("*各位朋友,喺講《三國演義》之前啊,我唸一首詞畀大家聽下吓。*", elem_id="ref_desc")
# Text to synthesize
text_input = gr.Textbox(
label="輸入文本",
placeholder="例:從前有個住喺海邊嘅阿婆",
lines=5
)
# Examples section moved here
gr.Markdown("### 示例文本")
gr.Examples(
examples=[
["廣州商團事變,廣東革命政府叫廣州商團叛亂。廣州商團叫廣州屠城事件、西關屠城血案或者西關慘案,係一九二四年十月十號喺廣州爆發嘅一場武裝衝突。"],
["紅線女,原名鄺健廉,粵劇表演藝術家、粵劇紅派表演藝術創始人。她曾被周恩來譽為「南國紅豆」。"],
["二十日,葉舉又與粵軍諸將致電孫文,要求恢復陳炯明廣東省長、粵軍總司令之職,遭孫文拒絕。"],
["但係呢,三個月之後,上海失咗,南京失咗。共產黨喺武漢呢,即刻變咗口嘞,話,凡親有主張話蘇聯參戰嘅呢,嗰個就係國賊漢奸噉。"],
],
inputs=text_input,
label="揀一個嚟生成試下效果"
)
# Advanced settings
with gr.Accordion("⚙️ 高级設定", open=False):
with gr.Row():
top_k_slider = gr.Slider(
minimum=1, maximum=50, value=15, step=1,
label="Top K",
info="控制採樣,越高隨機性越大,太低可能會變成亂噏"
)
top_p_slider = gr.Slider(
minimum=0.0, maximum=1.0, value=1.0, step=0.1,
label="Top P",
info="核采样"
)
with gr.Row():
temperature_slider = gr.Slider(
minimum=0.1, maximum=2.0, value=1.0, step=0.1,
label="Temperature",
info="温度,越高越有創造力但不可預測"
)
speed_slider = gr.Slider(
minimum=0.5, maximum=2.0, value=1.0, step=0.1,
label="语速",
info="1.0 = 正常"
)
# Generate button
generate_btn = gr.Button(
"生成",
variant="primary",
size="lg"
)
# Output
with gr.Group():
audio_output = gr.Audio(
label="成果",
type="filepath"
)
status_output = gr.Textbox(
label="状态",
interactive=False,
max_lines=3
)
# Event handlers
# Default reference audio button
def use_default_reference():
ref_audio_path = ROOT_DIR / "ref" / "001_001.opus"
# Check if file exists
if ref_audio_path.exists():
ref_text = "各位朋友,喺講《三國演義》之前啊,我唸一首詞畀大家聽下吓。"
return str(ref_audio_path), ref_text
else:
print(f"Warning: Default reference audio not found at {ref_audio_path}")
return None, ""
default_ref_btn.click(
fn=use_default_reference,
outputs=[ref_audio_input, ref_text_input]
)
# Generate button
generate_btn.click(
fn=generate_tts,
inputs=[
text_input,
ref_audio_input,
ref_text_input,
top_k_slider,
top_p_slider,
temperature_slider,
speed_slider
],
outputs=[audio_output, status_output]
)
return demo
# Launch the app
if __name__ == "__main__":
print("🎤 Initializing Cantonese TTS Demo...")
print("=" * 50)
print("This Space downloads all models from HuggingFace Hub:")
print(f"- Your models: {YOUR_MODEL_REPO}")
print(f"- Pretrained models: {PRETRAINED_REPO}")
print("=" * 50)
# Create necessary directories
(ROOT_DIR / "GPT_SoVITS").mkdir(exist_ok=True)
(ROOT_DIR / "models").mkdir(exist_ok=True)
(ROOT_DIR / "cache").mkdir(exist_ok=True)
(ROOT_DIR / "ref").mkdir(exist_ok=True) # For reference audio files
# Create and launch interface
demo = create_interface()
demo.queue(max_size=10)
demo.launch(
share=False,
show_error=True,
server_name="0.0.0.0",
server_port=7860
)