Spaces:
Runtime error
Runtime error
Commit
·
8fa13bc
1
Parent(s):
112bea7
use meloTTS and suno bark
Browse files
app.py
CHANGED
|
@@ -37,6 +37,15 @@ LANGUAGE_MAPPING = {
|
|
| 37 |
"Turkish": "tr"
|
| 38 |
}
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
class DialogueItem(BaseModel):
|
| 41 |
"""A single dialogue item."""
|
| 42 |
|
|
@@ -67,19 +76,14 @@ def generate_podcast(
|
|
| 67 |
tone: Optional[str],
|
| 68 |
length: Optional[str],
|
| 69 |
language: str,
|
|
|
|
| 70 |
) -> Tuple[str, str]:
|
| 71 |
"""Generate the audio and transcript from the PDFs and/or URL."""
|
| 72 |
text = ""
|
| 73 |
|
| 74 |
-
#
|
| 75 |
-
|
| 76 |
-
"
|
| 77 |
-
"Spanish": "ES",
|
| 78 |
-
"French": "FR",
|
| 79 |
-
"Chinese": "ZH",
|
| 80 |
-
"Japanese": "JP",
|
| 81 |
-
"Korean": "KR",
|
| 82 |
-
}
|
| 83 |
|
| 84 |
# Check if at least one input is provided
|
| 85 |
if not files and not url:
|
|
@@ -154,7 +158,7 @@ def generate_podcast(
|
|
| 154 |
|
| 155 |
# Get audio file path
|
| 156 |
audio_file_path = generate_podcast_audio(
|
| 157 |
-
line.text, line.speaker, LANGUAGE_MAPPING[language]
|
| 158 |
)
|
| 159 |
# Read the audio file into an AudioSegment
|
| 160 |
audio_segment = AudioSegment.from_file(audio_file_path)
|
|
@@ -191,7 +195,7 @@ demo = gr.Interface(
|
|
| 191 |
<table style="border-collapse: collapse; border: none; padding: 20px;">
|
| 192 |
<tr style="border: none;">
|
| 193 |
<td style="border: none; vertical-align: top; padding-right: 30px; padding-left: 30px;">
|
| 194 |
-
<img src="https://raw.githubusercontent.com/gabrielchua/daily-ai-papers/main/
|
| 195 |
</td>
|
| 196 |
<td style="border: none; vertical-align: top; padding: 10px;">
|
| 197 |
<p style="margin-bottom: 15px;"><strong>Convert</strong> your PDFs into podcasts with open-source AI models (Llama 3.1 405B and MeloTTS).</p>
|
|
@@ -225,6 +229,10 @@ demo = gr.Interface(
|
|
| 225 |
value="English",
|
| 226 |
label="6. 🌐 Choose the language"
|
| 227 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
],
|
| 229 |
outputs=[
|
| 230 |
gr.Audio(label="Podcast", format="mp3"),
|
|
@@ -242,23 +250,26 @@ demo = gr.Interface(
|
|
| 242 |
"Fun",
|
| 243 |
"Short (1-2 min)",
|
| 244 |
"English",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
],
|
| 246 |
-
# [
|
| 247 |
-
# [],
|
| 248 |
-
# "https://en.wikipedia.org/wiki/Hugging_Face",
|
| 249 |
-
# "How did Hugging Face become so successful?",
|
| 250 |
-
# "Fun",
|
| 251 |
-
# "Short (1-2 min)",
|
| 252 |
-
# "English",
|
| 253 |
-
# ],
|
| 254 |
-
# [
|
| 255 |
-
# [],
|
| 256 |
-
# "https://simple.wikipedia.org/wiki/Taylor_Swift",
|
| 257 |
-
# "Why is Taylor Swift so popular?",
|
| 258 |
-
# "Fun",
|
| 259 |
-
# "Short (1-2 min)",
|
| 260 |
-
# "English",
|
| 261 |
-
# ],
|
| 262 |
],
|
| 263 |
cache_examples=True,
|
| 264 |
)
|
|
|
|
| 37 |
"Turkish": "tr"
|
| 38 |
}
|
| 39 |
|
| 40 |
+
MELO_TTS_LANGUAGE_MAPPING = {
|
| 41 |
+
"English": "EN",
|
| 42 |
+
"Spanish": "ES",
|
| 43 |
+
"French": "FR",
|
| 44 |
+
"Chinese": "ZJ",
|
| 45 |
+
"Japanese": "JP",
|
| 46 |
+
"Korean": "KR",
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
class DialogueItem(BaseModel):
|
| 50 |
"""A single dialogue item."""
|
| 51 |
|
|
|
|
| 76 |
tone: Optional[str],
|
| 77 |
length: Optional[str],
|
| 78 |
language: str,
|
| 79 |
+
use_advanced_audio: bool,
|
| 80 |
) -> Tuple[str, str]:
|
| 81 |
"""Generate the audio and transcript from the PDFs and/or URL."""
|
| 82 |
text = ""
|
| 83 |
|
| 84 |
+
# Check if the selected language is supported by MeloTTS when not using advanced audio
|
| 85 |
+
if not use_advanced_audio and language not in MELO_TTS_LANGUAGE_MAPPING:
|
| 86 |
+
raise gr.Error(f"The selected language '{language}' is not supported without advanced audio generation. Please enable advanced audio generation or choose a supported language.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
# Check if at least one input is provided
|
| 89 |
if not files and not url:
|
|
|
|
| 158 |
|
| 159 |
# Get audio file path
|
| 160 |
audio_file_path = generate_podcast_audio(
|
| 161 |
+
line.text, line.speaker, LANGUAGE_MAPPING[language], use_advanced_audio
|
| 162 |
)
|
| 163 |
# Read the audio file into an AudioSegment
|
| 164 |
audio_segment = AudioSegment.from_file(audio_file_path)
|
|
|
|
| 195 |
<table style="border-collapse: collapse; border: none; padding: 20px;">
|
| 196 |
<tr style="border: none;">
|
| 197 |
<td style="border: none; vertical-align: top; padding-right: 30px; padding-left: 30px;">
|
| 198 |
+
<img src="https://raw.githubusercontent.com/gabrielchua/daily-ai-papers/main/_includes/icon.png" alt="Open NotebookLM" width="120" style="margin-bottom: 10px;">
|
| 199 |
</td>
|
| 200 |
<td style="border: none; vertical-align: top; padding: 10px;">
|
| 201 |
<p style="margin-bottom: 15px;"><strong>Convert</strong> your PDFs into podcasts with open-source AI models (Llama 3.1 405B and MeloTTS).</p>
|
|
|
|
| 229 |
value="English",
|
| 230 |
label="6. 🌐 Choose the language"
|
| 231 |
),
|
| 232 |
+
gr.Checkbox(
|
| 233 |
+
label="7. 🔄 Use advanced audio generation? (Experimental)",
|
| 234 |
+
value=False
|
| 235 |
+
)
|
| 236 |
],
|
| 237 |
outputs=[
|
| 238 |
gr.Audio(label="Podcast", format="mp3"),
|
|
|
|
| 250 |
"Fun",
|
| 251 |
"Short (1-2 min)",
|
| 252 |
"English",
|
| 253 |
+
True
|
| 254 |
+
],
|
| 255 |
+
[
|
| 256 |
+
[],
|
| 257 |
+
"https://en.wikipedia.org/wiki/Hugging_Face",
|
| 258 |
+
"How did Hugging Face become so successful?",
|
| 259 |
+
"Fun",
|
| 260 |
+
"Short (1-2 min)",
|
| 261 |
+
"English",
|
| 262 |
+
False
|
| 263 |
+
],
|
| 264 |
+
[
|
| 265 |
+
[],
|
| 266 |
+
"https://simple.wikipedia.org/wiki/Taylor_Swift",
|
| 267 |
+
"Why is Taylor Swift so popular?",
|
| 268 |
+
"Fun",
|
| 269 |
+
"Short (1-2 min)",
|
| 270 |
+
"English",
|
| 271 |
+
False
|
| 272 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
],
|
| 274 |
cache_examples=True,
|
| 275 |
)
|
utils.py
CHANGED
|
@@ -24,7 +24,7 @@ client = OpenAI(
|
|
| 24 |
api_key=os.getenv("FIREWORKS_API_KEY"),
|
| 25 |
)
|
| 26 |
|
| 27 |
-
|
| 28 |
|
| 29 |
# download and load all models
|
| 30 |
preload_models()
|
|
@@ -78,34 +78,35 @@ def parse_url(url: str) -> str:
|
|
| 78 |
return response.text
|
| 79 |
|
| 80 |
|
| 81 |
-
def generate_podcast_audio(text: str, speaker: str, language: str) -> str:
|
| 82 |
|
| 83 |
-
|
|
|
|
| 84 |
|
| 85 |
-
|
| 86 |
|
| 87 |
-
|
| 88 |
-
|
| 89 |
|
| 90 |
-
|
| 91 |
|
| 92 |
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
|
|
|
| 24 |
api_key=os.getenv("FIREWORKS_API_KEY"),
|
| 25 |
)
|
| 26 |
|
| 27 |
+
hf_client = Client("mrfakename/MeloTTS")
|
| 28 |
|
| 29 |
# download and load all models
|
| 30 |
preload_models()
|
|
|
|
| 78 |
return response.text
|
| 79 |
|
| 80 |
|
| 81 |
+
def generate_podcast_audio(text: str, speaker: str, language: str, use_advanced_audio: bool) -> str:
|
| 82 |
|
| 83 |
+
if use_advanced_audio:
|
| 84 |
+
audio_array = generate_audio(text, history_prompt=f"v2/{language}_speaker_{'1' if speaker == 'Host (Jane)' else '3'}")
|
| 85 |
|
| 86 |
+
file_path = f"audio_{language}_{speaker}.mp3"
|
| 87 |
|
| 88 |
+
# save audio to disk
|
| 89 |
+
write_wav(file_path, SAMPLE_RATE, audio_array)
|
| 90 |
|
| 91 |
+
return file_path
|
| 92 |
|
| 93 |
|
| 94 |
+
else:
|
| 95 |
+
if speaker == "Guest":
|
| 96 |
+
accent = "EN-US" if language == "EN" else language
|
| 97 |
+
speed = 0.9
|
| 98 |
+
else: # host
|
| 99 |
+
accent = "EN-Default" if language == "EN" else language
|
| 100 |
+
speed = 1
|
| 101 |
+
if language != "EN" and speaker != "Guest":
|
| 102 |
+
speed = 1.1
|
| 103 |
|
| 104 |
+
# Generate audio
|
| 105 |
+
result = hf_client.predict(
|
| 106 |
+
text=text,
|
| 107 |
+
language=language,
|
| 108 |
+
speaker=accent,
|
| 109 |
+
speed=speed,
|
| 110 |
+
api_name="/synthesize",
|
| 111 |
+
)
|
| 112 |
+
return result
|