wsntxxn commited on
Commit
893c559
·
1 Parent(s): cd13388

Add examples

Browse files
Files changed (1) hide show
  1. app.py +85 -15
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import spaces
2
  import gradio as gr
3
  import os
@@ -7,11 +8,10 @@ import soundfile as sf
7
  import numpy as np
8
  from pathlib import Path
9
  from transformers import AutoModel
10
- #from utils.llm import get_time_info
11
  from utils.llm_xiapi import get_time_info
12
- #retry1
13
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
14
- model = AutoModel.from_pretrained("rookie9/PicoAudio2", trust_remote_code=True).to(device)
15
  def is_tdc_format_valid(tdc_str):
16
  try:
17
  for event_onset in tdc_str.split('--'):
@@ -153,7 +153,7 @@ A brief text description for the overall audio scene.
153
  ]
154
  }</pre>
155
 
156
- It means the events of `a dog barks` happen from 3.0 to 4.0 seconds, and the events of `a man speaks` happen from 5.0 to 6.0 seconds.
157
 
158
  </div>
159
 
@@ -167,22 +167,25 @@ It means the events of `a dog barks` happen from 3.0 to 4.0 seconds, and the eve
167
  with gr.Column():
168
  tcc_input = gr.Textbox(
169
  label="🎯 TCC (Temporal Coarse Caption) - Required",
170
- value="a dog barks",
171
  placeholder="e.g., a dog barks and a man speaks",
172
  lines=2
173
  )
174
 
175
  event_json = gr.Code(
176
- label="Event Timestamp JSON",
177
  value="""{
178
  "a dog barks": [
179
  [3.0, 4.0],
180
  [6.0, 7.0]
 
 
 
181
  ]
182
  }""",
183
  language="json",
184
  lines=10,
185
- interactive=True
186
  )
187
 
188
  clear_btn = gr.Button("🗑️ Clear JSON", size="sm")
@@ -190,7 +193,11 @@ It means the events of `a dog barks` happen from 3.0 to 4.0 seconds, and the eve
190
  gr.Markdown("---")
191
 
192
  with gr.Row():
193
- length_input = gr.Textbox(label="⏱️ Length (seconds)", value="10.0", scale=2)
 
 
 
 
194
  time_control = gr.Checkbox(
195
  label="⚙️ Enable Time Control",
196
  value=True,
@@ -212,13 +219,13 @@ It means the events of `a dog barks` happen from 3.0 to 4.0 seconds, and the eve
212
 
213
  <div style="background-color: #fff5e6; padding: 15px; border-radius: 8px; margin: 10px 0;">
214
 
215
- **TCC** is **required** for audio generation.
216
 
217
- **TDC (JSON)** is **optional** for precise temporal control of events.
218
 
219
- **Length** (in seconds) is optional, but recommended for temporal control. Defaults to 10.0 seconds.
220
 
221
- **Enable Time Control**: Tick to use TDC and length for precise event timing.
222
 
223
  </div>
224
 
@@ -228,11 +235,13 @@ It means the events of `a dog barks` happen from 3.0 to 4.0 seconds, and the eve
228
 
229
  <div style="background-color: #f0fff0; padding: 15px; border-radius: 8px; margin: 10px 0;">
230
 
231
- If TDC format is incorrect or length is missing, the model will generate audio **without precise temporal control**.
232
 
233
- For general audio generation without precise timing, you can leave the JSON empty.
234
 
235
- You may leave TDC blank to let the LLM generate timestamps automatically (subject to API quota).
 
 
236
 
237
  </div>
238
 
@@ -252,6 +261,67 @@ You may leave TDC blank to let the LLM generate timestamps automatically (subjec
252
  inputs=[tcc_input, event_json, length_input, time_control],
253
  outputs=[audio_output, tdc_used]
254
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
 
256
  if __name__ == "__main__":
257
  demo.launch()
 
1
+ from textwrap import indent
2
  import spaces
3
  import gradio as gr
4
  import os
 
8
  import numpy as np
9
  from pathlib import Path
10
  from transformers import AutoModel
 
11
  from utils.llm_xiapi import get_time_info
12
+
13
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
14
+ # model = AutoModel.from_pretrained("rookie9/PicoAudio2", trust_remote_code=True).to(device)
15
  def is_tdc_format_valid(tdc_str):
16
  try:
17
  for event_onset in tdc_str.split('--'):
 
153
  ]
154
  }</pre>
155
 
156
+ It means the event `a dog barks` happens from 3.0 to 4.0 seconds, and the event `a man speaks` happens from 5.0 to 6.0 seconds.
157
 
158
  </div>
159
 
 
167
  with gr.Column():
168
  tcc_input = gr.Textbox(
169
  label="🎯 TCC (Temporal Coarse Caption) - Required",
170
+ value="a dog barks and a man speaks",
171
  placeholder="e.g., a dog barks and a man speaks",
172
  lines=2
173
  )
174
 
175
  event_json = gr.Code(
176
+ label="📋 TDC (Event Timestamp JSON) - Optional",
177
  value="""{
178
  "a dog barks": [
179
  [3.0, 4.0],
180
  [6.0, 7.0]
181
+ ],
182
+ "a man speaks": [
183
+ [8.0, 9.5]
184
  ]
185
  }""",
186
  language="json",
187
  lines=10,
188
+ interactive=True,
189
  )
190
 
191
  clear_btn = gr.Button("🗑️ Clear JSON", size="sm")
 
193
  gr.Markdown("---")
194
 
195
  with gr.Row():
196
+ length_input = gr.Textbox(
197
+ label="⏱️ Length (seconds)",
198
+ value="10.0",
199
+ placeholder="e.g., 10.0 (optional but recommended)",
200
+ scale=2)
201
  time_control = gr.Checkbox(
202
  label="⚙️ Enable Time Control",
203
  value=True,
 
219
 
220
  <div style="background-color: #fff5e6; padding: 15px; border-radius: 8px; margin: 10px 0;">
221
 
222
+ 1. **TCC** is **required** for audio generation.
223
 
224
+ 2. **TDC (JSON)** is **optional** for precise temporal control of events.
225
 
226
+ 3. **Length** (in seconds) is optional, but recommended for temporal control. Defaults to 10.0 seconds.
227
 
228
+ 4. **Enable Time Control**: Tick to use TDC and length for precise event timing.
229
 
230
  </div>
231
 
 
235
 
236
  <div style="background-color: #f0fff0; padding: 15px; border-radius: 8px; margin: 10px 0;">
237
 
238
+ 1. Currently events with overlapped timestamps will not get good results, so we recommend generating audio **without temporal overlaps between events**.
239
 
240
+ 2. If TDC format is incorrect or length is missing, the model will generate audio **without precise temporal control**.
241
 
242
+ 3. For general audio generation without precise timing, you can leave the JSON empty.
243
+
244
+ 4. You may leave TDC blank to let the LLM generate timestamps automatically (subject to API quota).
245
 
246
  </div>
247
 
 
261
  inputs=[tcc_input, event_json, length_input, time_control],
262
  outputs=[audio_output, tdc_used]
263
  )
264
+
265
+ # Examples
266
+ gr.Markdown("## 🎯 Quick Examples")
267
+ gr.Examples(
268
+ examples=[
269
+ [
270
+ "a dog barks",
271
+ """{
272
+ "a dog barks": [
273
+ [3.0, 4.0],
274
+ [6.0, 7.0]
275
+ ]
276
+ }""",
277
+ "8.0",
278
+ True
279
+ ],
280
+ [
281
+ "door closes then car engine starts",
282
+ """{
283
+ "door closes": [
284
+ [1.0, 1.5]
285
+ ],
286
+ "car engine starts": [
287
+ [2.0, 7.0]
288
+ ]
289
+ }""",
290
+ "8.0",
291
+ True
292
+ ],
293
+ [
294
+ "birds chirping and water flowing",
295
+ """{
296
+ "birds chirping": [
297
+ [0.0, 5.0]
298
+ ],
299
+ "water flowing": [
300
+ [6.0, 9.8]
301
+ ]
302
+ }""",
303
+ "10.0",
304
+ True
305
+ ],
306
+ [
307
+ "thunder roars while heavy rain is falling",
308
+ "",
309
+ "15.0",
310
+ False
311
+ ],
312
+ [
313
+ "a gun shoots twice then a man speaks",
314
+ "",
315
+ "",
316
+ True
317
+ ]
318
+ ],
319
+ inputs=[tcc_input, event_json, length_input, time_control],
320
+ outputs=[audio_output, tdc_used],
321
+ fn=generate_audio,
322
+ cache_examples=False,
323
+ label="Click examples below to try"
324
+ )
325
 
326
  if __name__ == "__main__":
327
  demo.launch()