PicoAudio2

Sleeping

App Files Files Community

wsntxxn commited on Oct 13

Commit

893c559

1 Parent(s): cd13388

Add examples

Browse files

Files changed (1) hide show

app.py +85 -15

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import spaces
 import gradio as gr
 import os
@@ -7,11 +8,10 @@ import soundfile as sf
 import numpy as np
 from pathlib import Path
 from transformers import AutoModel
-#from utils.llm import get_time_info
 from utils.llm_xiapi import get_time_info
-#retry1
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model = AutoModel.from_pretrained("rookie9/PicoAudio2", trust_remote_code=True).to(device)
 def is_tdc_format_valid(tdc_str):
     try:
         for event_onset in tdc_str.split('--'):
@@ -153,7 +153,7 @@ A brief text description for the overall audio scene.
     ]
 }</pre>
-It means the events of `a dog barks` happen from 3.0 to 4.0 seconds, and the events of `a man speaks` happen from 5.0 to 6.0 seconds.
 </div>
@@ -167,22 +167,25 @@ It means the events of `a dog barks` happen from 3.0 to 4.0 seconds, and the eve
         with gr.Column():
             tcc_input = gr.Textbox(
                 label="🎯 TCC (Temporal Coarse Caption) - Required",
-                value="a dog barks",
                 placeholder="e.g., a dog barks and a man speaks",
                 lines=2
             )
             event_json = gr.Code(
-                label="Event Timestamp JSON",
                 value="""{
     "a dog barks": [
         [3.0, 4.0],
         [6.0, 7.0]
     ]
 }""",
                 language="json",
                 lines=10,
-                interactive=True
             )
             clear_btn = gr.Button("🗑️ Clear JSON", size="sm")
@@ -190,7 +193,11 @@ It means the events of `a dog barks` happen from 3.0 to 4.0 seconds, and the eve
             gr.Markdown("---")
             with gr.Row():
-                length_input = gr.Textbox(label="⏱️ Length (seconds)", value="10.0", scale=2)
                 time_control = gr.Checkbox(
                     label="⚙️ Enable Time Control",
                     value=True,
@@ -212,13 +219,13 @@ It means the events of `a dog barks` happen from 3.0 to 4.0 seconds, and the eve
 <div style="background-color: #fff5e6; padding: 15px; border-radius: 8px; margin: 10px 0;">
-**TCC** is **required** for audio generation.
-**TDC (JSON)** is **optional** for precise temporal control of events.
-**Length** (in seconds) is optional, but recommended for temporal control. Defaults to 10.0 seconds.
-**Enable Time Control**: Tick to use TDC and length for precise event timing.
 </div>
@@ -228,11 +235,13 @@ It means the events of `a dog barks` happen from 3.0 to 4.0 seconds, and the eve
 <div style="background-color: #f0fff0; padding: 15px; border-radius: 8px; margin: 10px 0;">
-If TDC format is incorrect or length is missing, the model will generate audio **without precise temporal control**.
-For general audio generation without precise timing, you can leave the JSON empty.
-You may leave TDC blank to let the LLM generate timestamps automatically (subject to API quota).
 </div>
@@ -252,6 +261,67 @@ You may leave TDC blank to let the LLM generate timestamps automatically (subjec
         inputs=[tcc_input, event_json, length_input, time_control],
         outputs=[audio_output, tdc_used]
     )
 if __name__ == "__main__":
     demo.launch()

+from textwrap import indent
 import spaces
 import gradio as gr
 import os
 import numpy as np
 from pathlib import Path
 from transformers import AutoModel
 from utils.llm_xiapi import get_time_info
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# model = AutoModel.from_pretrained("rookie9/PicoAudio2", trust_remote_code=True).to(device)
 def is_tdc_format_valid(tdc_str):
     try:
         for event_onset in tdc_str.split('--'):
     ]
 }</pre>
+It means the event `a dog barks` happens from 3.0 to 4.0 seconds, and the event `a man speaks` happens from 5.0 to 6.0 seconds.
 </div>
         with gr.Column():
             tcc_input = gr.Textbox(
                 label="🎯 TCC (Temporal Coarse Caption) - Required",
+                value="a dog barks and a man speaks",
                 placeholder="e.g., a dog barks and a man speaks",
                 lines=2
             )
             event_json = gr.Code(
+                label="📋 TDC (Event Timestamp JSON) - Optional",
                 value="""{
     "a dog barks": [
         [3.0, 4.0],
         [6.0, 7.0]
+    ],
+    "a man speaks": [
+        [8.0, 9.5]
     ]
 }""",
                 language="json",
                 lines=10,
+                interactive=True,
             )
             clear_btn = gr.Button("🗑️ Clear JSON", size="sm")
             gr.Markdown("---")
             with gr.Row():
+                length_input = gr.Textbox(
+                    label="⏱️ Length (seconds)",
+                    value="10.0",
+                    placeholder="e.g., 10.0 (optional but recommended)",
+                    scale=2)
                 time_control = gr.Checkbox(
                     label="⚙️ Enable Time Control",
                     value=True,
 <div style="background-color: #fff5e6; padding: 15px; border-radius: 8px; margin: 10px 0;">
+1. **TCC** is **required** for audio generation.
+2. **TDC (JSON)** is **optional** for precise temporal control of events.
+3. **Length** (in seconds) is optional, but recommended for temporal control. Defaults to 10.0 seconds.
+4. **Enable Time Control**: Tick to use TDC and length for precise event timing.
 </div>
 <div style="background-color: #f0fff0; padding: 15px; border-radius: 8px; margin: 10px 0;">
+1. Currently events with overlapped timestamps will not get good results, so we recommend generating audio **without temporal overlaps between events**.
+2. If TDC format is incorrect or length is missing, the model will generate audio **without precise temporal control**.
+3. For general audio generation without precise timing, you can leave the JSON empty.
+4. You may leave TDC blank to let the LLM generate timestamps automatically (subject to API quota).
 </div>
         inputs=[tcc_input, event_json, length_input, time_control],
         outputs=[audio_output, tdc_used]
     )
+    # Examples
+    gr.Markdown("## 🎯 Quick Examples")
+    gr.Examples(
+        examples=[
+            [
+                "a dog barks",
+                """{
+    "a dog barks": [
+        [3.0, 4.0],
+        [6.0, 7.0]
+    ]
+}""",
+                "8.0",
+                True
+            ],
+            [
+                "door closes then car engine starts",
+                """{
+    "door closes": [
+        [1.0, 1.5]
+    ],
+    "car engine starts": [
+        [2.0, 7.0]
+    ]
+}""",
+                "8.0",
+                True
+            ],
+            [
+                "birds chirping and water flowing",
+                """{
+    "birds chirping": [
+        [0.0, 5.0]
+    ],
+    "water flowing": [
+        [6.0, 9.8]
+    ]
+}""",
+                "10.0",
+                True
+            ],
+            [
+                "thunder roars while heavy rain is falling",
+                "",
+                "15.0",
+                False
+            ],
+            [
+                "a gun shoots twice then a man speaks",
+                "",
+                "",
+                True
+            ]
+        ],
+        inputs=[tcc_input, event_json, length_input, time_control],
+        outputs=[audio_output, tdc_used],
+        fn=generate_audio,
+        cache_examples=False,
+        label="Click examples below to try"
+    )
 if __name__ == "__main__":
     demo.launch()