Spaces:

ACloudCenter
/

canary-qwen-transcriber-2.5b

Runtime error

App Files Files Community

ACloudCenter commited on Aug 29

Commit

c8c7d5c

1 Parent(s): 971e72e

Feat: Add title image. Add disable buttons when running.

Browse files

Files changed (2) hide show

app.py +46 -7
public/canary-qwen.png +3 -0

app.py CHANGED Viewed

@@ -30,7 +30,7 @@ def transcribe_audio(audio_filepath):
     if audio_filepath is None:
         return "Please upload an audio file", "", []
-    # Load audio with torchaudio (handles all formats)
     audio, sample_rate = torchaudio.load(audio_filepath)
     # Resample if needed
@@ -101,15 +101,30 @@ def enable_buttons():
 # Build the Gradio interface
 with gr.Blocks(theme=theme) as demo:
-    gr.Markdown("# Canary-Qwen Transcriber with Interactive Q&A")
-    gr.Markdown("Upload audio, or record yourself and interact with conversational Q&A")
     # State variables
     transcript_state = gr.State()
     with gr.Row():
         with gr.Column(scale=1):
-            gr.Markdown("### Audio Input")
             audio_input = gr.Audio(
                 sources=["microphone", "upload"],
                 type="filepath",
@@ -118,15 +133,15 @@ with gr.Blocks(theme=theme) as demo:
             transcribe_btn = gr.Button("Transcribe Audio", variant="primary", size="lg")
         with gr.Column(scale=1):
-            gr.Markdown("### Transcript")
             transcript_output = gr.Textbox(
                 label="",
                 lines=10,
                 placeholder="Transcript will appear here after clicking 'Transcribe Audio'...",
                 max_lines=10
             )
-    gr.Markdown("### Interactive Q&A")
     chatbot = gr.Chatbot(
         type="messages",
         height=450,
@@ -158,21 +173,45 @@ with gr.Blocks(theme=theme) as demo:
     # Event handlers
     transcribe_btn.click(
         fn=transcribe_audio,
         inputs=[audio_input],
         outputs=[transcript_output, transcript_state, chatbot]
     )
     ask_btn.click(
         fn=transcript_qa,
         inputs=[transcript_state, question_input, chatbot],
         outputs=[chatbot, question_input]
     )
     question_input.submit(
         fn=transcript_qa,
         inputs=[transcript_state, question_input, chatbot],
         outputs=[chatbot, question_input]
     )
     clear_chat_btn.click(

     if audio_filepath is None:
         return "Please upload an audio file", "", []
+    # Load audio with torchaudio
     audio, sample_rate = torchaudio.load(audio_filepath)
     # Resample if needed
 # Build the Gradio interface
 with gr.Blocks(theme=theme) as demo:
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.HTML("""
+            <img src="./canary-qwen.png" style='height: 150px; width: 150px;'>
+            """)
+        with gr.Column(scale=4):
+            gr.Markdown("# Canary-Qwen Transcriber with Interactive Q&A")
+            gr.Markdown("## Upload audio, or record yourself then ask questions about the transcript.")
+    gr.Markdown('''NVIDIA NeMo Canary-Qwen-2.5B is an English speech recognition model that achieves state-of-the art
+                performance on multiple English speech benchmarks. With 2.5 billion parameters and running at 418 RTFx,
+                Canary-Qwen-2.5B supports automatic speech-to-text recognition (ASR) in English with punctuation and capitalization
+                (PnC). The model works in two modes: as a transcription tool (ASR mode) and as an LLM (LLM mode). In ASR mode, the
+                model is only capable of transcribing the speech into text, but does not retain any LLM-specific skills such as reasoning.
+                In LLM mode, the model retains all of the original LLM capabilities, which can be used to post-process the transcript, e.g.
+                summarize it or answer questions about it. In LLM mode, the model does not "understand" the raw audio anymore - only
+                its transcript. This model is ready for commercial use.''')
     # State variables
     transcript_state = gr.State()
     with gr.Row():
         with gr.Column(scale=1):
+            gr.Markdown("### Step1 - Audio Input")
             audio_input = gr.Audio(
                 sources=["microphone", "upload"],
                 type="filepath",
             transcribe_btn = gr.Button("Transcribe Audio", variant="primary", size="lg")
         with gr.Column(scale=1):
+            gr.Markdown("### Step2 - Transcript")
             transcript_output = gr.Textbox(
                 label="",
                 lines=10,
                 placeholder="Transcript will appear here after clicking 'Transcribe Audio'...",
                 max_lines=10
             )
+    gr.Markdown("### Step3 - Interactive Q&A")
     chatbot = gr.Chatbot(
         type="messages",
         height=450,
     # Event handlers
     transcribe_btn.click(
+        fn=disable_buttons,
+        inputs=None,
+        outputs=[transcribe_btn, ask_btn]
+    ).then(
         fn=transcribe_audio,
         inputs=[audio_input],
         outputs=[transcript_output, transcript_state, chatbot]
+    ).then(
+        fn=enable_buttons,
+        inputs=None,
+        outputs=[transcribe_btn, ask_btn]
     )
     ask_btn.click(
+        fn=disable_buttons,
+        inputs=None,
+        outputs=[transcribe_btn, ask_btn]
+    ).then(
         fn=transcript_qa,
         inputs=[transcript_state, question_input, chatbot],
         outputs=[chatbot, question_input]
+    ).then(
+        fn=enable_buttons,
+        inputs=None,
+        outputs=[transcribe_btn, ask_btn]
     )
     question_input.submit(
+        fn=disable_buttons,
+        inputs=None,
+        outputs=[transcribe_btn, ask_btn]
+    ).then(
         fn=transcript_qa,
         inputs=[transcript_state, question_input, chatbot],
         outputs=[chatbot, question_input]
+    ).then(
+        fn=enable_buttons,
+        inputs=None,
+        outputs=[transcribe_btn, ask_btn]
     )
     clear_chat_btn.click(

public/canary-qwen.png ADDED Viewed

Git LFS Details

SHA256: 6ff45be40861ac58873433c0353da69b3f6eb96d80e2c59077434bfcff2434c8
Pointer size: 132 Bytes
Size of remote file: 2.03 MB