Spaces:
Runtime error
Runtime error
Commit
·
c8c7d5c
1
Parent(s):
971e72e
Feat: Add title image. Add disable buttons when running.
Browse files- app.py +46 -7
- public/canary-qwen.png +3 -0
app.py
CHANGED
|
@@ -30,7 +30,7 @@ def transcribe_audio(audio_filepath):
|
|
| 30 |
if audio_filepath is None:
|
| 31 |
return "Please upload an audio file", "", []
|
| 32 |
|
| 33 |
-
# Load audio with torchaudio
|
| 34 |
audio, sample_rate = torchaudio.load(audio_filepath)
|
| 35 |
|
| 36 |
# Resample if needed
|
|
@@ -101,15 +101,30 @@ def enable_buttons():
|
|
| 101 |
|
| 102 |
# Build the Gradio interface
|
| 103 |
with gr.Blocks(theme=theme) as demo:
|
| 104 |
-
gr.
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
# State variables
|
| 108 |
transcript_state = gr.State()
|
| 109 |
|
| 110 |
with gr.Row():
|
| 111 |
with gr.Column(scale=1):
|
| 112 |
-
gr.Markdown("### Audio Input")
|
| 113 |
audio_input = gr.Audio(
|
| 114 |
sources=["microphone", "upload"],
|
| 115 |
type="filepath",
|
|
@@ -118,15 +133,15 @@ with gr.Blocks(theme=theme) as demo:
|
|
| 118 |
transcribe_btn = gr.Button("Transcribe Audio", variant="primary", size="lg")
|
| 119 |
|
| 120 |
with gr.Column(scale=1):
|
| 121 |
-
gr.Markdown("### Transcript")
|
| 122 |
transcript_output = gr.Textbox(
|
| 123 |
label="",
|
| 124 |
lines=10,
|
| 125 |
placeholder="Transcript will appear here after clicking 'Transcribe Audio'...",
|
| 126 |
max_lines=10
|
| 127 |
)
|
| 128 |
-
|
| 129 |
-
gr.Markdown("### Interactive Q&A")
|
| 130 |
chatbot = gr.Chatbot(
|
| 131 |
type="messages",
|
| 132 |
height=450,
|
|
@@ -158,21 +173,45 @@ with gr.Blocks(theme=theme) as demo:
|
|
| 158 |
|
| 159 |
# Event handlers
|
| 160 |
transcribe_btn.click(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
fn=transcribe_audio,
|
| 162 |
inputs=[audio_input],
|
| 163 |
outputs=[transcript_output, transcript_state, chatbot]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
)
|
| 165 |
|
| 166 |
ask_btn.click(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
fn=transcript_qa,
|
| 168 |
inputs=[transcript_state, question_input, chatbot],
|
| 169 |
outputs=[chatbot, question_input]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
)
|
| 171 |
|
| 172 |
question_input.submit(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
fn=transcript_qa,
|
| 174 |
inputs=[transcript_state, question_input, chatbot],
|
| 175 |
outputs=[chatbot, question_input]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
)
|
| 177 |
|
| 178 |
clear_chat_btn.click(
|
|
|
|
| 30 |
if audio_filepath is None:
|
| 31 |
return "Please upload an audio file", "", []
|
| 32 |
|
| 33 |
+
# Load audio with torchaudio
|
| 34 |
audio, sample_rate = torchaudio.load(audio_filepath)
|
| 35 |
|
| 36 |
# Resample if needed
|
|
|
|
| 101 |
|
| 102 |
# Build the Gradio interface
|
| 103 |
with gr.Blocks(theme=theme) as demo:
|
| 104 |
+
with gr.Row():
|
| 105 |
+
with gr.Column(scale=1):
|
| 106 |
+
gr.HTML("""
|
| 107 |
+
<img src="./canary-qwen.png" style='height: 150px; width: 150px;'>
|
| 108 |
+
""")
|
| 109 |
+
with gr.Column(scale=4):
|
| 110 |
+
gr.Markdown("# Canary-Qwen Transcriber with Interactive Q&A")
|
| 111 |
+
gr.Markdown("## Upload audio, or record yourself then ask questions about the transcript.")
|
| 112 |
+
|
| 113 |
+
gr.Markdown('''NVIDIA NeMo Canary-Qwen-2.5B is an English speech recognition model that achieves state-of-the art
|
| 114 |
+
performance on multiple English speech benchmarks. With 2.5 billion parameters and running at 418 RTFx,
|
| 115 |
+
Canary-Qwen-2.5B supports automatic speech-to-text recognition (ASR) in English with punctuation and capitalization
|
| 116 |
+
(PnC). The model works in two modes: as a transcription tool (ASR mode) and as an LLM (LLM mode). In ASR mode, the
|
| 117 |
+
model is only capable of transcribing the speech into text, but does not retain any LLM-specific skills such as reasoning.
|
| 118 |
+
In LLM mode, the model retains all of the original LLM capabilities, which can be used to post-process the transcript, e.g.
|
| 119 |
+
summarize it or answer questions about it. In LLM mode, the model does not "understand" the raw audio anymore - only
|
| 120 |
+
its transcript. This model is ready for commercial use.''')
|
| 121 |
|
| 122 |
# State variables
|
| 123 |
transcript_state = gr.State()
|
| 124 |
|
| 125 |
with gr.Row():
|
| 126 |
with gr.Column(scale=1):
|
| 127 |
+
gr.Markdown("### Step1 - Audio Input")
|
| 128 |
audio_input = gr.Audio(
|
| 129 |
sources=["microphone", "upload"],
|
| 130 |
type="filepath",
|
|
|
|
| 133 |
transcribe_btn = gr.Button("Transcribe Audio", variant="primary", size="lg")
|
| 134 |
|
| 135 |
with gr.Column(scale=1):
|
| 136 |
+
gr.Markdown("### Step2 - Transcript")
|
| 137 |
transcript_output = gr.Textbox(
|
| 138 |
label="",
|
| 139 |
lines=10,
|
| 140 |
placeholder="Transcript will appear here after clicking 'Transcribe Audio'...",
|
| 141 |
max_lines=10
|
| 142 |
)
|
| 143 |
+
|
| 144 |
+
gr.Markdown("### Step3 - Interactive Q&A")
|
| 145 |
chatbot = gr.Chatbot(
|
| 146 |
type="messages",
|
| 147 |
height=450,
|
|
|
|
| 173 |
|
| 174 |
# Event handlers
|
| 175 |
transcribe_btn.click(
|
| 176 |
+
fn=disable_buttons,
|
| 177 |
+
inputs=None,
|
| 178 |
+
outputs=[transcribe_btn, ask_btn]
|
| 179 |
+
).then(
|
| 180 |
fn=transcribe_audio,
|
| 181 |
inputs=[audio_input],
|
| 182 |
outputs=[transcript_output, transcript_state, chatbot]
|
| 183 |
+
).then(
|
| 184 |
+
fn=enable_buttons,
|
| 185 |
+
inputs=None,
|
| 186 |
+
outputs=[transcribe_btn, ask_btn]
|
| 187 |
)
|
| 188 |
|
| 189 |
ask_btn.click(
|
| 190 |
+
fn=disable_buttons,
|
| 191 |
+
inputs=None,
|
| 192 |
+
outputs=[transcribe_btn, ask_btn]
|
| 193 |
+
).then(
|
| 194 |
fn=transcript_qa,
|
| 195 |
inputs=[transcript_state, question_input, chatbot],
|
| 196 |
outputs=[chatbot, question_input]
|
| 197 |
+
).then(
|
| 198 |
+
fn=enable_buttons,
|
| 199 |
+
inputs=None,
|
| 200 |
+
outputs=[transcribe_btn, ask_btn]
|
| 201 |
)
|
| 202 |
|
| 203 |
question_input.submit(
|
| 204 |
+
fn=disable_buttons,
|
| 205 |
+
inputs=None,
|
| 206 |
+
outputs=[transcribe_btn, ask_btn]
|
| 207 |
+
).then(
|
| 208 |
fn=transcript_qa,
|
| 209 |
inputs=[transcript_state, question_input, chatbot],
|
| 210 |
outputs=[chatbot, question_input]
|
| 211 |
+
).then(
|
| 212 |
+
fn=enable_buttons,
|
| 213 |
+
inputs=None,
|
| 214 |
+
outputs=[transcribe_btn, ask_btn]
|
| 215 |
)
|
| 216 |
|
| 217 |
clear_chat_btn.click(
|
public/canary-qwen.png
ADDED
|
Git LFS Details
|