ACloudCenter commited on
Commit
c8c7d5c
·
1 Parent(s): 971e72e

Feat: Add title image. Add disable buttons when running.

Browse files
Files changed (2) hide show
  1. app.py +46 -7
  2. public/canary-qwen.png +3 -0
app.py CHANGED
@@ -30,7 +30,7 @@ def transcribe_audio(audio_filepath):
30
  if audio_filepath is None:
31
  return "Please upload an audio file", "", []
32
 
33
- # Load audio with torchaudio (handles all formats)
34
  audio, sample_rate = torchaudio.load(audio_filepath)
35
 
36
  # Resample if needed
@@ -101,15 +101,30 @@ def enable_buttons():
101
 
102
  # Build the Gradio interface
103
  with gr.Blocks(theme=theme) as demo:
104
- gr.Markdown("# Canary-Qwen Transcriber with Interactive Q&A")
105
- gr.Markdown("Upload audio, or record yourself and interact with conversational Q&A")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
  # State variables
108
  transcript_state = gr.State()
109
 
110
  with gr.Row():
111
  with gr.Column(scale=1):
112
- gr.Markdown("### Audio Input")
113
  audio_input = gr.Audio(
114
  sources=["microphone", "upload"],
115
  type="filepath",
@@ -118,15 +133,15 @@ with gr.Blocks(theme=theme) as demo:
118
  transcribe_btn = gr.Button("Transcribe Audio", variant="primary", size="lg")
119
 
120
  with gr.Column(scale=1):
121
- gr.Markdown("### Transcript")
122
  transcript_output = gr.Textbox(
123
  label="",
124
  lines=10,
125
  placeholder="Transcript will appear here after clicking 'Transcribe Audio'...",
126
  max_lines=10
127
  )
128
-
129
- gr.Markdown("### Interactive Q&A")
130
  chatbot = gr.Chatbot(
131
  type="messages",
132
  height=450,
@@ -158,21 +173,45 @@ with gr.Blocks(theme=theme) as demo:
158
 
159
  # Event handlers
160
  transcribe_btn.click(
 
 
 
 
161
  fn=transcribe_audio,
162
  inputs=[audio_input],
163
  outputs=[transcript_output, transcript_state, chatbot]
 
 
 
 
164
  )
165
 
166
  ask_btn.click(
 
 
 
 
167
  fn=transcript_qa,
168
  inputs=[transcript_state, question_input, chatbot],
169
  outputs=[chatbot, question_input]
 
 
 
 
170
  )
171
 
172
  question_input.submit(
 
 
 
 
173
  fn=transcript_qa,
174
  inputs=[transcript_state, question_input, chatbot],
175
  outputs=[chatbot, question_input]
 
 
 
 
176
  )
177
 
178
  clear_chat_btn.click(
 
30
  if audio_filepath is None:
31
  return "Please upload an audio file", "", []
32
 
33
+ # Load audio with torchaudio
34
  audio, sample_rate = torchaudio.load(audio_filepath)
35
 
36
  # Resample if needed
 
101
 
102
  # Build the Gradio interface
103
  with gr.Blocks(theme=theme) as demo:
104
+ with gr.Row():
105
+ with gr.Column(scale=1):
106
+ gr.HTML("""
107
+ <img src="./canary-qwen.png" style='height: 150px; width: 150px;'>
108
+ """)
109
+ with gr.Column(scale=4):
110
+ gr.Markdown("# Canary-Qwen Transcriber with Interactive Q&A")
111
+ gr.Markdown("## Upload audio, or record yourself then ask questions about the transcript.")
112
+
113
+ gr.Markdown('''NVIDIA NeMo Canary-Qwen-2.5B is an English speech recognition model that achieves state-of-the art
114
+ performance on multiple English speech benchmarks. With 2.5 billion parameters and running at 418 RTFx,
115
+ Canary-Qwen-2.5B supports automatic speech-to-text recognition (ASR) in English with punctuation and capitalization
116
+ (PnC). The model works in two modes: as a transcription tool (ASR mode) and as an LLM (LLM mode). In ASR mode, the
117
+ model is only capable of transcribing the speech into text, but does not retain any LLM-specific skills such as reasoning.
118
+ In LLM mode, the model retains all of the original LLM capabilities, which can be used to post-process the transcript, e.g.
119
+ summarize it or answer questions about it. In LLM mode, the model does not "understand" the raw audio anymore - only
120
+ its transcript. This model is ready for commercial use.''')
121
 
122
  # State variables
123
  transcript_state = gr.State()
124
 
125
  with gr.Row():
126
  with gr.Column(scale=1):
127
+ gr.Markdown("### Step1 - Audio Input")
128
  audio_input = gr.Audio(
129
  sources=["microphone", "upload"],
130
  type="filepath",
 
133
  transcribe_btn = gr.Button("Transcribe Audio", variant="primary", size="lg")
134
 
135
  with gr.Column(scale=1):
136
+ gr.Markdown("### Step2 - Transcript")
137
  transcript_output = gr.Textbox(
138
  label="",
139
  lines=10,
140
  placeholder="Transcript will appear here after clicking 'Transcribe Audio'...",
141
  max_lines=10
142
  )
143
+
144
+ gr.Markdown("### Step3 - Interactive Q&A")
145
  chatbot = gr.Chatbot(
146
  type="messages",
147
  height=450,
 
173
 
174
  # Event handlers
175
  transcribe_btn.click(
176
+ fn=disable_buttons,
177
+ inputs=None,
178
+ outputs=[transcribe_btn, ask_btn]
179
+ ).then(
180
  fn=transcribe_audio,
181
  inputs=[audio_input],
182
  outputs=[transcript_output, transcript_state, chatbot]
183
+ ).then(
184
+ fn=enable_buttons,
185
+ inputs=None,
186
+ outputs=[transcribe_btn, ask_btn]
187
  )
188
 
189
  ask_btn.click(
190
+ fn=disable_buttons,
191
+ inputs=None,
192
+ outputs=[transcribe_btn, ask_btn]
193
+ ).then(
194
  fn=transcript_qa,
195
  inputs=[transcript_state, question_input, chatbot],
196
  outputs=[chatbot, question_input]
197
+ ).then(
198
+ fn=enable_buttons,
199
+ inputs=None,
200
+ outputs=[transcribe_btn, ask_btn]
201
  )
202
 
203
  question_input.submit(
204
+ fn=disable_buttons,
205
+ inputs=None,
206
+ outputs=[transcribe_btn, ask_btn]
207
+ ).then(
208
  fn=transcript_qa,
209
  inputs=[transcript_state, question_input, chatbot],
210
  outputs=[chatbot, question_input]
211
+ ).then(
212
+ fn=enable_buttons,
213
+ inputs=None,
214
+ outputs=[transcribe_btn, ask_btn]
215
  )
216
 
217
  clear_chat_btn.click(
public/canary-qwen.png ADDED

Git LFS Details

  • SHA256: 6ff45be40861ac58873433c0353da69b3f6eb96d80e2c59077434bfcff2434c8
  • Pointer size: 132 Bytes
  • Size of remote file: 2.03 MB