| import logging | |
| import sys | |
| import gradio as gr | |
| import vosk | |
| import json | |
| import subprocess | |
| logging.basicConfig( | |
| format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", | |
| datefmt="%m/%d/%Y %H:%M:%S", | |
| handlers=[logging.StreamHandler(sys.stdout)], | |
| ) | |
| logger = logging.getLogger(__name__) | |
| logger.setLevel(logging.DEBUG) | |
| LARGE_MODEL_BY_LANGUAGE = { | |
| "Russian": {"model_id": "vosk-model-ru-0.42"}, | |
| "Chinese": {"model_id": "vosk-model-cn-0.22"}, | |
| "English": {"model_id": "vosk-model-en-us-0.22"}, | |
| "French": {"model_id": "vosk-model-fr-0.22"}, | |
| "German": {"model_id": "vosk-model-de-0.22"}, | |
| "Italian": {"model_id": "vosk-model-it-0.22"}, | |
| "Japanese": {"model_id": "vosk-model-ja-0.22"}, | |
| "Hindi": {"model_id": "vosk-model-hi-0.22"}, | |
| "Persian": {"model_id": "vosk-model-fa-0.5"}, | |
| "Uzbek": {"model_id": "vosk-model-small-uz-0.22"}, | |
| } | |
| LANGUAGES = sorted(LARGE_MODEL_BY_LANGUAGE.keys()) | |
| CACHED_MODELS_BY_ID = {} | |
| def asr(model, input_file): | |
| rec = vosk.KaldiRecognizer(model, 16000.0) | |
| results = [] | |
| process = subprocess.Popen(f'ffmpeg -loglevel quiet -i {input_file} -ar 16000 -ac 1 -f s16le -'.split(), | |
| stdout=subprocess.PIPE) | |
| while True: | |
| data = process.stdout.read(4000) | |
| if len(data) == 0: | |
| break | |
| if rec.AcceptWaveform(data): | |
| jres = json.loads(rec.Result()) | |
| results.append(jres['text']) | |
| jres = json.loads(rec.FinalResult()) | |
| results.append(jres['text']) | |
| return " ".join(results) | |
| def run(input_file, language, history): | |
| logger.info(f"Running ASR for {language} for {input_file}") | |
| history = history or [] | |
| model = LARGE_MODEL_BY_LANGUAGE.get(language, None) | |
| if model is None: | |
| history.append({ | |
| "error_message": f"Failed to find a model for {language} language :(" | |
| }) | |
| elif input_file is None: | |
| history.append({ | |
| "error_message": f"Record input audio first" | |
| }) | |
| else: | |
| model_instance = CACHED_MODELS_BY_ID.get(model["model_id"], None) | |
| if model_instance is None: | |
| model_instance = vosk.Model(model_name=model["model_id"]) | |
| CACHED_MODELS_BY_ID[model["model_id"]] = model_instance | |
| transcription = asr(model_instance, input_file) | |
| logger.info(f"Transcription for {input_file}: {transcription}") | |
| history.append({ | |
| "model_id": model["model_id"], | |
| "language": language, | |
| "transcription": transcription, | |
| "error_message": None | |
| }) | |
| html_output = "<div class='result'>" | |
| for item in history: | |
| if item["error_message"] is not None: | |
| html_output += f"<div class='result_item result_item_error'>{item['error_message']}</div>" | |
| else: | |
| html_output += "<div class='result_item result_item_success'>" | |
| html_output += f'{item["transcription"]}<br/>' | |
| html_output += "</div>" | |
| html_output += "</div>" | |
| return html_output, history | |
| gr.Interface( | |
| run, | |
| inputs=[ | |
| gr.inputs.Audio(source="microphone", type="filepath", label="Record something..."), | |
| gr.inputs.Radio(label="Language", choices=LANGUAGES), | |
| "state" | |
| ], | |
| outputs=[ | |
| gr.outputs.HTML(label="Outputs"), | |
| "state" | |
| ], | |
| title="Automatic Speech Recognition", | |
| description="", | |
| css=""" | |
| .result {display:flex;flex-direction:column} | |
| .result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%} | |
| .result_item_success {background-color:mediumaquamarine;color:white;align-self:start} | |
| .result_item_error {background-color:#ff7070;color:white;align-self:start} | |
| """, | |
| allow_flagging="never", | |
| theme="default" | |
| ).launch(enable_queue=True) | |