import streamlit as st from transformers import pipeline import torch import tempfile from pydub import AudioSegment import math st.set_page_config(page_title="Urdu Speech-to-Text", layout="centered") st.title("πŸŽ™οΈ Urdu Speech-to-Text (Whisper Turbo Urdu)") # -------------------------- # GPU / CPU detection # -------------------------- def get_device(): if torch.cuda.is_available(): st.success("GPU active βœ“ (Fast Mode)") return 0 else: st.warning("GPU not available – switching to CPU (slow mode)") return -1 # -------------------------- # Load ASR Model # -------------------------- @st.cache_resource def load_asr(device): return pipeline( task="automatic-speech-recognition", model="kingabzpro/whisper-large-v3-turbo-urdu", return_timestamps=True, chunk_length_s=30, stride_length_s=5, torch_dtype=torch.float16 if device == 0 else torch.float32, device=device ) # -------------------------- # Chunk Transcription # -------------------------- def transcribe_in_chunks(asr, audio_path): audio = AudioSegment.from_file(audio_path) total_ms = len(audio) # audio duration in milliseconds chunk_ms = 30 * 1000 # 30 sec chunks total_chunks = math.ceil(total_ms / chunk_ms) st.info(f"⏳ Estimated chunks: {total_chunks}") progress = st.progress(0) full_text = "" for i in range(total_chunks): start = i * chunk_ms end = min((i+1) * chunk_ms, total_ms) chunk = audio[start:end] with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: chunk.export(tmp.name, format="wav") chunk_path = tmp.name result = asr( chunk_path, return_timestamps=True, chunk_length_s=30, stride_length_s=5 ) full_text += result["text"] + " " progress.progress((i + 1) / total_chunks) return full_text.strip() # -------------------------- # APP UI # -------------------------- uploaded_file = st.file_uploader("Upload audio", type=["mp3","wav","m4a","ogg"]) if uploaded_file: with tempfile.NamedTemporaryFile(delete=False, suffix=uploaded_file.name) as tmp: tmp.write(uploaded_file.read()) audio_path = tmp.name st.success("βœ” Audio uploaded") device = get_device() asr = load_asr(device) st.info("⏳ Transcribing audio…") transcript = transcribe_in_chunks(asr, audio_path) st.subheader("πŸ“ Urdu Transcription") st.write(transcript) st.download_button("Download Text", transcript, "urdu_transcription.txt")