import gradio as gr import torch import torchaudio import numpy as np import librosa from transformers import WhisperProcessor, WhisperForConditionalGeneration from transformers import XLMRobertaTokenizerFast, XLMRobertaForSequenceClassification import soundfile as sf # --- Load models --- whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-tiny") whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny") lang_tokenizer = XLMRobertaTokenizerFast.from_pretrained("papluca/xlm-roberta-base-language-detection") lang_model = XLMRobertaForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection") # --- Convert audio to text --- def audio_to_text(audio_path): audio_input, sample_rate = torchaudio.load(audio_path) if sample_rate != 16000: audio_input = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(audio_input) input_features = whisper_processor( audio_input.squeeze(), sampling_rate=16000, return_tensors="pt" ).input_features predicted_ids = whisper_model.generate(input_features) transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] return transcription.strip() # --- Detect language from text --- def detect_language(text): inputs = lang_tokenizer(text, return_tensors="pt", truncation=True, padding=True) with torch.no_grad(): outputs = lang_model(**inputs) probs = torch.nn.functional.softmax(outputs.logits, dim=1) pred_idx = probs.argmax().item() pred_label = lang_model.config.id2label[pred_idx] confidence = probs[0][pred_idx].item() * 100 return f"🌐 Language: {pred_label} | Confidence: {confidence:.2f}%" # --- Gradio function --- def detect_language_from_audio(audio_file): if audio_file is None: return "❌ No file selected." try: # Save audio temporarily in WAV format if needed temp_wav = "temp.wav" data, sr = librosa.load(audio_file, sr=16000) sf.write(temp_wav, data, sr) # Step 1: Convert audio to text text = audio_to_text(temp_wav) if not text: return "❌ Failed to extract text from audio." # Step 2: Detect language return detect_language(text) except Exception as e: return f"❌ Runtime error: {str(e)}" # --- Gradio Interface --- iface = gr.Interface( fn=detect_language_from_audio, inputs=gr.Audio(type="filepath", label="Choose Audio File (WAV/MP3)"), outputs=gr.Textbox(label="Result"), title="🎙️ Fast Voice Language Detector", description="Upload an audio file and get the detected language instantly using Whisper-Tiny + XLM-Roberta." ) # --- Entry point --- if __name__ == "__main__": iface.launch()