import traceback import soundfile as sf import torch import numpy as np from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, Wav2Vec2ForCTC, Wav2Vec2Processor import gradio as gr import resampy # Language configuration - UPDATED with correct Afan Oromo code LANGUAGE_CONFIG = { "Amharic": { "code": "amh", "model": "facebook/seamless-m4t-v2-large", "available": True }, "Swahili": { "code": "swh", "model": "facebook/seamless-m4t-v2-large", "available": True }, "Somali": { "code": "som", "model": "facebook/seamless-m4t-v2-large", "available": True }, "Afan Oromo": { "code": "gaz", # FIXED: Changed from "orm" to "gaz" "model": "facebook/seamless-m4t-v2-large", # Using SeamlessM4T since it supports gaz "available": True }, "Tigrinya": { "code": "tir", "model": "facebook/seamless-m4t-v2-large", "available": False, "message": "Tigrinya transcription is not currently available" }, "Chichewa": { "code": "nya", "model": "dmatekenya/wav2vec2-large-xls-r-300m-chichewa", "available": True } } # Initialize models models = {} processors = {} print("[INFO] Loading transcription models...") # Load SeamlessM4T model for Amharic, Swahili, Somali, Afan Oromo try: seamless_model_id = "facebook/seamless-m4t-v2-large" seamless_processor = AutoProcessor.from_pretrained(seamless_model_id) seamless_model = AutoModelForSpeechSeq2Seq.from_pretrained(seamless_model_id).to("cpu") for lang, config in LANGUAGE_CONFIG.items(): if config["available"] and config["model"] == seamless_model_id: models[lang] = seamless_model processors[lang] = seamless_processor print("[SUCCESS] SeamlessM4T model loaded for Amharic, Swahili, Somali, Afan Oromo") except Exception as e: print("[ERROR] Failed to load SeamlessM4T model:", e) traceback.print_exc() # Load Chichewa model try: chichewa_processor = Wav2Vec2Processor.from_pretrained("dmatekenya/wav2vec2-large-xls-r-300m-chichewa") chichewa_model = Wav2Vec2ForCTC.from_pretrained("dmatekenya/wav2vec2-large-xls-r-300m-chichewa").to("cpu") models["Chichewa"] = chichewa_model processors["Chichewa"] = chichewa_processor print("[SUCCESS] Chichewa model loaded successfully") except Exception as e: print("[ERROR] Failed to load Chichewa model:", e) traceback.print_exc() LANGUAGE_CONFIG["Chichewa"]["available"] = False # --- Helper: ASR --- def transcribe_audio(audio_file, language): if language not in models or language not in processors: return f"Model for {language} is not available" if not LANGUAGE_CONFIG[language]["available"]: if language == "Tigrinya": return LANGUAGE_CONFIG[language]["message"] return f"{language} transcription is currently unavailable" try: # Read and preprocess audio audio, sr = sf.read(audio_file) if audio.ndim > 1: audio = audio.mean(axis=1) audio = resampy.resample(audio, sr, 16000) model = models[language] processor = processors[language] # Handle different model types if language == "Chichewa": # Wav2Vec2 processing inputs = processor(audio, sampling_rate=16000, return_tensors="pt", padding=True) with torch.no_grad(): logits = model(**inputs).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predicted_ids)[0] else: # Standard SeamlessM4T processing for all other languages inputs = processor(audio=audio, sampling_rate=16000, return_tensors="pt") # Fixed: audio instead of audios with torch.no_grad(): generated_ids = model.generate(**inputs, tgt_lang=LANGUAGE_CONFIG[language]["code"]) transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] return transcription.strip() except Exception as e: print(f"[ERROR] ASR transcription failed for {language}:", e) traceback.print_exc() return f"Transcription failed: {str(e)[:100]}..." # --- Beautiful Gradio UI --- with gr.Blocks( theme=gr.themes.Soft( primary_hue="blue", secondary_hue="green", ), title="đ GihonTech - Multilingual Speech Recognition", css=""" .gradio-container { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); } .header { text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; margin-bottom: 20px; color: white; } .language-card { background: white; padding: 15px; border-radius: 10px; margin: 10px 0; border-left: 4px solid #667eea; box-shadow: 0 2px 4px rgba(0,0,0,0.1); } .unavailable { background: #ffebee; border-left: 4px solid #f44336; } .available { background: #e8f5e8; border-left: 4px solid #4caf50; } """ ) as demo: # Header Section with gr.Row(): with gr.Column(): gr.HTML("""
Transcribe audio in multiple African languages with state-of-the-art AI models
Model: {model_info}
Language Code: {config['code']}