import traceback import soundfile as sf import torch import numpy as np from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, Wav2Vec2ForCTC, Wav2Vec2Processor import gradio as gr import resampy # Language configuration - UPDATED with correct Afan Oromo code LANGUAGE_CONFIG = { "Amharic": { "code": "amh", "model": "facebook/seamless-m4t-v2-large", "available": True }, "Swahili": { "code": "swh", "model": "facebook/seamless-m4t-v2-large", "available": True }, "Somali": { "code": "som", "model": "facebook/seamless-m4t-v2-large", "available": True }, "Afan Oromo": { "code": "gaz", # FIXED: Changed from "orm" to "gaz" "model": "facebook/seamless-m4t-v2-large", # Using SeamlessM4T since it supports gaz "available": True }, "Tigrinya": { "code": "tir", "model": "facebook/seamless-m4t-v2-large", "available": False, "message": "Tigrinya transcription is not currently available" }, "Chichewa": { "code": "nya", "model": "dmatekenya/wav2vec2-large-xls-r-300m-chichewa", "available": True } } # Initialize models models = {} processors = {} print("[INFO] Loading transcription models...") # Load SeamlessM4T model for Amharic, Swahili, Somali, Afan Oromo try: seamless_model_id = "facebook/seamless-m4t-v2-large" seamless_processor = AutoProcessor.from_pretrained(seamless_model_id) seamless_model = AutoModelForSpeechSeq2Seq.from_pretrained(seamless_model_id).to("cpu") for lang, config in LANGUAGE_CONFIG.items(): if config["available"] and config["model"] == seamless_model_id: models[lang] = seamless_model processors[lang] = seamless_processor print("[SUCCESS] SeamlessM4T model loaded for Amharic, Swahili, Somali, Afan Oromo") except Exception as e: print("[ERROR] Failed to load SeamlessM4T model:", e) traceback.print_exc() # Load Chichewa model try: chichewa_processor = Wav2Vec2Processor.from_pretrained("dmatekenya/wav2vec2-large-xls-r-300m-chichewa") chichewa_model = Wav2Vec2ForCTC.from_pretrained("dmatekenya/wav2vec2-large-xls-r-300m-chichewa").to("cpu") models["Chichewa"] = chichewa_model processors["Chichewa"] = chichewa_processor print("[SUCCESS] Chichewa model loaded successfully") except Exception as e: print("[ERROR] Failed to load Chichewa model:", e) traceback.print_exc() LANGUAGE_CONFIG["Chichewa"]["available"] = False # --- Helper: ASR --- def transcribe_audio(audio_file, language): if language not in models or language not in processors: return f"Model for {language} is not available" if not LANGUAGE_CONFIG[language]["available"]: if language == "Tigrinya": return LANGUAGE_CONFIG[language]["message"] return f"{language} transcription is currently unavailable" try: # Read and preprocess audio audio, sr = sf.read(audio_file) if audio.ndim > 1: audio = audio.mean(axis=1) audio = resampy.resample(audio, sr, 16000) model = models[language] processor = processors[language] # Handle different model types if language == "Chichewa": # Wav2Vec2 processing inputs = processor(audio, sampling_rate=16000, return_tensors="pt", padding=True) with torch.no_grad(): logits = model(**inputs).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predicted_ids)[0] else: # Standard SeamlessM4T processing for all other languages inputs = processor(audio=audio, sampling_rate=16000, return_tensors="pt") # Fixed: audio instead of audios with torch.no_grad(): generated_ids = model.generate(**inputs, tgt_lang=LANGUAGE_CONFIG[language]["code"]) transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] return transcription.strip() except Exception as e: print(f"[ERROR] ASR transcription failed for {language}:", e) traceback.print_exc() return f"Transcription failed: {str(e)[:100]}..." # --- Beautiful Gradio UI --- with gr.Blocks( theme=gr.themes.Soft( primary_hue="blue", secondary_hue="green", ), title="🌍 GihonTech - Multilingual Speech Recognition", css=""" .gradio-container { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); } .header { text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; margin-bottom: 20px; color: white; } .language-card { background: white; padding: 15px; border-radius: 10px; margin: 10px 0; border-left: 4px solid #667eea; box-shadow: 0 2px 4px rgba(0,0,0,0.1); } .unavailable { background: #ffebee; border-left: 4px solid #f44336; } .available { background: #e8f5e8; border-left: 4px solid #4caf50; } """ ) as demo: # Header Section with gr.Row(): with gr.Column(): gr.HTML("""

🌍 GihonTech Multilingual Speech Recognition

Transcribe audio in multiple African languages with state-of-the-art AI models

""") # Main Content with gr.Row(): # Input Section with gr.Column(scale=1): gr.Markdown("### 🎤 Upload Audio") audio_input = gr.Audio( sources=["microphone", "upload"], type="filepath", label="Record or Upload Audio", elem_classes="audio-input" ) language_select = gr.Dropdown( choices=list(LANGUAGE_CONFIG.keys()), value="Swahili", label="Select Language", info="Choose the language of your audio" ) submit_btn = gr.Button( "đŸŽ¯ Transcribe Audio", variant="primary", size="lg" ) # Output Section with gr.Column(scale=1): gr.Markdown("### 📝 Transcription Result") transcription_output = gr.Textbox( label="Transcribed Text", placeholder="Your transcription will appear here...", lines=5, show_copy_button=True ) # Status indicator status_indicator = gr.HTML("""
✅ Ready to transcribe
""") # Language Information Section with gr.Row(): with gr.Column(): gr.Markdown("### 🌐 Supported Languages") for lang, config in LANGUAGE_CONFIG.items(): status_class = "unavailable" if not config["available"] else "available" status_text = "🔴 Not Available" if not config["available"] else "đŸŸĸ Available" model_info = config["model"] if config["available"] else config.get("message", "Not available") gr.HTML(f"""

{lang} {status_text}

Model: {model_info}

Language Code: {config['code']}

""") # Footer with gr.Row(): with gr.Column(): gr.Markdown(""" --- ### â„šī¸ About This Service **Powered by:** - Facebook SeamlessM4T - Hugging Face Transformers - Specialized African Language Models **Supported Languages & Codes:** - Amharic (amh) - Swahili (swh) - Somali (som) - Afan Oromo (gaz) - Chichewa (nya) **Supported Formats:** WAV, MP3, M4A, FLAC **Maximum Duration:** 30 seconds per audio *For best results, use clear audio with minimal background noise* """) # Event handlers def update_status(language): config = LANGUAGE_CONFIG[language] if not config["available"]: if language == "Tigrinya": return f'
⛔ {config["message"]}
' return f'
⛔ {language} transcription is currently unavailable
' return '
✅ Ready to transcribe
' # Connect events language_select.change( fn=update_status, inputs=[language_select], outputs=status_indicator ) submit_btn.click( fn=transcribe_audio, inputs=[audio_input, language_select], outputs=transcription_output ).then( fn=lambda: '
✅ Ready to transcribe
', outputs=status_indicator ) if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, share=False, show_error=True )