Spaces:

minte-atnafu
/

GihonTech_Local_Language_Transcription

Sleeping

App Files Files Community

Minte commited on Oct 8

Commit

d191a12

1 Parent(s): 8f055e9

Enhance multilingual ASR functionality with improved language configuration and model loading

Browse files

Files changed (1) hide show

app.py +248 -41

app.py CHANGED Viewed

@@ -2,89 +2,296 @@ import traceback
 import soundfile as sf
 import torch
 import numpy as np
-from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
 import gradio as gr
 import resampy
-# Language code mapping
-LANGUAGE_CODES = {
-    "Amharic": "amh",
-    "Swahili": "swh",
-    "Somali": "som",
-    "Afan Oromo": "orm",
-    "Tigrinya": "tir",
-    "Chichewa": "nya"
 }
-# --- Load ASR model ---
 try:
-    model_id = "facebook/seamless-m4t-v2-large"
-    processor = AutoProcessor.from_pretrained(model_id)
-    asr_model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id).to("cpu")
-    print("[INFO] ASR model loaded successfully.")
 except Exception as e:
-    print("[ERROR] Failed to load ASR model:", e)
     traceback.print_exc()
-    asr_model = None
-    processor = None
 # --- Helper: ASR ---
 def transcribe_audio(audio_file, language):
-    if asr_model is None or processor is None:
-        return "ASR Model loading failed"
     try:
-        # Get language code
-        lang_code = LANGUAGE_CODES.get(language)
-        if not lang_code:
-            return f"Unsupported language: {language}"
         # Read and preprocess audio
         audio, sr = sf.read(audio_file)
         if audio.ndim > 1:
             audio = audio.mean(axis=1)
         audio = resampy.resample(audio, sr, 16000)
-        # Process with model
-        inputs = processor(audios=audio, sampling_rate=16000, return_tensors="pt")
-        with torch.no_grad():
-            generated_ids = asr_model.generate(**inputs, tgt_lang=lang_code)
-        # Decode the transcription
-        transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         return transcription.strip()
     except Exception as e:
         print(f"[ERROR] ASR transcription failed for {language}:", e)
         traceback.print_exc()
-        return f"ASR failed: {str(e)[:50]}..."
-# --- Gradio UI ---
-with gr.Blocks(title="🌍 Multilingual ASR") as demo:
-    gr.Markdown("# 🌍 Multilingual Speech Recognition")
-    gr.Markdown("Transcribe audio in Amharic, Swahili, Somali, Afan Oromo, Tigrinya, or Chichewa")
     with gr.Row():
         with gr.Column():
-            audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Record or upload audio")
             language_select = gr.Dropdown(
-                choices=list(LANGUAGE_CODES.keys()),
                 value="Swahili",
-                label="Select Language"
             )
-    submit_btn = gr.Button("Transcribe", variant="primary")
     with gr.Row():
         with gr.Column():
-            transcription_output = gr.Textbox(label="Transcription")
     submit_btn.click(
         fn=transcribe_audio,
         inputs=[audio_input, language_select],
         outputs=transcription_output
     )
 if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860)

 import soundfile as sf
 import torch
 import numpy as np
+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, Wav2Vec2ForCTC, Wav2Vec2Processor
 import gradio as gr
 import resampy
+# Language configuration
+LANGUAGE_CONFIG = {
+    "Amharic": {
+        "code": "amh",
+        "model": "facebook/seamless-m4t-v2-large",
+        "available": True
+    },
+    "Swahili": {
+        "code": "swh",
+        "model": "facebook/seamless-m4t-v2-large",
+        "available": True
+    },
+    "Somali": {
+        "code": "som",
+        "model": "facebook/seamless-m4t-v2-large",
+        "available": True
+    },
+    "Afan Oromo": {
+        "code": "orm",
+        "model": "osanseviero/seamless-copy",
+        "available": True
+    },
+    "Tigrinya": {
+        "code": "tir",
+        "model": "facebook/seamless-m4t-v2-large",
+        "available": False,
+        "message": "Tigrinya transcription is not currently available"
+    },
+    "Chichewa": {
+        "code": "nya",
+        "model": "dmatekenya/wav2vec2-large-xls-r-300m-chichewa",
+        "available": True
+    }
 }
+# Initialize models
+models = {}
+processors = {}
+print("[INFO] Loading transcription models...")
+# Load SeamlessM4T model for Amharic, Swahili, Somali
+try:
+    seamless_model_id = "facebook/seamless-m4t-v2-large"
+    seamless_processor = AutoProcessor.from_pretrained(seamless_model_id)
+    seamless_model = AutoModelForSpeechSeq2Seq.from_pretrained(seamless_model_id).to("cpu")
+    for lang, config in LANGUAGE_CONFIG.items():
+        if config["available"] and config["model"] == seamless_model_id:
+            models[lang] = seamless_model
+            processors[lang] = seamless_processor
+    print("[SUCCESS] SeamlessM4T model loaded for Amharic, Swahili, Somali")
+except Exception as e:
+    print("[ERROR] Failed to load SeamlessM4T model:", e)
+    traceback.print_exc()
+# Load Afan Oromo model
 try:
+    oromo_processor = AutoProcessor.from_pretrained("osanseviero/seamless-copy")
+    oromo_model = AutoModelForSpeechSeq2Seq.from_pretrained("osanseviero/seamless-copy").to("cpu")
+    models["Afan Oromo"] = oromo_model
+    processors["Afan Oromo"] = oromo_processor
+    print("[SUCCESS] Afan Oromo model loaded successfully")
 except Exception as e:
+    print("[ERROR] Failed to load Afan Oromo model:", e)
     traceback.print_exc()
+    LANGUAGE_CONFIG["Afan Oromo"]["available"] = False
+# Load Chichewa model
+try:
+    chichewa_processor = Wav2Vec2Processor.from_pretrained("dmatekenya/wav2vec2-large-xls-r-300m-chichewa")
+    chichewa_model = Wav2Vec2ForCTC.from_pretrained("dmatekenya/wav2vec2-large-xls-r-300m-chichewa").to("cpu")
+    models["Chichewa"] = chichewa_model
+    processors["Chichewa"] = chichewa_processor
+    print("[SUCCESS] Chichewa model loaded successfully")
+except Exception as e:
+    print("[ERROR] Failed to load Chichewa model:", e)
+    traceback.print_exc()
+    LANGUAGE_CONFIG["Chichewa"]["available"] = False
 # --- Helper: ASR ---
 def transcribe_audio(audio_file, language):
+    if language not in models or language not in processors:
+        return f"Model for {language} is not available"
+    if not LANGUAGE_CONFIG[language]["available"]:
+        if language == "Tigrinya":
+            return LANGUAGE_CONFIG[language]["message"]
+        return f"{language} transcription is currently unavailable"
     try:
         # Read and preprocess audio
         audio, sr = sf.read(audio_file)
         if audio.ndim > 1:
             audio = audio.mean(axis=1)
         audio = resampy.resample(audio, sr, 16000)
+        model = models[language]
+        processor = processors[language]
+        # Handle different model types
+        if language == "Chichewa":
+            # Wav2Vec2 processing
+            inputs = processor(audio, sampling_rate=16000, return_tensors="pt", padding=True)
+            with torch.no_grad():
+                logits = model(**inputs).logits
+            predicted_ids = torch.argmax(logits, dim=-1)
+            transcription = processor.batch_decode(predicted_ids)[0]
+        elif language == "Afan Oromo":
+            # Seamless-copy processing
+            inputs = processor(audios=audio, sampling_rate=16000, return_tensors="pt")
+            with torch.no_grad():
+                generated_ids = model.generate(**inputs, tgt_lang=LANGUAGE_CONFIG[language]["code"])
+            transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        else:
+            # Standard SeamlessM4T processing
+            inputs = processor(audios=audio, sampling_rate=16000, return_tensors="pt")
+            with torch.no_grad():
+                generated_ids = model.generate(**inputs, tgt_lang=LANGUAGE_CONFIG[language]["code"])
+            transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         return transcription.strip()
     except Exception as e:
         print(f"[ERROR] ASR transcription failed for {language}:", e)
         traceback.print_exc()
+        return f"Transcription failed: {str(e)[:100]}..."
+# --- Beautiful Gradio UI ---
+with gr.Blocks(
+    theme=gr.themes.Soft(
+        primary_hue="blue",
+        secondary_hue="green",
+    ),
+    title="🌍 GihonTech - Multilingual Speech Recognition",
+    css="""
+    .gradio-container {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    }
+    .header {
+        text-align: center;
+        padding: 20px;
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        border-radius: 15px;
+        margin-bottom: 20px;
+        color: white;
+    }
+    .language-card {
+        background: white;
+        padding: 15px;
+        border-radius: 10px;
+        margin: 10px 0;
+        border-left: 4px solid #667eea;
+        box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+    }
+    .unavailable {
+        background: #ffebee;
+        border-left: 4px solid #f44336;
+    }
+    .available {
+        background: #e8f5e8;
+        border-left: 4px solid #4caf50;
+    }
+    """
+) as demo:
+    # Header Section
     with gr.Row():
         with gr.Column():
+            gr.HTML("""
+            <div class="header">
+                <h1>🌍 GihonTech Multilingual Speech Recognition</h1>
+                <p>Transcribe audio in multiple African languages with state-of-the-art AI models</p>
+            </div>
+            """)
+    # Main Content
+    with gr.Row():
+        # Input Section
+        with gr.Column(scale=1):
+            gr.Markdown("### 🎤 Upload Audio")
+            audio_input = gr.Audio(
+                sources=["microphone", "upload"],
+                type="filepath",
+                label="Record or Upload Audio",
+                elem_classes="audio-input"
+            )
             language_select = gr.Dropdown(
+                choices=list(LANGUAGE_CONFIG.keys()),
                 value="Swahili",
+                label="Select Language",
+                info="Choose the language of your audio"
+            )
+            submit_btn = gr.Button(
+                "🎯 Transcribe Audio",
+                variant="primary",
+                size="lg"
             )
+        # Output Section
+        with gr.Column(scale=1):
+            gr.Markdown("### 📝 Transcription Result")
+            transcription_output = gr.Textbox(
+                label="Transcribed Text",
+                placeholder="Your transcription will appear here...",
+                lines=5,
+                show_copy_button=True
+            )
+            # Status indicator
+            status_indicator = gr.HTML("""
+            <div style="text-align: center; padding: 10px;">
+                <span style="color: #4caf50;">✅ Ready to transcribe</span>
+            </div>
+            """)
+    # Language Information Section
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("### 🌐 Supported Languages")
+            for lang, config in LANGUAGE_CONFIG.items():
+                status_class = "unavailable" if not config["available"] else "available"
+                status_text = "🔴 Not Available" if not config["available"] else "🟢 Available"
+                model_info = config["model"] if config["available"] else config.get("message", "Not available")
+                gr.HTML(f"""
+                <div class="language-card {status_class}">
+                    <h4>{lang} {status_text}</h4>
+                    <p><strong>Model:</strong> {model_info}</p>
+                </div>
+                """)
+    # Footer
     with gr.Row():
         with gr.Column():
+            gr.Markdown("""
+            ---
+            ### ℹ️ About This Service
+            **Powered by:**
+            - Facebook SeamlessM4T
+            - Hugging Face Transformers
+            - Specialized African Language Models
+            **Supported Formats:** WAV, MP3, M4A, FLAC
+            **Maximum Duration:** 30 seconds per audio
+            *For best results, use clear audio with minimal background noise*
+            """)
+    # Event handlers
+    def update_status(language):
+        config = LANGUAGE_CONFIG[language]
+        if not config["available"]:
+            if language == "Tigrinya":
+                return f'<div style="text-align: center; padding: 10px; background: #ffebee; border-radius: 5px;"><span style="color: #f44336;">⛔ {config["message"]}</span></div>'
+            return f'<div style="text-align: center; padding: 10px; background: #ffebee; border-radius: 5px;"><span style="color: #f44336;">⛔ {language} transcription is currently unavailable</span></div>'
+        return '<div style="text-align: center; padding: 10px; background: #e8f5e8; border-radius: 5px;"><span style="color: #4caf50;">✅ Ready to transcribe</span></div>'
+    # Connect events
+    language_select.change(
+        fn=update_status,
+        inputs=[language_select],
+        outputs=status_indicator
+    )
     submit_btn.click(
         fn=transcribe_audio,
         inputs=[audio_input, language_select],
         outputs=transcription_output
+    ).then(
+        fn=lambda: '<div style="text-align: center; padding: 10px; background: #e8f5e8; border-radius: 5px;"><span style="color: #4caf50;">✅ Ready to transcribe</span></div>',
+        outputs=status_indicator
     )
 if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True
+    )