Spaces:

minte-atnafu
/

GihonTech_Local_Language_Transcription

Sleeping

App Files Files Community

Minte commited on Oct 8

Commit

133a63b

1 Parent(s): e61d7b5

Fix Afan Oromo language configuration and model loading

Browse files

Files changed (1) hide show

app.py +15 -39

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, Wav2Vec2ForCT
 import gradio as gr
 import resampy
-# Language configuration
 LANGUAGE_CONFIG = {
     "Amharic": {
         "code": "amh",
@@ -24,8 +24,8 @@ LANGUAGE_CONFIG = {
         "available": True
     },
     "Afan Oromo": {
-        "code": "orm",
-        "model": "osanseviero/seamless-copy",
         "available": True
     },
     "Tigrinya": {
@@ -47,7 +47,7 @@ processors = {}
 print("[INFO] Loading transcription models...")
-# Load SeamlessM4T model for Amharic, Swahili, Somali
 try:
     seamless_model_id = "facebook/seamless-m4t-v2-large"
     seamless_processor = AutoProcessor.from_pretrained(seamless_model_id)
@@ -58,23 +58,11 @@ try:
             models[lang] = seamless_model
             processors[lang] = seamless_processor
-    print("[SUCCESS] SeamlessM4T model loaded for Amharic, Swahili, Somali")
 except Exception as e:
     print("[ERROR] Failed to load SeamlessM4T model:", e)
     traceback.print_exc()
-# Load Afan Oromo model - FIXED IMPLEMENTATION
-try:
-    oromo_processor = AutoProcessor.from_pretrained("osanseviero/seamless-copy")
-    oromo_model = AutoModelForSpeechSeq2Seq.from_pretrained("osanseviero/seamless-copy").to("cpu")
-    models["Afan Oromo"] = oromo_model
-    processors["Afan Oromo"] = oromo_processor
-    print("[SUCCESS] Afan Oromo model loaded successfully")
-except Exception as e:
-    print("[ERROR] Failed to load Afan Oromo model:", e)
-    traceback.print_exc()
-    LANGUAGE_CONFIG["Afan Oromo"]["available"] = False
 # Load Chichewa model
 try:
     chichewa_processor = Wav2Vec2Processor.from_pretrained("dmatekenya/wav2vec2-large-xls-r-300m-chichewa")
@@ -116,29 +104,9 @@ def transcribe_audio(audio_file, language):
             predicted_ids = torch.argmax(logits, dim=-1)
             transcription = processor.batch_decode(predicted_ids)[0]
-        elif language == "Afan Oromo":
-            # FIXED: Afan Oromo uses different processing
-            # The seamless-copy model might work differently
-            try:
-                # Try without tgt_lang first
-                inputs = processor(audio=audio, sampling_rate=16000, return_tensors="pt")  # FIXED: audio instead of audios
-                with torch.no_grad():
-                    generated_ids = model.generate(**inputs)
-                transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-            except Exception as oromo_error:
-                print(f"[WARNING] Afan Oromo standard processing failed: {oromo_error}")
-                # Fallback: try with text generation
-                try:
-                    inputs = processor(audio=audio, sampling_rate=16000, return_tensors="pt")
-                    with torch.no_grad():
-                        outputs = model(**inputs)
-                    transcription = processor.decode(outputs.logits.argmax(dim=-1)[0])
-                except Exception as fallback_error:
-                    transcription = f"Afan Oromo transcription failed: {str(fallback_error)[:100]}"
         else:
-            # Standard SeamlessM4T processing - FIXED: audio instead of audios
-            inputs = processor(audio=audio, sampling_rate=16000, return_tensors="pt")  # FIXED HERE
             with torch.no_grad():
                 generated_ids = model.generate(**inputs, tgt_lang=LANGUAGE_CONFIG[language]["code"])
             transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
@@ -255,6 +223,7 @@ with gr.Blocks(
                 <div class="language-card {status_class}">
                     <h4>{lang} {status_text}</h4>
                     <p><strong>Model:</strong> {model_info}</p>
                 </div>
                 """)
@@ -270,6 +239,13 @@ with gr.Blocks(
             - Hugging Face Transformers
             - Specialized African Language Models
             **Supported Formats:** WAV, MP3, M4A, FLAC
             **Maximum Duration:** 30 seconds per audio

 import gradio as gr
 import resampy
+# Language configuration - UPDATED with correct Afan Oromo code
 LANGUAGE_CONFIG = {
     "Amharic": {
         "code": "amh",
         "available": True
     },
     "Afan Oromo": {
+        "code": "gaz",  # FIXED: Changed from "orm" to "gaz"
+        "model": "facebook/seamless-m4t-v2-large",  # Using SeamlessM4T since it supports gaz
         "available": True
     },
     "Tigrinya": {
 print("[INFO] Loading transcription models...")
+# Load SeamlessM4T model for Amharic, Swahili, Somali, Afan Oromo
 try:
     seamless_model_id = "facebook/seamless-m4t-v2-large"
     seamless_processor = AutoProcessor.from_pretrained(seamless_model_id)
             models[lang] = seamless_model
             processors[lang] = seamless_processor
+    print("[SUCCESS] SeamlessM4T model loaded for Amharic, Swahili, Somali, Afan Oromo")
 except Exception as e:
     print("[ERROR] Failed to load SeamlessM4T model:", e)
     traceback.print_exc()
 # Load Chichewa model
 try:
     chichewa_processor = Wav2Vec2Processor.from_pretrained("dmatekenya/wav2vec2-large-xls-r-300m-chichewa")
             predicted_ids = torch.argmax(logits, dim=-1)
             transcription = processor.batch_decode(predicted_ids)[0]
         else:
+            # Standard SeamlessM4T processing for all other languages
+            inputs = processor(audio=audio, sampling_rate=16000, return_tensors="pt")  # Fixed: audio instead of audios
             with torch.no_grad():
                 generated_ids = model.generate(**inputs, tgt_lang=LANGUAGE_CONFIG[language]["code"])
             transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
                 <div class="language-card {status_class}">
                     <h4>{lang} {status_text}</h4>
                     <p><strong>Model:</strong> {model_info}</p>
+                    <p><strong>Language Code:</strong> {config['code']}</p>
                 </div>
                 """)
             - Hugging Face Transformers
             - Specialized African Language Models
+            **Supported Languages & Codes:**
+            - Amharic (amh)
+            - Swahili (swh)
+            - Somali (som)
+            - Afan Oromo (gaz)
+            - Chichewa (nya)
             **Supported Formats:** WAV, MP3, M4A, FLAC
             **Maximum Duration:** 30 seconds per audio