Minte
commited on
Commit
·
133a63b
1
Parent(s):
e61d7b5
Fix Afan Oromo language configuration and model loading
Browse files
app.py
CHANGED
|
@@ -6,7 +6,7 @@ from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, Wav2Vec2ForCT
|
|
| 6 |
import gradio as gr
|
| 7 |
import resampy
|
| 8 |
|
| 9 |
-
# Language configuration
|
| 10 |
LANGUAGE_CONFIG = {
|
| 11 |
"Amharic": {
|
| 12 |
"code": "amh",
|
|
@@ -24,8 +24,8 @@ LANGUAGE_CONFIG = {
|
|
| 24 |
"available": True
|
| 25 |
},
|
| 26 |
"Afan Oromo": {
|
| 27 |
-
"code": "orm"
|
| 28 |
-
"model": "
|
| 29 |
"available": True
|
| 30 |
},
|
| 31 |
"Tigrinya": {
|
|
@@ -47,7 +47,7 @@ processors = {}
|
|
| 47 |
|
| 48 |
print("[INFO] Loading transcription models...")
|
| 49 |
|
| 50 |
-
# Load SeamlessM4T model for Amharic, Swahili, Somali
|
| 51 |
try:
|
| 52 |
seamless_model_id = "facebook/seamless-m4t-v2-large"
|
| 53 |
seamless_processor = AutoProcessor.from_pretrained(seamless_model_id)
|
|
@@ -58,23 +58,11 @@ try:
|
|
| 58 |
models[lang] = seamless_model
|
| 59 |
processors[lang] = seamless_processor
|
| 60 |
|
| 61 |
-
print("[SUCCESS] SeamlessM4T model loaded for Amharic, Swahili, Somali")
|
| 62 |
except Exception as e:
|
| 63 |
print("[ERROR] Failed to load SeamlessM4T model:", e)
|
| 64 |
traceback.print_exc()
|
| 65 |
|
| 66 |
-
# Load Afan Oromo model - FIXED IMPLEMENTATION
|
| 67 |
-
try:
|
| 68 |
-
oromo_processor = AutoProcessor.from_pretrained("osanseviero/seamless-copy")
|
| 69 |
-
oromo_model = AutoModelForSpeechSeq2Seq.from_pretrained("osanseviero/seamless-copy").to("cpu")
|
| 70 |
-
models["Afan Oromo"] = oromo_model
|
| 71 |
-
processors["Afan Oromo"] = oromo_processor
|
| 72 |
-
print("[SUCCESS] Afan Oromo model loaded successfully")
|
| 73 |
-
except Exception as e:
|
| 74 |
-
print("[ERROR] Failed to load Afan Oromo model:", e)
|
| 75 |
-
traceback.print_exc()
|
| 76 |
-
LANGUAGE_CONFIG["Afan Oromo"]["available"] = False
|
| 77 |
-
|
| 78 |
# Load Chichewa model
|
| 79 |
try:
|
| 80 |
chichewa_processor = Wav2Vec2Processor.from_pretrained("dmatekenya/wav2vec2-large-xls-r-300m-chichewa")
|
|
@@ -116,29 +104,9 @@ def transcribe_audio(audio_file, language):
|
|
| 116 |
predicted_ids = torch.argmax(logits, dim=-1)
|
| 117 |
transcription = processor.batch_decode(predicted_ids)[0]
|
| 118 |
|
| 119 |
-
elif language == "Afan Oromo":
|
| 120 |
-
# FIXED: Afan Oromo uses different processing
|
| 121 |
-
# The seamless-copy model might work differently
|
| 122 |
-
try:
|
| 123 |
-
# Try without tgt_lang first
|
| 124 |
-
inputs = processor(audio=audio, sampling_rate=16000, return_tensors="pt") # FIXED: audio instead of audios
|
| 125 |
-
with torch.no_grad():
|
| 126 |
-
generated_ids = model.generate(**inputs)
|
| 127 |
-
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
| 128 |
-
except Exception as oromo_error:
|
| 129 |
-
print(f"[WARNING] Afan Oromo standard processing failed: {oromo_error}")
|
| 130 |
-
# Fallback: try with text generation
|
| 131 |
-
try:
|
| 132 |
-
inputs = processor(audio=audio, sampling_rate=16000, return_tensors="pt")
|
| 133 |
-
with torch.no_grad():
|
| 134 |
-
outputs = model(**inputs)
|
| 135 |
-
transcription = processor.decode(outputs.logits.argmax(dim=-1)[0])
|
| 136 |
-
except Exception as fallback_error:
|
| 137 |
-
transcription = f"Afan Oromo transcription failed: {str(fallback_error)[:100]}"
|
| 138 |
-
|
| 139 |
else:
|
| 140 |
-
# Standard SeamlessM4T processing
|
| 141 |
-
inputs = processor(audio=audio, sampling_rate=16000, return_tensors="pt") #
|
| 142 |
with torch.no_grad():
|
| 143 |
generated_ids = model.generate(**inputs, tgt_lang=LANGUAGE_CONFIG[language]["code"])
|
| 144 |
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
|
@@ -255,6 +223,7 @@ with gr.Blocks(
|
|
| 255 |
<div class="language-card {status_class}">
|
| 256 |
<h4>{lang} {status_text}</h4>
|
| 257 |
<p><strong>Model:</strong> {model_info}</p>
|
|
|
|
| 258 |
</div>
|
| 259 |
""")
|
| 260 |
|
|
@@ -270,6 +239,13 @@ with gr.Blocks(
|
|
| 270 |
- Hugging Face Transformers
|
| 271 |
- Specialized African Language Models
|
| 272 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
**Supported Formats:** WAV, MP3, M4A, FLAC
|
| 274 |
**Maximum Duration:** 30 seconds per audio
|
| 275 |
|
|
|
|
| 6 |
import gradio as gr
|
| 7 |
import resampy
|
| 8 |
|
| 9 |
+
# Language configuration - UPDATED with correct Afan Oromo code
|
| 10 |
LANGUAGE_CONFIG = {
|
| 11 |
"Amharic": {
|
| 12 |
"code": "amh",
|
|
|
|
| 24 |
"available": True
|
| 25 |
},
|
| 26 |
"Afan Oromo": {
|
| 27 |
+
"code": "gaz", # FIXED: Changed from "orm" to "gaz"
|
| 28 |
+
"model": "facebook/seamless-m4t-v2-large", # Using SeamlessM4T since it supports gaz
|
| 29 |
"available": True
|
| 30 |
},
|
| 31 |
"Tigrinya": {
|
|
|
|
| 47 |
|
| 48 |
print("[INFO] Loading transcription models...")
|
| 49 |
|
| 50 |
+
# Load SeamlessM4T model for Amharic, Swahili, Somali, Afan Oromo
|
| 51 |
try:
|
| 52 |
seamless_model_id = "facebook/seamless-m4t-v2-large"
|
| 53 |
seamless_processor = AutoProcessor.from_pretrained(seamless_model_id)
|
|
|
|
| 58 |
models[lang] = seamless_model
|
| 59 |
processors[lang] = seamless_processor
|
| 60 |
|
| 61 |
+
print("[SUCCESS] SeamlessM4T model loaded for Amharic, Swahili, Somali, Afan Oromo")
|
| 62 |
except Exception as e:
|
| 63 |
print("[ERROR] Failed to load SeamlessM4T model:", e)
|
| 64 |
traceback.print_exc()
|
| 65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
# Load Chichewa model
|
| 67 |
try:
|
| 68 |
chichewa_processor = Wav2Vec2Processor.from_pretrained("dmatekenya/wav2vec2-large-xls-r-300m-chichewa")
|
|
|
|
| 104 |
predicted_ids = torch.argmax(logits, dim=-1)
|
| 105 |
transcription = processor.batch_decode(predicted_ids)[0]
|
| 106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
else:
|
| 108 |
+
# Standard SeamlessM4T processing for all other languages
|
| 109 |
+
inputs = processor(audio=audio, sampling_rate=16000, return_tensors="pt") # Fixed: audio instead of audios
|
| 110 |
with torch.no_grad():
|
| 111 |
generated_ids = model.generate(**inputs, tgt_lang=LANGUAGE_CONFIG[language]["code"])
|
| 112 |
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
|
|
|
| 223 |
<div class="language-card {status_class}">
|
| 224 |
<h4>{lang} {status_text}</h4>
|
| 225 |
<p><strong>Model:</strong> {model_info}</p>
|
| 226 |
+
<p><strong>Language Code:</strong> {config['code']}</p>
|
| 227 |
</div>
|
| 228 |
""")
|
| 229 |
|
|
|
|
| 239 |
- Hugging Face Transformers
|
| 240 |
- Specialized African Language Models
|
| 241 |
|
| 242 |
+
**Supported Languages & Codes:**
|
| 243 |
+
- Amharic (amh)
|
| 244 |
+
- Swahili (swh)
|
| 245 |
+
- Somali (som)
|
| 246 |
+
- Afan Oromo (gaz)
|
| 247 |
+
- Chichewa (nya)
|
| 248 |
+
|
| 249 |
**Supported Formats:** WAV, MP3, M4A, FLAC
|
| 250 |
**Maximum Duration:** 30 seconds per audio
|
| 251 |
|