Minte commited on
Commit
133a63b
·
1 Parent(s): e61d7b5

Fix Afan Oromo language configuration and model loading

Browse files
Files changed (1) hide show
  1. app.py +15 -39
app.py CHANGED
@@ -6,7 +6,7 @@ from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, Wav2Vec2ForCT
6
  import gradio as gr
7
  import resampy
8
 
9
- # Language configuration
10
  LANGUAGE_CONFIG = {
11
  "Amharic": {
12
  "code": "amh",
@@ -24,8 +24,8 @@ LANGUAGE_CONFIG = {
24
  "available": True
25
  },
26
  "Afan Oromo": {
27
- "code": "orm",
28
- "model": "osanseviero/seamless-copy",
29
  "available": True
30
  },
31
  "Tigrinya": {
@@ -47,7 +47,7 @@ processors = {}
47
 
48
  print("[INFO] Loading transcription models...")
49
 
50
- # Load SeamlessM4T model for Amharic, Swahili, Somali
51
  try:
52
  seamless_model_id = "facebook/seamless-m4t-v2-large"
53
  seamless_processor = AutoProcessor.from_pretrained(seamless_model_id)
@@ -58,23 +58,11 @@ try:
58
  models[lang] = seamless_model
59
  processors[lang] = seamless_processor
60
 
61
- print("[SUCCESS] SeamlessM4T model loaded for Amharic, Swahili, Somali")
62
  except Exception as e:
63
  print("[ERROR] Failed to load SeamlessM4T model:", e)
64
  traceback.print_exc()
65
 
66
- # Load Afan Oromo model - FIXED IMPLEMENTATION
67
- try:
68
- oromo_processor = AutoProcessor.from_pretrained("osanseviero/seamless-copy")
69
- oromo_model = AutoModelForSpeechSeq2Seq.from_pretrained("osanseviero/seamless-copy").to("cpu")
70
- models["Afan Oromo"] = oromo_model
71
- processors["Afan Oromo"] = oromo_processor
72
- print("[SUCCESS] Afan Oromo model loaded successfully")
73
- except Exception as e:
74
- print("[ERROR] Failed to load Afan Oromo model:", e)
75
- traceback.print_exc()
76
- LANGUAGE_CONFIG["Afan Oromo"]["available"] = False
77
-
78
  # Load Chichewa model
79
  try:
80
  chichewa_processor = Wav2Vec2Processor.from_pretrained("dmatekenya/wav2vec2-large-xls-r-300m-chichewa")
@@ -116,29 +104,9 @@ def transcribe_audio(audio_file, language):
116
  predicted_ids = torch.argmax(logits, dim=-1)
117
  transcription = processor.batch_decode(predicted_ids)[0]
118
 
119
- elif language == "Afan Oromo":
120
- # FIXED: Afan Oromo uses different processing
121
- # The seamless-copy model might work differently
122
- try:
123
- # Try without tgt_lang first
124
- inputs = processor(audio=audio, sampling_rate=16000, return_tensors="pt") # FIXED: audio instead of audios
125
- with torch.no_grad():
126
- generated_ids = model.generate(**inputs)
127
- transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
128
- except Exception as oromo_error:
129
- print(f"[WARNING] Afan Oromo standard processing failed: {oromo_error}")
130
- # Fallback: try with text generation
131
- try:
132
- inputs = processor(audio=audio, sampling_rate=16000, return_tensors="pt")
133
- with torch.no_grad():
134
- outputs = model(**inputs)
135
- transcription = processor.decode(outputs.logits.argmax(dim=-1)[0])
136
- except Exception as fallback_error:
137
- transcription = f"Afan Oromo transcription failed: {str(fallback_error)[:100]}"
138
-
139
  else:
140
- # Standard SeamlessM4T processing - FIXED: audio instead of audios
141
- inputs = processor(audio=audio, sampling_rate=16000, return_tensors="pt") # FIXED HERE
142
  with torch.no_grad():
143
  generated_ids = model.generate(**inputs, tgt_lang=LANGUAGE_CONFIG[language]["code"])
144
  transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
@@ -255,6 +223,7 @@ with gr.Blocks(
255
  <div class="language-card {status_class}">
256
  <h4>{lang} {status_text}</h4>
257
  <p><strong>Model:</strong> {model_info}</p>
 
258
  </div>
259
  """)
260
 
@@ -270,6 +239,13 @@ with gr.Blocks(
270
  - Hugging Face Transformers
271
  - Specialized African Language Models
272
 
 
 
 
 
 
 
 
273
  **Supported Formats:** WAV, MP3, M4A, FLAC
274
  **Maximum Duration:** 30 seconds per audio
275
 
 
6
  import gradio as gr
7
  import resampy
8
 
9
+ # Language configuration - UPDATED with correct Afan Oromo code
10
  LANGUAGE_CONFIG = {
11
  "Amharic": {
12
  "code": "amh",
 
24
  "available": True
25
  },
26
  "Afan Oromo": {
27
+ "code": "gaz", # FIXED: Changed from "orm" to "gaz"
28
+ "model": "facebook/seamless-m4t-v2-large", # Using SeamlessM4T since it supports gaz
29
  "available": True
30
  },
31
  "Tigrinya": {
 
47
 
48
  print("[INFO] Loading transcription models...")
49
 
50
+ # Load SeamlessM4T model for Amharic, Swahili, Somali, Afan Oromo
51
  try:
52
  seamless_model_id = "facebook/seamless-m4t-v2-large"
53
  seamless_processor = AutoProcessor.from_pretrained(seamless_model_id)
 
58
  models[lang] = seamless_model
59
  processors[lang] = seamless_processor
60
 
61
+ print("[SUCCESS] SeamlessM4T model loaded for Amharic, Swahili, Somali, Afan Oromo")
62
  except Exception as e:
63
  print("[ERROR] Failed to load SeamlessM4T model:", e)
64
  traceback.print_exc()
65
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  # Load Chichewa model
67
  try:
68
  chichewa_processor = Wav2Vec2Processor.from_pretrained("dmatekenya/wav2vec2-large-xls-r-300m-chichewa")
 
104
  predicted_ids = torch.argmax(logits, dim=-1)
105
  transcription = processor.batch_decode(predicted_ids)[0]
106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  else:
108
+ # Standard SeamlessM4T processing for all other languages
109
+ inputs = processor(audio=audio, sampling_rate=16000, return_tensors="pt") # Fixed: audio instead of audios
110
  with torch.no_grad():
111
  generated_ids = model.generate(**inputs, tgt_lang=LANGUAGE_CONFIG[language]["code"])
112
  transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
223
  <div class="language-card {status_class}">
224
  <h4>{lang} {status_text}</h4>
225
  <p><strong>Model:</strong> {model_info}</p>
226
+ <p><strong>Language Code:</strong> {config['code']}</p>
227
  </div>
228
  """)
229
 
 
239
  - Hugging Face Transformers
240
  - Specialized African Language Models
241
 
242
+ **Supported Languages & Codes:**
243
+ - Amharic (amh)
244
+ - Swahili (swh)
245
+ - Somali (som)
246
+ - Afan Oromo (gaz)
247
+ - Chichewa (nya)
248
+
249
  **Supported Formats:** WAV, MP3, M4A, FLAC
250
  **Maximum Duration:** 30 seconds per audio
251