Minte commited on
Commit
cd5cc96
Β·
1 Parent(s): 3edf9d4

ASR for Local Languages

Browse files
Files changed (3) hide show
  1. .gitignore +0 -0
  2. app.py +90 -0
  3. requirements.txt +12 -0
.gitignore ADDED
File without changes
app.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import traceback
2
+ import soundfile as sf
3
+ import torch
4
+ import numpy as np
5
+ from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
6
+ import gradio as gr
7
+ import resampy
8
+
9
+ # Language code mapping
10
+ LANGUAGE_CODES = {
11
+ "Amharic": "amh",
12
+ "Swahili": "swh",
13
+ "Somali": "som",
14
+ "Afan Oromo": "orm",
15
+ "Tigrinya": "tir",
16
+ "Chichewa": "nya"
17
+ }
18
+
19
+ # --- Load ASR model ---
20
+ try:
21
+ model_id = "facebook/seamless-m4t-v2-large"
22
+ processor = AutoProcessor.from_pretrained(model_id)
23
+ asr_model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id).to("cpu")
24
+ print("[INFO] ASR model loaded successfully.")
25
+ except Exception as e:
26
+ print("[ERROR] Failed to load ASR model:", e)
27
+ traceback.print_exc()
28
+ asr_model = None
29
+ processor = None
30
+
31
+ # --- Helper: ASR ---
32
+ def transcribe_audio(audio_file, language):
33
+ if asr_model is None or processor is None:
34
+ return "ASR Model loading failed"
35
+
36
+ try:
37
+ # Get language code
38
+ lang_code = LANGUAGE_CODES.get(language)
39
+ if not lang_code:
40
+ return f"Unsupported language: {language}"
41
+
42
+ # Read and preprocess audio
43
+ audio, sr = sf.read(audio_file)
44
+ if audio.ndim > 1:
45
+ audio = audio.mean(axis=1)
46
+ audio = resampy.resample(audio, sr, 16000)
47
+
48
+ # Process with model
49
+ inputs = processor(audios=audio, sampling_rate=16000, return_tensors="pt")
50
+
51
+ with torch.no_grad():
52
+ generated_ids = asr_model.generate(**inputs, tgt_lang=lang_code)
53
+
54
+ # Decode the transcription
55
+ transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
56
+ return transcription.strip()
57
+
58
+ except Exception as e:
59
+ print(f"[ERROR] ASR transcription failed for {language}:", e)
60
+ traceback.print_exc()
61
+ return f"ASR failed: {str(e)[:50]}..."
62
+
63
+ # --- Gradio UI ---
64
+ with gr.Blocks(title="🌍 Multilingual ASR") as demo:
65
+ gr.Markdown("# 🌍 Multilingual Speech Recognition")
66
+ gr.Markdown("Transcribe audio in Amharic, Swahili, Somali, Afan Oromo, Tigrinya, or Chichewa")
67
+
68
+ with gr.Row():
69
+ with gr.Column():
70
+ audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Record or upload audio")
71
+ language_select = gr.Dropdown(
72
+ choices=list(LANGUAGE_CODES.keys()),
73
+ value="Swahili",
74
+ label="Select Language"
75
+ )
76
+
77
+ submit_btn = gr.Button("Transcribe", variant="primary")
78
+
79
+ with gr.Row():
80
+ with gr.Column():
81
+ transcription_output = gr.Textbox(label="Transcription")
82
+
83
+ submit_btn.click(
84
+ fn=transcribe_audio,
85
+ inputs=[audio_input, language_select],
86
+ outputs=transcription_output
87
+ )
88
+
89
+ if __name__ == "__main__":
90
+ demo.launch(server_name="0.0.0.0", server_port=7860)
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ torchaudio
3
+ transformers
4
+ gradio
5
+ soundfile
6
+ resampy
7
+ accelerate
8
+ sentencepiece
9
+ scipy
10
+ numpy
11
+ sacremoses
12
+ librosa