Spaces:

chrisjay
/

afro-speech

Build error

App Files Files Community

chrisjay commited on Sep 14, 2022

Commit

794ebc0

1 Parent(s): 210f8d3

modules to test the model

Browse files

Files changed (8) hide show

.gitignore +2 -1
app.py +25 -10
data +1 -1
inference.py +119 -0
loss_main_plot.png +0 -0
requirements.txt +7 -1
run.sh +3 -0
val_accuracy_plot.png +0 -0

.gitignore CHANGED Viewed

@@ -2,4 +2,5 @@ data/*
 gradio_queue.db
 data
 __pycache__/*
-data_local/*

 gradio_queue.db
 data
 __pycache__/*
+data_local/*
+afro-speech/__pycache__

app.py CHANGED Viewed

@@ -11,8 +11,7 @@ from utils import *
 import matplotlib.pyplot as plt
 import scipy.io.wavfile as wavf
 from huggingface_hub import Repository, upload_file
 HF_TOKEN = os.environ.get("HF_TOKEN")
@@ -31,7 +30,6 @@ os.makedirs(LOCAL_DIR,exist_ok=True)
 GENDER = ['Choose Gender','Male','Female','Other','Prefer not to say']
 #------------------Work on Languages--------------------
 DEFAULT_LANGS =   {}
 languages = read_json_lines('clean_languages.json')
@@ -50,8 +48,6 @@ repo.git_pull()
 with open('app.css','r') as f:
     BLOCK_CSS = f.read()
 def save_record(language,text,record,number,age,gender,accent,number_history,current_number,country,email,done_recording):
     # set default
     number_history = number_history if number_history is not None else [0]
@@ -273,6 +269,7 @@ __Note:__  You should record all numbers shown till the end. It does not count i
 PLOTS_FOR_GRADIO = []
 FUNCTIONS_FOR_GRADIO = []
 # Interface design begins
 block = gr.Blocks(css=BLOCK_CSS)
 with block:
@@ -366,12 +363,30 @@ with block:
             #listen = gr.Button("Listen")
             listen_tab.select(show_records,inputs=[],outputs=[display_html,plot]+PLOTS_FOR_GRADIO)
-            # Have a list of the languages. lang
-            # We want digits per language and gender per language
-            # for l in range(len(lang),step =4)
-            #   with Row()....   d
     gr.Markdown(ARTICLE)
 block.launch()

 import matplotlib.pyplot as plt
 import scipy.io.wavfile as wavf
 from huggingface_hub import Repository, upload_file
+from inference import make_inference
 HF_TOKEN = os.environ.get("HF_TOKEN")
 GENDER = ['Choose Gender','Male','Female','Other','Prefer not to say']
 #------------------Work on Languages--------------------
 DEFAULT_LANGS =   {}
 languages = read_json_lines('clean_languages.json')
 with open('app.css','r') as f:
     BLOCK_CSS = f.read()
 def save_record(language,text,record,number,age,gender,accent,number_history,current_number,country,email,done_recording):
     # set default
     number_history = number_history if number_history is not None else [0]
 PLOTS_FOR_GRADIO = []
 FUNCTIONS_FOR_GRADIO = []
 # Interface design begins
 block = gr.Blocks(css=BLOCK_CSS)
 with block:
             #listen = gr.Button("Listen")
             listen_tab.select(show_records,inputs=[],outputs=[display_html,plot]+PLOTS_FOR_GRADIO)
+        with gr.TabItem('Test Model') as listen_tab:
+            # Dropdown to choose a language from any of the 6
+            # When you choose, it will load the correponding model
+            # And then one can record a voice and get the model prediction
+            #Igbo - ibo
+            #Oshiwambo - kua
+            #Yoruba - yor
+            #Oromo (although note all of these audios are from female) - gax
+            #Shona (all male) - sna
+            #Rundi (all male) - run
+            gr.Markdown('''Here we are testing the models which we trained on the dataset collected.
+            Choose a language from the dropdown, record your voice and select `Submit`.''')
+            with gr.Row():
+                language_choice = gr.Dropdown(["Choose language","Igbo", "Oshiwambo", "Yoruba","Oromo","Shona","Rundi","MULTILINGUAL"],label="Choose language",default="Choose language")
+                input_audio = gr.Audio(source="microphone",label='Record your voice',type='filepath')
+                output_pred = gr.Label(num_top_classes=5)
+            submit = gr.Button('Submit')
+            submit.click(make_inference, inputs = [language_choice,input_audio], outputs = [output_pred])
     gr.Markdown(ARTICLE)
 block.launch()

data CHANGED Viewed

	@@ -1 +1 @@
1	- Subproject commit ~~83312f698fc01a05a7f3c5c45da93220cd2278c5~~


1	+ Subproject commit ebedcd8c55c90d39fd27126d29d8484566cd27ca

inference.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import torch
+import torchaudio
+from torch import nn
+from transformers import AutoFeatureExtractor,AutoModelForAudioClassification,pipeline
+#Preprocessing the data
+feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
+max_duration = 2.0  # seconds
+if torch.cuda.is_available():
+    device = "cuda"
+else:
+    device = "cpu"
+softmax = nn.Softmax()
+label2id, id2label = dict(), dict()
+labels = ['0','1','2','3','4','5','6','7','8','9']
+num_labels = 10
+for i, label in enumerate(labels):
+    label2id[label] = str(i)
+    id2label[str(i)] = label
+def get_pipeline(model_name):
+    if model_name.split('-')[-1].strip()!='ibo':
+        return None
+    return pipeline(task="audio-classification", model=model_name)
+def load_model(model_checkpoint):
+    #if model_checkpoint.split('-')[-1].strip()!='ibo': #This is for DEBUGGING
+    #    return None, None
+    # construct model and assign it to device
+    model = AutoModelForAudioClassification.from_pretrained(
+    model_checkpoint,
+    num_labels=num_labels,
+    label2id=label2id,
+    id2label=id2label,
+    ).to(device)
+    return model
+language_dict = {
+                "Igbo":'ibo',
+                "Oshiwambo":'kua',
+                "Yoruba":'yor',
+                 "Oromo":'gax',
+                 "Shona":'sna',
+                 "Rundi":'run',
+                 "Choose language":'none',
+                 "MULTILINGUAL":'all'
+            }
+AUDIO_CLASSIFICATION_MODELS= {'ibo':load_model('chrisjay/afrospeech-wav2vec-ibo'),
+                              'kua':load_model('chrisjay/afrospeech-wav2vec-kua'),
+                              'sna':load_model('chrisjay/afrospeech-wav2vec-sna'),
+                              'yor':load_model('chrisjay/afrospeech-wav2vec-yor'),
+                              'gax':load_model('chrisjay/afrospeech-wav2vec-gax'),
+                              'run':load_model('chrisjay/afrospeech-wav2vec-run'),
+                              'all':load_model('chrisjay/afrospeech-wav2vec-all-6')  }
+def cut_if_necessary(signal,num_samples):
+        if signal.shape[1] > num_samples:
+            signal = signal[:, :num_samples]
+        return signal
+def right_pad_if_necessary(signal,num_samples):
+    length_signal = signal.shape[1]
+    if length_signal < num_samples:
+        num_missing_samples = num_samples - length_signal
+        last_dim_padding = (0, num_missing_samples)
+        signal = torch.nn.functional.pad(signal, last_dim_padding)
+    return signal
+def resample_if_necessary(signal, sr,target_sample_rate,device):
+    if sr != target_sample_rate:
+        resampler = torchaudio.transforms.Resample(sr, target_sample_rate).to(device)
+        signal = resampler(signal)
+    return signal
+def mix_down_if_necessary(signal):
+    if signal.shape[0] > 1:
+        signal = torch.mean(signal, dim=0, keepdim=True)
+    return signal
+def preprocess_audio(waveform,sample_rate,feature_extractor):
+    waveform = resample_if_necessary(waveform, sample_rate,16000,device)
+    waveform = mix_down_if_necessary(waveform)
+    waveform = cut_if_necessary(waveform,16000)
+    waveform = right_pad_if_necessary(waveform,16000)
+    transformed = feature_extractor(waveform,sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True)
+    return transformed
+def make_inference(drop_down,audio):
+    waveform, sample_rate = torchaudio.load(audio)
+    preprocessed_audio = preprocess_audio(waveform,sample_rate,feature_extractor)
+    language_code_chosen = language_dict[drop_down]
+    model = AUDIO_CLASSIFICATION_MODELS[language_code_chosen]
+    model.eval()
+    torch_preprocessed_audio = torch.from_numpy(preprocessed_audio.input_values[0])
+    # make prediction
+    prediction = softmax(model(torch_preprocessed_audio).logits)
+    sorted_prediction = torch.sort(prediction,descending=True)
+    confidences={}
+    for s,v in zip(sorted_prediction.indices.detach().numpy().tolist()[0],sorted_prediction.values.detach().numpy().tolist()[0]):
+        confidences.update({s:v})
+    return confidences

loss_main_plot.png ADDED Viewed

requirements.txt CHANGED Viewed

@@ -2,4 +2,10 @@ pandas
 scipy
 pycountry
 numpy
-matplotlib

 scipy
 pycountry
 numpy
+matplotlib
+datasets==1.14
+transformers
+librosa
+torch
+huggingface-hub
+torchaudio

run.sh ADDED Viewed

	@@ -0,0 +1,3 @@

+#!/bin/bash
+#cd afro-speech
+export HF_TOKEN=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 && python -m pdb app.py

val_accuracy_plot.png ADDED Viewed