Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| import torch | |
| import numpy as np | |
| import librosa | |
| from torchmetrics.functional.audio.nisqa import non_intrusive_speech_quality_assessment as tm_nisqa | |
| import spaces | |
| import pandas as pd | |
| SR = 16000 | |
| def label_mos(x: float): | |
| # ITU-T P.800 ACR-inspired buckets | |
| if x < 1.5: return "Bad" | |
| if x < 2.5: return "Poor" | |
| if x < 3.5: return "Fair" | |
| if x < 4.3: return "Good" | |
| return "Excellent" | |
| def label_dim(x: float): | |
| if x < 1.5: return "Severe" | |
| if x < 2.5: return "High" | |
| if x < 3.5: return "Moderate" | |
| if x < 4.3: return "Low" | |
| return "Negligible" | |
| def explain_dim(name: str): | |
| return { | |
| "Noisiness": "How noisy it sounds (higher = less noise).", | |
| "Discontinuity": "Dropouts/glitches (higher = fewer glitches).", | |
| "Coloration": "Tone/timbre coloration (higher = more natural).", | |
| "Loudness": "Perceived loudness appropriateness (higher = more appropriate)." | |
| }[name] | |
| def predict_nisqa(audio): | |
| if isinstance(audio, tuple): | |
| _sr, y = audio | |
| y = librosa.resample(np.asarray(y).astype(np.float32), orig_sr=_sr, target_sr=SR) | |
| else: | |
| y, _ = librosa.load(audio, sr=SR, mono=True) | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| wav = torch.tensor(y, dtype=torch.float32, device=device) | |
| mos, noisiness, discontinuity, coloration, loudness = ( | |
| tm_nisqa(wav, SR).detach().to("cpu").numpy().tolist() | |
| ) | |
| metrics = [ | |
| ("MOS (overall)", mos, label_mos(mos), "Higher = better perceived quality."), | |
| ("Noisiness", noisiness, label_dim(noisiness), explain_dim("Noisiness")), | |
| ("Discontinuity", discontinuity, label_dim(discontinuity), explain_dim("Discontinuity")), | |
| ("Coloration", coloration, label_dim(coloration), explain_dim("Coloration")), | |
| ("Loudness", loudness, label_dim(loudness), explain_dim("Loudness")), | |
| ] | |
| df_table = pd.DataFrame( | |
| { | |
| "Metric": [m[0] for m in metrics], | |
| "Score": [round(float(m[1]), 3) for m in metrics], | |
| "Label": [m[2] for m in metrics], | |
| "Notes": [m[3] for m in metrics], | |
| } | |
| ) | |
| df_bars = pd.DataFrame( | |
| {"Metric": [m[0] for m in metrics], "Score": [float(m[1]) for m in metrics]} | |
| ) | |
| return df_table, df_bars | |
| with gr.Blocks(title="NISQA Speech Quality (MOS) Demo") as demo: | |
| gr.Markdown( | |
| """ | |
| # 🎧 NISQA Speech Quality (MOS) | |
| Upload or record speech and get **MOS + quality dimensions**. | |
| **Scale:** 1–5 where higher = better. | |
| **Dimensions:** higher = fewer issues in that aspect. | |
| """ | |
| ) | |
| audio = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Input audio") | |
| btn = gr.Button("Predict") | |
| out_table = gr.Dataframe(interactive=False, label="Results") | |
| bars = gr.BarPlot( | |
| x="Metric", y="Score", | |
| y_lim=(0, 5), | |
| label="Scores (0–5, higher = better)", | |
| interactive=False, | |
| ) | |
| btn.click(fn=predict_nisqa, inputs=audio, outputs=[out_table, bars]) | |
| if __name__ == "__main__": | |
| demo.launch() | |