import gradio as gr import torch import numpy as np import librosa from torchmetrics.functional.audio.nisqa import non_intrusive_speech_quality_assessment as tm_nisqa import spaces import pandas as pd SR = 16000 def label_mos(x: float): # ITU-T P.800 ACR-inspired buckets if x < 1.5: return "Bad" if x < 2.5: return "Poor" if x < 3.5: return "Fair" if x < 4.3: return "Good" return "Excellent" def label_dim(x: float): if x < 1.5: return "Severe" if x < 2.5: return "High" if x < 3.5: return "Moderate" if x < 4.3: return "Low" return "Negligible" def explain_dim(name: str): return { "Noisiness": "How noisy it sounds (higher = less noise).", "Discontinuity": "Dropouts/glitches (higher = fewer glitches).", "Coloration": "Tone/timbre coloration (higher = more natural).", "Loudness": "Perceived loudness appropriateness (higher = more appropriate)." }[name] @spaces.GPU() def predict_nisqa(audio): if isinstance(audio, tuple): _sr, y = audio y = librosa.resample(np.asarray(y).astype(np.float32), orig_sr=_sr, target_sr=SR) else: y, _ = librosa.load(audio, sr=SR, mono=True) device = "cuda" if torch.cuda.is_available() else "cpu" wav = torch.tensor(y, dtype=torch.float32, device=device) mos, noisiness, discontinuity, coloration, loudness = ( tm_nisqa(wav, SR).detach().to("cpu").numpy().tolist() ) metrics = [ ("MOS (overall)", mos, label_mos(mos), "Higher = better perceived quality."), ("Noisiness", noisiness, label_dim(noisiness), explain_dim("Noisiness")), ("Discontinuity", discontinuity, label_dim(discontinuity), explain_dim("Discontinuity")), ("Coloration", coloration, label_dim(coloration), explain_dim("Coloration")), ("Loudness", loudness, label_dim(loudness), explain_dim("Loudness")), ] df_table = pd.DataFrame( { "Metric": [m[0] for m in metrics], "Score": [round(float(m[1]), 3) for m in metrics], "Label": [m[2] for m in metrics], "Notes": [m[3] for m in metrics], } ) df_bars = pd.DataFrame( {"Metric": [m[0] for m in metrics], "Score": [float(m[1]) for m in metrics]} ) return df_table, df_bars with gr.Blocks(title="NISQA Speech Quality (MOS) Demo") as demo: gr.Markdown( """ # 🎧 NISQA Speech Quality (MOS) Upload or record speech and get **MOS + quality dimensions**. **Scale:** 1–5 where higher = better. **Dimensions:** higher = fewer issues in that aspect. """ ) audio = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Input audio") btn = gr.Button("Predict") out_table = gr.Dataframe(interactive=False, label="Results") bars = gr.BarPlot( x="Metric", y="Score", y_lim=(0, 5), label="Scores (0–5, higher = better)", interactive=False, ) btn.click(fn=predict_nisqa, inputs=audio, outputs=[out_table, bars]) if __name__ == "__main__": demo.launch()