nisqa-v2.0 / app.py
ankandrew's picture
Update app.py
1af130b verified
import gradio as gr
import torch
import numpy as np
import librosa
from torchmetrics.functional.audio.nisqa import non_intrusive_speech_quality_assessment as tm_nisqa
import spaces
import pandas as pd
SR = 16000
def label_mos(x: float):
# ITU-T P.800 ACR-inspired buckets
if x < 1.5: return "Bad"
if x < 2.5: return "Poor"
if x < 3.5: return "Fair"
if x < 4.3: return "Good"
return "Excellent"
def label_dim(x: float):
if x < 1.5: return "Severe"
if x < 2.5: return "High"
if x < 3.5: return "Moderate"
if x < 4.3: return "Low"
return "Negligible"
def explain_dim(name: str):
return {
"Noisiness": "How noisy it sounds (higher = less noise).",
"Discontinuity": "Dropouts/glitches (higher = fewer glitches).",
"Coloration": "Tone/timbre coloration (higher = more natural).",
"Loudness": "Perceived loudness appropriateness (higher = more appropriate)."
}[name]
@spaces.GPU()
def predict_nisqa(audio):
if isinstance(audio, tuple):
_sr, y = audio
y = librosa.resample(np.asarray(y).astype(np.float32), orig_sr=_sr, target_sr=SR)
else:
y, _ = librosa.load(audio, sr=SR, mono=True)
device = "cuda" if torch.cuda.is_available() else "cpu"
wav = torch.tensor(y, dtype=torch.float32, device=device)
mos, noisiness, discontinuity, coloration, loudness = (
tm_nisqa(wav, SR).detach().to("cpu").numpy().tolist()
)
metrics = [
("MOS (overall)", mos, label_mos(mos), "Higher = better perceived quality."),
("Noisiness", noisiness, label_dim(noisiness), explain_dim("Noisiness")),
("Discontinuity", discontinuity, label_dim(discontinuity), explain_dim("Discontinuity")),
("Coloration", coloration, label_dim(coloration), explain_dim("Coloration")),
("Loudness", loudness, label_dim(loudness), explain_dim("Loudness")),
]
df_table = pd.DataFrame(
{
"Metric": [m[0] for m in metrics],
"Score": [round(float(m[1]), 3) for m in metrics],
"Label": [m[2] for m in metrics],
"Notes": [m[3] for m in metrics],
}
)
df_bars = pd.DataFrame(
{"Metric": [m[0] for m in metrics], "Score": [float(m[1]) for m in metrics]}
)
return df_table, df_bars
with gr.Blocks(title="NISQA Speech Quality (MOS) Demo") as demo:
gr.Markdown(
"""
# 🎧 NISQA Speech Quality (MOS)
Upload or record speech and get **MOS + quality dimensions**.
**Scale:** 1–5 where higher = better.
**Dimensions:** higher = fewer issues in that aspect.
"""
)
audio = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Input audio")
btn = gr.Button("Predict")
out_table = gr.Dataframe(interactive=False, label="Results")
bars = gr.BarPlot(
x="Metric", y="Score",
y_lim=(0, 5),
label="Scores (0–5, higher = better)",
interactive=False,
)
btn.click(fn=predict_nisqa, inputs=audio, outputs=[out_table, bars])
if __name__ == "__main__":
demo.launch()