nisqa-v2.0 / app.py
ankandrew's picture
Update app.py
3a95050 verified
raw
history blame
3.36 kB
import gradio as gr
import torch
import numpy as np
import librosa
from torchmetrics.functional.audio.nisqa import non_intrusive_speech_quality_assessment as tm_nisqa
import spaces
SR = 16000
def label_mos(x: float):
# ITU-T P.800 ACR-inspired buckets
if x < 1.5: return "Bad"
if x < 2.5: return "Poor"
if x < 3.5: return "Fair"
if x < 4.3: return "Good"
return "Excellent"
def label_dim(x: float):
if x < 1.5: return "Severe"
if x < 2.5: return "High"
if x < 3.5: return "Moderate"
if x < 4.3: return "Low"
return "Negligible"
def explain_dim(name: str):
return {
"Noisiness": "How noisy it sounds (higher = less noise).",
"Discontinuity": "Dropouts/glitches (higher = fewer glitches).",
"Coloration": "Tone/timbre coloration (higher = more natural).",
"Loudness": "Perceived loudness appropriateness (higher = more appropriate)."
}[name]
@spaces.GPU()
def predict_nisqa(audio):
if isinstance(audio, tuple):
_sr, y = audio
y = librosa.resample(np.asarray(y).astype(np.float32), orig_sr=_sr, target_sr=SR)
else:
y, _ = librosa.load(audio, sr=SR, mono=True)
device = "cuda" if torch.cuda.is_available() else "cpu"
wav = torch.tensor(y, dtype=torch.float32, device=device)
mos, noisiness, discontinuity, coloration, loudness = (
tm_nisqa(wav, SR).detach().to("cpu").numpy().tolist()
)
metrics = [
("MOS (overall)", mos, label_mos(mos), "Higher = better perceived quality."),
("Noisiness", noisiness, label_dim(noisiness), explain_dim("Noisiness")),
("Discontinuity", discontinuity, label_dim(discontinuity), explain_dim("Discontinuity")),
("Coloration", coloration, label_dim(coloration), explain_dim("Coloration")),
("Loudness", loudness, label_dim(loudness), explain_dim("Loudness")),
]
table = {
"Metric": [m[0] for m in metrics],
"Score": [round(m[1], 3) for m in metrics],
"Label": [m[2] for m in metrics],
"Notes": [m[3] for m in metrics],
}
bars = {m[0]: float(m[1]) for m in metrics}
return table, bars
with gr.Blocks(title="NISQA Speech Quality (MOS) Demo") as demo:
gr.Markdown(
"""
# 🎧 NISQA Speech Quality (MOS)
Upload or record speech and get **MOS + quality dimensions**.
**Scale:** 1–5 where higher = better.
**Dimensions:** higher = fewer issues in that aspect.
"""
)
audio = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Input audio")
btn = gr.Button("Predict")
out_table = gr.Dataframe(headers=["Metric", "Score", "Label", "Notes"], interactive=False)
bars = gr.BarPlot(
x="Metric", y="Score",
y_lim=(0, 5),
tooltip=["Score"],
width=0.6,
interactive=False,
label="Scores (0–5, higher = better)"
)
def _bars_to_df(table_dict, bars_dict):
import pandas as pd
df = pd.DataFrame({"Metric": list(bars_dict.keys()), "Score": list(bars_dict.values())})
return table_dict, df
btn.click(fn=predict_nisqa, inputs=audio, outputs=[out_table, bars], postprocess=False)\
.then(fn=_bars_to_df, inputs=[out_table, bars], outputs=[out_table, bars])
if __name__ == "__main__":
demo.launch()