File size: 3,133 Bytes
35d2caa
 
 
 
 
3fafc46
1af130b
35d2caa
 
 
5d44395
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3fafc46
35d2caa
 
 
 
 
 
 
3fafc46
 
3a95050
3fafc46
 
35d2caa
5d44395
 
 
 
 
 
 
35d2caa
1af130b
 
 
 
 
 
 
 
 
 
 
 
35d2caa
 
 
 
5d44395
 
 
 
35d2caa
 
1af130b
35d2caa
 
1af130b
5d44395
 
 
 
1af130b
5d44395
 
 
1af130b
35d2caa
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import gradio as gr
import torch
import numpy as np
import librosa
from torchmetrics.functional.audio.nisqa import non_intrusive_speech_quality_assessment as tm_nisqa
import spaces
import pandas as pd

SR = 16000

def label_mos(x: float):
    # ITU-T P.800 ACR-inspired buckets
    if x < 1.5:   return "Bad"
    if x < 2.5:   return "Poor"
    if x < 3.5:   return "Fair"
    if x < 4.3:   return "Good"
    return "Excellent"

def label_dim(x: float):
    if x < 1.5:   return "Severe"
    if x < 2.5:   return "High"
    if x < 3.5:   return "Moderate"
    if x < 4.3:   return "Low"
    return "Negligible"

def explain_dim(name: str):
    return {
        "Noisiness": "How noisy it sounds (higher = less noise).",
        "Discontinuity": "Dropouts/glitches (higher = fewer glitches).",
        "Coloration": "Tone/timbre coloration (higher = more natural).",
        "Loudness": "Perceived loudness appropriateness (higher = more appropriate)."
    }[name]

@spaces.GPU()
def predict_nisqa(audio):
    if isinstance(audio, tuple):
        _sr, y = audio
        y = librosa.resample(np.asarray(y).astype(np.float32), orig_sr=_sr, target_sr=SR)
    else:
        y, _ = librosa.load(audio, sr=SR, mono=True)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    wav = torch.tensor(y, dtype=torch.float32, device=device)
    mos, noisiness, discontinuity, coloration, loudness = (
        tm_nisqa(wav, SR).detach().to("cpu").numpy().tolist()
    )

    metrics = [
        ("MOS (overall)", mos, label_mos(mos), "Higher = better perceived quality."),
        ("Noisiness", noisiness, label_dim(noisiness), explain_dim("Noisiness")),
        ("Discontinuity", discontinuity, label_dim(discontinuity), explain_dim("Discontinuity")),
        ("Coloration", coloration, label_dim(coloration), explain_dim("Coloration")),
        ("Loudness", loudness, label_dim(loudness), explain_dim("Loudness")),
    ]

    df_table = pd.DataFrame(
        {
            "Metric": [m[0] for m in metrics],
            "Score":  [round(float(m[1]), 3) for m in metrics],
            "Label":  [m[2] for m in metrics],
            "Notes":  [m[3] for m in metrics],
        }
    )
    df_bars = pd.DataFrame(
        {"Metric": [m[0] for m in metrics], "Score": [float(m[1]) for m in metrics]}
    )
    return df_table, df_bars

with gr.Blocks(title="NISQA Speech Quality (MOS) Demo") as demo:
    gr.Markdown(
        """
        # 🎧 NISQA Speech Quality (MOS)
        Upload or record speech and get **MOS + quality dimensions**.
        **Scale:** 1–5 where higher = better.  
        **Dimensions:** higher = fewer issues in that aspect.
        """
    )
    audio = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Input audio")
    btn = gr.Button("Predict")

    out_table = gr.Dataframe(interactive=False, label="Results")

    bars = gr.BarPlot(
        x="Metric", y="Score",
        y_lim=(0, 5),
        label="Scores (0–5, higher = better)",
        interactive=False,
    )

    btn.click(fn=predict_nisqa, inputs=audio, outputs=[out_table, bars])

if __name__ == "__main__":
    demo.launch()