Spaces:

AIvry
/

MAPSS-measures

Running on Zero

File size: 13,571 Bytes

"""

Distortions banks for the PS and the PM computations.

"""

import librosa
import numpy as np
from numpy.fft import irfft, rfft, rfftfreq
from scipy.signal import butter, filtfilt, lfilter

from config import ENERGY_WIN_MS, EPS, SR


def sig_stats(x):
    A_pk = max(np.max(np.abs(x)), EPS)
    A_rms = max(np.sqrt(np.mean(x**2)), EPS)
    A_95 = max(np.percentile(np.abs(x), 95), EPS)
    return A_pk, A_rms, A_95


def frame_distortions(

    frame,

    sr,

    distortion_keys,

    notch_freqs=None,

    low_cutoffs=None,

    high_cutoffs=None,

    frame_start=0,

):
    notch_freqs = [] if notch_freqs is None else notch_freqs
    low_cutoffs = [] if low_cutoffs is None else low_cutoffs
    high_cutoffs = [] if high_cutoffs is None else high_cutoffs
    distortions = {}

    A_pk, A_rms, A_95 = sig_stats(frame)
    frame_len = len(frame)
    X = rfft(frame)
    freqs = rfftfreq(frame_len, 1 / sr)
    t = np.arange(frame_len) / sr

    if ("notch" in distortion_keys) or distortion_keys == "all":
        bw = 60.0
        for f0 in notch_freqs:
            Y = X.copy()
            band = (freqs > f0 - bw) & (freqs < f0 + bw)
            Y[band] = 0
            distortions[f"Notch_{int(round(f0))}Hz"] = irfft(Y, n=len(frame))

    if ("comb" in distortion_keys) or distortion_keys == "all":
        for d_ms, decay in zip([2.5, 5, 7.5, 10, 12.5, 15], [0.4, 0.5, 0.6, 0.7, 0.9]):
            D = int(sr * d_ms / 1000)
            if D >= frame_len:
                continue
            out = frame.copy()
            out[:-D] += decay * frame[D:]
            distortions[f"Comb_{int(d_ms)}ms"] = out

    if ("tremolo" in distortion_keys) or distortion_keys == "all":
        depth = 1.0
        t_centre = (frame_start + 0.5 * len(frame)) / sr
        for r_hz in [1, 2, 4, 6]:
            mod = (1 - depth) + depth * 0.5 * (1 + np.sin(2 * np.pi * r_hz * t_centre))
            distortions[f"Tremolo_{r_hz}Hz"] = frame * mod

    if ("noise" in distortion_keys) or distortion_keys == "all":
        nyq = sr / 2
        low_norm = 20 / nyq
        high_freq = min(20_000, 0.45 * sr)
        high_norm = min(high_freq / nyq, 0.99)
        b_band, a_band = butter(5, [low_norm, high_norm], btype="band")

        def add_noise(sig, snr_db, color="white"):
            nl_target = 10 ** (snr_db / 10)
            n = np.random.randn(len(sig))
            if color == "pink":
                n = np.cumsum(n)
                n /= max(np.max(np.abs(n)), 1e-12)
            elif color == "brown":
                n = np.cumsum(np.cumsum(n))
                n /= max(np.max(np.abs(n)), 1e-12)
            n = lfilter(b_band, a_band, n)
            rms_sig = np.sqrt(np.mean(sig**2))
            rms_n = np.sqrt(np.mean(n**2)) + 1e-12
            noise_rms = rms_sig / np.sqrt(nl_target)
            noise_rms = max(noise_rms, rms_sig / np.sqrt(10 ** (15 / 10)))
            n *= noise_rms / rms_n
            return sig + n

        for snr in [-15, -10, -5, 0, 5, 10, 15, 20, 25]:
            for clr in ["white", "pink", "brown"]:
                if (snr in [-15, -10, -5]) and (clr == "white"):
                    continue
                distortions[f"{clr.capitalize()}Noise_{snr}dB"] = add_noise(
                    frame, snr, clr
                )

    if ("harmonic" in distortion_keys) or distortion_keys == "all":
        for f_h, rel_amp in zip([100, 500, 1000, 4000], [0.4, 0.6, 0.8, 1.0]):
            tone = (rel_amp * A_rms) * np.sin(2 * np.pi * f_h * t)
            distortions[f"Harmonic_{f_h}Hz"] = frame + tone

    if ("reverb" in distortion_keys) or distortion_keys == "all":
        for tail_ms, decay in zip([50, 100, 200, 400], [0.3, 0.5, 0.7, 0.9]):
            L = int(sr * tail_ms / 1000)
            if L >= frame_len:
                continue
            irv = np.exp(-np.linspace(0, 6, L)) * decay
            reverbed = np.convolve(frame, irv)[:frame_len]
            distortions[f"Reverb_{tail_ms}ms"] = reverbed

    if ("noisegate" in distortion_keys) or distortion_keys == "all":
        for pct in [0.05, 0.10, 0.20, 0.40]:
            thr = pct * A_95
            g = frame.copy()
            g[np.abs(g) < thr] = 0
            distortions[f"NoiseGate_{int(pct * 100)}pct"] = g

    if ("pitch_shift" in distortion_keys) or distortion_keys == "all":
        n_fft = min(2048, frame_len // 2)
        for shift in [-4, -2, 2, 4]:
            y = librosa.effects.pitch_shift(frame, sr=sr, n_steps=shift, n_fft=n_fft)
            distortions[f"PitchShift_{shift}st"] = y[:frame_len]

    if ("lowpass" in distortion_keys) or distortion_keys == "all":
        for fc in low_cutoffs:
            if fc >= sr / 2 * 0.99:
                continue
            b, a = butter(6, fc / (sr / 2), btype="low")
            distortions[f"Lowpass_{fc}Hz"] = filtfilt(b, a, frame)

    if ("highpass" in distortion_keys) or distortion_keys == "all":
        for fc in high_cutoffs:
            if fc <= 20:
                continue
            b, a = butter(6, fc / (sr / 2), btype="high")
            distortions[f"Highpass_{fc}Hz"] = filtfilt(b, a, frame)

    if ("echo" in distortion_keys) or distortion_keys == "all":
        for delay_ms, amp in zip([50, 100, 150], [0.4, 0.5, 0.7]):
            D = int(sr * delay_ms / 1000)
            if D >= frame_len:
                continue
            echo = np.pad(frame, (D, 0), "constant")[:-D] * amp
            distortions[f"Echo_{delay_ms}ms"] = frame + echo

    if ("clipping" in distortion_keys) or distortion_keys == "all":
        for frac in [0.70, 0.50, 0.30]:
            thr = frac * A_95
            distortions[f"Clipping_{frac:.2f}p95"] = np.clip(frame, -thr, thr)

    if ("vibrato" in distortion_keys) or distortion_keys == "all":
        n_fft = min(2048, frame_len // 2)
        base_depth = 0.03 * (A_rms / A_pk)
        for rate_hz, scale in zip([3, 5, 7], [1.0, 1.3, 1.6]):
            depth = np.clip(base_depth * scale, 0.01, 0.05)
            y = librosa.effects.time_stretch(frame, rate=1 + depth, n_fft=n_fft)
            distortions[f"Vibrato_{rate_hz}Hz"] = librosa.util.fix_length(
                y, size=frame_len
            )

    return distortions


def apply_pm_distortions(ref, distortion_keys, sr=SR):
    frame_len = int(ENERGY_WIN_MS * sr / 1000)
    n_frames = int(np.ceil(len(ref) / frame_len))
    pad_len = n_frames * frame_len - len(ref)
    ref_padded = (
        np.concatenate([ref, np.zeros(pad_len, dtype=ref.dtype)]) if pad_len else ref
    )

    X_full = rfft(ref_padded)
    freqs_f = rfftfreq(len(ref_padded), 1 / sr)
    mag_full = np.abs(X_full)

    valid = (freqs_f > 80) & (freqs_f < 0.45 * sr)
    cand_indices = np.argsort(mag_full[valid])[-60:]
    cand_freqs = freqs_f[valid][cand_indices]
    cand_freqs = cand_freqs[np.argsort(mag_full[valid][cand_indices])[::-1]]

    selected_notch_freqs = []
    for f0 in cand_freqs:
        if all(abs(f0 - f_sel) > 300 for f_sel in selected_notch_freqs):
            selected_notch_freqs.append(float(f0))
        if len(selected_notch_freqs) >= 20:
            break

    mag2 = np.abs(X_full) ** 2
    total_p = mag2.sum()
    cum_low = np.cumsum(mag2)
    q_low = [0.50, 0.70, 0.85, 0.95]
    lowpass_cutoffs = []
    for q in q_low:
        idx = np.searchsorted(cum_low, q * total_p)
        f_c = float(freqs_f[idx])
        lowpass_cutoffs.append(round(f_c / 100.0) * 100)

    cum_high = np.cumsum(mag2[::-1])
    q_high = [0.05, 0.15, 0.30, 0.50]
    highpass_cutoffs = []
    for q in q_high:
        idx = np.searchsorted(cum_high, q * total_p)
        f_c = float(freqs_f[-1 - idx])
        highpass_cutoffs.append(round(f_c / 100.0) * 100)

    lowpass_cutoffs = sorted(set(lowpass_cutoffs))
    highpass_cutoffs = sorted(set(highpass_cutoffs))

    out = {}
    for f in range(n_frames):
        start, end = f * frame_len, (f + 1) * frame_len
        frame = ref_padded[start:end]
        frame_dists = frame_distortions(
            frame,
            sr,
            distortion_keys,
            notch_freqs=selected_notch_freqs,
            low_cutoffs=lowpass_cutoffs,
            high_cutoffs=highpass_cutoffs,
            frame_start=start,
        )
        for lbl, sig in frame_dists.items():
            if lbl not in out:
                out[lbl] = np.zeros_like(ref_padded)
            out[lbl][start:end] = sig

    return list(out.values())


def apply_ps_distortions(ref, distortion_keys, sr=SR):
    distortions = {}
    X = rfft(ref)
    freqs = rfftfreq(len(ref), 1 / sr)
    t = np.arange(len(ref)) / sr

    if ("notch" in distortion_keys) or distortion_keys == "all":
        for c in [500, 1000, 2000, 4000, 8000]:
            Y = X.copy()
            Y[(freqs > c - 50) & (freqs < c + 50)] = 0
            distortions[f"Notch_{c}Hz"] = irfft(Y, n=len(ref))

    if ("comb" in distortion_keys) or distortion_keys == "all":
        for d, decay in zip([2.5, 5, 7.5, 10, 12.5, 15], [0.4, 0.5, 0.6, 0.7, 0.9]):
            D = int(sr * d / 1000)
            if D >= len(ref):
                continue
            cpy = ref.copy()
            if len(ref) > D:
                cpy[:-D] += decay * ref[D:]
            distortions[f"Comb_{int(d)}ms"] = cpy

    if ("tremolo" in distortion_keys) or distortion_keys == "all":
        for r, depth in zip([1, 2, 4, 6], [0.3, 0.5, 0.8, 1.0]):
            mod = (1 - depth) + depth * 0.5 * (1 + np.sin(2 * np.pi * r * t))
            distortions[f"Tremolo_{r}Hz"] = ref * mod

    if ("noise" in distortion_keys) or distortion_keys == "all":

        def add_noise(signal, snr_db, color):
            rms = np.sqrt(np.mean(signal**2))
            nl = 10 ** (snr_db / 10)
            noise_rms = rms / np.sqrt(nl)
            n = np.random.randn(len(signal))
            if color == "pink":
                n = np.cumsum(n)
                n /= max(np.max(np.abs(n)), 1e-12)
            elif color == "brown":
                n = np.cumsum(np.cumsum(n))
                n /= max(np.max(np.abs(n)), 1e-12)
            return signal + noise_rms * n

        for snr in [-15, -10, -5, 0, 5, 10, 15, 20, 25]:
            for clr in ["white", "pink", "brown"]:
                if snr in [-15, -10, -5] and clr in ["white"]:
                    continue
                distortions[f"{clr.capitalize()}Noise_{snr}dB"] = add_noise(
                    ref, snr, clr
                )

    if ("harmonic" in distortion_keys) or distortion_keys == "all":
        for f_h, amp in zip([100, 500, 1000, 4000], [0.02, 0.03, 0.05, 0.08]):
            tone = amp * np.sin(2 * np.pi * f_h * t)
            distortions[f"Harmonic_{f_h}Hz"] = ref + tone

    if ("reverb" in distortion_keys) or distortion_keys == "all":
        for tail_ms, decay in zip([5, 10, 15, 20], [0.3, 0.5, 0.7, 0.9, 1.1]):
            L = int(sr * tail_ms / 1000)
            if L >= len(ref):
                continue
            irv = np.exp(-np.linspace(0, 3, L)) * decay
            reverbed = np.convolve(ref, irv)[: len(ref)]
            distortions[f"Reverb_{tail_ms}ms"] = reverbed

    if ("noisegate" in distortion_keys) or distortion_keys == "all":
        for thr in [0.005, 0.01, 0.02, 0.04]:
            g = ref.copy()
            g[np.abs(g) < thr] = 0
            distortions[f"NoiseGate_{thr}"] = g

    if ("pitch_shift" in distortion_keys) or distortion_keys == "all":
        n_fft = min(2048, len(ref) // 2)
        for shift in [-4, -2, 2, 4]:
            shifted = librosa.effects.pitch_shift(
                y=ref, sr=sr, n_steps=shift, n_fft=n_fft
            )
            distortions[f"PitchShift_{shift}st"] = shifted[: len(ref)]

    if ("lowpass" in distortion_keys) or distortion_keys == "all":
        for freq in [2000, 3000, 4000, 6000]:
            if freq >= (sr / 2):
                continue
            b, a = butter(4, freq / (sr / 2), "low")
            distortions[f"Lowpass_{freq}Hz"] = filtfilt(b, a, ref)

    if ("highpass" in distortion_keys) or distortion_keys == "all":
        for freq in [100, 300, 500, 800]:
            if freq >= (sr / 2):
                continue
            b, a = butter(4, freq / (sr / 2), "high")
            distortions[f"Highpass_{freq}Hz"] = filtfilt(b, a, ref)

    if ("echo" in distortion_keys) or distortion_keys == "all":
        for delay_ms, amp in zip([5, 10, 15, 20], [0.3, 0.5, 0.7]):
            delay = int(sr * delay_ms / 1000)
            if delay >= len(ref):
                continue
            echo = np.pad(ref, (delay, 0), "constant")[:-delay] * amp
            distortions[f"Echo_{delay_ms}ms"] = ref + echo

    if ("clipping" in distortion_keys) or distortion_keys == "all":
        for thr in [0.3, 0.5, 0.7]:
            distortions[f"Clipping_{thr}"] = np.clip(ref, -thr, thr)

    if ("vibrato" in distortion_keys) or distortion_keys == "all":
        for rate, depth in zip([3, 5, 7], [0.001, 0.002, 0.003]):
            vibrato = np.sin(2 * np.pi * rate * t) * depth
            vibrato_signal = librosa.effects.time_stretch(
                ref, rate=1 + float(vibrato.mean()), n_fft=min(2048, len(ref) // 2)
            )
            distortions[f"Vibrato_{rate}Hz"] = librosa.util.fix_length(
                vibrato_signal, size=len(ref)
            )

    return list(distortions.values())