MAPSS-measures / audio.py
AIvry's picture
Upload 12 files
1832e16 verified
raw
history blame
2.1 kB
import librosa
import numpy as np
import pyloudnorm as pyln
import torch
from config import SILENCE_RATIO, SR
from utils import hungarian, safe_corr_np
import warnings
warnings.filterwarnings("ignore", message="Possible clipped samples in output.")
def loudness_normalize(wav, sr=SR, target_lufs=-23.0):
meter = pyln.Meter(sr)
loudness = meter.integrated_loudness(wav)
normalized_wav = pyln.normalize.loudness(wav, loudness, target_lufs)
peak = np.max(np.abs(normalized_wav))
if peak > 1.0:
normalized_wav = normalized_wav / max(peak, 1e-12)
return np.clip(normalized_wav, -1.0, 1.0)
def frame_rms_torch(sig, win, hop):
dev = sig.device
frames = sig.unfold(0, win, hop)
if frames.size(0) and (frames.size(0) - 1) * hop == sig.numel() - win:
frames = frames[:-1]
rms = torch.sqrt((frames**2).mean(1) + 1e-12)
return rms.to(dev)
def make_union_voiced_mask(refs_tensors, win, hop):
device = refs_tensors[0].device
rms_vecs = [frame_rms_torch(r, win, hop) for r in refs_tensors]
lengths = [v.numel() for v in rms_vecs]
L_max = max(lengths)
silent_union = torch.zeros(L_max, dtype=torch.bool, device=device)
for idx, (rms, L) in enumerate(zip(rms_vecs, lengths)):
thr = SILENCE_RATIO * torch.sqrt((refs_tensors[idx] ** 2).mean())
sil = rms <= thr
silent_union[:L] |= sil
return ~silent_union
def assign_outputs_to_refs_by_corr(ref_paths, out_paths):
if not out_paths:
return [None] * len(ref_paths)
refs = [loudness_normalize(librosa.load(str(p), sr=SR)[0]) for p in ref_paths]
outs = [loudness_normalize(librosa.load(str(p), sr=SR)[0]) for p in out_paths]
n, m = len(refs), len(outs)
K = max(n, m)
C = np.ones((K, K), dtype=np.float64)
for i in range(n):
for j in range(m):
r = safe_corr_np(refs[i], outs[j])
C[i, j] = 1.0 - (r + 1.0) * 0.5 # lower = better
ri, cj = hungarian(C)
mapping = [None] * n
for i, j in zip(ri, cj):
if i < n and j < m:
mapping[i] = int(j)
return mapping