Spaces:
Running
on
Zero
Running
on
Zero
File size: 2,652 Bytes
1832e16 b759ccc 1832e16 b759ccc 1832e16 b759ccc 1832e16 b759ccc 1832e16 b759ccc 1832e16 b759ccc 1832e16 b759ccc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
import numpy as np
import pyloudnorm as pyln
import torch
from config import SILENCE_RATIO, SR
import warnings
warnings.filterwarnings("ignore", message="Possible clipped samples in output.")
def loudness_normalize(wav, sr=SR, target_lufs=-23.0):
"""
Apply loudness normalization on an audio signal.
:param wav: waveform signal to normalize.
:param sr: sampling rate.
:param target_lufs: LUFS points to normalize to.
:return: normalized signal.
"""
meter = pyln.Meter(sr)
loudness = meter.integrated_loudness(wav)
normalized_wav = pyln.normalize.loudness(wav, loudness, target_lufs)
peak = np.max(np.abs(normalized_wav))
if peak > 1.0:
normalized_wav = normalized_wav / max(peak, 1e-12)
return np.clip(normalized_wav, -1.0, 1.0)
def frame_rms_torch(sig, win, hop):
"""
Calculates the RMS of a signal with a moving window.
:param sig: signal for calculation.
:param win: analysis window size.
:param hop: analysis window hop size.
:return: RMS of signal.
"""
dev = sig.device
frames = sig.unfold(0, win, hop)
if frames.size(0) and (frames.size(0) - 1) * hop == sig.numel() - win:
frames = frames[:-1]
rms = torch.sqrt((frames ** 2).mean(1) + 1e-12)
return rms.to(dev)
def compute_speaker_activity_masks(refs_tensors, win, hop):
"""
Computes individual voice activity for each speaker and determines which frames
have at least 2 active speakers.
:param refs_tensors: references that compose the mixture.
:param win: analysis window size.
:param hop: analysis window hop size.
:return: (multi_speaker_mask, individual_speaker_masks)
- multi_speaker_mask: boolean mask of frames where at least 2 speakers are active
- individual_speaker_masks: list of boolean masks, one per speaker
"""
device = refs_tensors[0].device
individual_masks = []
lengths = []
for ref in refs_tensors:
rms = frame_rms_torch(ref, win, hop)
threshold = SILENCE_RATIO * torch.sqrt((ref ** 2).mean())
voiced = rms > threshold
individual_masks.append(voiced)
lengths.append(voiced.numel())
L_max = max(lengths)
padded_masks = []
for mask, L in zip(individual_masks, lengths):
if L < L_max:
padded = torch.cat([mask, torch.zeros(L_max - L, dtype=torch.bool, device=device)])
else:
padded = mask
padded_masks.append(padded)
stacked = torch.stack(padded_masks, dim=0)
active_count = stacked.sum(dim=0)
multi_speaker_mask = active_count >= 2
return multi_speaker_mask, padded_masks |