Spaces:
Build error
Build error
| #!/usr/bin/env python3 -u | |
| # Copyright (c) Facebook, Inc. and its affiliates. | |
| # | |
| # This source code is licensed under the MIT license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| import argparse | |
| import sys | |
| from copy import deepcopy | |
| from scipy.signal import lfilter | |
| import numpy as np | |
| from tqdm import tqdm | |
| import soundfile as sf | |
| import os.path as osp | |
| def get_parser(): | |
| parser = argparse.ArgumentParser(description="compute vad segments") | |
| parser.add_argument( | |
| "--rvad-home", | |
| "-r", | |
| help="path to rvad home (see https://github.com/zhenghuatan/rVADfast)", | |
| required=True, | |
| ) | |
| return parser | |
| def rvad(speechproc, path): | |
| winlen, ovrlen, pre_coef, nfilter, nftt = 0.025, 0.01, 0.97, 20, 512 | |
| ftThres = 0.5 | |
| vadThres = 0.4 | |
| opts = 1 | |
| data, fs = sf.read(path) | |
| assert fs == 16_000, "sample rate must be 16khz" | |
| ft, flen, fsh10, nfr10 = speechproc.sflux(data, fs, winlen, ovrlen, nftt) | |
| # --spectral flatness -- | |
| pv01 = np.zeros(ft.shape[0]) | |
| pv01[np.less_equal(ft, ftThres)] = 1 | |
| pitch = deepcopy(ft) | |
| pvblk = speechproc.pitchblockdetect(pv01, pitch, nfr10, opts) | |
| # --filtering-- | |
| ENERGYFLOOR = np.exp(-50) | |
| b = np.array([0.9770, -0.9770]) | |
| a = np.array([1.0000, -0.9540]) | |
| fdata = lfilter(b, a, data, axis=0) | |
| # --pass 1-- | |
| noise_samp, noise_seg, n_noise_samp = speechproc.snre_highenergy( | |
| fdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk | |
| ) | |
| # sets noisy segments to zero | |
| for j in range(n_noise_samp): | |
| fdata[range(int(noise_samp[j, 0]), int(noise_samp[j, 1]) + 1)] = 0 | |
| vad_seg = speechproc.snre_vad( | |
| fdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk, vadThres | |
| ) | |
| return vad_seg, data | |
| def main(): | |
| parser = get_parser() | |
| args = parser.parse_args() | |
| sys.path.append(args.rvad_home) | |
| import speechproc | |
| stride = 160 | |
| lines = sys.stdin.readlines() | |
| root = lines[0].rstrip() | |
| for fpath in tqdm(lines[1:]): | |
| path = osp.join(root, fpath.split()[0]) | |
| vads, wav = rvad(speechproc, path) | |
| start = None | |
| vad_segs = [] | |
| for i, v in enumerate(vads): | |
| if start is None and v == 1: | |
| start = i * stride | |
| elif start is not None and v == 0: | |
| vad_segs.append((start, i * stride)) | |
| start = None | |
| if start is not None: | |
| vad_segs.append((start, len(wav))) | |
| print(" ".join(f"{v[0]}:{v[1]}" for v in vad_segs)) | |
| if __name__ == "__main__": | |
| main() | |