Spaces:
Running
on
Zero
Running
on
Zero
| import json | |
| import shlex | |
| import subprocess | |
| import tempfile | |
| from pathlib import Path | |
| from typing import Tuple | |
| import ffmpy | |
| import numpy as np | |
| import torch | |
| def r128stats(filepath: str, quiet: bool): | |
| """Takes a path to an audio file, returns a dict with the loudness | |
| stats computed by the ffmpeg ebur128 filter. | |
| Parameters | |
| ---------- | |
| filepath : str | |
| Path to compute loudness stats on. | |
| quiet : bool | |
| Whether to show FFMPEG output during computation. | |
| Returns | |
| ------- | |
| dict | |
| Dictionary containing loudness stats. | |
| """ | |
| ffargs = [ | |
| "ffmpeg", | |
| "-nostats", | |
| "-i", | |
| filepath, | |
| "-filter_complex", | |
| "ebur128", | |
| "-f", | |
| "null", | |
| "-", | |
| ] | |
| if quiet: | |
| ffargs += ["-hide_banner"] | |
| proc = subprocess.Popen(ffargs, stderr=subprocess.PIPE, universal_newlines=True) | |
| stats = proc.communicate()[1] | |
| summary_index = stats.rfind("Summary:") | |
| summary_list = stats[summary_index:].split() | |
| i_lufs = float(summary_list[summary_list.index("I:") + 1]) | |
| i_thresh = float(summary_list[summary_list.index("I:") + 4]) | |
| lra = float(summary_list[summary_list.index("LRA:") + 1]) | |
| lra_thresh = float(summary_list[summary_list.index("LRA:") + 4]) | |
| lra_low = float(summary_list[summary_list.index("low:") + 1]) | |
| lra_high = float(summary_list[summary_list.index("high:") + 1]) | |
| stats_dict = { | |
| "I": i_lufs, | |
| "I Threshold": i_thresh, | |
| "LRA": lra, | |
| "LRA Threshold": lra_thresh, | |
| "LRA Low": lra_low, | |
| "LRA High": lra_high, | |
| } | |
| return stats_dict | |
| def ffprobe_offset_and_codec(path: str) -> Tuple[float, str]: | |
| """Given a path to a file, returns the start time offset and codec of | |
| the first audio stream. | |
| """ | |
| ff = ffmpy.FFprobe( | |
| inputs={path: None}, | |
| global_options="-show_entries format=start_time:stream=duration,start_time,codec_type,codec_name,start_pts,time_base -of json -v quiet", | |
| ) | |
| streams = json.loads(ff.run(stdout=subprocess.PIPE)[0])["streams"] | |
| seconds_offset = 0.0 | |
| codec = None | |
| # Get the offset and codec of the first audio stream we find | |
| # and return its start time, if it has one. | |
| for stream in streams: | |
| if stream["codec_type"] == "audio": | |
| seconds_offset = stream.get("start_time", 0.0) | |
| codec = stream.get("codec_name") | |
| break | |
| return float(seconds_offset), codec | |
| class FFMPEGMixin: | |
| _loudness = None | |
| def ffmpeg_loudness(self, quiet: bool = True): | |
| """Computes loudness of audio file using FFMPEG. | |
| Parameters | |
| ---------- | |
| quiet : bool, optional | |
| Whether to show FFMPEG output during computation, | |
| by default True | |
| Returns | |
| ------- | |
| torch.Tensor | |
| Loudness of every item in the batch, computed via | |
| FFMPEG. | |
| """ | |
| loudness = [] | |
| with tempfile.NamedTemporaryFile(suffix=".wav") as f: | |
| for i in range(self.batch_size): | |
| self[i].write(f.name) | |
| loudness_stats = r128stats(f.name, quiet=quiet) | |
| loudness.append(loudness_stats["I"]) | |
| self._loudness = torch.from_numpy(np.array(loudness)).float() | |
| return self.loudness() | |
| def ffmpeg_resample(self, sample_rate: int, quiet: bool = True): | |
| """Resamples AudioSignal using FFMPEG. More memory-efficient | |
| than using julius.resample for long audio files. | |
| Parameters | |
| ---------- | |
| sample_rate : int | |
| Sample rate to resample to. | |
| quiet : bool, optional | |
| Whether to show FFMPEG output during computation, | |
| by default True | |
| Returns | |
| ------- | |
| AudioSignal | |
| Resampled AudioSignal. | |
| """ | |
| from audiotools import AudioSignal | |
| if sample_rate == self.sample_rate: | |
| return self | |
| with tempfile.NamedTemporaryFile(suffix=".wav") as f: | |
| self.write(f.name) | |
| f_out = f.name.replace("wav", "rs.wav") | |
| command = f"ffmpeg -i {f.name} -ar {sample_rate} {f_out}" | |
| if quiet: | |
| command += " -hide_banner -loglevel error" | |
| subprocess.check_call(shlex.split(command)) | |
| resampled = AudioSignal(f_out) | |
| Path.unlink(Path(f_out)) | |
| return resampled | |
| def load_from_file_with_ffmpeg(cls, audio_path: str, quiet: bool = True, **kwargs): | |
| """Loads AudioSignal object after decoding it to a wav file using FFMPEG. | |
| Useful for loading audio that isn't covered by librosa's loading mechanism. Also | |
| useful for loading mp3 files, without any offset. | |
| Parameters | |
| ---------- | |
| audio_path : str | |
| Path to load AudioSignal from. | |
| quiet : bool, optional | |
| Whether to show FFMPEG output during computation, | |
| by default True | |
| Returns | |
| ------- | |
| AudioSignal | |
| AudioSignal loaded from file with FFMPEG. | |
| """ | |
| audio_path = str(audio_path) | |
| with tempfile.TemporaryDirectory() as d: | |
| wav_file = str(Path(d) / "extracted.wav") | |
| padded_wav = str(Path(d) / "padded.wav") | |
| global_options = "-y" | |
| if quiet: | |
| global_options += " -loglevel error" | |
| ff = ffmpy.FFmpeg( | |
| inputs={audio_path: None}, | |
| outputs={wav_file: None}, | |
| global_options=global_options, | |
| ) | |
| ff.run() | |
| # We pad the file using the start time offset in case it's an audio | |
| # stream starting at some offset in a video container. | |
| pad, codec = ffprobe_offset_and_codec(audio_path) | |
| # For mp3s, don't pad files with discrepancies less than 0.027s - | |
| # it's likely due to codec latency. The amount of latency introduced | |
| # by mp3 is 1152, which is 0.0261 44khz. So we set the threshold | |
| # here slightly above that. | |
| # Source: https://lame.sourceforge.io/tech-FAQ.txt. | |
| if codec == "mp3" and pad < 0.027: | |
| pad = 0.0 | |
| ff = ffmpy.FFmpeg( | |
| inputs={wav_file: None}, | |
| outputs={padded_wav: f"-af 'adelay={pad*1000}:all=true'"}, | |
| global_options=global_options, | |
| ) | |
| ff.run() | |
| signal = cls(padded_wav, **kwargs) | |
| return signal | |