File size: 4,260 Bytes
7c08dc3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
import os, re, json
import random
import argparse
import moviepy.editor as mp
from os import path
from pathlib import Path
from typing import List
from pyannote.audio import Audio
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
from scipy.spatial.distance import cosine
def extract_random_audio_segment(video_path: str, output_wav_path: str, duration: float = 5.0):
print(video_path)
video = mp.VideoFileClip(video_path)
audio = video.audio
total_duration = audio.duration
if duration >= total_duration: start_time = 0
else: start_time = random.uniform(0, total_duration - duration)
audio_subclip = audio.subclip(start_time, start_time + duration)
audio_subclip.write_audiofile(output_wav_path, codec='pcm_s16le', fps=16000)
def compute_speaker_similarity(audio_path_1: str, audio_path_2: str, device: str = "cuda") -> float:
embedding_model = PretrainedSpeakerEmbedding("speechbrain/spkrec-ecapa-voxceleb", device=device)
audio_loader = Audio(sample_rate=16000)
wav1, _ = audio_loader(audio_path_1)
wav2, _ = audio_loader(audio_path_2)
wav1 = wav1[0:1].unsqueeze(0)
wav2 = wav2[0:1].unsqueeze(0)
embedding1 = embedding_model(wav1)
embedding2 = embedding_model(wav2)
embedding1 = embedding1.reshape(embedding1.shape[1])
embedding2 = embedding2.reshape(embedding2.shape[1])
similarity = 1 - cosine(embedding1, embedding2)
return similarity
def get_audio_sim_score(gen_video_path, gt_video_path):
extract_random_audio_segment(gen_video_path, gen_video_path.replace('.mp4', '.wav'), duration=5)
extract_random_audio_segment(gt_video_path, gt_video_path.replace('.mp4', '.wav'), duration=5)
similarity = compute_speaker_similarity(gen_video_path.replace('.mp4', '.wav'),
gt_video_path.replace('.mp4', '.wav'))
return similarity
_num_at_start = re.compile(r'^\s*["\']?(\d+)')
def sort_by_leading_number(paths: List[str]) -> List[str]:
def key(p: str):
name = Path(p).name
m = _num_at_start.match(name)
return (int(m.group(1)) if m else float('inf'), name)
return sorted(paths, key=key)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-r", "--result_dir", default="/path/to/result_dir")
parser.add_argument("-g", "--gt_dir", default="/path/to/gt_dir")
parser.add_argument("-s", "--save_dir", default="/path/to/save_dir")
args = parser.parse_args()
## load exist result if have
save_dir = args.save_dir
save_dir = path.join(save_dir, path.basename(args.result_dir))
save_path = path.join(save_dir, "audio_sim.json")
os.makedirs(save_dir, exist_ok=True)
if path.exists(save_path):
with open(save_path, 'r') as f: audio_similarity_list = json.load(f)
else: audio_similarity_list = []
## path
gt_dir, result_dir = args.gt_dir, args.result_dir
groundtruth_list = sort_by_leading_number([path.join(gt_dir, name) for name in os.listdir(gt_dir)])
result_list = sort_by_leading_number([path.join(result_dir, name) for name in os.listdir(result_dir)])
for index in range(len(audio_similarity_list), 40):
if path.basename(args.result_dir) == "paper2video":
p2v_video_path = path.join(result_list[index], "3_merage.mp4")
elif path.basename(args.result_dir) == "veo3":
p2v_video_path = path.join(result_list[index])
else:
p2v_video_path = path.join(result_list[index], "result.mp4")
if path.exists(p2v_video_path) is False: continue
gt_video_path = path.join(groundtruth_list[index], "gt_presentation_video.mp4")
if path.exists(gt_video_path) is False: continue
print(p2v_video_path, gt_video_path)
similarity = get_audio_sim_score(p2v_video_path, gt_video_path)
audio_similarity_list.append({
"data_idx": index,
"score": similarity.item()
})
print(audio_similarity_list)
with open(save_path, 'w') as f: json.dump(audio_similarity_list, f, indent=4)
# import numpy as np
# avg = np.average(similarity_all)
# var = np.var(similarity_all)
# print(avg, var) |