# Subtitle cleaning + preparation — reads .vtt, cleans text, chunks, punctuates, and outputs blocks import os import re import json from pathlib import Path import webvtt import pandas as pd from punctuation import punctuate_text from config import SUBS_DIR, META_CSV, VIDEO_METADATA, LINES_PER_CHUNK # Helpers def clean_text(text: str) -> str: """Lowercase, strip tags/brackets, keep basic punctuation, collapse spaces.""" text = text.lower() text = re.sub(r'<.*?>', '', text) text = re.sub(r'\[.*?\]', '', text) text = re.sub(r"[^a-z0-9.,!?;:'\"()\-\s]", '', text) text = re.sub(r'\s+', ' ', text).strip() return text def vtt_time_to_seconds(t: str) -> float: """Convert WebVTT time ('HH:MM:SS.mmm') to seconds.""" h, m, s = t.split(':') return int(h) * 3600 + int(m) * 60 + float(s) # Main def load_and_prepare_subtitles(folder_path: str | os.PathLike, lines_per_chunk: int = LINES_PER_CHUNK): """ Reads .vtt files, cleans captions, chunks by N lines, punctuates the WHOLE chunk once, and returns blocks suitable for indexing. """ folder_path = os.fspath(folder_path) subtitle_blocks = [] for filename in os.listdir(folder_path): if not filename.endswith(".vtt"): continue stem = Path(filename).stem.strip().lower() meta = VIDEO_METADATA.get(stem) real_video_id = meta["id"] if meta else None if not real_video_id: continue filepath = os.path.join(folder_path, filename) raw_lines = [] # Collect cleaned, unpunctuated lines with original timestamps for caption in webvtt.read(filepath): cleaned = clean_text(caption.text) if cleaned: raw_lines.append({ "timestamp": caption.start, "start_sec": vtt_time_to_seconds(caption.start), "end_sec": vtt_time_to_seconds(caption.end), "text": cleaned, "video_id": real_video_id, }) if not raw_lines: continue # Chunk by N lines, then punctuate per chunk for i in range(0, len(raw_lines), lines_per_chunk): chunk_lines = raw_lines[i:i + lines_per_chunk] chunk_raw_text = "" for line in chunk_lines: text = line["text"].strip() if not text: continue if chunk_raw_text and chunk_raw_text[-1].isalpha() and text[0].isalpha(): chunk_raw_text += " " + text else: if chunk_raw_text: chunk_raw_text += " " chunk_raw_text += text # Punctuate chunk text chunk_text = punctuate_text(chunk_raw_text) or chunk_raw_text chunk_start = chunk_lines[0]["start_sec"] chunk_end = chunk_lines[-1]["end_sec"] subtitle_blocks.append({ "text": chunk_text.strip(), "video_id": real_video_id, "timestamp": chunk_lines[0]["timestamp"], "lines": json.dumps(chunk_lines), "chunk_start": chunk_start, "chunk_end": chunk_end, }) return subtitle_blocks # process all subs and save META_CSV if __name__ == "__main__": os.makedirs(os.path.dirname(META_CSV), exist_ok=True) blocks = load_and_prepare_subtitles(SUBS_DIR, lines_per_chunk=LINES_PER_CHUNK) pd.DataFrame(blocks).to_csv(META_CSV, index=False)