Spaces:
Sleeping
Sleeping
| # Subtitle cleaning + preparation β reads .vtt, cleans text, chunks, punctuates, and outputs blocks | |
| import os | |
| import re | |
| import json | |
| from pathlib import Path | |
| import webvtt | |
| import pandas as pd | |
| from punctuation import punctuate_text | |
| from config import SUBS_DIR, META_CSV, VIDEO_METADATA, LINES_PER_CHUNK | |
| # Helpers | |
| def clean_text(text: str) -> str: | |
| """Lowercase, strip tags/brackets, keep basic punctuation, collapse spaces.""" | |
| text = text.lower() | |
| text = re.sub(r'<.*?>', '', text) | |
| text = re.sub(r'\[.*?\]', '', text) | |
| text = re.sub(r"[^a-z0-9.,!?;:'\"()\-\s]", '', text) | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| return text | |
| def vtt_time_to_seconds(t: str) -> float: | |
| """Convert WebVTT time ('HH:MM:SS.mmm') to seconds.""" | |
| h, m, s = t.split(':') | |
| return int(h) * 3600 + int(m) * 60 + float(s) | |
| # Main | |
| def load_and_prepare_subtitles(folder_path: str | os.PathLike, lines_per_chunk: int = LINES_PER_CHUNK): | |
| """ | |
| Reads .vtt files, cleans captions, chunks by N lines, | |
| punctuates the WHOLE chunk once, and returns blocks suitable for indexing. | |
| """ | |
| folder_path = os.fspath(folder_path) | |
| subtitle_blocks = [] | |
| for filename in os.listdir(folder_path): | |
| if not filename.endswith(".vtt"): | |
| continue | |
| stem = Path(filename).stem.strip().lower() | |
| meta = VIDEO_METADATA.get(stem) | |
| real_video_id = meta["id"] if meta else None | |
| if not real_video_id: | |
| continue | |
| filepath = os.path.join(folder_path, filename) | |
| raw_lines = [] | |
| # Collect cleaned, unpunctuated lines with original timestamps | |
| for caption in webvtt.read(filepath): | |
| cleaned = clean_text(caption.text) | |
| if cleaned: | |
| raw_lines.append({ | |
| "timestamp": caption.start, | |
| "start_sec": vtt_time_to_seconds(caption.start), | |
| "end_sec": vtt_time_to_seconds(caption.end), | |
| "text": cleaned, | |
| "video_id": real_video_id, | |
| }) | |
| if not raw_lines: | |
| continue | |
| # Chunk by N lines, then punctuate per chunk | |
| for i in range(0, len(raw_lines), lines_per_chunk): | |
| chunk_lines = raw_lines[i:i + lines_per_chunk] | |
| chunk_raw_text = "" | |
| for line in chunk_lines: | |
| text = line["text"].strip() | |
| if not text: | |
| continue | |
| if chunk_raw_text and chunk_raw_text[-1].isalpha() and text[0].isalpha(): | |
| chunk_raw_text += " " + text | |
| else: | |
| if chunk_raw_text: | |
| chunk_raw_text += " " | |
| chunk_raw_text += text | |
| # Punctuate chunk text | |
| chunk_text = punctuate_text(chunk_raw_text) or chunk_raw_text | |
| chunk_start = chunk_lines[0]["start_sec"] | |
| chunk_end = chunk_lines[-1]["end_sec"] | |
| subtitle_blocks.append({ | |
| "text": chunk_text.strip(), | |
| "video_id": real_video_id, | |
| "timestamp": chunk_lines[0]["timestamp"], | |
| "lines": json.dumps(chunk_lines), | |
| "chunk_start": chunk_start, | |
| "chunk_end": chunk_end, | |
| }) | |
| return subtitle_blocks | |
| # process all subs and save META_CSV | |
| if __name__ == "__main__": | |
| os.makedirs(os.path.dirname(META_CSV), exist_ok=True) | |
| blocks = load_and_prepare_subtitles(SUBS_DIR, lines_per_chunk=LINES_PER_CHUNK) | |
| pd.DataFrame(blocks).to_csv(META_CSV, index=False) | |