ks-version-1-1 / backend /clean_subtitles.py
NIKKI77's picture
Deploy: GPU-ready HF Space (Docker)
903b444
# Subtitle cleaning + preparation β€” reads .vtt, cleans text, chunks, punctuates, and outputs blocks
import os
import re
import json
from pathlib import Path
import webvtt
import pandas as pd
from punctuation import punctuate_text
from config import SUBS_DIR, META_CSV, VIDEO_METADATA, LINES_PER_CHUNK
# Helpers
def clean_text(text: str) -> str:
"""Lowercase, strip tags/brackets, keep basic punctuation, collapse spaces."""
text = text.lower()
text = re.sub(r'<.*?>', '', text)
text = re.sub(r'\[.*?\]', '', text)
text = re.sub(r"[^a-z0-9.,!?;:'\"()\-\s]", '', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
def vtt_time_to_seconds(t: str) -> float:
"""Convert WebVTT time ('HH:MM:SS.mmm') to seconds."""
h, m, s = t.split(':')
return int(h) * 3600 + int(m) * 60 + float(s)
# Main
def load_and_prepare_subtitles(folder_path: str | os.PathLike, lines_per_chunk: int = LINES_PER_CHUNK):
"""
Reads .vtt files, cleans captions, chunks by N lines,
punctuates the WHOLE chunk once, and returns blocks suitable for indexing.
"""
folder_path = os.fspath(folder_path)
subtitle_blocks = []
for filename in os.listdir(folder_path):
if not filename.endswith(".vtt"):
continue
stem = Path(filename).stem.strip().lower()
meta = VIDEO_METADATA.get(stem)
real_video_id = meta["id"] if meta else None
if not real_video_id:
continue
filepath = os.path.join(folder_path, filename)
raw_lines = []
# Collect cleaned, unpunctuated lines with original timestamps
for caption in webvtt.read(filepath):
cleaned = clean_text(caption.text)
if cleaned:
raw_lines.append({
"timestamp": caption.start,
"start_sec": vtt_time_to_seconds(caption.start),
"end_sec": vtt_time_to_seconds(caption.end),
"text": cleaned,
"video_id": real_video_id,
})
if not raw_lines:
continue
# Chunk by N lines, then punctuate per chunk
for i in range(0, len(raw_lines), lines_per_chunk):
chunk_lines = raw_lines[i:i + lines_per_chunk]
chunk_raw_text = ""
for line in chunk_lines:
text = line["text"].strip()
if not text:
continue
if chunk_raw_text and chunk_raw_text[-1].isalpha() and text[0].isalpha():
chunk_raw_text += " " + text
else:
if chunk_raw_text:
chunk_raw_text += " "
chunk_raw_text += text
# Punctuate chunk text
chunk_text = punctuate_text(chunk_raw_text) or chunk_raw_text
chunk_start = chunk_lines[0]["start_sec"]
chunk_end = chunk_lines[-1]["end_sec"]
subtitle_blocks.append({
"text": chunk_text.strip(),
"video_id": real_video_id,
"timestamp": chunk_lines[0]["timestamp"],
"lines": json.dumps(chunk_lines),
"chunk_start": chunk_start,
"chunk_end": chunk_end,
})
return subtitle_blocks
# process all subs and save META_CSV
if __name__ == "__main__":
os.makedirs(os.path.dirname(META_CSV), exist_ok=True)
blocks = load_and_prepare_subtitles(SUBS_DIR, lines_per_chunk=LINES_PER_CHUNK)
pd.DataFrame(blocks).to_csv(META_CSV, index=False)