Spaces:

NIKKI77
/

ks-version-1-1

Sleeping

App Files Files Community

ks-version-1-1 / backend /clean_subtitles.py

NIKKI77

Deploy: GPU-ready HF Space (Docker)

903b444 3 months ago

raw

history blame contribute delete

3.58 kB

	# Subtitle cleaning + preparation — reads .vtt, cleans text, chunks, punctuates, and outputs blocks
	import os
	import re
	import json
	from pathlib import Path
	import webvtt
	import pandas as pd
	from punctuation import punctuate_text
	from config import SUBS_DIR, META_CSV, VIDEO_METADATA, LINES_PER_CHUNK

	# Helpers
	def clean_text(text: str) -> str:
	"""Lowercase, strip tags/brackets, keep basic punctuation, collapse spaces."""
	text = text.lower()
	text = re.sub(r'<.*?>', '', text)
	text = re.sub(r'\[.*?\]', '', text)
	text = re.sub(r"[^a-z0-9.,!?;:'\"()\-\s]", '', text)
	text = re.sub(r'\s+', ' ', text).strip()
	return text

	def vtt_time_to_seconds(t: str) -> float:
	"""Convert WebVTT time ('HH:MM:SS.mmm') to seconds."""
	h, m, s = t.split(':')
	return int(h) * 3600 + int(m) * 60 + float(s)

	# Main
	def load_and_prepare_subtitles(folder_path: str \| os.PathLike, lines_per_chunk: int = LINES_PER_CHUNK):
	"""
	Reads .vtt files, cleans captions, chunks by N lines,
	punctuates the WHOLE chunk once, and returns blocks suitable for indexing.
	"""
	folder_path = os.fspath(folder_path)
	subtitle_blocks = []

	for filename in os.listdir(folder_path):
	if not filename.endswith(".vtt"):
	continue

	stem = Path(filename).stem.strip().lower()
	meta = VIDEO_METADATA.get(stem)
	real_video_id = meta["id"] if meta else None
	if not real_video_id:
	continue

	filepath = os.path.join(folder_path, filename)
	raw_lines = []

	# Collect cleaned, unpunctuated lines with original timestamps
	for caption in webvtt.read(filepath):
	cleaned = clean_text(caption.text)
	if cleaned:
	raw_lines.append({
	"timestamp": caption.start,
	"start_sec": vtt_time_to_seconds(caption.start),
	"end_sec": vtt_time_to_seconds(caption.end),
	"text": cleaned,
	"video_id": real_video_id,
	})

	if not raw_lines:
	continue

	# Chunk by N lines, then punctuate per chunk
	for i in range(0, len(raw_lines), lines_per_chunk):
	chunk_lines = raw_lines[i:i + lines_per_chunk]
	chunk_raw_text = ""
	for line in chunk_lines:
	text = line["text"].strip()
	if not text:
	continue
	if chunk_raw_text and chunk_raw_text[-1].isalpha() and text[0].isalpha():
	chunk_raw_text += " " + text
	else:
	if chunk_raw_text:
	chunk_raw_text += " "
	chunk_raw_text += text

	# Punctuate chunk text
	chunk_text = punctuate_text(chunk_raw_text) or chunk_raw_text

	chunk_start = chunk_lines[0]["start_sec"]
	chunk_end = chunk_lines[-1]["end_sec"]

	subtitle_blocks.append({
	"text": chunk_text.strip(),
	"video_id": real_video_id,
	"timestamp": chunk_lines[0]["timestamp"],
	"lines": json.dumps(chunk_lines),
	"chunk_start": chunk_start,
	"chunk_end": chunk_end,
	})

	return subtitle_blocks

	# process all subs and save META_CSV
	if __name__ == "__main__":
	os.makedirs(os.path.dirname(META_CSV), exist_ok=True)
	blocks = load_and_prepare_subtitles(SUBS_DIR, lines_per_chunk=LINES_PER_CHUNK)
	pd.DataFrame(blocks).to_csv(META_CSV, index=False)