Spaces:

NIKKI77
/

ks-version-1-1

Sleeping

App Files Files Community

ks-version-1-1 / backend /embed_index.py

NIKKI77

Deploy: GPU-ready HF Space (Docker)

903b444 3 months ago

raw

history blame contribute delete

1.97 kB

	# Embedding + autocomplete index builder — creates FAISS vector index and bigram index
	import os
	import numpy as np
	import pandas as pd
	import faiss
	from sentence_transformers import SentenceTransformer
	from config import (
	META_CSV,
	INDEX_DIR,
	FAISS_PATH,
	EMBEDDING_MODEL,
	VIDEO_METADATA,
	)

	# Autocomplete index builder
	from autocomplete import build_bigrams_index, BIGRAMS_PATH

	# Build FAISS embedding index + bigram autocomplete index
	def build_embedding_index(subtitle_blocks: list[dict]):
	texts = [(s.get("text") or "") for s in subtitle_blocks]
	if not texts:
	raise ValueError("No texts found in subtitle blocks. Did you generate metadata.csv?")

	model = SentenceTransformer(EMBEDDING_MODEL)
	vectors = model.encode(texts, show_progress_bar=True, convert_to_numpy=True)

	vectors = np.asarray(vectors, dtype=np.float32)

	index = faiss.IndexFlatL2(vectors.shape[1])
	index.add(vectors)

	os.makedirs(INDEX_DIR, exist_ok=True)
	faiss.write_index(index, os.fspath(FAISS_PATH))

	# Build bigrams for autocomplete
	build_bigrams_index(subtitle_blocks, out_path=BIGRAMS_PATH, min_count=2)

	# Load subtitle blocks from CSV and with video titles
	def load_blocks_from_csv(csv_path) -> list[dict]:
	df = pd.read_csv(csv_path)
	records = df.to_dict("records")
	for r in records:
	vid = r.get("video_id")
	friendly_key = next((k for k, v in VIDEO_METADATA.items() if v["id"] == vid), None)
	if friendly_key:
	r["video_title"] = VIDEO_METADATA[friendly_key]["title"]
	else:
	r["video_title"] = "Unknown Video"
	return records

	# build FAISS + autocomplete indexes
	if __name__ == "__main__":
	if not META_CSV.exists():
	raise FileNotFoundError(
	f"metadata.csv not found at {META_CSV}. Run clean_subtitles.py first to generate it."
	)
	blocks = load_blocks_from_csv(META_CSV)
	build_embedding_index(blocks)