ks-version-1-1 / backend /embed_index.py
NIKKI77's picture
Deploy: GPU-ready HF Space (Docker)
903b444
# Embedding + autocomplete index builder β€” creates FAISS vector index and bigram index
import os
import numpy as np
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer
from config import (
META_CSV,
INDEX_DIR,
FAISS_PATH,
EMBEDDING_MODEL,
VIDEO_METADATA,
)
# Autocomplete index builder
from autocomplete import build_bigrams_index, BIGRAMS_PATH
# Build FAISS embedding index + bigram autocomplete index
def build_embedding_index(subtitle_blocks: list[dict]):
texts = [(s.get("text") or "") for s in subtitle_blocks]
if not texts:
raise ValueError("No texts found in subtitle blocks. Did you generate metadata.csv?")
model = SentenceTransformer(EMBEDDING_MODEL)
vectors = model.encode(texts, show_progress_bar=True, convert_to_numpy=True)
vectors = np.asarray(vectors, dtype=np.float32)
index = faiss.IndexFlatL2(vectors.shape[1])
index.add(vectors)
os.makedirs(INDEX_DIR, exist_ok=True)
faiss.write_index(index, os.fspath(FAISS_PATH))
# Build bigrams for autocomplete
build_bigrams_index(subtitle_blocks, out_path=BIGRAMS_PATH, min_count=2)
# Load subtitle blocks from CSV and with video titles
def load_blocks_from_csv(csv_path) -> list[dict]:
df = pd.read_csv(csv_path)
records = df.to_dict("records")
for r in records:
vid = r.get("video_id")
friendly_key = next((k for k, v in VIDEO_METADATA.items() if v["id"] == vid), None)
if friendly_key:
r["video_title"] = VIDEO_METADATA[friendly_key]["title"]
else:
r["video_title"] = "Unknown Video"
return records
# build FAISS + autocomplete indexes
if __name__ == "__main__":
if not META_CSV.exists():
raise FileNotFoundError(
f"metadata.csv not found at {META_CSV}. Run clean_subtitles.py first to generate it."
)
blocks = load_blocks_from_csv(META_CSV)
build_embedding_index(blocks)