Spaces:
Sleeping
Sleeping
| # Embedding + autocomplete index builder β creates FAISS vector index and bigram index | |
| import os | |
| import numpy as np | |
| import pandas as pd | |
| import faiss | |
| from sentence_transformers import SentenceTransformer | |
| from config import ( | |
| META_CSV, | |
| INDEX_DIR, | |
| FAISS_PATH, | |
| EMBEDDING_MODEL, | |
| VIDEO_METADATA, | |
| ) | |
| # Autocomplete index builder | |
| from autocomplete import build_bigrams_index, BIGRAMS_PATH | |
| # Build FAISS embedding index + bigram autocomplete index | |
| def build_embedding_index(subtitle_blocks: list[dict]): | |
| texts = [(s.get("text") or "") for s in subtitle_blocks] | |
| if not texts: | |
| raise ValueError("No texts found in subtitle blocks. Did you generate metadata.csv?") | |
| model = SentenceTransformer(EMBEDDING_MODEL) | |
| vectors = model.encode(texts, show_progress_bar=True, convert_to_numpy=True) | |
| vectors = np.asarray(vectors, dtype=np.float32) | |
| index = faiss.IndexFlatL2(vectors.shape[1]) | |
| index.add(vectors) | |
| os.makedirs(INDEX_DIR, exist_ok=True) | |
| faiss.write_index(index, os.fspath(FAISS_PATH)) | |
| # Build bigrams for autocomplete | |
| build_bigrams_index(subtitle_blocks, out_path=BIGRAMS_PATH, min_count=2) | |
| # Load subtitle blocks from CSV and with video titles | |
| def load_blocks_from_csv(csv_path) -> list[dict]: | |
| df = pd.read_csv(csv_path) | |
| records = df.to_dict("records") | |
| for r in records: | |
| vid = r.get("video_id") | |
| friendly_key = next((k for k, v in VIDEO_METADATA.items() if v["id"] == vid), None) | |
| if friendly_key: | |
| r["video_title"] = VIDEO_METADATA[friendly_key]["title"] | |
| else: | |
| r["video_title"] = "Unknown Video" | |
| return records | |
| # build FAISS + autocomplete indexes | |
| if __name__ == "__main__": | |
| if not META_CSV.exists(): | |
| raise FileNotFoundError( | |
| f"metadata.csv not found at {META_CSV}. Run clean_subtitles.py first to generate it." | |
| ) | |
| blocks = load_blocks_from_csv(META_CSV) | |
| build_embedding_index(blocks) | |