Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| # modules/indexer.py | |
| from typing import Dict, List | |
| from modules.youtube_metadata.db import get_youtube_metadata_collection | |
| from modules.youtube_metadata.embeddings import get_embedding | |
| import logging | |
| logging.basicConfig() | |
| logger = logging.getLogger(__name__) | |
| logger.setLevel(logging.INFO) | |
| def index_videos( | |
| videos: List[Dict], channel_url: str, batch_size: int = 50 | |
| ): | |
| collection = get_youtube_metadata_collection() | |
| total = len(videos) | |
| logger.info( | |
| f"index_videos: [INDEX] Starting indexing for {total} videos (channel={channel_url})" | |
| ) | |
| # Split into batches | |
| for start in range(0, total, batch_size): | |
| batch = videos[start : start + batch_size] | |
| end = start + len(batch) | |
| percent = round((end / total) * 100, 1) | |
| logger.info( | |
| f"index_videos: [INDEX] Processing batch {start+1} β {end} of {total} β {percent}%" | |
| ) | |
| # Prepare text inputs | |
| texts = [ | |
| f"{vid.get('title', '')} - {vid.get('description', '')}" for vid in batch | |
| ] | |
| embeddings = [get_embedding(text) for text in texts] | |
| # Build metadata + ids | |
| metadatas, ids = [], [] | |
| for vid in batch: | |
| metadata = { | |
| "video_id": vid.get("video_id"), | |
| "video_title": vid.get("title", ""), | |
| "description": vid.get("description", ""), | |
| "channel_url": channel_url, | |
| } | |
| if "channel_id" in vid: | |
| metadata["channel_id"] = vid["channel_id"] | |
| if "channel_title" in vid: | |
| metadata["channel_title"] = vid["channel_title"] | |
| metadatas.append(metadata) | |
| ids.append(vid.get("video_id")) | |
| # Insert in bulk | |
| collection.add( | |
| documents=texts, | |
| embeddings=embeddings, | |
| metadatas=metadatas, | |
| ids=ids, | |
| ) | |
| logger.info( | |
| f"index_videos: [INDEX] β Indexed {len(batch)} videos (total so far: {end}/{total} β {percent}%)" | |
| ) | |
| logger.info( | |
| f"index_videos: [INDEX] π Finished indexing {total} videos for channel={channel_url}" | |
| ) | |
| return total | |