vikramvasudevan's picture
Upload folder using huggingface_hub
73a6587 verified
# modules/indexer.py
from typing import Dict, List
from modules.youtube_metadata.db import get_youtube_metadata_collection
from modules.youtube_metadata.embeddings import get_embedding
import logging
logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
def index_videos(
videos: List[Dict], channel_url: str, batch_size: int = 50
):
collection = get_youtube_metadata_collection()
total = len(videos)
logger.info(
f"index_videos: [INDEX] Starting indexing for {total} videos (channel={channel_url})"
)
# Split into batches
for start in range(0, total, batch_size):
batch = videos[start : start + batch_size]
end = start + len(batch)
percent = round((end / total) * 100, 1)
logger.info(
f"index_videos: [INDEX] Processing batch {start+1} β†’ {end} of {total} β€” {percent}%"
)
# Prepare text inputs
texts = [
f"{vid.get('title', '')} - {vid.get('description', '')}" for vid in batch
]
embeddings = [get_embedding(text) for text in texts]
# Build metadata + ids
metadatas, ids = [], []
for vid in batch:
metadata = {
"video_id": vid.get("video_id"),
"video_title": vid.get("title", ""),
"description": vid.get("description", ""),
"channel_url": channel_url,
}
if "channel_id" in vid:
metadata["channel_id"] = vid["channel_id"]
if "channel_title" in vid:
metadata["channel_title"] = vid["channel_title"]
metadatas.append(metadata)
ids.append(vid.get("video_id"))
# Insert in bulk
collection.add(
documents=texts,
embeddings=embeddings,
metadatas=metadatas,
ids=ids,
)
logger.info(
f"index_videos: [INDEX] βœ… Indexed {len(batch)} videos (total so far: {end}/{total} β€” {percent}%)"
)
logger.info(
f"index_videos: [INDEX] πŸŽ‰ Finished indexing {total} videos for channel={channel_url}"
)
return total