Spaces:

vikramvasudevan
/

sanatan_ai

Running on CPU Upgrade

App Files Files Community

vikramvasudevan commited on Sep 21

Commit

73a6587

verified ·

1 Parent(s): bb80c5f

Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

config.py +7 -0
db.py +134 -137
modules/youtube_metadata/answerer.py +3 -1
modules/youtube_metadata/app.py +18 -16
modules/youtube_metadata/channel_utils.py +3 -3
modules/youtube_metadata/db.py +9 -28
modules/youtube_metadata/indexer.py +3 -4
modules/youtube_metadata/youtube_poller.py +11 -12
modules/youtube_metadata/youtube_sync.py +1 -2

config.py CHANGED Viewed

@@ -754,6 +754,13 @@ class SanatanConfig:
             if scripture["collection_name"] == collection_name
         ][0]
     def is_metadata_field_allowed(
         self, collection_name: str, metadata_where_clause: MetadataWhereClause
     ):

             if scripture["collection_name"] == collection_name
         ][0]
+    def get_scripture_by_name(self, scripture_name: str):
+        return [
+            scripture
+            for scripture in self.scriptures
+            if scripture["name"] == scripture_name
+        ][0]
     def is_metadata_field_allowed(
         self, collection_name: str, metadata_where_clause: MetadataWhereClause
     ):

db.py CHANGED Viewed

@@ -7,9 +7,8 @@ import re, unicodedata
 from config import SanatanConfig
 from embeddings import get_embedding
 import logging
-from pydantic import BaseModel
-from metadata import MetadataFilter, MetadataWhereClause
 from modules.db.relevance import validate_relevance_queryresult
 from tqdm import tqdm
@@ -59,7 +58,7 @@ class SanatanDatabase:
                 metadata_where_clause.to_chroma_where()
                 if metadata_where_clause is not None
                 else None
-            )
         )
         docs = data["documents"]  # list of all verse texts
         ids = data["ids"]
@@ -79,9 +78,7 @@ class SanatanDatabase:
         )
     def fetch_first_match(
-        self,
-        collection_name: str,
-        metadata_where_clause: MetadataWhereClause = None
     ):
         """This version is created to support the browse module"""
         logger.info(
@@ -96,14 +93,14 @@ class SanatanDatabase:
                 metadata_where_clause.to_chroma_where()
                 if metadata_where_clause is not None
                 else None
-            )
         )
         if data["metadatas"]:
             # find index of record with lowest _global_index
             min_index = min(
                 range(len(data["metadatas"])),
-                key=lambda i: data["metadatas"][i].get("_global_index", float("inf"))
             )
             # shrink data to keep same structure but only one record
@@ -521,151 +518,151 @@ class SanatanDatabase:
         return sorted(list(values))
-    def build_global_index_for_all_scriptures(self, force: bool = False):
-        import pandas as pd
-        import numpy as np
-        logger.info("build_global_index_for_all_scriptures: started")
-        config = SanatanConfig()
-        for scripture in config.scriptures:
-            scripture_name = scripture["name"]
-            chapter_order = scripture.get("chapter_order", None)
-            # if scripture_name != "vishnu_sahasranamam":
-            #     continue
-            logger.info(
-                "build_global_index_for_all_scriptures:%s: Processing", scripture_name
-            )
-            collection_name = scripture["collection_name"]
-            collection = self.chroma_client.get_or_create_collection(
-                name=collection_name
-            )
-            metadata_fields = scripture.get("metadata_fields", [])
-            # Get metadata field names marked as unique
-            unique_fields = [f["name"] for f in metadata_fields if f.get("is_unique")]
-            if not unique_fields:
-                if metadata_fields:
-                    unique_fields = [metadata_fields[0]["name"]]
-                else:
-                    logger.warning(
-                        f"No metadata fields defined for {collection_name}, skipping"
-                    )
-                    continue
-            logger.info(
-                "build_global_index_for_all_scriptures:%s:unique fields: %s",
                 scripture_name,
-                unique_fields,
             )
-            # Build chapter_order mapping if defined
-            chapter_order_mapping = {}
-            for field in metadata_fields:
-                if callable(chapter_order):
-                    chapter_order_mapping = chapter_order()
-            logger.info(
-                "build_global_index_for_all_scriptures:%s:chapter_order_mapping: %s",
                 scripture_name,
-                chapter_order_mapping,
             )
-            # Fetch all records (keep embeddings for upsert)
-            try:
-                results = collection.get(
-                    include=["metadatas", "documents", "embeddings"]
                 )
-            except Exception as e:
-                logger.error(
-                    "build_global_index_for_all_scriptures:%s Error getting data from chromadb",
-                    scripture_name,
-                    exc_info=True,
-                )
-                continue
-            ids = results["ids"]
-            metadatas = results["metadatas"]
-            documents = results["documents"]
-            embeddings = results.get("embeddings", [None] * len(ids))
-            if not force and metadatas and "_global_index" in metadatas[0]:
-                logger.warning(
-                    "build_global_index_for_all_scriptures:%s: global index already available. skipping collection",
-                    scripture_name,
-                )
-                continue
-            # Create a DataFrame for metadata sorting
-            df = pd.DataFrame(metadatas)
-            df["_id"] = ids
-            df["_doc"] = documents
-            # Add sortable columns for each unique field
-            for field_name in unique_fields:
-                if field_name.lower() == "chapter" and chapter_order_mapping:
-                    # Map chapter names to their defined order
-                    df["_sort_" + field_name] = (
-                        df[field_name].map(chapter_order_mapping).fillna(np.inf)
-                    )
-                else:
-                    # Try numeric, fallback to string lowercase
-                    def parse_val(v):
-                        if v is None:
-                            return float("inf")
-                        if isinstance(v, int):
-                            return v
-                        if isinstance(v, str):
-                            v = v.strip()
-                            return int(v) if v.isdigit() else v.lower()
-                        return str(v)
-                    df["_sort_" + field_name] = df[field_name].apply(parse_val)
-            sort_cols = ["_sort_" + f for f in unique_fields]
-            df = df.sort_values(by=sort_cols, kind="stable").reset_index(drop=True)
-            # Assign global index
-            df["_global_index"] = range(1, len(df) + 1)
-            logger.info(
-                "build_global_index_for_all_scriptures:%s: updating database",
-                scripture_name,
-            )
-            # Batch upsert
-            BATCH_SIZE = 5000  # safely below max batch size
-            for i in range(0, len(df), BATCH_SIZE):
-                batch_df = df.iloc[i : i + BATCH_SIZE]
-                batch_ids = batch_df["_id"].tolist()
-                batch_docs = batch_df["_doc"].tolist()
-                batch_metas = [
-                    {k: record[k] for k in metadatas[0].keys() if k in record}
-                    | {"_global_index": record["_global_index"]}
-                    for record in batch_df.to_dict(orient="records")
-                ]
-                # Use original metadata keys for upsert
-                batch_metas = [
-                    {k: record[k] for k in metadatas[0].keys() if k in record}
-                    | {"_global_index": record["_global_index"]}
-                    for record in batch_df.to_dict(orient="records")
-                ]
-                batch_embeds = [embeddings[idx] for idx in batch_df.index]
-                collection.update(
-                    ids=batch_ids,
-                    # documents=batch_docs,
-                    metadatas=batch_metas,
-                    # embeddings=batch_embeds,
-                )
-            logger.info(
-                "build_global_index_for_all_scriptures:%s: ✅ Updated with %d records",
-                scripture_name,
-                len(df),
             )
     def fix_taniyans_in_divya_prabandham(self):
-        nalayiram_helper.reorder_taniyan(self.chroma_client.get_collection("divya_prabandham"))
     def delete_taniyans_in_divya_prabandham(self):
-        nalayiram_helper.delete_taniyan(self.chroma_client.get_collection("divya_prabandham"))

 from config import SanatanConfig
 from embeddings import get_embedding
 import logging
+from metadata import MetadataWhereClause
 from modules.db.relevance import validate_relevance_queryresult
 from tqdm import tqdm
                 metadata_where_clause.to_chroma_where()
                 if metadata_where_clause is not None
                 else None
+            ),
         )
         docs = data["documents"]  # list of all verse texts
         ids = data["ids"]
         )
     def fetch_first_match(
+        self, collection_name: str, metadata_where_clause: MetadataWhereClause = None
     ):
         """This version is created to support the browse module"""
         logger.info(
                 metadata_where_clause.to_chroma_where()
                 if metadata_where_clause is not None
                 else None
+            ),
         )
         if data["metadatas"]:
             # find index of record with lowest _global_index
             min_index = min(
                 range(len(data["metadatas"])),
+                key=lambda i: data["metadatas"][i].get("_global_index", float("inf")),
             )
             # shrink data to keep same structure but only one record
         return sorted(list(values))
+    def build_global_index_for_scripture(self, scripture: dict, force: bool = False):
+        scripture_name = scripture["name"]
+        chapter_order = scripture.get("chapter_order", None)
+        # if scripture_name != "vishnu_sahasranamam":
+        #     continue
+        logger.info(
+            "build_global_index_for_all_scriptures:%s: Processing", scripture_name
+        )
+        collection_name = scripture["collection_name"]
+        collection = self.chroma_client.get_or_create_collection(name=collection_name)
+        metadata_fields = scripture.get("metadata_fields", [])
+        # Get metadata field names marked as unique
+        unique_fields = [f["name"] for f in metadata_fields if f.get("is_unique")]
+        if not unique_fields:
+            if metadata_fields:
+                unique_fields = [metadata_fields[0]["name"]]
+            else:
+                logger.warning(
+                    f"No metadata fields defined for {collection_name}, skipping"
+                )
+                return
+        logger.info(
+            "build_global_index_for_all_scriptures:%s:unique fields: %s",
+            scripture_name,
+            unique_fields,
+        )
+        # Build chapter_order mapping if defined
+        chapter_order_mapping = {}
+        for field in metadata_fields:
+            if callable(chapter_order):
+                chapter_order_mapping = chapter_order()
+        logger.info(
+            "build_global_index_for_all_scriptures:%s:chapter_order_mapping: %s",
+            scripture_name,
+            chapter_order_mapping,
+        )
+        # Fetch all records (keep embeddings for upsert)
+        try:
+            results = collection.get(include=["metadatas", "documents", "embeddings"])
+        except Exception as e:
+            logger.error(
+                "build_global_index_for_all_scriptures:%s Error getting data from chromadb",
                 scripture_name,
+                exc_info=True,
             )
+            return
+        ids = results["ids"]
+        metadatas = results["metadatas"]
+        documents = results["documents"]
+        embeddings = results.get("embeddings", [None] * len(ids))
+        if not force and metadatas and "_global_index" in metadatas[0]:
+            logger.warning(
+                "build_global_index_for_all_scriptures:%s: global index already available. skipping collection",
                 scripture_name,
             )
+            return
+        # Create a DataFrame for metadata sorting
+        df = pd.DataFrame(metadatas)
+        df["_id"] = ids
+        df["_doc"] = documents
+        # Add sortable columns for each unique field
+        for field_name in unique_fields:
+            if field_name.lower() == "chapter" and chapter_order_mapping:
+                # Map chapter names to their defined order
+                df["_sort_" + field_name] = (
+                    df[field_name].map(chapter_order_mapping).fillna(np.inf)
                 )
+            else:
+                # Try numeric, fallback to string lowercase
+                def parse_val(v):
+                    if v is None:
+                        return float("inf")
+                    if isinstance(v, int):
+                        return v
+                    if isinstance(v, str):
+                        v = v.strip()
+                        return int(v) if v.isdigit() else v.lower()
+                    return str(v)
+                df["_sort_" + field_name] = df[field_name].apply(parse_val)
+        sort_cols = ["_sort_" + f for f in unique_fields]
+        df = df.sort_values(by=sort_cols, kind="stable").reset_index(drop=True)
+        # Assign global index
+        df["_global_index"] = range(1, len(df) + 1)
+        logger.info(
+            "build_global_index_for_all_scriptures:%s: updating database",
+            scripture_name,
+        )
+        # Batch upsert
+        BATCH_SIZE = 5000  # safely below max batch size
+        for i in range(0, len(df), BATCH_SIZE):
+            batch_df = df.iloc[i : i + BATCH_SIZE]
+            batch_ids = batch_df["_id"].tolist()
+            batch_docs = batch_df["_doc"].tolist()
+            batch_metas = [
+                {k: record[k] for k in metadatas[0].keys() if k in record}
+                | {"_global_index": record["_global_index"]}
+                for record in batch_df.to_dict(orient="records")
+            ]
+            # Use original metadata keys for upsert
+            batch_metas = [
+                {k: record[k] for k in metadatas[0].keys() if k in record}
+                | {"_global_index": record["_global_index"]}
+                for record in batch_df.to_dict(orient="records")
+            ]
+            batch_embeds = [embeddings[idx] for idx in batch_df.index]
+            collection.update(
+                ids=batch_ids,
+                # documents=batch_docs,
+                metadatas=batch_metas,
+                # embeddings=batch_embeds,
             )
+        logger.info(
+            "build_global_index_for_all_scriptures:%s: ✅ Updated with %d records",
+            scripture_name,
+            len(df),
+        )
+    def build_global_index_for_all_scriptures(self, force: bool = False):
+        logger.info("build_global_index_for_all_scriptures: started")
+        config = SanatanConfig()
+        for scripture in config.scriptures:
+            self.build_global_index_for_scripture(scripture=scripture, force=force)
     def fix_taniyans_in_divya_prabandham(self):
+        nalayiram_helper.reorder_taniyan(
+            self.chroma_client.get_collection("divya_prabandham")
+        )
     def delete_taniyans_in_divya_prabandham(self):
+        nalayiram_helper.delete_taniyan(
+            self.chroma_client.get_collection("divya_prabandham")
+        )

modules/youtube_metadata/answerer.py CHANGED Viewed

@@ -4,6 +4,7 @@
 from typing import List
 from pydantic import BaseModel
 from openai import OpenAI
 from modules.youtube_metadata.retriever import retrieve_videos
@@ -26,12 +27,13 @@ class LLMAnswer(BaseModel):
 # Main Function
 # -------------------------------
 def answer_query(
-    query: str, collection, top_k: int = 5, channel_id: str = None
 ) -> LLMAnswer:
     """
     Answer a user query using YouTube video metadata.
     Returns an LLMAnswer object with textual answer + list of videos.
     """
     results = retrieve_videos(query, collection, top_k=top_k, channel_id=channel_id)
     if not results:

 from typing import List
 from pydantic import BaseModel
 from openai import OpenAI
+from modules.youtube_metadata.db import get_youtube_metadata_collection
 from modules.youtube_metadata.retriever import retrieve_videos
 # Main Function
 # -------------------------------
 def answer_query(
+    query: str, top_k: int = 5, channel_id: str = None
 ) -> LLMAnswer:
     """
     Answer a user query using YouTube video metadata.
     Returns an LLMAnswer object with textual answer + list of videos.
     """
+    collection = get_youtube_metadata_collection()
     results = retrieve_videos(query, collection, top_k=top_k, channel_id=channel_id)
     if not results:

modules/youtube_metadata/app.py CHANGED Viewed

@@ -1,14 +1,13 @@
-import asyncio
 import os
 import re
-import threading
 import gradio as gr
 from gradio_modal import Modal
 from modules.youtube_metadata.downloader import export_channel_json
 from modules.youtube_metadata.channel_utils import fetch_channel_dataframe
 from modules.youtube_metadata.db import (
     delete_channel_from_collection,
-    get_collection,
     get_indexed_channels,
 )
 from modules.youtube_metadata.answerer import answer_query
@@ -16,6 +15,11 @@ from dotenv import load_dotenv
 from modules.youtube_metadata.youtube_poller import start_poll
 from modules.youtube_metadata.youtube_sync import sync_channels_from_youtube
 load_dotenv()
@@ -89,9 +93,9 @@ def index_channels(channel_urls: str):
 def youtube_metadata_init(progress: gr.Progress = None):
     channels = (
         "https://www.youtube.com/@onedayonepasuram6126,"
-        "https://www.youtube.com/@srisookthi,"
-        "https://www.youtube.com/@learn-aksharam,"
-        "https://www.youtube.com/@SriYadugiriYathirajaMutt,"
         "https://www.youtube.com/@akivasudev,"
         "https://www.youtube.com/@Arulicheyal_Amutham"
     )
@@ -102,7 +106,7 @@ def youtube_metadata_init(progress: gr.Progress = None):
 def refresh_all_channels():
     yt_api_key = os.environ["YOUTUBE_API_KEY"]
-    channels = get_indexed_channels(get_collection())
     if not channels:
         return "⚠️ No channels available to refresh.", refresh_channel_list()
@@ -127,7 +131,7 @@ def refresh_all_channels():
 # Channel selection as radio
 # -------------------------------
 def list_channels_radio():
-    channels = get_indexed_channels(get_collection())
     choices = []
     for key, val in channels.items():
         if isinstance(val, dict):
@@ -155,7 +159,7 @@ def delete_channel(channel_url: str):
 # -------------------------------
 def handle_query(query: str, search_channel_id: str):
     answer_text, video_html = answer_query(
-        query, get_collection(), channel_id=search_channel_id, top_k=10
     )
     if not answer_text:
         answer_text = "No answer available."
@@ -480,15 +484,13 @@ with gr.Blocks(title="Sanatana AI - Youtube Metadata Surfer") as youtube_metadat
 def initialize_youtube_metadata_and_poll():
-    # Step 1: Initialize metadata
     for msg in youtube_metadata_init():
-        print(msg)
-    # Step 2: Start polling after init
-    start_poll()  # run in the same thread
-    # OR if you want it in a separate daemon thread:
-    # poll_thread = threading.Thread(target=start_poll, daemon=True)
-    # poll_thread.start()
 if __name__ == "__main__":
     initialize_youtube_metadata_and_poll()

 import os
 import re
 import gradio as gr
 from gradio_modal import Modal
+from config import SanatanConfig
+from db import SanatanDatabase
 from modules.youtube_metadata.downloader import export_channel_json
 from modules.youtube_metadata.channel_utils import fetch_channel_dataframe
 from modules.youtube_metadata.db import (
     delete_channel_from_collection,
     get_indexed_channels,
 )
 from modules.youtube_metadata.answerer import answer_query
 from modules.youtube_metadata.youtube_poller import start_poll
 from modules.youtube_metadata.youtube_sync import sync_channels_from_youtube
+import logging
+logging.basicConfig()
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
 load_dotenv()
 def youtube_metadata_init(progress: gr.Progress = None):
     channels = (
         "https://www.youtube.com/@onedayonepasuram6126,"
+        # "https://www.youtube.com/@srisookthi,"
+        # "https://www.youtube.com/@learn-aksharam,"
+        # "https://www.youtube.com/@SriYadugiriYathirajaMutt,"
         "https://www.youtube.com/@akivasudev,"
         "https://www.youtube.com/@Arulicheyal_Amutham"
     )
 def refresh_all_channels():
     yt_api_key = os.environ["YOUTUBE_API_KEY"]
+    channels = get_indexed_channels()
     if not channels:
         return "⚠️ No channels available to refresh.", refresh_channel_list()
 # Channel selection as radio
 # -------------------------------
 def list_channels_radio():
+    channels = get_indexed_channels()
     choices = []
     for key, val in channels.items():
         if isinstance(val, dict):
 # -------------------------------
 def handle_query(query: str, search_channel_id: str):
     answer_text, video_html = answer_query(
+        query, channel_id=search_channel_id, top_k=10
     )
     if not answer_text:
         answer_text = "No answer available."
 def initialize_youtube_metadata_and_poll():
     for msg in youtube_metadata_init():
+        logger.info("initialize_youtube_metadata_and_poll: %s", msg)
+    SanatanDatabase().build_global_index_for_scripture(
+        scripture=SanatanConfig().get_scripture_by_name("yt_metadata"), force=True
+    )
+    start_poll()
 if __name__ == "__main__":
     initialize_youtube_metadata_and_poll()

modules/youtube_metadata/channel_utils.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from modules.youtube_metadata.db import get_collection
 import pandas as pd
 page_size = 10  # change if you like
@@ -8,7 +8,7 @@ page_size = 10  # change if you like
 # Fetch channel videos as HTML table with pagination
 # -------------------------------
 def fetch_channel_html(channel_id: str, page: int = 1, page_size: int = 10):
-    collection = get_collection()
     offset = (page - 1) * page_size
     all_results = collection.get(
@@ -73,7 +73,7 @@ def fetch_channel_html(channel_id: str, page: int = 1, page_size: int = 10):
 # Fetch channel videos as HTML table with pagination
 # -------------------------------
 def fetch_channel_dataframe(channel_id: str):
-    collection = get_collection()
     results = collection.get(
         where={"channel_id": channel_id}, include=["documents", "metadatas"]

+from modules.youtube_metadata.db import get_youtube_metadata_collection
 import pandas as pd
 page_size = 10  # change if you like
 # Fetch channel videos as HTML table with pagination
 # -------------------------------
 def fetch_channel_html(channel_id: str, page: int = 1, page_size: int = 10):
+    collection = get_youtube_metadata_collection()
     offset = (page - 1) * page_size
     all_results = collection.get(
 # Fetch channel videos as HTML table with pagination
 # -------------------------------
 def fetch_channel_dataframe(channel_id: str):
+    collection = get_youtube_metadata_collection()
     results = collection.get(
         where={"channel_id": channel_id}, include=["documents", "metadatas"]

modules/youtube_metadata/db.py CHANGED Viewed

@@ -1,38 +1,19 @@
 import chromadb
 from config import SanatanConfig
 config = SanatanConfig()
 YT_METADATA_COLLECTION_NAME =  config.get_collection_name(scripture_name="yt_metadata")
-def get_client():
-    client = chromadb.PersistentClient(path=config.dbStorePath)
-    return client
-def get_collection():
-    client = get_client()
-    # Ensure fresh collection with correct dimension
-    try:
-        collection = client.get_collection(YT_METADATA_COLLECTION_NAME)
-    except Exception:
-        collection = client.create_collection(YT_METADATA_COLLECTION_NAME)
-    # # Check dimension mismatch
-    # try:
-    #     # quick test query
-    #     collection.query(query_embeddings=[[0.0] * 1536], n_results=1)
-    # except Exception:
-    #     # Delete and recreate with fresh schema
-    #     client.delete_collection("yt_metadata")
-    #     collection = client.create_collection("yt_metadata")
-    return collection
-# modules/db.py
-def get_indexed_channels(collection=get_collection()):
     results = collection.get(include=["metadatas"])
     channels = {}
@@ -55,11 +36,11 @@ def delete_channel_from_collection(channel_id: str):
     # print("Deleting channel", channel_id)
     # print("data = ", data)
-    get_collection().delete(where={"channel_id": channel_id})
 def fetch_channel_data(channel_id: str):
-    data = get_collection().get(
         where={"channel_id": channel_id}, include=["embeddings", "metadatas", "documents"]
     )
     return data

 import chromadb
 from config import SanatanConfig
+from db import SanatanDatabase
 config = SanatanConfig()
 YT_METADATA_COLLECTION_NAME =  config.get_collection_name(scripture_name="yt_metadata")
+db = SanatanDatabase()
+def get_youtube_metadata_collection():
+    client = db.chroma_client
+    return client.get_or_create_collection(YT_METADATA_COLLECTION_NAME)
+def get_indexed_channels():
+    collection=get_youtube_metadata_collection()
     results = collection.get(include=["metadatas"])
     channels = {}
     # print("Deleting channel", channel_id)
     # print("data = ", data)
+    get_youtube_metadata_collection().delete(where={"channel_id": channel_id})
 def fetch_channel_data(channel_id: str):
+    data = get_youtube_metadata_collection().get(
         where={"channel_id": channel_id}, include=["embeddings", "metadatas", "documents"]
     )
     return data

modules/youtube_metadata/indexer.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # modules/indexer.py
 from typing import Dict, List
-from openai import OpenAI
 from modules.youtube_metadata.embeddings import get_embedding
 import logging
@@ -10,10 +10,9 @@ logger.setLevel(logging.INFO)
 def index_videos(
-    videos: List[Dict], collection, channel_url: str, batch_size: int = 50
 ):
-    client = OpenAI()
     total = len(videos)
     logger.info(
         f"index_videos: [INDEX] Starting indexing for {total} videos (channel={channel_url})"

 # modules/indexer.py
 from typing import Dict, List
+from modules.youtube_metadata.db import get_youtube_metadata_collection
 from modules.youtube_metadata.embeddings import get_embedding
 import logging
 def index_videos(
+    videos: List[Dict], channel_url: str, batch_size: int = 50
 ):
+    collection = get_youtube_metadata_collection()
     total = len(videos)
     logger.info(
         f"index_videos: [INDEX] Starting indexing for {total} videos (channel={channel_url})"

modules/youtube_metadata/youtube_poller.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from chromadb import Collection
 import feedparser
-from modules.youtube_metadata.db import get_collection, get_indexed_channels
 from modules.youtube_metadata.embeddings import get_embedding
 import logging
@@ -65,22 +65,20 @@ def filter_new_videos(videos, existing_ids):
 def add_to_chroma(collection: Collection, new_videos):
     if not new_videos:
         return
     collection.add(
-        documents=[v["title"] for v in new_videos],
         embeddings=[get_embedding(v["title"]) for v in new_videos],
         metadatas=[
-            {
-                "video_id": v["video_id"],
-                "channel_id": v["channel_id"],
-                "link": v["link"],
-            }
-            for v in new_videos
         ],
         ids=[v["video_id"] for v in new_videos],
     )
-def incremental_update(collection, channel_id):
     existing_ids = get_existing_video_ids(collection, channel_id)
     latest_videos = fetch_channel_videos_rss(channel_id)
     new_videos = filter_new_videos(latest_videos, existing_ids)
@@ -88,10 +86,10 @@ def incremental_update(collection, channel_id):
     if new_videos:
         add_to_chroma(collection, new_videos)
         logger.info(
-            f"incremental_update: Added {len(new_videos)} new videos from {channel_id}"
         )
     else:
-        logger.info(f"incremental_uddate: No new videos for {channel_id}")
 def start_poll():
@@ -101,5 +99,6 @@ def start_poll():
     while True:
         for channel_id in configured_channels:
-            incremental_update(get_collection(), channel_id)
         time.sleep(600)  # 10 minutes

 from chromadb import Collection
 import feedparser
+from modules.youtube_metadata.db import get_youtube_metadata_collection, get_indexed_channels
 from modules.youtube_metadata.embeddings import get_embedding
 import logging
 def add_to_chroma(collection: Collection, new_videos):
     if not new_videos:
         return
+    count = collection.count()
     collection.add(
+        documents=[f"{v['title']} - v['description']" for v in new_videos],
         embeddings=[get_embedding(v["title"]) for v in new_videos],
         metadatas=[
+            {**v, "_global_index": i}
+            for i, v in enumerate(new_videos, start=count)
         ],
         ids=[v["video_id"] for v in new_videos],
     )
+def incremental_update(channel_id):
+    collection = get_youtube_metadata_collection()
     existing_ids = get_existing_video_ids(collection, channel_id)
     latest_videos = fetch_channel_videos_rss(channel_id)
     new_videos = filter_new_videos(latest_videos, existing_ids)
     if new_videos:
         add_to_chroma(collection, new_videos)
         logger.info(
+            f"youtube_poller: incremental_update: Added {len(new_videos)} new videos from {channel_id}"
         )
     else:
+        logger.info(f"youtube_poller: incremental_uddate: No new videos for {channel_id}")
 def start_poll():
     while True:
         for channel_id in configured_channels:
+            incremental_update(channel_id)
+        logger.info("youtube_poller: Sleeping for 10 minutes")
         time.sleep(600)  # 10 minutes

modules/youtube_metadata/youtube_sync.py CHANGED Viewed

@@ -3,7 +3,6 @@ import gradio as gr
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from modules.youtube_metadata.collector import fetch_all_channel_videos
-from modules.youtube_metadata.db import get_collection
 from modules.youtube_metadata.indexer import index_videos
 # global stop signal
@@ -51,7 +50,7 @@ def _refresh_single_channel(api_key, channel_url, progress):
     with ThreadPoolExecutor(max_workers=4) as executor:
         futures = [
-            executor.submit(index_videos, batch, get_collection(), channel_url=channel_url)
             for _, batch in fetched_batches
         ]

 from concurrent.futures import ThreadPoolExecutor, as_completed
 from modules.youtube_metadata.collector import fetch_all_channel_videos
 from modules.youtube_metadata.indexer import index_videos
 # global stop signal
     with ThreadPoolExecutor(max_workers=4) as executor:
         futures = [
+            executor.submit(index_videos, batch, channel_url=channel_url)
             for _, batch in fetched_batches
         ]