Spaces:

vikramvasudevan
/

sanatan_ai

Running on CPU Upgrade

App Files Files Community

vikramvasudevan commited on Aug 29

Commit

74c37c0

verified ·

1 Parent(s): cbc9372

Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

Dockerfile +0 -8
README.md +3 -3
app.py +1 -3
config.py +11 -4
copy_chromadb.py +41 -12
db.py +7 -2
embeddings.py +69 -6

Dockerfile CHANGED Viewed

@@ -1,9 +1,5 @@
 FROM python:3.12-slim
-# Add near the top of Dockerfile
-ENV HF_HOME=/app/hf_cache
-RUN mkdir -p $HF_HOME && chmod 777 $HF_HOME
 # Avoid interactive prompts during build
 ENV DEBIAN_FRONTEND=noninteractive
@@ -34,9 +30,5 @@ RUN pip install --no-cache-dir -r requirements.txt
 COPY . /app
 WORKDIR /app
-RUN useradd -m appuser
-RUN mkdir -p /app/chroma_db && chown -R appuser:appuser /app
-USER appuser
 # Default command (Gradio, Streamlit, or Python)
 CMD ["python", "app.py"]

 FROM python:3.12-slim
 # Avoid interactive prompts during build
 ENV DEBIAN_FRONTEND=noninteractive
 COPY . /app
 WORKDIR /app
 # Default command (Gradio, Streamlit, or Python)
 CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 title: sanatan_ai
 app_file: app.py
-sdk: docker
 python_version: 3.12
-emoji: 👀
----

 ---
 title: sanatan_ai
 app_file: app.py
+sdk: gradio
+sdk_version: 5.38.0
 python_version: 3.12
+---

app.py CHANGED Viewed

@@ -468,6 +468,4 @@ with gr.Blocks(
         textbox=message_textbox,
     )
-port = int(os.environ.get("PORT", 7860))
-app.launch(server_name="0.0.0.0", server_port=port)

         textbox=message_textbox,
     )
+app.launch()

config.py CHANGED Viewed

@@ -159,6 +159,7 @@ class SanatanConfig:
             "title": "4000 Divya Prabandham",
             "output_dir": "./output/divya_prabandham",
             "collection_name": "divya_prabandham",
             "metadata_fields": [
                 {
                     "name": "prabandham_code",
@@ -381,8 +382,7 @@ class SanatanConfig:
                 "Show detailed commentary for sloka 2 from Chathusloki",
                 "What is the role of Sri Devi in the universe according to the Chathusloki?",
             ],
-            "llm_hints" : [
-            ]
         },
         {
             "name": "sri_stavam",
@@ -420,9 +420,9 @@ class SanatanConfig:
                 "Show detailed commentary for sloka 2 from Sri Stavam",
                 "What is the role of Sri Devi in the universe according to the Sri Stavam?",
             ],
-            "llm_hints" : [
                 "if the user asks for nth sloka, do a metadata search on the `verse` field."
-            ]
         },
     ]
@@ -445,3 +445,10 @@ class SanatanConfig:
                     f"metadata_field: [{filter.metadata_field}] not allowed in collection [{collection_name}]. Here are the allowed fields with their descriptions: {scripture["metadata_fields"]}"
                 )
         return True

             "title": "4000 Divya Prabandham",
             "output_dir": "./output/divya_prabandham",
             "collection_name": "divya_prabandham",
+            "collection_embedding_fn": "openai",
             "metadata_fields": [
                 {
                     "name": "prabandham_code",
                 "Show detailed commentary for sloka 2 from Chathusloki",
                 "What is the role of Sri Devi in the universe according to the Chathusloki?",
             ],
+            "llm_hints": [],
         },
         {
             "name": "sri_stavam",
                 "Show detailed commentary for sloka 2 from Sri Stavam",
                 "What is the role of Sri Devi in the universe according to the Sri Stavam?",
             ],
+            "llm_hints": [
                 "if the user asks for nth sloka, do a metadata search on the `verse` field."
+            ],
         },
     ]
                     f"metadata_field: [{filter.metadata_field}] not allowed in collection [{collection_name}]. Here are the allowed fields with their descriptions: {scripture["metadata_fields"]}"
                 )
         return True
+    def get_embedding_for_collection(self, collection_name: str):
+        scripture = self.get_scripture_by_collection(collection_name)
+        embedding_fn = "hf"  # default is huggingface sentence transformaers
+        if "collection_embedding_fn" in scripture:
+            embedding_fn = scripture["collection_embedding_fn"]  # overridden in config
+        return embedding_fn

copy_chromadb.py CHANGED Viewed

@@ -1,22 +1,51 @@
 import chromadb
 from tqdm import tqdm  # Optional: For progress bar
-# Connect to source and destination local persistent clients
-source_client = chromadb.PersistentClient(
-    path="../vedam_ai/chromadb-store"
 )
 destination_client = chromadb.PersistentClient(path="./chromadb-store")
-source_collection_name = "sri_stavam"
-destination_collection_name = "sri_stavam"
 # Get the source collection
 source_collection = source_client.get_collection(source_collection_name)
 # Retrieve all data from the source collection
-source_data = source_collection.get(
-    include=["documents", "metadatas", "embeddings"]
-)
 # Create or get the destination collection
 if destination_client.get_or_create_collection(destination_collection_name):
@@ -35,11 +64,11 @@ total_records = len(source_data["ids"])
 print(f"Copying {total_records} records in batches of {BATCH_SIZE}...")
 for i in tqdm(range(0, total_records, BATCH_SIZE)):
-    batch_ids = source_data["ids"][i:i + BATCH_SIZE]
-    batch_docs = source_data["documents"][i:i + BATCH_SIZE]
-    batch_metas = source_data["metadatas"][i:i + BATCH_SIZE]
     batch_embeds = (
-        source_data["embeddings"][i:i + BATCH_SIZE]
         if "embeddings" in source_data and source_data["embeddings"] is not None
         else None
     )

+import argparse
 import chromadb
 from tqdm import tqdm  # Optional: For progress bar
+db_config = {
+    "youtube_db": {
+        "source_db_path": "../youtube_surfer_ai_agent/youtube_db",
+        "source_collection_name": "yt_metadata",
+        "destination_collection_name": "yt_metadata",
+    },
+    "divya_prabandham": {
+        "source_db_path": "../uveda_analyzer/chromadb_store",
+        "source_collection_name": "divya_prabandham",
+        "destination_collection_name": "divya_prabandham",
+    },
+}
+parser = argparse.ArgumentParser(description="My app with database parameter")
+parser.add_argument(
+    "--db",
+    type=str,
+    required=True,
+    choices=list(db_config.keys()),
+    help=f"Id of the database to use. allowed_values : {', '.join(db_config.keys())}",
 )
+args = parser.parse_args()
+db_id = args.db
+if db_id is None:
+    raise Exception(f"No db provided!")
+if db_id not in db_config:
+    raise Exception(f"db with id {db_id} not found!")
+# Connect to source and destination local persistent clients
+source_client = chromadb.PersistentClient(path=db_config[db_id]["source_db_path"])
 destination_client = chromadb.PersistentClient(path="./chromadb-store")
+source_collection_name = db_config[db_id]["source_collection_name"]
+destination_collection_name = db_config[db_id]["destination_collection_name"]
 # Get the source collection
 source_collection = source_client.get_collection(source_collection_name)
 # Retrieve all data from the source collection
+source_data = source_collection.get(include=["documents", "metadatas", "embeddings"])
 # Create or get the destination collection
 if destination_client.get_or_create_collection(destination_collection_name):
 print(f"Copying {total_records} records in batches of {BATCH_SIZE}...")
 for i in tqdm(range(0, total_records, BATCH_SIZE)):
+    batch_ids = source_data["ids"][i : i + BATCH_SIZE]
+    batch_docs = source_data["documents"][i : i + BATCH_SIZE]
+    batch_metas = source_data["metadatas"][i : i + BATCH_SIZE]
     batch_embeds = (
+        source_data["embeddings"][i : i + BATCH_SIZE]
         if "embeddings" in source_data and source_data["embeddings"] is not None
         else None
     )

db.py CHANGED Viewed

@@ -34,10 +34,13 @@ class SanatanDatabase:
         logger.info("Vector Semantic Search for [%s] in [%s]", query, collection_name)
         collection = self.chroma_client.get_or_create_collection(name=collection_name)
         response = collection.query(
-            query_embeddings=[get_embedding(query)],
             # query_texts=[query],
             n_results=n_results,
         )
         return response
     def search_for_literal(
@@ -137,7 +140,9 @@ class SanatanDatabase:
         )
         collection = self.chroma_client.get_or_create_collection(name=collection_name)
         response = collection.query(
-            query_embeddings=[get_embedding(query)],
             where=metadata_where_clause.to_chroma_where(),
             # query_texts=[query],
             n_results=n_results,

         logger.info("Vector Semantic Search for [%s] in [%s]", query, collection_name)
         collection = self.chroma_client.get_or_create_collection(name=collection_name)
         response = collection.query(
+            query_embeddings=get_embedding(
+                [query], SanatanConfig().get_embedding_for_collection(collection_name)
+            ),
             # query_texts=[query],
             n_results=n_results,
         )
+        # logger.info("number of matches = %d", len(response["metadatas"]))
         return response
     def search_for_literal(
         )
         collection = self.chroma_client.get_or_create_collection(name=collection_name)
         response = collection.query(
+            query_embeddings=get_embedding(
+                [query], SanatanConfig().get_embedding_for_collection(collection_name)
+            ),
             where=metadata_where_clause.to_chroma_where(),
             # query_texts=[query],
             n_results=n_results,

embeddings.py CHANGED Viewed

@@ -1,9 +1,72 @@
 from sentence_transformers import SentenceTransformer
-# Step 1: Load SentenceTransformer model
-# model = SentenceTransformer("all-MiniLM-L6-v2")
-model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
-def get_embedding(text: str) -> list:
-    return model.encode(text).tolist()

+from typing import Literal
+import numpy as np
 from sentence_transformers import SentenceTransformer
+from openai import OpenAI
+from dotenv import load_dotenv
+import tiktoken
+load_dotenv()
+# Local HuggingFace model
+hf_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
+# OpenAI client
+client = OpenAI()
+# Choose tokenizer for embeddings model
+tokenizer = tiktoken.encoding_for_model("text-embedding-3-large")
+# -------------------------------
+# Helpers
+# -------------------------------
+def _get_hf_embedding(texts: list[str]) -> list[list[float]]:
+    """Get embeddings using HuggingFace SentenceTransformer."""
+    return hf_model.encode(texts).tolist()
+def chunk_text(text: str, max_tokens: int = 1000) -> list[str]:
+    tokens = tokenizer.encode(text)
+    return [tokenizer.decode(tokens[i:i+max_tokens]) for i in range(0, len(tokens), max_tokens)]
+def _get_openai_embedding(texts: list[str]) -> list[list[float]]:
+    """Get embeddings for a list of texts. If a text is too long, chunk + average."""
+    final_embeddings = []
+    for text in texts:
+        # Split into chunks if too long
+        if len(tokenizer.encode(text)) > 8192:
+            chunks = chunk_text(text)
+        else:
+            chunks = [text]
+        # Call API on all chunks at once
+        response = client.embeddings.create(
+            model="text-embedding-3-large",
+            input=chunks
+        )
+        chunk_embeddings = [np.array(d.embedding) for d in response.data]
+        # Average embeddings if multiple chunks
+        avg_embedding = np.mean(chunk_embeddings, axis=0)
+        final_embeddings.append(avg_embedding.tolist())
+    return final_embeddings
+def get_embedding(texts: list[str], backend: Literal["hf","openai"] = "hf") -> list[list[float]]:
+    """
+    Get embeddings for a list of texts.
+    backend = "openai" or "hf"
+    """
+    if backend == "hf":
+        return _get_hf_embedding(texts)
+    return _get_openai_embedding(texts)
+# -------------------------------
+# Example
+# -------------------------------
+if __name__ == "__main__":
+    texts = [
+        "short text example",
+        "very long text " * 2000  # will get chunked
+    ]
+    embs = get_embedding(texts, backend="openai")
+    print(len(embs), "embeddings returned")