Spaces:

mafzaal
/

lets_talk

Runtime error

App Files Files Community

mafzaal commited on May 11

Commit

f5df877

1 Parent(s): 6d85ef1

feat: Enhance blog data processing with vector database creation and update functionality

Browse files

Files changed (4) hide show

Dockerfile +4 -2
py-src/app.py +7 -3
py-src/pipeline.py +42 -10
stats/blog_stats_20250511_095935.json +8 -0

Dockerfile CHANGED Viewed

@@ -17,8 +17,6 @@ ENV UVICORN_WS_PROTOCOL=websockets
 # Set the working directory
 WORKDIR $HOME/app
-# Copy the app to the container
-COPY --chown=user ./py-src/ $HOME/app
 COPY --chown=user ./pyproject.toml $HOME/app
 COPY --chown=user ./uv.lock $HOME/app
@@ -26,6 +24,10 @@ COPY --chown=user ./uv.lock $HOME/app
 # RUN uv sync --frozen
 RUN uv sync
 #TODO: Fix this to download
 #copy posts to container
 COPY --chown=user ./data/ $HOME/app/data

 # Set the working directory
 WORKDIR $HOME/app
 COPY --chown=user ./pyproject.toml $HOME/app
 COPY --chown=user ./uv.lock $HOME/app
 # RUN uv sync --frozen
 RUN uv sync
+# Copy the app to the container
+COPY --chown=user ./py-src/ $HOME/app
 #TODO: Fix this to download
 #copy posts to container
 COPY --chown=user ./data/ $HOME/app/data

py-src/app.py CHANGED Viewed

@@ -7,6 +7,11 @@ from dotenv import load_dotenv
 # Load environment variables from .env file
 load_dotenv()
 import chainlit as cl
 from langchain.prompts import ChatPromptTemplate
@@ -19,11 +24,10 @@ from qdrant_client.http.models import Distance, VectorParams
 from lets_talk.config import LLM_MODEL, LLM_TEMPERATURE
 import lets_talk.utils.blog as blog
 from lets_talk.agent import build_agent,parse_output
-import pipeline
-#build vector store
-pipeline.main()
 tdg_agent = build_agent()

 # Load environment variables from .env file
 load_dotenv()
+import pipeline
+#build vector store
+print("=== Blog Data Update ===")
+pipeline.main()
+print("========================")
 import chainlit as cl
 from langchain.prompts import ChatPromptTemplate
 from lets_talk.config import LLM_MODEL, LLM_TEMPERATURE
 import lets_talk.utils.blog as blog
 from lets_talk.agent import build_agent,parse_output
 tdg_agent = build_agent()

py-src/pipeline.py CHANGED Viewed

@@ -5,7 +5,7 @@ This script updates the blog data vector store when new posts are added.
 It can be scheduled to run periodically or manually executed.
 Usage:
-    python update_blog_data.py [--force-recreate] [--data-dir DATA_DIR]
 Options:
     --force-recreate   Force recreation of the vector store even if it exists
@@ -57,6 +57,37 @@ def save_stats(stats, output_dir="./stats"):
     print(f"Saved stats to {filename}")
     return filename
 def main():
     """Main function to update blog data"""
     args = parse_args()
@@ -66,7 +97,6 @@ def main():
     print(f"Force recreate: {args.force_recreate}")
     print("========================")
-    # Process blog posts without creating embeddings
     try:
         # Load and process documents
         documents = blog.load_blog_posts(args.data_dir)
@@ -79,20 +109,22 @@ def main():
         # Save stats for tracking
         stats_file = save_stats(stats)
-        create_vector_store = (not Path.exists(Path(VECTOR_STORAGE_PATH))) or (args.force_recreate)
-        # Create a reference file for the vector store
-        if create_vector_store:
-            print("\nAttempting to save vector store reference file...")
-            vector_store = blog.create_vector_store(documents, storage_path=VECTOR_STORAGE_PATH, force_recreate=create_vector_store)
-            vector_store.client.close()
-            print("Vector store reference file saved.")
         print("\n=== Update Summary ===")
         print(f"Processed {stats['total_documents']} documents")
         print(f"Stats saved to: {stats_file}")
         print("=====================")
         return 0
     except Exception as e:
         print(f"Error: {e}")

 It can be scheduled to run periodically or manually executed.
 Usage:
+    python pipeline.py [--force-recreate] [--data-dir DATA_DIR]
 Options:
     --force-recreate   Force recreation of the vector store even if it exists
     print(f"Saved stats to {filename}")
     return filename
+def create_vector_database(documents, data_dir, storage_path=VECTOR_STORAGE_PATH, force_recreate=False):
+    """
+    Create or update the vector database with blog documents.
+    Args:
+        documents: List of document objects to store in the vector database
+        data_dir: Directory containing the blog posts (for reporting)
+        storage_path: Path where the vector database will be stored
+        force_recreate: Whether to force recreation of the vector store
+    Returns:
+        Tuple of (success status, message)
+    """
+    try:
+        create_vector_store = (not Path.exists(Path(storage_path))) or force_recreate
+        if create_vector_store:
+            print("\nAttempting to save vector store reference file...")
+            vector_store = blog.create_vector_store(
+                documents,
+                storage_path=storage_path,
+                force_recreate=force_recreate
+            )
+            vector_store.client.close()
+            print("Vector store reference file saved.")
+            return True, f"Vector store successfully created at {storage_path}"
+        else:
+            return True, f"Vector store already exists at {storage_path} (use --force-recreate to rebuild)"
+    except Exception as e:
+        return False, f"Error creating vector store: {str(e)}"
 def main():
     """Main function to update blog data"""
     args = parse_args()
     print(f"Force recreate: {args.force_recreate}")
     print("========================")
     try:
         # Load and process documents
         documents = blog.load_blog_posts(args.data_dir)
         # Save stats for tracking
         stats_file = save_stats(stats)
+        # Create or update vector database
+        success, message = create_vector_database(
+            documents,
+            args.data_dir,
+            storage_path=VECTOR_STORAGE_PATH,
+            force_recreate=args.force_recreate
+        )
         print("\n=== Update Summary ===")
         print(f"Processed {stats['total_documents']} documents")
         print(f"Stats saved to: {stats_file}")
+        print(f"Vector DB status: {message}")
         print("=====================")
+        if not success:
+            return 1
         return 0
     except Exception as e:
         print(f"Error: {e}")

stats/blog_stats_20250511_095935.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "timestamp": "20250511_095935",
+  "total_documents": 14,
+  "total_characters": 106275,
+  "min_length": 1900,
+  "max_length": 13468,
+  "avg_length": 7591.071428571428
+}