feat: Enhance blog data processing with vector database creation and update functionality
Browse files- Dockerfile +4 -2
- py-src/app.py +7 -3
- py-src/pipeline.py +42 -10
- stats/blog_stats_20250511_095935.json +8 -0
Dockerfile
CHANGED
|
@@ -17,8 +17,6 @@ ENV UVICORN_WS_PROTOCOL=websockets
|
|
| 17 |
# Set the working directory
|
| 18 |
WORKDIR $HOME/app
|
| 19 |
|
| 20 |
-
# Copy the app to the container
|
| 21 |
-
COPY --chown=user ./py-src/ $HOME/app
|
| 22 |
COPY --chown=user ./pyproject.toml $HOME/app
|
| 23 |
COPY --chown=user ./uv.lock $HOME/app
|
| 24 |
|
|
@@ -26,6 +24,10 @@ COPY --chown=user ./uv.lock $HOME/app
|
|
| 26 |
# RUN uv sync --frozen
|
| 27 |
RUN uv sync
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
#TODO: Fix this to download
|
| 30 |
#copy posts to container
|
| 31 |
COPY --chown=user ./data/ $HOME/app/data
|
|
|
|
| 17 |
# Set the working directory
|
| 18 |
WORKDIR $HOME/app
|
| 19 |
|
|
|
|
|
|
|
| 20 |
COPY --chown=user ./pyproject.toml $HOME/app
|
| 21 |
COPY --chown=user ./uv.lock $HOME/app
|
| 22 |
|
|
|
|
| 24 |
# RUN uv sync --frozen
|
| 25 |
RUN uv sync
|
| 26 |
|
| 27 |
+
# Copy the app to the container
|
| 28 |
+
COPY --chown=user ./py-src/ $HOME/app
|
| 29 |
+
|
| 30 |
+
|
| 31 |
#TODO: Fix this to download
|
| 32 |
#copy posts to container
|
| 33 |
COPY --chown=user ./data/ $HOME/app/data
|
py-src/app.py
CHANGED
|
@@ -7,6 +7,11 @@ from dotenv import load_dotenv
|
|
| 7 |
|
| 8 |
# Load environment variables from .env file
|
| 9 |
load_dotenv()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
import chainlit as cl
|
| 12 |
from langchain.prompts import ChatPromptTemplate
|
|
@@ -19,11 +24,10 @@ from qdrant_client.http.models import Distance, VectorParams
|
|
| 19 |
from lets_talk.config import LLM_MODEL, LLM_TEMPERATURE
|
| 20 |
import lets_talk.utils.blog as blog
|
| 21 |
from lets_talk.agent import build_agent,parse_output
|
| 22 |
-
import pipeline
|
| 23 |
|
| 24 |
|
| 25 |
-
|
| 26 |
-
|
| 27 |
|
| 28 |
tdg_agent = build_agent()
|
| 29 |
|
|
|
|
| 7 |
|
| 8 |
# Load environment variables from .env file
|
| 9 |
load_dotenv()
|
| 10 |
+
import pipeline
|
| 11 |
+
#build vector store
|
| 12 |
+
print("=== Blog Data Update ===")
|
| 13 |
+
pipeline.main()
|
| 14 |
+
print("========================")
|
| 15 |
|
| 16 |
import chainlit as cl
|
| 17 |
from langchain.prompts import ChatPromptTemplate
|
|
|
|
| 24 |
from lets_talk.config import LLM_MODEL, LLM_TEMPERATURE
|
| 25 |
import lets_talk.utils.blog as blog
|
| 26 |
from lets_talk.agent import build_agent,parse_output
|
|
|
|
| 27 |
|
| 28 |
|
| 29 |
+
|
| 30 |
+
|
| 31 |
|
| 32 |
tdg_agent = build_agent()
|
| 33 |
|
py-src/pipeline.py
CHANGED
|
@@ -5,7 +5,7 @@ This script updates the blog data vector store when new posts are added.
|
|
| 5 |
It can be scheduled to run periodically or manually executed.
|
| 6 |
|
| 7 |
Usage:
|
| 8 |
-
python
|
| 9 |
|
| 10 |
Options:
|
| 11 |
--force-recreate Force recreation of the vector store even if it exists
|
|
@@ -57,6 +57,37 @@ def save_stats(stats, output_dir="./stats"):
|
|
| 57 |
print(f"Saved stats to {filename}")
|
| 58 |
return filename
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
def main():
|
| 61 |
"""Main function to update blog data"""
|
| 62 |
args = parse_args()
|
|
@@ -66,7 +97,6 @@ def main():
|
|
| 66 |
print(f"Force recreate: {args.force_recreate}")
|
| 67 |
print("========================")
|
| 68 |
|
| 69 |
-
# Process blog posts without creating embeddings
|
| 70 |
try:
|
| 71 |
# Load and process documents
|
| 72 |
documents = blog.load_blog_posts(args.data_dir)
|
|
@@ -79,20 +109,22 @@ def main():
|
|
| 79 |
# Save stats for tracking
|
| 80 |
stats_file = save_stats(stats)
|
| 81 |
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
print("Vector store reference file saved.")
|
| 90 |
|
| 91 |
print("\n=== Update Summary ===")
|
| 92 |
print(f"Processed {stats['total_documents']} documents")
|
| 93 |
print(f"Stats saved to: {stats_file}")
|
|
|
|
| 94 |
print("=====================")
|
| 95 |
|
|
|
|
|
|
|
| 96 |
return 0
|
| 97 |
except Exception as e:
|
| 98 |
print(f"Error: {e}")
|
|
|
|
| 5 |
It can be scheduled to run periodically or manually executed.
|
| 6 |
|
| 7 |
Usage:
|
| 8 |
+
python pipeline.py [--force-recreate] [--data-dir DATA_DIR]
|
| 9 |
|
| 10 |
Options:
|
| 11 |
--force-recreate Force recreation of the vector store even if it exists
|
|
|
|
| 57 |
print(f"Saved stats to {filename}")
|
| 58 |
return filename
|
| 59 |
|
| 60 |
+
def create_vector_database(documents, data_dir, storage_path=VECTOR_STORAGE_PATH, force_recreate=False):
|
| 61 |
+
"""
|
| 62 |
+
Create or update the vector database with blog documents.
|
| 63 |
+
|
| 64 |
+
Args:
|
| 65 |
+
documents: List of document objects to store in the vector database
|
| 66 |
+
data_dir: Directory containing the blog posts (for reporting)
|
| 67 |
+
storage_path: Path where the vector database will be stored
|
| 68 |
+
force_recreate: Whether to force recreation of the vector store
|
| 69 |
+
|
| 70 |
+
Returns:
|
| 71 |
+
Tuple of (success status, message)
|
| 72 |
+
"""
|
| 73 |
+
try:
|
| 74 |
+
create_vector_store = (not Path.exists(Path(storage_path))) or force_recreate
|
| 75 |
+
|
| 76 |
+
if create_vector_store:
|
| 77 |
+
print("\nAttempting to save vector store reference file...")
|
| 78 |
+
vector_store = blog.create_vector_store(
|
| 79 |
+
documents,
|
| 80 |
+
storage_path=storage_path,
|
| 81 |
+
force_recreate=force_recreate
|
| 82 |
+
)
|
| 83 |
+
vector_store.client.close()
|
| 84 |
+
print("Vector store reference file saved.")
|
| 85 |
+
return True, f"Vector store successfully created at {storage_path}"
|
| 86 |
+
else:
|
| 87 |
+
return True, f"Vector store already exists at {storage_path} (use --force-recreate to rebuild)"
|
| 88 |
+
except Exception as e:
|
| 89 |
+
return False, f"Error creating vector store: {str(e)}"
|
| 90 |
+
|
| 91 |
def main():
|
| 92 |
"""Main function to update blog data"""
|
| 93 |
args = parse_args()
|
|
|
|
| 97 |
print(f"Force recreate: {args.force_recreate}")
|
| 98 |
print("========================")
|
| 99 |
|
|
|
|
| 100 |
try:
|
| 101 |
# Load and process documents
|
| 102 |
documents = blog.load_blog_posts(args.data_dir)
|
|
|
|
| 109 |
# Save stats for tracking
|
| 110 |
stats_file = save_stats(stats)
|
| 111 |
|
| 112 |
+
# Create or update vector database
|
| 113 |
+
success, message = create_vector_database(
|
| 114 |
+
documents,
|
| 115 |
+
args.data_dir,
|
| 116 |
+
storage_path=VECTOR_STORAGE_PATH,
|
| 117 |
+
force_recreate=args.force_recreate
|
| 118 |
+
)
|
|
|
|
| 119 |
|
| 120 |
print("\n=== Update Summary ===")
|
| 121 |
print(f"Processed {stats['total_documents']} documents")
|
| 122 |
print(f"Stats saved to: {stats_file}")
|
| 123 |
+
print(f"Vector DB status: {message}")
|
| 124 |
print("=====================")
|
| 125 |
|
| 126 |
+
if not success:
|
| 127 |
+
return 1
|
| 128 |
return 0
|
| 129 |
except Exception as e:
|
| 130 |
print(f"Error: {e}")
|
stats/blog_stats_20250511_095935.json
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"timestamp": "20250511_095935",
|
| 3 |
+
"total_documents": 14,
|
| 4 |
+
"total_characters": 106275,
|
| 5 |
+
"min_length": 1900,
|
| 6 |
+
"max_length": 13468,
|
| 7 |
+
"avg_length": 7591.071428571428
|
| 8 |
+
}
|