mafzaal commited on
Commit
f5df877
·
1 Parent(s): 6d85ef1

feat: Enhance blog data processing with vector database creation and update functionality

Browse files
Dockerfile CHANGED
@@ -17,8 +17,6 @@ ENV UVICORN_WS_PROTOCOL=websockets
17
  # Set the working directory
18
  WORKDIR $HOME/app
19
 
20
- # Copy the app to the container
21
- COPY --chown=user ./py-src/ $HOME/app
22
  COPY --chown=user ./pyproject.toml $HOME/app
23
  COPY --chown=user ./uv.lock $HOME/app
24
 
@@ -26,6 +24,10 @@ COPY --chown=user ./uv.lock $HOME/app
26
  # RUN uv sync --frozen
27
  RUN uv sync
28
 
 
 
 
 
29
  #TODO: Fix this to download
30
  #copy posts to container
31
  COPY --chown=user ./data/ $HOME/app/data
 
17
  # Set the working directory
18
  WORKDIR $HOME/app
19
 
 
 
20
  COPY --chown=user ./pyproject.toml $HOME/app
21
  COPY --chown=user ./uv.lock $HOME/app
22
 
 
24
  # RUN uv sync --frozen
25
  RUN uv sync
26
 
27
+ # Copy the app to the container
28
+ COPY --chown=user ./py-src/ $HOME/app
29
+
30
+
31
  #TODO: Fix this to download
32
  #copy posts to container
33
  COPY --chown=user ./data/ $HOME/app/data
py-src/app.py CHANGED
@@ -7,6 +7,11 @@ from dotenv import load_dotenv
7
 
8
  # Load environment variables from .env file
9
  load_dotenv()
 
 
 
 
 
10
 
11
  import chainlit as cl
12
  from langchain.prompts import ChatPromptTemplate
@@ -19,11 +24,10 @@ from qdrant_client.http.models import Distance, VectorParams
19
  from lets_talk.config import LLM_MODEL, LLM_TEMPERATURE
20
  import lets_talk.utils.blog as blog
21
  from lets_talk.agent import build_agent,parse_output
22
- import pipeline
23
 
24
 
25
- #build vector store
26
- pipeline.main()
27
 
28
  tdg_agent = build_agent()
29
 
 
7
 
8
  # Load environment variables from .env file
9
  load_dotenv()
10
+ import pipeline
11
+ #build vector store
12
+ print("=== Blog Data Update ===")
13
+ pipeline.main()
14
+ print("========================")
15
 
16
  import chainlit as cl
17
  from langchain.prompts import ChatPromptTemplate
 
24
  from lets_talk.config import LLM_MODEL, LLM_TEMPERATURE
25
  import lets_talk.utils.blog as blog
26
  from lets_talk.agent import build_agent,parse_output
 
27
 
28
 
29
+
30
+
31
 
32
  tdg_agent = build_agent()
33
 
py-src/pipeline.py CHANGED
@@ -5,7 +5,7 @@ This script updates the blog data vector store when new posts are added.
5
  It can be scheduled to run periodically or manually executed.
6
 
7
  Usage:
8
- python update_blog_data.py [--force-recreate] [--data-dir DATA_DIR]
9
 
10
  Options:
11
  --force-recreate Force recreation of the vector store even if it exists
@@ -57,6 +57,37 @@ def save_stats(stats, output_dir="./stats"):
57
  print(f"Saved stats to {filename}")
58
  return filename
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  def main():
61
  """Main function to update blog data"""
62
  args = parse_args()
@@ -66,7 +97,6 @@ def main():
66
  print(f"Force recreate: {args.force_recreate}")
67
  print("========================")
68
 
69
- # Process blog posts without creating embeddings
70
  try:
71
  # Load and process documents
72
  documents = blog.load_blog_posts(args.data_dir)
@@ -79,20 +109,22 @@ def main():
79
  # Save stats for tracking
80
  stats_file = save_stats(stats)
81
 
82
- create_vector_store = (not Path.exists(Path(VECTOR_STORAGE_PATH))) or (args.force_recreate)
83
-
84
- # Create a reference file for the vector store
85
- if create_vector_store:
86
- print("\nAttempting to save vector store reference file...")
87
- vector_store = blog.create_vector_store(documents, storage_path=VECTOR_STORAGE_PATH, force_recreate=create_vector_store)
88
- vector_store.client.close()
89
- print("Vector store reference file saved.")
90
 
91
  print("\n=== Update Summary ===")
92
  print(f"Processed {stats['total_documents']} documents")
93
  print(f"Stats saved to: {stats_file}")
 
94
  print("=====================")
95
 
 
 
96
  return 0
97
  except Exception as e:
98
  print(f"Error: {e}")
 
5
  It can be scheduled to run periodically or manually executed.
6
 
7
  Usage:
8
+ python pipeline.py [--force-recreate] [--data-dir DATA_DIR]
9
 
10
  Options:
11
  --force-recreate Force recreation of the vector store even if it exists
 
57
  print(f"Saved stats to {filename}")
58
  return filename
59
 
60
+ def create_vector_database(documents, data_dir, storage_path=VECTOR_STORAGE_PATH, force_recreate=False):
61
+ """
62
+ Create or update the vector database with blog documents.
63
+
64
+ Args:
65
+ documents: List of document objects to store in the vector database
66
+ data_dir: Directory containing the blog posts (for reporting)
67
+ storage_path: Path where the vector database will be stored
68
+ force_recreate: Whether to force recreation of the vector store
69
+
70
+ Returns:
71
+ Tuple of (success status, message)
72
+ """
73
+ try:
74
+ create_vector_store = (not Path.exists(Path(storage_path))) or force_recreate
75
+
76
+ if create_vector_store:
77
+ print("\nAttempting to save vector store reference file...")
78
+ vector_store = blog.create_vector_store(
79
+ documents,
80
+ storage_path=storage_path,
81
+ force_recreate=force_recreate
82
+ )
83
+ vector_store.client.close()
84
+ print("Vector store reference file saved.")
85
+ return True, f"Vector store successfully created at {storage_path}"
86
+ else:
87
+ return True, f"Vector store already exists at {storage_path} (use --force-recreate to rebuild)"
88
+ except Exception as e:
89
+ return False, f"Error creating vector store: {str(e)}"
90
+
91
  def main():
92
  """Main function to update blog data"""
93
  args = parse_args()
 
97
  print(f"Force recreate: {args.force_recreate}")
98
  print("========================")
99
 
 
100
  try:
101
  # Load and process documents
102
  documents = blog.load_blog_posts(args.data_dir)
 
109
  # Save stats for tracking
110
  stats_file = save_stats(stats)
111
 
112
+ # Create or update vector database
113
+ success, message = create_vector_database(
114
+ documents,
115
+ args.data_dir,
116
+ storage_path=VECTOR_STORAGE_PATH,
117
+ force_recreate=args.force_recreate
118
+ )
 
119
 
120
  print("\n=== Update Summary ===")
121
  print(f"Processed {stats['total_documents']} documents")
122
  print(f"Stats saved to: {stats_file}")
123
+ print(f"Vector DB status: {message}")
124
  print("=====================")
125
 
126
+ if not success:
127
+ return 1
128
  return 0
129
  except Exception as e:
130
  print(f"Error: {e}")
stats/blog_stats_20250511_095935.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "timestamp": "20250511_095935",
3
+ "total_documents": 14,
4
+ "total_characters": 106275,
5
+ "min_length": 1900,
6
+ "max_length": 13468,
7
+ "avg_length": 7591.071428571428
8
+ }