Add vector database creation configuration and update related scripts
Browse files- .env.example +6 -0
- CONTRIBUTING.md +6 -0
- README.md +10 -0
- py-src/app.py +2 -1
- py-src/lets_talk/config.py +6 -0
- py-src/pipeline.py +13 -9
- scripts/build-vector-store.sh +9 -2
.env.example
CHANGED
|
@@ -27,3 +27,9 @@ MAX_SEARCH_RESULTS=5
|
|
| 27 |
# Document Chunking Configuration
|
| 28 |
CHUNK_SIZE=1000
|
| 29 |
CHUNK_OVERLAP=200
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
# Document Chunking Configuration
|
| 28 |
CHUNK_SIZE=1000
|
| 29 |
CHUNK_OVERLAP=200
|
| 30 |
+
|
| 31 |
+
# Vector Database Creation Configuration
|
| 32 |
+
FORCE_RECREATE=False
|
| 33 |
+
OUTPUT_DIR=./stats
|
| 34 |
+
USE_CHUNKING=True
|
| 35 |
+
SHOULD_SAVE_STATS=True
|
CONTRIBUTING.md
CHANGED
|
@@ -29,6 +29,12 @@ TheDataGuy Chat is a Q&A chatbot powered by the content from [TheDataGuy blog](h
|
|
| 29 |
VECTOR_STORAGE_PATH=./db/vector_store_tdg
|
| 30 |
LLM_MODEL=gpt-4o-mini
|
| 31 |
EMBEDDING_MODEL=Snowflake/snowflake-arctic-embed-l
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
```
|
| 33 |
|
| 34 |
3. Install dependencies:
|
|
|
|
| 29 |
VECTOR_STORAGE_PATH=./db/vector_store_tdg
|
| 30 |
LLM_MODEL=gpt-4o-mini
|
| 31 |
EMBEDDING_MODEL=Snowflake/snowflake-arctic-embed-l
|
| 32 |
+
|
| 33 |
+
# Vector Database Creation Configuration (optional)
|
| 34 |
+
FORCE_RECREATE=False # Whether to force recreation of the vector store
|
| 35 |
+
OUTPUT_DIR=./stats # Directory to save stats and artifacts
|
| 36 |
+
USE_CHUNKING=True # Whether to split documents into chunks
|
| 37 |
+
SHOULD_SAVE_STATS=True # Whether to save statistics about the documents
|
| 38 |
```
|
| 39 |
|
| 40 |
3. Install dependencies:
|
README.md
CHANGED
|
@@ -94,6 +94,16 @@ CHUNK_SIZE=1000
|
|
| 94 |
CHUNK_OVERLAP=200
|
| 95 |
```
|
| 96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
## Running Locally
|
| 98 |
|
| 99 |
### Using Docker
|
|
|
|
| 94 |
CHUNK_OVERLAP=200
|
| 95 |
```
|
| 96 |
|
| 97 |
+
Additional configuration options for vector database creation:
|
| 98 |
+
|
| 99 |
+
```
|
| 100 |
+
# Vector Database Creation Configuration
|
| 101 |
+
FORCE_RECREATE=False # Whether to force recreation of the vector store
|
| 102 |
+
OUTPUT_DIR=./stats # Directory to save stats and artifacts
|
| 103 |
+
USE_CHUNKING=True # Whether to split documents into chunks
|
| 104 |
+
SHOULD_SAVE_STATS=True # Whether to save statistics about the documents
|
| 105 |
+
```
|
| 106 |
+
|
| 107 |
## Running Locally
|
| 108 |
|
| 109 |
### Using Docker
|
py-src/app.py
CHANGED
|
@@ -10,7 +10,8 @@ load_dotenv()
|
|
| 10 |
import pipeline
|
| 11 |
#build vector store
|
| 12 |
print("=== create vector db ===")
|
| 13 |
-
|
|
|
|
| 14 |
print("========================")
|
| 15 |
|
| 16 |
import chainlit as cl
|
|
|
|
| 10 |
import pipeline
|
| 11 |
#build vector store
|
| 12 |
print("=== create vector db ===")
|
| 13 |
+
# Use configuration from config rather than hardcoded values
|
| 14 |
+
pipeline.create_vector_database()
|
| 15 |
print("========================")
|
| 16 |
|
| 17 |
import chainlit as cl
|
py-src/lets_talk/config.py
CHANGED
|
@@ -20,5 +20,11 @@ MAX_SEARCH_RESULTS = int(os.environ.get("MAX_SEARCH_RESULTS", "5"))
|
|
| 20 |
CHUNK_SIZE = int(os.environ.get("CHUNK_SIZE", "1000"))
|
| 21 |
CHUNK_OVERLAP = int(os.environ.get("CHUNK_OVERLAP", "200"))
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
|
|
|
|
| 20 |
CHUNK_SIZE = int(os.environ.get("CHUNK_SIZE", "1000"))
|
| 21 |
CHUNK_OVERLAP = int(os.environ.get("CHUNK_OVERLAP", "200"))
|
| 22 |
|
| 23 |
+
# Vector database creation configuration
|
| 24 |
+
FORCE_RECREATE = os.environ.get("FORCE_RECREATE", "False").lower() == "true"
|
| 25 |
+
OUTPUT_DIR = os.environ.get("OUTPUT_DIR", "./stats")
|
| 26 |
+
USE_CHUNKING = os.environ.get("USE_CHUNKING", "True").lower() == "true"
|
| 27 |
+
SHOULD_SAVE_STATS = os.environ.get("SHOULD_SAVE_STATS", "True").lower() == "true"
|
| 28 |
+
|
| 29 |
|
| 30 |
|
py-src/pipeline.py
CHANGED
|
@@ -21,7 +21,10 @@ from datetime import datetime
|
|
| 21 |
import json
|
| 22 |
import logging
|
| 23 |
from pathlib import Path
|
| 24 |
-
from lets_talk.config import
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
# Import the blog utilities module
|
| 27 |
import lets_talk.utils.blog as blog
|
|
@@ -100,19 +103,20 @@ def save_stats(stats, output_dir="./stats", ci_mode=False):
|
|
| 100 |
return filename, basic_stats
|
| 101 |
|
| 102 |
def create_vector_database(data_dir=DATA_DIR, storage_path=VECTOR_STORAGE_PATH,
|
| 103 |
-
force_recreate=
|
| 104 |
-
use_chunking=
|
|
|
|
| 105 |
"""
|
| 106 |
Create or update the vector database with blog documents.
|
| 107 |
|
| 108 |
Args:
|
| 109 |
-
data_dir: Directory containing the blog posts
|
| 110 |
-
storage_path: Path where the vector database will be stored
|
| 111 |
-
force_recreate: Whether to force recreation of the vector store
|
| 112 |
-
output_dir: Directory to save stats and artifacts
|
| 113 |
ci_mode: Whether to run in CI mode
|
| 114 |
-
use_chunking: Whether to split documents into chunks
|
| 115 |
-
should_save_stats: Whether to save statistics about the documents
|
| 116 |
chunk_size: Size of each chunk in characters (default from config)
|
| 117 |
chunk_overlap: Overlap between chunks in characters (default from config)
|
| 118 |
|
|
|
|
| 21 |
import json
|
| 22 |
import logging
|
| 23 |
from pathlib import Path
|
| 24 |
+
from lets_talk.config import (
|
| 25 |
+
CHUNK_OVERLAP, CHUNK_SIZE, VECTOR_STORAGE_PATH, DATA_DIR,
|
| 26 |
+
FORCE_RECREATE, OUTPUT_DIR, USE_CHUNKING, SHOULD_SAVE_STATS
|
| 27 |
+
)
|
| 28 |
|
| 29 |
# Import the blog utilities module
|
| 30 |
import lets_talk.utils.blog as blog
|
|
|
|
| 103 |
return filename, basic_stats
|
| 104 |
|
| 105 |
def create_vector_database(data_dir=DATA_DIR, storage_path=VECTOR_STORAGE_PATH,
|
| 106 |
+
force_recreate=FORCE_RECREATE, output_dir=OUTPUT_DIR, ci_mode=False,
|
| 107 |
+
use_chunking=USE_CHUNKING, should_save_stats=SHOULD_SAVE_STATS,
|
| 108 |
+
chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP):
|
| 109 |
"""
|
| 110 |
Create or update the vector database with blog documents.
|
| 111 |
|
| 112 |
Args:
|
| 113 |
+
data_dir: Directory containing the blog posts (default from config)
|
| 114 |
+
storage_path: Path where the vector database will be stored (default from config)
|
| 115 |
+
force_recreate: Whether to force recreation of the vector store (default from config)
|
| 116 |
+
output_dir: Directory to save stats and artifacts (default from config)
|
| 117 |
ci_mode: Whether to run in CI mode
|
| 118 |
+
use_chunking: Whether to split documents into chunks (default from config)
|
| 119 |
+
should_save_stats: Whether to save statistics about the documents (default from config)
|
| 120 |
chunk_size: Size of each chunk in characters (default from config)
|
| 121 |
chunk_overlap: Overlap between chunks in characters (default from config)
|
| 122 |
|
scripts/build-vector-store.sh
CHANGED
|
@@ -1,14 +1,21 @@
|
|
| 1 |
#!/bin/bash
|
| 2 |
# Script to build vector store locally
|
| 3 |
# Usage: ./scripts/build-vector-store.sh [--force-recreate]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
|
|
|
| 5 |
FORCE_RECREATE=""
|
| 6 |
if [[ "$1" == "--force-recreate" ]]; then
|
| 7 |
FORCE_RECREATE="--force-recreate"
|
| 8 |
fi
|
| 9 |
|
| 10 |
-
# Set output directory for artifacts
|
| 11 |
-
OUTPUT_DIR
|
| 12 |
mkdir -p $OUTPUT_DIR
|
| 13 |
|
| 14 |
echo "Building vector store with output to $OUTPUT_DIR"
|
|
|
|
| 1 |
#!/bin/bash
|
| 2 |
# Script to build vector store locally
|
| 3 |
# Usage: ./scripts/build-vector-store.sh [--force-recreate]
|
| 4 |
+
#
|
| 5 |
+
# Environment variables that can be set:
|
| 6 |
+
# FORCE_RECREATE - Set to "true" to force recreation of the vector store
|
| 7 |
+
# OUTPUT_DIR - Directory to save stats and artifacts (default: ./artifacts)
|
| 8 |
+
# USE_CHUNKING - Set to "false" to disable document chunking
|
| 9 |
+
# SHOULD_SAVE_STATS - Set to "false" to disable saving document statistics
|
| 10 |
|
| 11 |
+
# Parse command line arguments
|
| 12 |
FORCE_RECREATE=""
|
| 13 |
if [[ "$1" == "--force-recreate" ]]; then
|
| 14 |
FORCE_RECREATE="--force-recreate"
|
| 15 |
fi
|
| 16 |
|
| 17 |
+
# Set output directory for artifacts (use environment variable if set)
|
| 18 |
+
OUTPUT_DIR=${OUTPUT_DIR:-"./artifacts"}
|
| 19 |
mkdir -p $OUTPUT_DIR
|
| 20 |
|
| 21 |
echo "Building vector store with output to $OUTPUT_DIR"
|