lydiasolomon commited on
Commit
a914d48
Β·
verified Β·
1 Parent(s): 897fd91

Update smebuilder_vector.py

Browse files
Files changed (1) hide show
  1. smebuilder_vector.py +55 -23
smebuilder_vector.py CHANGED
@@ -1,54 +1,86 @@
1
  import os
2
  import pandas as pd
3
- from langchain_community.embeddings import HuggingFaceEmbeddings
4
- from langchain_community.vectorstores import Chroma
 
5
  from langchain_core.documents import Document
6
 
7
- # ----------------- CONFIG -----------------
8
- DATASET_PATH = "sme_builder_dataset.csv"
9
- DB_LOCATION = os.getenv("CHROMA_DB_DIR", "./Dev_Assist_SME_Builder_DB")
10
- COLLECTION_NAME = "landing_page_generation_examples"
 
 
 
 
 
 
 
 
11
  EMBEDDING_MODEL = os.getenv("HF_EMBEDDING_MODEL", "intfloat/e5-large-v2")
12
- HF_CACHE_DIR = os.getenv("HF_CACHE_DIR", "/app/huggingface_cache")
13
 
14
  os.makedirs(HF_CACHE_DIR, exist_ok=True)
15
  os.makedirs(DB_LOCATION, exist_ok=True)
16
 
17
- # ----------------- LOAD DATASET -----------------
 
 
18
  if not os.path.exists(DATASET_PATH):
19
- raise FileNotFoundError(f"Dataset file not found: {DATASET_PATH}")
20
 
21
  df = pd.read_csv(DATASET_PATH)
 
 
22
 
23
- # ----------------- EMBEDDINGS -----------------
24
- embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
25
-
26
- # ----------------- VECTOR STORE -----------------
27
- # Only add documents if DB is empty
28
- add_documents = not os.listdir(DB_LOCATION)
 
 
29
 
 
 
 
30
  vector_store = Chroma(
31
  collection_name=COLLECTION_NAME,
32
  persist_directory=DB_LOCATION,
33
  embedding_function=embeddings,
34
  )
35
 
36
- if add_documents:
 
 
37
  documents = []
38
  for i, row in df.iterrows():
39
- content = " ".join([
40
  str(row.get("prompt", "")),
41
  str(row.get("html_code", "")),
42
  str(row.get("css_code", "")),
43
  str(row.get("js_code", "")),
44
- str(row.get("sector", ""))
45
- ]).strip()
 
 
 
46
  documents.append(Document(page_content=content, metadata={"id": str(i)}))
47
-
48
  if documents:
49
  vector_store.add_documents(documents=documents)
 
 
 
 
 
50
 
51
- # ----------------- RETRIEVER -----------------
 
 
52
  retriever = vector_store.as_retriever(search_kwargs={"k": 20})
53
-
54
- print(f"SME vector store initialized. collection={COLLECTION_NAME}, documents={vector_store._collection.count()}")
 
 
 
1
  import os
2
  import pandas as pd
3
+ import logging
4
+ from langchain_huggingface import HuggingFaceEmbeddings
5
+ from langchain_chroma import Chroma
6
  from langchain_core.documents import Document
7
 
8
+ # ==============================
9
+ # Logging Setup
10
+ # ==============================
11
+ logger = logging.getLogger("DevAssist.Vector")
12
+ logging.basicConfig(level=logging.INFO)
13
+
14
+ # ==============================
15
+ # Configuration
16
+ # ==============================
17
+ DATASET_PATH = os.getenv("SME_DATASET_PATH", "sme_builder_dataset.csv")
18
+ DB_LOCATION = os.getenv("CHROMA_DB_DIR", "./DevAssist_SME_Builder_DB")
19
+ COLLECTION_NAME = os.getenv("CHROMA_COLLECTION", "landing_page_generation_examples")
20
  EMBEDDING_MODEL = os.getenv("HF_EMBEDDING_MODEL", "intfloat/e5-large-v2")
21
+ HF_CACHE_DIR = os.getenv("HF_HOME", "/app/huggingface_cache")
22
 
23
  os.makedirs(HF_CACHE_DIR, exist_ok=True)
24
  os.makedirs(DB_LOCATION, exist_ok=True)
25
 
26
+ # ==============================
27
+ # Validate Dataset
28
+ # ==============================
29
  if not os.path.exists(DATASET_PATH):
30
+ raise FileNotFoundError(f"❌ Dataset not found: {DATASET_PATH}")
31
 
32
  df = pd.read_csv(DATASET_PATH)
33
+ if df.empty:
34
+ raise ValueError("❌ SME dataset is empty β€” cannot initialize vector DB.")
35
 
36
+ # ==============================
37
+ # Embedding Model
38
+ # ==============================
39
+ try:
40
+ embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
41
+ logger.info(f"βœ… Embedding model loaded: {EMBEDDING_MODEL}")
42
+ except Exception as e:
43
+ raise RuntimeError(f"⚠️ Failed to load embedding model: {e}")
44
 
45
+ # ==============================
46
+ # Initialize Vector Store
47
+ # ==============================
48
  vector_store = Chroma(
49
  collection_name=COLLECTION_NAME,
50
  persist_directory=DB_LOCATION,
51
  embedding_function=embeddings,
52
  )
53
 
54
+ # Only add documents if DB is new or empty
55
+ if not os.listdir(DB_LOCATION):
56
+ logger.info("🧩 Initializing new Chroma vector store from dataset...")
57
  documents = []
58
  for i, row in df.iterrows():
59
+ content_parts = [
60
  str(row.get("prompt", "")),
61
  str(row.get("html_code", "")),
62
  str(row.get("css_code", "")),
63
  str(row.get("js_code", "")),
64
+ str(row.get("sector", "")),
65
+ ]
66
+ content = " ".join([p for p in content_parts if p.strip()])
67
+ if not content.strip():
68
+ continue
69
  documents.append(Document(page_content=content, metadata={"id": str(i)}))
70
+
71
  if documents:
72
  vector_store.add_documents(documents=documents)
73
+ logger.info(f"βœ… Added {len(documents)} documents to Chroma DB.")
74
+ else:
75
+ logger.warning("⚠️ No valid documents found in dataset to embed.")
76
+ else:
77
+ logger.info("πŸ’Ύ Using existing Chroma vector store (no rebuild).")
78
 
79
+ # ==============================
80
+ # Retriever
81
+ # ==============================
82
  retriever = vector_store.as_retriever(search_kwargs={"k": 20})
83
+ logger.info(
84
+ f"SME vector store ready β†’ collection='{COLLECTION_NAME}', "
85
+ f"docs={vector_store._collection.count()}"
86
+ )