File size: 2,982 Bytes
4d95531
66d0fd5
a914d48
 
 
82fd433
410396e
a914d48
 
 
 
 
 
 
 
 
 
 
 
66d0fd5
a914d48
410396e
66d0fd5
 
 
a914d48
 
 
66d0fd5
a914d48
66d0fd5
 
a914d48
 
66d0fd5
a914d48
 
 
 
 
 
 
 
66d0fd5
a914d48
 
 
4d95531
66d0fd5
 
4d95531
 
 
a914d48
 
 
66d0fd5
 
a914d48
82fd433
 
 
 
a914d48
 
 
 
 
8ec62ff
a914d48
66d0fd5
8ec62ff
a914d48
 
 
 
 
66d0fd5
a914d48
 
 
82fd433
a914d48
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import os
import pandas as pd
import logging
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document

# ==============================
# Logging Setup
# ==============================
logger = logging.getLogger("DevAssist.Vector")
logging.basicConfig(level=logging.INFO)

# ==============================
# Configuration
# ==============================
DATASET_PATH = os.getenv("SME_DATASET_PATH", "sme_builder_dataset.csv")
DB_LOCATION = os.getenv("CHROMA_DB_DIR", "./DevAssist_SME_Builder_DB")
COLLECTION_NAME = os.getenv("CHROMA_COLLECTION", "landing_page_generation_examples")
EMBEDDING_MODEL = os.getenv("HF_EMBEDDING_MODEL", "intfloat/e5-large-v2")
HF_CACHE_DIR = os.getenv("HF_HOME", "/app/huggingface_cache")

os.makedirs(HF_CACHE_DIR, exist_ok=True)
os.makedirs(DB_LOCATION, exist_ok=True)

# ==============================
# Validate Dataset
# ==============================
if not os.path.exists(DATASET_PATH):
    raise FileNotFoundError(f"❌ Dataset not found: {DATASET_PATH}")

df = pd.read_csv(DATASET_PATH)
if df.empty:
    raise ValueError("❌ SME dataset is empty β€” cannot initialize vector DB.")

# ==============================
# Embedding Model
# ==============================
try:
    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
    logger.info(f"βœ… Embedding model loaded: {EMBEDDING_MODEL}")
except Exception as e:
    raise RuntimeError(f"⚠️ Failed to load embedding model: {e}")

# ==============================
# Initialize Vector Store
# ==============================
vector_store = Chroma(
    collection_name=COLLECTION_NAME,
    persist_directory=DB_LOCATION,
    embedding_function=embeddings,
)

# Only add documents if DB is new or empty
if not os.listdir(DB_LOCATION):
    logger.info("🧩 Initializing new Chroma vector store from dataset...")
    documents = []
    for i, row in df.iterrows():
        content_parts = [
            str(row.get("prompt", "")),
            str(row.get("html_code", "")),
            str(row.get("css_code", "")),
            str(row.get("js_code", "")),
            str(row.get("sector", "")),
        ]
        content = " ".join([p for p in content_parts if p.strip()])
        if not content.strip():
            continue
        documents.append(Document(page_content=content, metadata={"id": str(i)}))

    if documents:
        vector_store.add_documents(documents=documents)
        logger.info(f"βœ… Added {len(documents)} documents to Chroma DB.")
    else:
        logger.warning("⚠️ No valid documents found in dataset to embed.")
else:
    logger.info("πŸ’Ύ Using existing Chroma vector store (no rebuild).")

# ==============================
# Retriever
# ==============================
retriever = vector_store.as_retriever(search_kwargs={"k": 20})
logger.info(
    f"SME vector store ready β†’ collection='{COLLECTION_NAME}', "
    f"docs={vector_store._collection.count()}"
)