Spaces:
Sleeping
Sleeping
File size: 2,982 Bytes
4d95531 66d0fd5 a914d48 82fd433 410396e a914d48 66d0fd5 a914d48 410396e 66d0fd5 a914d48 66d0fd5 a914d48 66d0fd5 a914d48 66d0fd5 a914d48 66d0fd5 a914d48 4d95531 66d0fd5 4d95531 a914d48 66d0fd5 a914d48 82fd433 a914d48 8ec62ff a914d48 66d0fd5 8ec62ff a914d48 66d0fd5 a914d48 82fd433 a914d48 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import os
import pandas as pd
import logging
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document
# ==============================
# Logging Setup
# ==============================
logger = logging.getLogger("DevAssist.Vector")
logging.basicConfig(level=logging.INFO)
# ==============================
# Configuration
# ==============================
DATASET_PATH = os.getenv("SME_DATASET_PATH", "sme_builder_dataset.csv")
DB_LOCATION = os.getenv("CHROMA_DB_DIR", "./DevAssist_SME_Builder_DB")
COLLECTION_NAME = os.getenv("CHROMA_COLLECTION", "landing_page_generation_examples")
EMBEDDING_MODEL = os.getenv("HF_EMBEDDING_MODEL", "intfloat/e5-large-v2")
HF_CACHE_DIR = os.getenv("HF_HOME", "/app/huggingface_cache")
os.makedirs(HF_CACHE_DIR, exist_ok=True)
os.makedirs(DB_LOCATION, exist_ok=True)
# ==============================
# Validate Dataset
# ==============================
if not os.path.exists(DATASET_PATH):
raise FileNotFoundError(f"β Dataset not found: {DATASET_PATH}")
df = pd.read_csv(DATASET_PATH)
if df.empty:
raise ValueError("β SME dataset is empty β cannot initialize vector DB.")
# ==============================
# Embedding Model
# ==============================
try:
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
logger.info(f"β
Embedding model loaded: {EMBEDDING_MODEL}")
except Exception as e:
raise RuntimeError(f"β οΈ Failed to load embedding model: {e}")
# ==============================
# Initialize Vector Store
# ==============================
vector_store = Chroma(
collection_name=COLLECTION_NAME,
persist_directory=DB_LOCATION,
embedding_function=embeddings,
)
# Only add documents if DB is new or empty
if not os.listdir(DB_LOCATION):
logger.info("π§© Initializing new Chroma vector store from dataset...")
documents = []
for i, row in df.iterrows():
content_parts = [
str(row.get("prompt", "")),
str(row.get("html_code", "")),
str(row.get("css_code", "")),
str(row.get("js_code", "")),
str(row.get("sector", "")),
]
content = " ".join([p for p in content_parts if p.strip()])
if not content.strip():
continue
documents.append(Document(page_content=content, metadata={"id": str(i)}))
if documents:
vector_store.add_documents(documents=documents)
logger.info(f"β
Added {len(documents)} documents to Chroma DB.")
else:
logger.warning("β οΈ No valid documents found in dataset to embed.")
else:
logger.info("πΎ Using existing Chroma vector store (no rebuild).")
# ==============================
# Retriever
# ==============================
retriever = vector_store.as_retriever(search_kwargs={"k": 20})
logger.info(
f"SME vector store ready β collection='{COLLECTION_NAME}', "
f"docs={vector_store._collection.count()}"
)
|