Yashashvibhardwaj commited on
Commit
eb42a98
Β·
verified Β·
1 Parent(s): 556cbea

Update build_index.py

Browse files
Files changed (1) hide show
  1. build_index.py +24 -64
build_index.py CHANGED
@@ -1,80 +1,40 @@
1
  import os
2
  import json
3
- import requests
4
- import io
5
  import faiss
6
  import numpy as np
7
- from PIL import Image
8
  from sentence_transformers import SentenceTransformer
9
- from tqdm import tqdm # progress bar
10
 
11
- # ---------------------------------------------------
12
- # Locate products.json in the same folder as this script
13
- # ---------------------------------------------------
14
- BASE_DIR = os.path.dirname(os.path.abspath(__file__))
15
- PRODUCTS_FILE = os.path.join(BASE_DIR, "products.json")
16
- INDEX_FILE = os.path.join(BASE_DIR, "products.index")
17
 
18
- # ---------------------------------------------------
19
- # Load product metadata
20
- # ---------------------------------------------------
21
- if not os.path.exists(PRODUCTS_FILE):
22
- raise FileNotFoundError(f"❌ Could not find {PRODUCTS_FILE}")
23
-
24
- with open(PRODUCTS_FILE, "r", encoding="utf-8") as f:
25
  products = json.load(f)
26
 
27
- print(f"πŸ“¦ Loaded {len(products)} products from {PRODUCTS_FILE}")
28
 
29
- # ---------------------------------------------------
30
  # Load CLIP model
31
- # ---------------------------------------------------
32
- print("🧠 Loading CLIP model (this may take a few seconds)...")
33
- model = SentenceTransformer("clip-ViT-B-32")
34
-
35
- # ---------------------------------------------------
36
- # Collect unique image URLs (avoid redundant downloads)
37
- # ---------------------------------------------------
38
- unique_urls = list({p["image_url"] for p in products})
39
- print(f"πŸ”— Found {len(unique_urls)} unique image URLs")
40
-
41
- # ---------------------------------------------------
42
- # Compute embeddings for unique URLs
43
- # ---------------------------------------------------
44
- url_to_emb = {}
45
 
46
- for url in tqdm(unique_urls, desc="Embedding unique images"):
47
- try:
48
- response = requests.get(url, timeout=10)
49
- response.raise_for_status()
50
- img = Image.open(io.BytesIO(response.content)).convert("RGB")
51
- emb = model.encode(img, convert_to_numpy=True,
52
- normalize_embeddings=True)
53
- url_to_emb[url] = emb
54
- except Exception as e:
55
- print(f"⚠️ Error processing {url}: {e}")
56
- url_to_emb[url] = np.zeros(512, dtype=np.float32) # fallback embedding
57
 
58
- # ---------------------------------------------------
59
- # Build embeddings array for all products
60
- # ---------------------------------------------------
61
- embeddings = []
62
- for p in products:
63
- embeddings.append(url_to_emb[p["image_url"]])
64
-
65
- embeddings = np.array(embeddings).astype("float32")
66
 
67
- print(f"βœ… Built embeddings array: {embeddings.shape}")
 
 
68
 
69
- # ---------------------------------------------------
70
- # Create FAISS index (cosine similarity via inner product)
71
- # ---------------------------------------------------
72
- dim = embeddings.shape[1] # 512 for CLIP
73
- index = faiss.IndexFlatIP(dim)
74
- index.add(embeddings)
75
 
76
- # ---------------------------------------------------
77
- # Save FAISS index
78
- # ---------------------------------------------------
79
- faiss.write_index(index, INDEX_FILE)
80
- print(f"πŸŽ‰ Saved FAISS index with {index.ntotal} vectors β†’ {INDEX_FILE}")
 
1
  import os
2
  import json
 
 
3
  import faiss
4
  import numpy as np
 
5
  from sentence_transformers import SentenceTransformer
 
6
 
7
+ # Fix caching permissions for Hugging Face
8
+ os.environ["HF_HOME"] = "./cache"
9
+ os.environ["TRANSFORMERS_CACHE"] = "./cache"
10
+ os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./cache"
 
 
11
 
12
+ # Load products
13
+ with open("products.json", "r", encoding="utf-8") as f:
 
 
 
 
 
14
  products = json.load(f)
15
 
16
+ print(f"πŸ“¦ Loaded {len(products)} products from products.json")
17
 
 
18
  # Load CLIP model
19
+ print("🧠 Loading CLIP model...")
20
+ model = SentenceTransformer("sentence-transformers/clip-ViT-B-32", cache_folder="./cache")
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
+ # Encode product names
23
+ print("πŸ”Ž Encoding product features...")
24
+ product_names = [p["name"] for p in products]
25
+ embeddings = model.encode(product_names, convert_to_numpy=True, show_progress_bar=True)
 
 
 
 
 
 
 
26
 
27
+ # Build FAISS index
28
+ dimension = embeddings.shape[1]
29
+ index = faiss.IndexFlatL2(dimension)
30
+ index.add(embeddings)
 
 
 
 
31
 
32
+ # Save index
33
+ faiss.write_index(index, "products.index")
34
+ print("βœ… Saved FAISS index to products.index")
35
 
36
+ # Save mapping
37
+ with open("id_mapping.json", "w", encoding="utf-8") as f:
38
+ json.dump(products, f, ensure_ascii=False, indent=2)
 
 
 
39
 
40
+ print("βœ… Saved product ID mapping to id_mapping.json")