bhardwaj08sarthak commited on
Commit
5ad52f6
·
verified ·
1 Parent(s): ecee2c9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -17
app.py CHANGED
@@ -70,49 +70,79 @@ except Exception:
70
  _BLOOM_INDEX = build_phrase_index(_backend, BLOOMS_PHRASES)
71
  _DOK_INDEX = build_phrase_index(_backend, DOK_PHRASES)
72
 
73
-
74
  DATASET_REPO = "bhardwaj08sarthak/my-stem-index" # your dataset repo id
75
- PERSIST_SUBDIR = "index_store" # the folder you uploaded
76
 
 
77
  def _pick_writable_base() -> Path:
78
- # Prefer home, fall back to /tmp
79
  for base in (Path.home(), Path("/tmp")):
80
  try:
81
  base.mkdir(parents=True, exist_ok=True)
82
  test = base / ".write_test"
83
- with open(test, "w") as f:
84
- f.write("ok")
85
  test.unlink(missing_ok=True)
86
  return base
87
  except Exception:
88
  continue
89
- # Last resort: current working directory
90
  return Path.cwd()
91
 
92
  WRITABLE_BASE = _pick_writable_base()
93
  LOCAL_BASE = WRITABLE_BASE / "my_app_cache" / "index"
94
  LOCAL_BASE.mkdir(parents=True, exist_ok=True)
95
- # Recreate the SAME embedding model used to build the index
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  try:
97
  import torch
98
  _emb_device = "cuda" if torch.cuda.is_available() else "cpu"
99
  except Exception:
100
  _emb_device = "cpu"
 
101
  emb = HuggingFaceEmbeddings(
102
  model_name="google/embeddinggemma-300m",
103
- model_kwargs={"device": _emb_device},
104
  encode_kwargs={"normalize_embeddings": True},
105
  )
106
 
107
- # Load the index from storage
108
- snapshot_download(
109
- repo_id=DATASET_REPO,
110
- repo_type="dataset",
111
- local_dir=str(LOCAL_BASE),
112
- allow_patterns=[f"{PERSIST_SUBDIR}/**"],
113
- local_dir_use_symlinks=False,
114
- )
115
- persist_dir = str(LOCAL_BASE / PERSIST_SUBDIR)
116
  storage_context = StorageContext.from_defaults(persist_dir=str(persist_dir))
117
  index = load_index_from_storage(storage_context, embed_model=emb)
118
 
 
70
  _BLOOM_INDEX = build_phrase_index(_backend, BLOOMS_PHRASES)
71
  _DOK_INDEX = build_phrase_index(_backend, DOK_PHRASES)
72
 
 
73
  DATASET_REPO = "bhardwaj08sarthak/my-stem-index" # your dataset repo id
74
+ PERSIST_SUBDIR = "index_store" # folder inside the dataset
75
 
76
+ # Writable cache base (home or /tmp)
77
  def _pick_writable_base() -> Path:
 
78
  for base in (Path.home(), Path("/tmp")):
79
  try:
80
  base.mkdir(parents=True, exist_ok=True)
81
  test = base / ".write_test"
82
+ test.write_text("ok")
 
83
  test.unlink(missing_ok=True)
84
  return base
85
  except Exception:
86
  continue
 
87
  return Path.cwd()
88
 
89
  WRITABLE_BASE = _pick_writable_base()
90
  LOCAL_BASE = WRITABLE_BASE / "my_app_cache" / "index"
91
  LOCAL_BASE.mkdir(parents=True, exist_ok=True)
92
+
93
+ # Download only the persisted index folder
94
+ snapshot_download(
95
+ repo_id=DATASET_REPO,
96
+ repo_type="dataset",
97
+ local_dir=str(LOCAL_BASE),
98
+ allow_patterns=[f"{PERSIST_SUBDIR}/**"],
99
+ local_dir_use_symlinks=False,
100
+ )
101
+
102
+ # Resolve the actual persist dir by finding docstore.json
103
+ def _resolve_persist_dir(base: Path, subdir: str) -> Path:
104
+ # Common candidates
105
+ candidates = [
106
+ base / subdir, # <LOCAL_BASE>/index_store
107
+ base, # sometimes files land directly under local base
108
+ ]
109
+ for c in candidates:
110
+ if (c / "docstore.json").exists():
111
+ return c
112
+ # Search anywhere under base for docstore.json
113
+ matches = list(base.rglob("docstore.json"))
114
+ if matches:
115
+ return matches[0].parent
116
+ # Nothing found: print what we actually downloaded
117
+ tree = "\n".join(str(p.relative_to(base)) for p in base.rglob("*") if p.is_file())
118
+ raise FileNotFoundError(
119
+ f"Could not find 'docstore.json' under {base}. "
120
+ f"Expected '{subdir}/docstore.json'. Downloaded files:\n{tree}"
121
+ )
122
+
123
+ persist_dir = _resolve_persist_dir(Path(LOCAL_BASE), PERSIST_SUBDIR)
124
+
125
+ # Sanity-check typical LlamaIndex files (names may vary by version/vector store)
126
+ expected = ["docstore.json", "index_store.json", "vector_store.json"]
127
+ missing = [name for name in expected if not (persist_dir / name).exists()]
128
+ if missing:
129
+ # Not fatal for every setup, but warn loudly so you know if upload was incomplete
130
+ print(f"[warn] Missing in {persist_dir}: {missing}. If loading fails, re-upload the full '{PERSIST_SUBDIR}' folder.")
131
+
132
+ # Pick a device that exists for embeddings
133
  try:
134
  import torch
135
  _emb_device = "cuda" if torch.cuda.is_available() else "cpu"
136
  except Exception:
137
  _emb_device = "cpu"
138
+
139
  emb = HuggingFaceEmbeddings(
140
  model_name="google/embeddinggemma-300m",
141
+ model_kwargs={"device": _emb_device, "attn_implementation": "eager"},
142
  encode_kwargs={"normalize_embeddings": True},
143
  )
144
 
145
+ # Finally load the index
 
 
 
 
 
 
 
 
146
  storage_context = StorageContext.from_defaults(persist_dir=str(persist_dir))
147
  index = load_index_from_storage(storage_context, embed_model=emb)
148