Spaces:

bhardwaj08sarthak
/

STEM-Question-Generator

Sleeping

App Files Files Community

bhardwaj08sarthak commited on Sep 25

Commit

5ad52f6

verified ·

1 Parent(s): ecee2c9

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -17

app.py CHANGED Viewed

@@ -70,49 +70,79 @@ except Exception:
 _BLOOM_INDEX = build_phrase_index(_backend, BLOOMS_PHRASES)
 _DOK_INDEX = build_phrase_index(_backend, DOK_PHRASES)
 DATASET_REPO = "bhardwaj08sarthak/my-stem-index"   # your dataset repo id
-PERSIST_SUBDIR = "index_store"                      # the folder you uploaded
 def _pick_writable_base() -> Path:
-    # Prefer home, fall back to /tmp
     for base in (Path.home(), Path("/tmp")):
         try:
             base.mkdir(parents=True, exist_ok=True)
             test = base / ".write_test"
-            with open(test, "w") as f:
-                f.write("ok")
             test.unlink(missing_ok=True)
             return base
         except Exception:
             continue
-    # Last resort: current working directory
     return Path.cwd()
 WRITABLE_BASE = _pick_writable_base()
 LOCAL_BASE = WRITABLE_BASE / "my_app_cache" / "index"
 LOCAL_BASE.mkdir(parents=True, exist_ok=True)
-# Recreate the SAME embedding model used to build the index
 try:
     import torch
     _emb_device = "cuda" if torch.cuda.is_available() else "cpu"
 except Exception:
     _emb_device = "cpu"
 emb = HuggingFaceEmbeddings(
     model_name="google/embeddinggemma-300m",
-    model_kwargs={"device": _emb_device},
     encode_kwargs={"normalize_embeddings": True},
 )
-# Load the index from storage
-snapshot_download(
-    repo_id=DATASET_REPO,
-    repo_type="dataset",
-    local_dir=str(LOCAL_BASE),
-    allow_patterns=[f"{PERSIST_SUBDIR}/**"],
-    local_dir_use_symlinks=False,
-)
-persist_dir = str(LOCAL_BASE / PERSIST_SUBDIR)
 storage_context = StorageContext.from_defaults(persist_dir=str(persist_dir))
 index = load_index_from_storage(storage_context, embed_model=emb)

 _BLOOM_INDEX = build_phrase_index(_backend, BLOOMS_PHRASES)
 _DOK_INDEX = build_phrase_index(_backend, DOK_PHRASES)
 DATASET_REPO = "bhardwaj08sarthak/my-stem-index"   # your dataset repo id
+PERSIST_SUBDIR = "index_store"                      # folder inside the dataset
+# Writable cache base (home or /tmp)
 def _pick_writable_base() -> Path:
     for base in (Path.home(), Path("/tmp")):
         try:
             base.mkdir(parents=True, exist_ok=True)
             test = base / ".write_test"
+            test.write_text("ok")
             test.unlink(missing_ok=True)
             return base
         except Exception:
             continue
     return Path.cwd()
 WRITABLE_BASE = _pick_writable_base()
 LOCAL_BASE = WRITABLE_BASE / "my_app_cache" / "index"
 LOCAL_BASE.mkdir(parents=True, exist_ok=True)
+# Download only the persisted index folder
+snapshot_download(
+    repo_id=DATASET_REPO,
+    repo_type="dataset",
+    local_dir=str(LOCAL_BASE),
+    allow_patterns=[f"{PERSIST_SUBDIR}/**"],
+    local_dir_use_symlinks=False,
+)
+# Resolve the actual persist dir by finding docstore.json
+def _resolve_persist_dir(base: Path, subdir: str) -> Path:
+    # Common candidates
+    candidates = [
+        base / subdir,       # <LOCAL_BASE>/index_store
+        base,                # sometimes files land directly under local base
+    ]
+    for c in candidates:
+        if (c / "docstore.json").exists():
+            return c
+    # Search anywhere under base for docstore.json
+    matches = list(base.rglob("docstore.json"))
+    if matches:
+        return matches[0].parent
+    # Nothing found: print what we actually downloaded
+    tree = "\n".join(str(p.relative_to(base)) for p in base.rglob("*") if p.is_file())
+    raise FileNotFoundError(
+        f"Could not find 'docstore.json' under {base}. "
+        f"Expected '{subdir}/docstore.json'. Downloaded files:\n{tree}"
+    )
+persist_dir = _resolve_persist_dir(Path(LOCAL_BASE), PERSIST_SUBDIR)
+# Sanity-check typical LlamaIndex files (names may vary by version/vector store)
+expected = ["docstore.json", "index_store.json", "vector_store.json"]
+missing = [name for name in expected if not (persist_dir / name).exists()]
+if missing:
+    # Not fatal for every setup, but warn loudly so you know if upload was incomplete
+    print(f"[warn] Missing in {persist_dir}: {missing}. If loading fails, re-upload the full '{PERSIST_SUBDIR}' folder.")
+# Pick a device that exists for embeddings
 try:
     import torch
     _emb_device = "cuda" if torch.cuda.is_available() else "cpu"
 except Exception:
     _emb_device = "cpu"
 emb = HuggingFaceEmbeddings(
     model_name="google/embeddinggemma-300m",
+    model_kwargs={"device": _emb_device, "attn_implementation": "eager"},
     encode_kwargs={"normalize_embeddings": True},
 )
+# Finally load the index
 storage_context = StorageContext.from_defaults(persist_dir=str(persist_dir))
 index = load_index_from_storage(storage_context, embed_model=emb)