Spaces:

bhardwaj08sarthak
/

STEM-Question-Generator

Sleeping

App Files Files Community

bhardwaj08sarthak commited on Sep 25

Commit

3d115e7

verified ·

1 Parent(s): e2ade71

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -52

app.py CHANGED Viewed

@@ -1,8 +1,18 @@
-# Create a self-contained Gradio app that uses the agent-driven loop (Option A)
-# It expects `level_classifier_tool.py` to be colocated (or installed on PYTHONPATH).
 import spaces
 import sys
-import os
 from huggingface_hub import hf_hub_download
 import pickle
 from huggingface_hub import login
@@ -12,11 +22,11 @@ import gradio as gr
 from huggingface_hub import InferenceClient
 from smolagents import CodeAgent, InferenceClientModel, tool
 from langchain_community.embeddings import HuggingFaceEmbeddings
-#from llama_index.embeddings.huggingface import HuggingFaceEmbeddings
 from llama_index.core import StorageContext, load_index_from_storage
 from huggingface_hub import login, snapshot_download
 from smolagents import tool
-#from all_datasets import *
 from level_classifier_tool_2 import (
     classify_levels_phrases,
     HFEmbeddingBackend,
@@ -25,15 +35,21 @@ from level_classifier_tool_2 import (
 from task_temp import rag_temp, rag_cls_temp, cls_temp, gen_temp
 from all_tools import classify_and_score, QuestionRetrieverTool
 from phrases import BLOOMS_PHRASES, DOK_PHRASES
-# Prebuild embeddings once
 _backend = HFEmbeddingBackend(model_name="google/embeddinggemma-300m")
 _BLOOM_INDEX = build_phrase_index(_backend, BLOOMS_PHRASES)
 _DOK_INDEX = build_phrase_index(_backend, DOK_PHRASES)
 DATASET_REPO = "bhardwaj08sarthak/my-stem-index"   # your dataset repo id
-PERSIST_SUBDIR = "index_store"             # the folder you uploaded
-LOCAL_BASE = "/data/index"                 # where to place files in the Space
 # Download the persisted index folder into ephemeral storage
 os.makedirs(LOCAL_BASE, exist_ok=True)
@@ -50,46 +66,25 @@ persist_dir = os.path.join(LOCAL_BASE, PERSIST_SUBDIR)
 # Recreate the SAME embedding model used to build the index
 emb = HuggingFaceEmbeddings(
     model_name="google/embeddinggemma-300m",
-    model_kwargs={"device": "cuda"},
     encode_kwargs={"normalize_embeddings": True},
 )
 # Load the index from storage
 storage_context = StorageContext.from_defaults(persist_dir=persist_dir)
 index = load_index_from_storage(storage_context, embed_model=emb)
-#D = {
-#    "GSM8k": GSM8k['question'],
-#    "Olympiad": Olympiad_math['question'],
-#    "Olympiad2": Olympiad_math2['question'],
-#    "DeepMind Math": clean_math['question'],
-#    "MMMLU": MMMLU['question'],
-#    "MMMU": MMMU['question'],
-#    "ScienceQA": ScienceQA['question'],
-#    "PubmedQA": PubmedQA['question']
-#}
-#all_questions = (
-#    list(D["GSM8k"]) +
-#    list(D["Olympiad"]) +
-#    list(D["MMMLU"]) +
-#    list(D["MMMU"]) +
-#    list(D["DeepMind Math"]) +
-#    list(D["Olympiad2"]) +
-#    list(D["ScienceQA"]) +
-#    list(D["PubmedQA"])
-#)
-#texts = all_questions
-#@spaces.GPU(15)
-#def build_indexes_on_gpu(model="google/embeddinggemma-300m"):
-#    device = 'cuda'
-#    emb = HuggingFaceEmbeddings(
-#        model_name="model",
-#        model_kwargs={"device": device},
-#        encode_kwargs={"normalize_embeddings": True})
-#    idx = VectorStoreIndex.from_documents([Document(text=t) for t in texts], embed_model=emb)
-#    return idx
-#   device = "cuda"
-#index = build_indexes_on_gpu(model="google/embeddinggemma-300m")
 # ------------------------ Agent setup with timeout ------------------------
 def make_agent(hf_token: str, model_id: str, provider: str, timeout: int, temperature: float, max_tokens: int):
     client = InferenceClient(
@@ -101,14 +96,11 @@ def make_agent(hf_token: str, model_id: str, provider: str, timeout: int, temper
     # Bind generation params by partially applying via model kwargs.
     # smolagents InferenceClientModel currently accepts client only; we pass runtime params in task text.
-    model = InferenceClientModel(model_id=model_id,client=client)
     agent = CodeAgent(model=model, tools=[classify_and_score, QuestionRetrieverTool])
     agent._ui_params = {"temperature": temperature, "max_tokens": max_tokens}  # attach for reference
     return agent
-# ------------------------ Agent task template -----------------------------
 # ------------------------ Gradio glue ------------------------------------
 def run_pipeline(
     hf_token,
@@ -146,7 +138,7 @@ def run_pipeline(
     # The agent will internally call the tool
     try:
-        result_text = agent.run(task, max_steps=int(attempts)*4)
     except Exception as e:
         result_text = f"ERROR: {e}"
@@ -164,7 +156,6 @@ def run_pipeline(
     return final_json, result_text
 with gr.Blocks() as demo:
     gr.Markdown("# Agent + Tool: Generate Questions to Target Difficulty")
     gr.Markdown(
@@ -186,10 +177,11 @@ with gr.Blocks() as demo:
             value="Grade 7",
             label="Grade"
         )
-        subject= gr.Textbox(value="Math", label="Subject")
         task_type = gr.Dropdown(
             choices=["TASK_TMPL", "CLASSIFY_TMPL", "GEN_TMPL", "RAG_TMPL"],
-        label= "task type")
     with gr.Row():
         target_bloom = gr.Dropdown(
@@ -215,7 +207,7 @@ with gr.Blocks() as demo:
     run_btn.click(
         fn=run_pipeline,
-        inputs=[hf_token, topic, grade, subject, target_bloom, target_dok, attempts, model_id, provider, timeout, temperature, max_tokens,task_type],
         outputs=[final_json, transcript]
     )

+# --- MUST be first: disable Hugging Face Spaces ZeroGPU monkey-patch ---
+import os
+os.environ["SPACES_ZERO_DISABLED"] = "1"
+# (optional but helpful) steer PyTorch to math attention kernels (no Flash/MemEfficient)
+try:
+    import torch
+    torch.backends.cuda.sdp_kernel(enable_math=True, enable_flash=False, enable_mem_efficient=False)
+except Exception:
+    pass
+# If you truly need Spaces, import it AFTER disabling the patch.
 import spaces
 import sys
 from huggingface_hub import hf_hub_download
 import pickle
 from huggingface_hub import login
 from huggingface_hub import InferenceClient
 from smolagents import CodeAgent, InferenceClientModel, tool
 from langchain_community.embeddings import HuggingFaceEmbeddings
+# from llama_index.embeddings.huggingface import HuggingFaceEmbeddings
 from llama_index.core import StorageContext, load_index_from_storage
 from huggingface_hub import login, snapshot_download
 from smolagents import tool
+# from all_datasets import *
 from level_classifier_tool_2 import (
     classify_levels_phrases,
     HFEmbeddingBackend,
 from task_temp import rag_temp, rag_cls_temp, cls_temp, gen_temp
 from all_tools import classify_and_score, QuestionRetrieverTool
 from phrases import BLOOMS_PHRASES, DOK_PHRASES
+# ------------------------ Prebuild embeddings once ------------------------
 _backend = HFEmbeddingBackend(model_name="google/embeddinggemma-300m")
+# Belt-and-suspenders: ensure eager attention even if class wasn't patched
+try:
+    _backend.MODEL.config.attn_implementation = "eager"
+except Exception:
+    pass
 _BLOOM_INDEX = build_phrase_index(_backend, BLOOMS_PHRASES)
 _DOK_INDEX = build_phrase_index(_backend, DOK_PHRASES)
 DATASET_REPO = "bhardwaj08sarthak/my-stem-index"   # your dataset repo id
+PERSIST_SUBDIR = "index_store"                      # the folder you uploaded
+LOCAL_BASE = "/data/index"                          # where to place files in the Space
 # Download the persisted index folder into ephemeral storage
 os.makedirs(LOCAL_BASE, exist_ok=True)
 # Recreate the SAME embedding model used to build the index
 emb = HuggingFaceEmbeddings(
     model_name="google/embeddinggemma-300m",
+    model_kwargs={"device": "cuda", "attn_implementation": "eager"},
     encode_kwargs={"normalize_embeddings": True},
 )
 # Load the index from storage
 storage_context = StorageContext.from_defaults(persist_dir=persist_dir)
 index = load_index_from_storage(storage_context, embed_model=emb)
+# Datasets & GPU build code remains commented out...
+# @spaces.GPU(15)
+# def build_indexes_on_gpu(model="google/embeddinggemma-300m"):
+#     device = 'cuda'
+#     emb = HuggingFaceEmbeddings(
+#         model_name="model",
+#         model_kwargs={"device": device, "attn_implementation": "eager"},
+#         encode_kwargs={"normalize_embeddings": True})
+#     idx = VectorStoreIndex.from_documents([Document(text=t) for t in texts], embed_model=emb)
+#     return idx
 # ------------------------ Agent setup with timeout ------------------------
 def make_agent(hf_token: str, model_id: str, provider: str, timeout: int, temperature: float, max_tokens: int):
     client = InferenceClient(
     # Bind generation params by partially applying via model kwargs.
     # smolagents InferenceClientModel currently accepts client only; we pass runtime params in task text.
+    model = InferenceClientModel(model_id=model_id, client=client)
     agent = CodeAgent(model=model, tools=[classify_and_score, QuestionRetrieverTool])
     agent._ui_params = {"temperature": temperature, "max_tokens": max_tokens}  # attach for reference
     return agent
 # ------------------------ Gradio glue ------------------------------------
 def run_pipeline(
     hf_token,
     # The agent will internally call the tool
     try:
+        result_text = agent.run(task, max_steps=int(attempts) * 4)
     except Exception as e:
         result_text = f"ERROR: {e}"
     return final_json, result_text
 with gr.Blocks() as demo:
     gr.Markdown("# Agent + Tool: Generate Questions to Target Difficulty")
     gr.Markdown(
             value="Grade 7",
             label="Grade"
         )
+        subject = gr.Textbox(value="Math", label="Subject")
         task_type = gr.Dropdown(
             choices=["TASK_TMPL", "CLASSIFY_TMPL", "GEN_TMPL", "RAG_TMPL"],
+            label="task type"
+        )
     with gr.Row():
         target_bloom = gr.Dropdown(
     run_btn.click(
         fn=run_pipeline,
+        inputs=[hf_token, topic, grade, subject, target_bloom, target_dok, attempts, model_id, provider, timeout, temperature, max_tokens, task_type],
         outputs=[final_json, transcript]
     )