Spaces:

bhardwaj08sarthak
/

STEM-Question-Generator

Sleeping

App Files Files Community

bhardwaj08sarthak commited on Sep 25

Commit

f296c60

verified ·

1 Parent(s): c1b36ab

Update app.py

Browse files

Files changed (1) hide show

app.py +128 -339

app.py CHANGED Viewed

@@ -1,282 +1,109 @@
-import os
 import json
 import gradio as gr
-import spaces
 from huggingface_hub import InferenceClient
 from smolagents import CodeAgent, InferenceClientModel, tool
 from huggingface_hub import login
-from smolagents import TransformersModel
-import os
-token = os.getenv("HF_Token")
-login(token=token)
-from level_classifier_tool import (
     classify_levels_phrases,
     HFEmbeddingBackend,
     build_phrase_index
 )
-# ------------------------ Taxonomy phrases ------------------------
-BLOOMS_PHRASES = {
-    "Remember": [
-        "define", "list", "recall", "identify", "state", "label", "name", "recognize", "find",
-        "select", "match", "choose", "give", "write", "tell", "show"
-    ],
-    "Understand": [
-        "classify", "interpret", "summarize", "explain", "estimate", "describe", "discuss",
-        "predict", "paraphrase", "restate", "illustrate", "compare", "contrast", "report"
-    ],
-    "Apply": [
-        "apply", "solve", "use", "demonstrate", "calculate", "implement", "perform",
-        "execute", "carry out", "practice", "employ", "sketch"
-    ],
-    "Analyze": [
-        "analyze", "differentiate", "organize", "structure", "break down", "distinguish",
-        "dissect", "examine", "compare", "contrast", "attribute", "investigate"
-    ],
-    "Evaluate": [
-        "evaluate", "judge", "critique", "assess", "defend", "argue", "select", "support",
-        "appraise", "recommend", "conclude", "review"
-    ],
-    "Create": [
-        "create", "design", "compose", "plan", "construct", "produce", "devise", "generate",
-        "develop", "formulate", "invent", "build"
-    ]
-}
-DOK_PHRASES = {
-    "DOK1": [
-        "define", "list", "recall", "compute", "identify", "state", "label", "how many",
-        "name", "recognize", "find", "determine", "select", "match", "choose", "give",
-        "write", "tell", "show", "point out"
-    ],
-    "DOK2": [
-        "classify", "interpret", "estimate", "organise", "summarise", "explain", "solve",
-        "categorize", "group", "compare", "contrast", "distinguish", "make observations",
-        "collect data", "display data", "arrange", "sort", "paraphrase", "restate", "predict",
-        "approximate", "demonstrate", "illustrate", "describe", "analyze data"
-    ],
-    "DOK3": [
-        "justify", "analyze", "generalise", "compare", "construct", "investigate",
-        "support", "defend", "argue", "examine", "differentiate", "criticize", "debate",
-        "test", "experiment", "hypothesize", "draw conclusions", "break down", "dissect",
-        "probe", "explore", "develop", "formulate"
-    ],
-    "DOK4": [
-        "design", "synthesize", "model", "prove", "evaluate system", "critique", "create",
-        "compose", "plan", "invent", "devise", "generate", "build", "construct", "produce",
-        "formulate", "improve", "revise", "assess", "appraise", "judge", "recommend",
-        "predict outcome", "simulate"
-    ]
-}
-# ------------------------ Prebuild embeddings once ------------------------
 _backend = HFEmbeddingBackend(model_name="sentence-transformers/all-MiniLM-L6-v2")
 _BLOOM_INDEX = build_phrase_index(_backend, BLOOMS_PHRASES)
 _DOK_INDEX = build_phrase_index(_backend, DOK_PHRASES)
-# ------------------------ Tool: classify and score ------------------------
-@tool
-def classify_and_score(
-    question: str,
-    target_bloom: str,
-    target_dok: str,
-    agg: str = "max"
-) -> dict:
-    """Classify a question against Bloom’s and DOK targets and return guidance.
-    Args:
-        question: The question text to evaluate for cognitive demand.
-        target_bloom: Target Bloom’s level or range. Accepts exact (e.g., "Analyze")
-            or plus form (e.g., "Apply+") meaning that level or higher.
-        target_dok: Target DOK level or range. Accepts exact (e.g., "DOK3")
-            or span (e.g., "DOK2-DOK3").
-        agg: Aggregation method over phrase similarities within a level
-            (choices: "mean", "max", "topk_mean").
-    Returns:
-        A dictionary with:
-            ok: True if both Bloom’s and DOK match the targets.
-            measured: Dict with best levels and per-level scores for Bloom’s and DOK.
-            feedback: Brief guidance describing how to adjust the question to hit targets.
-    """
-    res = classify_levels_phrases(
-        question,
-        BLOOMS_PHRASES,
-        DOK_PHRASES,
-        backend=_backend,
-        prebuilt_bloom_index=_BLOOM_INDEX,
-        prebuilt_dok_index=_DOK_INDEX,
-        agg=agg,
-        return_phrase_matches=True
-    )
-    def _parse_target_bloom(t: str):
-        order = ["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"]
-        if t.endswith("+"):
-            base = t[:-1]
-            if base not in order:
-                raise ValueError(f"Invalid Bloom target '{t}'")
-            return set(order[order.index(base):])
-        if t not in order:
-            raise ValueError(f"Invalid Bloom target '{t}'")
-        return {t}
-    def _parse_target_dok(t: str):
-        order = ["DOK1", "DOK2", "DOK3", "DOK4"]
-        if "-" in t:
-            lo, hi = t.split("-")
-            if lo not in order or hi not in order or order.index(lo) > order.index(hi):
-                raise ValueError(f"Invalid DOK range '{t}'")
-            return set(order[order.index(lo):order.index(hi) + 1])
-        if t not in order:
-            raise ValueError(f"Invalid DOK target '{t}'")
-        return {t}
-    try:
-        bloom_target_set = _parse_target_bloom(target_bloom)
-        dok_target_set = _parse_target_dok(target_dok)
-    except Exception as e:
-        return {
-            "ok": False,
-            "measured": {},
-            "feedback": (
-                f"Invalid targets: {e}. Use Bloom in "
-                "{Remember, Understand, Apply, Analyze, Evaluate, Create} "
-                "and DOK in {DOK1..DOK4} or ranges like 'DOK2-DOK3'."
-            ),
-        }
-    bloom_best = res["blooms"]["best_level"]
-    dok_best = res["dok"]["best_level"]
-    bloom_ok = bloom_best in bloom_target_set
-    dok_ok = dok_best in dok_target_set
-    top_bloom_phrases = res["blooms"].get("top_phrases", {})
-    top_dok_phrases = res["dok"].get("top_phrases", {})
-    feedback_parts = []
-    if not bloom_ok:
-        feedback_parts.append(
-            f"Shift Bloom’s from {bloom_best} toward {sorted(list(bloom_target_set))}. "
-            f"Top cues: {top_bloom_phrases.get(bloom_best, [])[:3]}"
-        )
-    if not dok_ok:
-        feedback_parts.append(
-            f"Shift DOK from {dok_best} toward {sorted(list(dok_target_set))}. "
-            f"Top cues: {top_dok_phrases.get(dok_best, [])[:3]}"
-        )
-    return {
-        "ok": bool(bloom_ok and dok_ok),
-        "measured": {
-            "bloom_best": bloom_best,
-            "bloom_scores": res["blooms"]["scores"],
-            "dok_best": dok_best,
-            "dok_scores": res["dok"]["scores"],
-        },
-        "feedback": " ".join(feedback_parts) if feedback_parts else "On target.",
-    }
-# ------------------------ Backend selection + GPU-wrapped local loader ------------------------
-_LOCAL_MODEL_CACHE = {"model": None, "model_id": None}
-@spaces.GPU(duration=30)  # request GPU only when loading/using local model
-def get_local_model_gpu(model_id: str):
-    """
-    Load and cache a local Transformers model for smolagents on GPU.
-    Decorated so Spaces knows this task needs a GPU.
-    """
-    # Import here to keep Hosted mode lightweight.
-    try:
-        from smolagents import TransformersModel  # provided by smolagents
-    except Exception as e:
-        raise RuntimeError(
-            "Local backend requires 'TransformersModel' from smolagents. "
-            "Please ensure your smolagents version provides it."
-        ) from e
-    if (
-        _LOCAL_MODEL_CACHE["model"] is not None
-        and _LOCAL_MODEL_CACHE["model_id"] == model_id
-    ):
-        return _LOCAL_MODEL_CACHE["model"]
-    local_model = TransformersModel(
-        model_id=model_id,
-        device_map="auto"  # lets accelerate pick the best device(s)
     )
-    _LOCAL_MODEL_CACHE["model"] = local_model
-    _LOCAL_MODEL_CACHE["model_id"] = model_id
-    return local_model
-def make_agent(
-    backend_choice: str,          # "Hosted API" | "Local GPU"
-    hf_token: str,
-    model_id: str,
-    timeout: int,
-    temperature: float,
-    max_tokens: int
-):
-    if backend_choice == "Local GPU":
-        # This call is GPU-annotated; Spaces will allocate a GPU for it.
-        model = get_local_model_gpu(model_id)
-    else:
-        client = InferenceClient(
-            model=model_id,
-            timeout=timeout,
-            token=(hf_token or None),
-        )
-        model = InferenceClientModel(client=client)
-    agent = CodeAgent(model=model, tools=[classify_and_score])
-    agent._ui_params = {"temperature": temperature, "max_tokens": max_tokens}
     return agent
-# ------------------------ Agent task template -----------------------------
-TASK_TMPL = '''You generate {subject} question candidates for {grade} on "{topic}".
-After you propose a candidate, you MUST immediately call:
-classify_and_score(
-    question=<just the question text>,
-    target_bloom="{target_bloom}",
-    target_dok="{target_dok}",
-    agg="max"
-)
-Use the returned dict:
-- If ok == True: print ONLY compact JSON {{"question": "...", "answer": "...", "reasoning": "..."}} and finish.
-- If ok == False: briefly explain the needed shift, revise the question, and call classify_and_score again.
-Repeat up to {attempts} attempts.
-Keep answers concise.
-Additionally, when you call classify_and_score, pass the exact question text you propose.
-If you output JSON, ensure it is valid JSON (no trailing commas, use double quotes).
-'''
-# ------------------------ Utility: robust JSON extractor ------------------
-def extract_top_level_json(s: str) -> str:
-    start = s.find("{")
-    if start == -1:
-        return ""
-    depth = 0
-    for i in range(start, len(s)):
-        ch = s[i]
-        if ch == "{":
-            depth += 1
-        elif ch == "}":
-            depth -= 1
-            if depth == 0:
-                candidate = s[start:i + 1]
-                try:
-                    json.loads(candidate)  # validate
-                    return candidate
-                except Exception:
-                    return ""
-    return ""
-# ------------------------ Pipeline ---------------------------------------
 def run_pipeline(
-    backend_choice,
     hf_token,
     topic,
     grade,
@@ -285,24 +112,23 @@ def run_pipeline(
     target_dok,
     attempts,
     model_id,
     timeout,
     temperature,
-    max_tokens
 ):
-    try:
-        agent = make_agent(
-            backend_choice=backend_choice,
-            hf_token=(hf_token or "").strip(),
-            model_id=model_id,
-            timeout=int(timeout),
-            temperature=float(temperature),
-            max_tokens=int(max_tokens),
-        )
-    except Exception as e:
-        err = f"ERROR initializing backend '{backend_choice}': {e}"
-        return "", err
-    task = TASK_TMPL.format(
         grade=grade,
         topic=topic,
         subject=subject,
@@ -311,117 +137,80 @@ def run_pipeline(
         attempts=int(attempts)
     )
     try:
-        result_text = agent.run(task, max_steps=int(attempts) * 4)
     except Exception as e:
-        result_text = f"ERROR while running the agent: {e}"
     final_json = ""
-    candidate = extract_top_level_json(result_text or "")
-    if candidate:
-        try:
             final_json = json.dumps(json.loads(candidate), indent=2)
-        except Exception:
-            final_json = ""
     return final_json, result_text
-# ------------------------ Gradio UI --------------------------------------
 with gr.Blocks() as demo:
     gr.Markdown("# Agent + Tool: Generate Questions to Target Difficulty")
     gr.Markdown(
-        "Use a **CodeAgent** that calls the scoring tool (`classify_and_score`) after each proposal, "
-        "and revises until it hits your Bloom/DOK target."
     )
-    with gr.Accordion("API / Backend Settings", open=True):
-        backend_choice = gr.Radio(
-            choices=["Hosted API", "Local GPU"],
-            value="Hosted API",
-            label="Inference Backend"
-        )
-        with gr.Row():
-            hf_token = gr.Textbox(
-                label="Hugging Face Token (required for private/hosted endpoints)",
-                type="password",
-                visible=True
-            )
-            model_id = gr.Textbox(
-                value="swiss-ai/Apertus-70B-Instruct-2509",
-                label="Model ID (repo id for Hosted, or local repo for GPU)"
-            )
-        timeout = gr.Slider(5, 120, value=30, step=1, label="Timeout (s, Hosted API only)")
     with gr.Row():
         topic = gr.Textbox(value="Fractions", label="Topic")
         grade = gr.Dropdown(
-            choices=[
-                "Grade 1", "Grade 2", "Grade 3", "Grade 4", "Grade 5", "Grade 6",
-                "Grade 7", "Grade 8", "Grade 9",
-                "Grade 10", "Grade 11", "Grade 12",
-                "Under Graduate", "Post Graduate"
-            ],
             value="Grade 7",
             label="Grade"
         )
-        subject = gr.Textbox(value="Math", label="Subject")
     with gr.Row():
         target_bloom = gr.Dropdown(
-            choices=["Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"],
             value="Analyze",
             label="Target Bloom’s"
         )
         target_dok = gr.Dropdown(
-            choices=["DOK1", "DOK2", "DOK3", "DOK4", "DOK1-DOK2", "DOK2-DOK3", "DOK3-DOK4"],
             value="DOK2-DOK3",
-            label="Target Depth of Knowledge"
         )
         attempts = gr.Slider(1, 8, value=5, step=1, label="Max Attempts")
-    with gr.Accordion("⚙️ Generation Controls", open=False):
         temperature = gr.Slider(0.0, 1.5, value=0.7, step=0.1, label="Temperature")
         max_tokens = gr.Slider(64, 1024, value=300, step=16, label="Max Tokens")
-    backend_tips = gr.Markdown(
-        "*Hosted API:* uses Hugging Face Inference endpoints. Provide a token if needed.\n\n"
-        "*Local GPU:* loads the model into the Space with `TransformersModel (device_map='auto')`. "
-        "Ensure your Space has a GPU and enough VRAM for the selected model."
-    )
-    run_btn = gr.Button("Run Agent 🚀")
     final_json = gr.Code(label="Final Candidate (JSON if detected)", language="json")
     transcript = gr.Textbox(label="Agent Transcript", lines=18)
-    def _toggle_backend_fields(choice):
-        return (
-            gr.update(visible=(choice == "Hosted API")),  # hf_token
-            gr.update(visible=True),                      # model_id always visible
-            gr.update(visible=(choice == "Hosted API"))   # timeout slider
-        )
-    backend_choice.change(
-        _toggle_backend_fields,
-        inputs=[backend_choice],
-        outputs=[hf_token, model_id, timeout]
-    )
     run_btn.click(
         fn=run_pipeline,
-        inputs=[
-            backend_choice, hf_token, topic, grade, subject,
-            target_bloom, target_dok, attempts, model_id,
-            timeout, temperature, max_tokens
-        ],
         outputs=[final_json, transcript]
     )
-if __name__ == "__main__" or os.getenv("SYSTEM") == "spaces":
-    try:
-        get_local_model_gpu(model_id)  # triggers GPU allocation during startup
-    except Exception as e:
-        # don't crash the app if warmup fails; logs will show details
-        print("Warmup failed:", e)
-    demo.launch()

+# Create a self-contained Gradio app that uses the agent-driven loop (Option A)
+# It expects `level_classifier_tool.py` to be colocated (or installed on PYTHONPATH).
+import sys
+sys.path.append(r"C:\Users\Sarthak\OneDrive - UT Cloud\thesis\HF_Agent\src")  # use raw string because of spaces
 import json
 import gradio as gr
 from huggingface_hub import InferenceClient
 from smolagents import CodeAgent, InferenceClientModel, tool
+from langchain.embeddings import HuggingFaceEmbeddings
+from llama_index.core import VectorStoreIndex, Document
 from huggingface_hub import login
+from smolagents import tool
+from all_datasets import *
+from level_classifier_tool_2 import (
     classify_levels_phrases,
     HFEmbeddingBackend,
     build_phrase_index
 )
+from task_temp import TASK_TMPL, CLASSIFY_TMPL, GEN_TMPL, RAG_TMPL
+from all_tools import classify_and_score, QuestionRetrieverTool
+from phrases import BLOOMS_PHRASES, DOK_PHRASES
+# Prebuild embeddings once
 _backend = HFEmbeddingBackend(model_name="sentence-transformers/all-MiniLM-L6-v2")
 _BLOOM_INDEX = build_phrase_index(_backend, BLOOMS_PHRASES)
 _DOK_INDEX = build_phrase_index(_backend, DOK_PHRASES)
+D = {
+    "GSM8k": GSM8k['question'],
+    "Olympiad": Olympiad_math['question'],
+    "Olympiad2": Olympiad_math2['question'],
+    "DeepMind Math": clean_math['question'],
+    "MMMLU": MMMLU['question'],
+    "MMMU": MMMU['question'],
+    "ScienceQA": ScienceQA['question'],
+    "PubmedQA": PubmedQA['question']
+}
+all_questions = (
+    list(D["GSM8k"]) +
+    list(D["Olympiad"]) +
+    list(D["MMMLU"]) +
+    list(D["MMMU"]) +
+    list(D["DeepMind Math"]) +
+    list(D["Olympiad2"]) +
+    list(D["ScienceQA"]) +
+    list(D["PubmedQA"])
+)
+emb = HuggingFaceEmbeddings(
+    model_name="google/embeddinggemma-300m",
+    encode_kwargs={"normalize_embeddings": True},
+)
+texts = all_questions
+index = VectorStoreIndex.from_documents([Document(text=t) for t in texts], embed_model=emb)
+# ------------------------ Scoring TOOL -----------------------------------
+emb = HuggingFaceEmbeddings(
+    model_name="google/embeddinggemma-300m",
+    encode_kwargs={"normalize_embeddings": True},
+)
+D = {
+    "GSM8k": GSM8k['question'],
+    "Olympiad": Olympiad_math['question'],
+    "Olympiad2": Olympiad_math2['question'],
+    "DeepMind Math": clean_math['question'],
+    "MMMLU": MMMLU['question'],
+    "MMMU": MMMU['question'],
+    "ScienceQA": ScienceQA['question'],
+    "PubmedQA": PubmedQA['question']
+}
+all_questions = (
+    list(D["GSM8k"]) +
+    list(D["Olympiad"]) +
+    list(D["MMMLU"]) +
+    list(D["MMMU"]) +
+    list(D["DeepMind Math"]) +
+    list(D["Olympiad2"]) +
+    list(D["ScienceQA"]) +
+    list(D["PubmedQA"])
+)
+texts = all_questions
+index = VectorStoreIndex.from_documents([Document(text=t) for t in texts], embed_model=emb)
+# ------------------------ Retriever TOOL -----------------------------------
+# ------------------------ Agent setup with timeout ------------------------
+def make_agent(hf_token: str, model_id: str, provider: str, timeout: int, temperature: float, max_tokens: int):
+    client = InferenceClient(
+        model=model_id,
+        provider=provider,
+        timeout=timeout,
+        token=hf_token if hf_token else None,
     )
+    # Bind generation params by partially applying via model kwargs.
+    # smolagents InferenceClientModel currently accepts client only; we pass runtime params in task text.
+    model = InferenceClientModel(model_id=model_id,client=client)
+    agent = CodeAgent(model=model, tools=[classify_and_score, QuestionRetrieverTool])
+    agent._ui_params = {"temperature": temperature, "max_tokens": max_tokens}  # attach for reference
     return agent
+# ------------------------ Agent task template -----------------------------
+# ------------------------ Gradio glue ------------------------------------
 def run_pipeline(
     hf_token,
     topic,
     grade,
     target_dok,
     attempts,
     model_id,
+    provider,
     timeout,
     temperature,
+    max_tokens,
+    task_type
 ):
+    # Build agent per run (or cache if you prefer)
+    agent = make_agent(
+        hf_token=hf_token.strip(),
+        model_id=model_id,
+        provider=provider,
+        timeout=int(timeout),
+        temperature=float(temperature),
+        max_tokens=int(max_tokens),
+    )
+    task = task_type.format(
         grade=grade,
         topic=topic,
         subject=subject,
         attempts=int(attempts)
     )
+    # The agent will internally call the tool
     try:
+        result_text = agent.run(task, max_steps=int(attempts)*4)
     except Exception as e:
+        result_text = f"ERROR: {e}"
+    # Try to extract final JSON
     final_json = ""
+    try:
+        # find JSON object in result_text (simple heuristic)
+        start = result_text.find("{")
+        end = result_text.rfind("}")
+        if start != -1 and end != -1 and end > start:
+            candidate = result_text[start:end+1]
             final_json = json.dumps(json.loads(candidate), indent=2)
+    except Exception:
+        final_json = ""
     return final_json, result_text
 with gr.Blocks() as demo:
     gr.Markdown("# Agent + Tool: Generate Questions to Target Difficulty")
     gr.Markdown(
+        "This app uses a **CodeAgent** that *calls the scoring tool* "
+        "(`classify_and_score`) after each proposal, and revises until it hits the target."
     )
+    with gr.Accordion("API Settings", open=False):
+        hf_token = gr.Textbox(label="Hugging Face Token (required)", type="password")
+        model_id = gr.Textbox(value="meta-llama/Llama-4-Scout-17B-16E-Instruct", label="Model ID")
+        provider = gr.Textbox(value="novita", label="Provider")
+        timeout = gr.Slider(5, 120, value=30, step=1, label="Timeout (s)")
     with gr.Row():
         topic = gr.Textbox(value="Fractions", label="Topic")
         grade = gr.Dropdown(
+            choices=["Grade 1","Grade 2","Grade 3","Grade4","Grade 5","Grade 6","Grade 7","Grade 8","Grade 9",
+                     "Grade 10","Grade 11","Grade 12","Under Graduate","Post Graduate"],
             value="Grade 7",
             label="Grade"
         )
+        subject= gr.Textbox(value="Math", label="Subject")
+        task_type = gr.Dropdown(
+            choices=["TASK_TMPL", "CLASSIFY_TMPL", "GEN_TMPL", "RAG_TMPL"]
+        label= "task type")
     with gr.Row():
         target_bloom = gr.Dropdown(
+            choices=["Remember","Understand","Apply","Analyze","Evaluate","Create","Apply+","Analyze+","Evaluate+"],
             value="Analyze",
             label="Target Bloom’s"
         )
         target_dok = gr.Dropdown(
+            choices=["DOK1","DOK2","DOK3","DOK4","DOK1-DOK2","DOK2-DOK3","DOK3-DOK4"],
             value="DOK2-DOK3",
+            label="Target DOK"
         )
         attempts = gr.Slider(1, 8, value=5, step=1, label="Max Attempts")
+    with gr.Accordion("" Generation Controls", open=False):
         temperature = gr.Slider(0.0, 1.5, value=0.7, step=0.1, label="Temperature")
         max_tokens = gr.Slider(64, 1024, value=300, step=16, label="Max Tokens")
+    run_btn = gr.Button("Run Agent")
     final_json = gr.Code(label="Final Candidate (JSON if detected)", language="json")
     transcript = gr.Textbox(label="Agent Transcript", lines=18)
     run_btn.click(
         fn=run_pipeline,
+        inputs=[hf_token, topic, grade, subject, target_bloom, target_dok, attempts, model_id, provider, timeout, temperature, max_tokens],
         outputs=[final_json, transcript]
     )
+if __name__ == "__main__":
+    demo.launch(share=True)