# --- MUST be first: disable Hugging Face Spaces ZeroGPU monkey-patch ---
import os
os.environ["SPACES_ZERO_DISABLED"] = "1"

# (optional but helpful) steer PyTorch to math attention kernels (no Flash/MemEfficient)
try:
    import torch
    torch.backends.cuda.sdp_kernel(enable_math=True, enable_flash=False, enable_mem_efficient=False)
except Exception:
    pass

# If you truly need Spaces, import it AFTER disabling the patch.
import spaces

import sys
from huggingface_hub import hf_hub_download
import pickle
from huggingface_hub import login
login(os.getenv("HF_Token"))
import json
import gradio as gr
from huggingface_hub import InferenceClient
from smolagents import CodeAgent, InferenceClientModel, tool
from langchain_community.embeddings import HuggingFaceEmbeddings
# from llama_index.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.core import StorageContext, load_index_from_storage
from huggingface_hub import login, snapshot_download
from smolagents import tool
# from all_datasets import *
from level_classifier_tool_2 import (
    classify_levels_phrases,
    HFEmbeddingBackend,
    build_phrase_index
)
from task_temp import rag_temp, rag_cls_temp, cls_temp, gen_temp
from all_tools import classify_and_score, QuestionRetrieverTool
from phrases import BLOOMS_PHRASES, DOK_PHRASES

# ------------------------ Prebuild embeddings once ------------------------
_backend = HFEmbeddingBackend(model_name="google/embeddinggemma-300m")
# Belt-and-suspenders: ensure eager attention even if class wasn't patched
try:
    _backend.MODEL.config.attn_implementation = "eager"
except Exception:
    pass

_BLOOM_INDEX = build_phrase_index(_backend, BLOOMS_PHRASES)
_DOK_INDEX = build_phrase_index(_backend, DOK_PHRASES)

DATASET_REPO = "bhardwaj08sarthak/my-stem-index"   # your dataset repo id
PERSIST_SUBDIR = "index_store"                      # the folder you uploaded
LOCAL_BASE = "/data/index"                          # where to place files in the Space

# Download the persisted index folder into ephemeral storage
os.makedirs(LOCAL_BASE, exist_ok=True)
snapshot_download(
    repo_id=DATASET_REPO,
    repo_type="dataset",
    local_dir=LOCAL_BASE,
    allow_patterns=[f"{PERSIST_SUBDIR}/**"],  # only grab the index folder
    local_dir_use_symlinks=False,             # real files (safer in Spaces)
)

persist_dir = os.path.join(LOCAL_BASE, PERSIST_SUBDIR)

# Recreate the SAME embedding model used to build the index
emb = HuggingFaceEmbeddings(
    model_name="google/embeddinggemma-300m",
    model_kwargs={"device": "cuda", "attn_implementation": "eager"},
    encode_kwargs={"normalize_embeddings": True},
)

# Load the index from storage
storage_context = StorageContext.from_defaults(persist_dir=persist_dir)
index = load_index_from_storage(storage_context, embed_model=emb)

# Datasets & GPU build code remains commented out...
# @spaces.GPU(15)
# def build_indexes_on_gpu(model="google/embeddinggemma-300m"):
#     device = 'cuda'
#     emb = HuggingFaceEmbeddings(
#         model_name="model",
#         model_kwargs={"device": device, "attn_implementation": "eager"},
#         encode_kwargs={"normalize_embeddings": True})
#     idx = VectorStoreIndex.from_documents([Document(text=t) for t in texts], embed_model=emb)
#     return idx

# ------------------------ Agent setup with timeout ------------------------
def make_agent(hf_token: str, model_id: str, provider: str, timeout: int, temperature: float, max_tokens: int):
    client = InferenceClient(
        model=model_id,
        provider=provider,
        timeout=timeout,
        token=hf_token if hf_token else None,
    )

    # Bind generation params by partially applying via model kwargs.
    # smolagents InferenceClientModel currently accepts client only; we pass runtime params in task text.
    model = InferenceClientModel(model_id=model_id, client=client)
    agent = CodeAgent(model=model, tools=[classify_and_score, QuestionRetrieverTool])
    agent._ui_params = {"temperature": temperature, "max_tokens": max_tokens}  # attach for reference
    return agent

# ------------------------ Gradio glue ------------------------------------
def run_pipeline(
    hf_token,
    topic,
    grade,
    subject,
    target_bloom,
    target_dok,
    attempts,
    model_id,
    provider,
    timeout,
    temperature,
    max_tokens,
    task_type
):
    # Build agent per run (or cache if you prefer)
    agent = make_agent(
        hf_token=hf_token.strip(),
        model_id=model_id,
        provider=provider,
        timeout=int(timeout),
        temperature=float(temperature),
        max_tokens=int(max_tokens),
    )

    task = task_type.format(
        grade=grade,
        topic=topic,
        subject=subject,
        target_bloom=target_bloom,
        target_dok=target_dok,
        attempts=int(attempts)
    )

    # The agent will internally call the tool
    try:
        result_text = agent.run(task, max_steps=int(attempts) * 4)
    except Exception as e:
        result_text = f"ERROR: {e}"

    # Try to extract final JSON
    final_json = ""
    try:
        # find JSON object in result_text (simple heuristic)
        start = result_text.find("{")
        end = result_text.rfind("}")
        if start != -1 and end != -1 and end > start:
            candidate = result_text[start:end+1]
            final_json = json.dumps(json.loads(candidate), indent=2)
    except Exception:
        final_json = ""

    return final_json, result_text

with gr.Blocks() as demo:
    gr.Markdown("# Agent + Tool: Generate Questions to Target Difficulty")
    gr.Markdown(
        "This app uses a **CodeAgent** that *calls the scoring tool* "
        "(`classify_and_score`) after each proposal, and revises until it hits the target."
    )

    with gr.Accordion("API Settings", open=False):
        hf_token = gr.Textbox(label="Hugging Face Token (required)", type="password")
        model_id = gr.Textbox(value="meta-llama/Llama-4-Scout-17B-16E-Instruct", label="Model ID")
        provider = gr.Textbox(value="novita", label="Provider")
        timeout = gr.Slider(5, 120, value=30, step=1, label="Timeout (s)")

    with gr.Row():
        topic = gr.Textbox(value="Fractions", label="Topic")
        grade = gr.Dropdown(
            choices=["Grade 1","Grade 2","Grade 3","Grade4","Grade 5","Grade 6","Grade 7","Grade 8","Grade 9",
                     "Grade 10","Grade 11","Grade 12","Under Graduate","Post Graduate"],
            value="Grade 7",
            label="Grade"
        )
        subject = gr.Textbox(value="Math", label="Subject")
        task_type = gr.Dropdown(
            choices=["TASK_TMPL", "CLASSIFY_TMPL", "GEN_TMPL", "RAG_TMPL"],
            label="task type"
        )

    with gr.Row():
        target_bloom = gr.Dropdown(
            choices=["Remember","Understand","Apply","Analyze","Evaluate","Create","Apply+","Analyze+","Evaluate+"],
            value="Analyze",
            label="Target Bloom’s"
        )
        target_dok = gr.Dropdown(
            choices=["DOK1","DOK2","DOK3","DOK4","DOK1-DOK2","DOK2-DOK3","DOK3-DOK4"],
            value="DOK2-DOK3",
            label="Target DOK"
        )
        attempts = gr.Slider(1, 8, value=5, step=1, label="Max Attempts")

    with gr.Accordion("Generation Controls", open=False):
        temperature = gr.Slider(0.0, 1.5, value=0.7, step=0.1, label="Temperature")
        max_tokens = gr.Slider(64, 1024, value=300, step=16, label="Max Tokens")

    run_btn = gr.Button("Run Agent")

    final_json = gr.Code(label="Final Candidate (JSON if detected)", language="json")
    transcript = gr.Textbox(label="Agent Transcript", lines=18)

    run_btn.click(
        fn=run_pipeline,
        inputs=[hf_token, topic, grade, subject, target_bloom, target_dok, attempts, model_id, provider, timeout, temperature, max_tokens, task_type],
        outputs=[final_json, transcript]
    )

if __name__ == "__main__":
    demo.launch(share=True)