import os
import subprocess
import tempfile

import threading

import spaces
import gradio as gr
import torch
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer, TextIteratorStreamer, AutoModel, AutoModelForSequenceClassification
from kernels import get_kernel
from typing import Any, Optional, Dict
import numpy as np


# Login to HF to get access to the model weights
HF_LE_LLM_READ_TOKEN = os.environ.get('HF_LE_LLM_READ_TOKEN')

from huggingface_hub import login
login(token=HF_LE_LLM_READ_TOKEN)

# Constants
DEFAULT_MODEL = "lapa-llm/manipulative-score-model"
DEVICE = "cuda"

MODEL_OPTIONS = [
    "lapa-llm/manipulative-score-model",
    "lapa-llm/gec-score-model",
    "lapa-llm/fineweb-mixtral-edu-score",
    "lapa-llm/fineweb-nemotron-edu-score",
    "lapa-llm/alignment-score-model",
    "lapa-llm/fasttext-quality-score",

]

# --- Cache to avoid repeated reloads ---
_model_cache: Dict[str, tuple[torch.nn.Module, AutoTokenizer]] = {}


def load_model(model_id: str):
    if model_id in _model_cache:
        return _model_cache[model_id]

    print(f"🔹 Loading model: {model_id}")
    tokenizer = AutoTokenizer.from_pretrained(model_id)

    model = AutoModelForSequenceClassification.from_pretrained(model_id, torch_dtype=torch.bfloat16)
    print(f"Detected model: {model_id}")

    model.to(DEVICE).eval()
    _model_cache[model_id] = (model, tokenizer)
    print(f"✅ Loaded model on {DEVICE}")
    return model, tokenizer


def compute_score(text: str, model: torch.nn.Module, tokenizer: AutoTokenizer) -> dict:
        inputs = tokenizer(
            text,
            return_tensors="pt",
            padding="longest",
            truncation=True,
        ).to(DEVICE)

        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits.squeeze(-1).float().cpu().numpy()

        res = {}
        res["score"] = logits.tolist()[0]
        res["int_score"] = [int(round(max(0, min(score, 5)))) for score in logits]
        return res

# --- Main scoring logic ---
@spaces.GPU
def bot(user_message: str, history: list[dict[str, Any]]):
    if not user_message.strip():
        return "", history

    res = ""
    history = history + [{"role": "user", "content": user_message}]
    scores = {}
    for model_choice in MODEL_OPTIONS:
        model, tokenizer = load_model(model_choice)  # returns embedding model
        score = compute_score(user_message, model, tokenizer)["score"]
        scores[model_choice] = score
        res += f"{model_choice}: {score}\n"


    formula_score = np.median([scores["lapa-llm/fineweb-nemotron-edu-score"], scores["lapa-llm/fineweb-mixtral-edu-score"], scores["lapa-llm/fasttext-quality-score"],]) \
    * scores["lapa-llm/alignment-score-model"] * scores["lapa-llm/manipulative-score-model"] * scores["lapa-llm/gec-score-model"]

    res += f"Formula (combined) score: {formula_score}\n"

    history.append({"role": "assistant", "content": res.strip()})
    return "", history

# --- UI ---
THEME = gr.themes.Soft(primary_hue="blue", secondary_hue="amber", neutral_hue="stone")


def _clear_chat():
    return "", []


with gr.Blocks(theme=THEME, fill_height=True) as demo:
    gr.Markdown("### 🤔 LAPA Quality Estimation")

    chatbot = gr.Chatbot(type="messages", height=480)
    msg = gr.Textbox(label=None, placeholder="Type your text…", lines=1)
    clear_btn = gr.Button("Clear")

    msg.submit(bot, inputs=[msg, chatbot], outputs=[msg, chatbot])
    clear_btn.click(_clear_chat, outputs=[msg, chatbot])

if __name__ == "__main__":
    demo.queue().launch()