from transformers import AutoModelForCausalLM, AutoTokenizer
import gradio as gr
import torch
import time

# --- Model / tokenizer load (your checkpoint) ---
checkpoint = "EpistemeAI/metatune-gpt20b-R0"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype="auto").to(device)
model.eval()

# --- Helper: convert gradio display history (tuples) -> model/chat history (dicts) ---
def display_to_model_history(display_history):
    """
    Convert gradio chatbot history (list of (role, text)) into a list of dicts
    used by your tokenizer.apply_chat_template. Adjust roles to 'user'/'assistant'.
    """
    model_history = []
    if not display_history:
        return model_history
    for role, text in display_history:
        role_key = "user" if role.lower().startswith("user") else "assistant"
        model_history.append({"role": role_key, "content": text})
    return model_history

# --- Prediction (generator) that shows thinking and then final output ---
def predict(user_message, chat_history):
    """
    Args:
        user_message: string typed by user
        chat_history: list of tuples [(role, text), ...] from the gradio Chatbot
    
    Yields:
        chat_history list (so gradio updates UI). First yield shows "Thinking...",
        second yields the final assistant response.
    """
    # Ensure history is a list
    chat_history = chat_history or []
    
    # 1) Append user message to both display and model history
    chat_history.append(("User", user_message))
    # Convert to model history for tokenizer
    model_history = display_to_model_history(chat_history)
    
    # 2) Append "Thinking..." placeholder in UI and yield (so user sees it)
    chat_history.append(("Assistant", "Thinking..."))
    yield chat_history
    
    # 3) Build the prompt for the model using your tokenizer helper
    input_text = tokenizer.apply_chat_template(model_history, tokenize=False)
    
    # 4) Tokenize and run generation
    inputs = tokenizer.encode(input_text, return_tensors="pt", truncation=True).to(device)
    # Generate (tune args as you prefer)
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_new_tokens=512,
            temperature=0.9,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=False)
    
    # 5) Extract assistant response (match your original splitting logic)
    # Keep the same delimiters you used previously (adjust if different)
    try:
        response = decoded.split("<|im_start|>assistant\n")[-1].split("<|im_end|>")[0].strip()
    except Exception:
        # Fallback: use last part of decoded text
        response = decoded.strip()
    
    # 6) Replace the "Thinking..." placeholder with final response
    # The placeholder was last element, so update it
    if chat_history and chat_history[-1][0].lower().startswith("assistant"):
        chat_history[-1] = ("Assistant", response)
    else:
        chat_history.append(("Assistant", response))
    
    # 7) Final yield with assistant output
    yield chat_history

# --- Gradio UI ---
with gr.Blocks() as demo:
    gr.Markdown("## Episteme Chat — shows 'Thinking...' then final assistant output")
    chatbot = gr.Chatbot(height=600)
    txt = gr.Textbox(show_label=False, placeholder="Type your message and hit Enter")
    clear = gr.Button("Clear")
    
    # Bind the generator to textbox submit
    txt.submit(predict, inputs=[txt, chatbot], outputs=chatbot)
    clear.click(lambda: None, None, chatbot, queue=False)  # clears chat (returns None)
    
demo.launch()