Spaces:
Sleeping
Sleeping
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| import gradio as gr | |
| import torch | |
| import time | |
| # --- Model / tokenizer load (your checkpoint) --- | |
| checkpoint = "EpistemeAI/metatune-gpt20b-R0" | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| tokenizer = AutoTokenizer.from_pretrained(checkpoint) | |
| model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype="auto").to(device) | |
| model.eval() | |
| # --- Helper: convert gradio display history (tuples) -> model/chat history (dicts) --- | |
| def display_to_model_history(display_history): | |
| """ | |
| Convert gradio chatbot history (list of (role, text)) into a list of dicts | |
| used by your tokenizer.apply_chat_template. Adjust roles to 'user'/'assistant'. | |
| """ | |
| model_history = [] | |
| if not display_history: | |
| return model_history | |
| for role, text in display_history: | |
| role_key = "user" if role.lower().startswith("user") else "assistant" | |
| model_history.append({"role": role_key, "content": text}) | |
| return model_history | |
| # --- Prediction (generator) that shows thinking and then final output --- | |
| def predict(user_message, chat_history): | |
| """ | |
| Args: | |
| user_message: string typed by user | |
| chat_history: list of tuples [(role, text), ...] from the gradio Chatbot | |
| Yields: | |
| chat_history list (so gradio updates UI). First yield shows "Thinking...", | |
| second yields the final assistant response. | |
| """ | |
| # Ensure history is a list | |
| chat_history = chat_history or [] | |
| # 1) Append user message to both display and model history | |
| chat_history.append(("User", user_message)) | |
| # Convert to model history for tokenizer | |
| model_history = display_to_model_history(chat_history) | |
| # 2) Append "Thinking..." placeholder in UI and yield (so user sees it) | |
| chat_history.append(("Assistant", "Thinking...")) | |
| yield chat_history | |
| # 3) Build the prompt for the model using your tokenizer helper | |
| input_text = tokenizer.apply_chat_template(model_history, tokenize=False) | |
| # 4) Tokenize and run generation | |
| inputs = tokenizer.encode(input_text, return_tensors="pt", truncation=True).to(device) | |
| # Generate (tune args as you prefer) | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| inputs, | |
| max_new_tokens=512, | |
| temperature=0.9, | |
| top_p=0.9, | |
| do_sample=True, | |
| pad_token_id=tokenizer.eos_token_id, | |
| ) | |
| decoded = tokenizer.decode(outputs[0], skip_special_tokens=False) | |
| # 5) Extract assistant response (match your original splitting logic) | |
| # Keep the same delimiters you used previously (adjust if different) | |
| try: | |
| response = decoded.split("<|im_start|>assistant\n")[-1].split("<|im_end|>")[0].strip() | |
| except Exception: | |
| # Fallback: use last part of decoded text | |
| response = decoded.strip() | |
| # 6) Replace the "Thinking..." placeholder with final response | |
| # The placeholder was last element, so update it | |
| if chat_history and chat_history[-1][0].lower().startswith("assistant"): | |
| chat_history[-1] = ("Assistant", response) | |
| else: | |
| chat_history.append(("Assistant", response)) | |
| # 7) Final yield with assistant output | |
| yield chat_history | |
| # --- Gradio UI --- | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## Episteme Chat — shows 'Thinking...' then final assistant output") | |
| chatbot = gr.Chatbot(height=600) | |
| txt = gr.Textbox(show_label=False, placeholder="Type your message and hit Enter") | |
| clear = gr.Button("Clear") | |
| # Bind the generator to textbox submit | |
| txt.submit(predict, inputs=[txt, chatbot], outputs=chatbot) | |
| clear.click(lambda: None, None, chatbot, queue=False) # clears chat (returns None) | |
| demo.launch() | |