metatune-20b

Sleeping

App Files Files Community

legolasyiu commited on Oct 29

Commit

4c1a418

verified ·

1 Parent(s): f35ca45

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -183

app.py CHANGED Viewed

@@ -1,189 +1,21 @@
-# save as app.py
-"""
-Gradio streaming chat where:
- - user messages are visible in the UI,
- - system messages are hidden (kept for context),
- - assistant output is streamed and updates in-place.
- - full back-and-forth memory between turns.
-Requirements:
-    pip install torch transformers gradio
-"""
-import threading
 import gradio as gr
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
-MODEL_ID = "EpistemeAI/metatune-gpt20b-R0"
-print("Loading tokenizer and model (this may take a while)...")
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-# Use auto dtype & device mapping
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    torch_dtype="auto",
-    device_map="auto",
-)
-model.eval()
-print("Model loaded. Example param device:", next(model.parameters()).device)
-# Thread-safe global history
-GLOBAL_HISTORY = []  # list of {"role": "system"|"user"|"assistant", "content": "..."}
-HISTORY_LOCK = threading.Lock()
-def build_prompt(system_message: str, history: list, user_message: str) -> str:
-    """
-    Build prompt in the model's expected format. Adjust as needed.
-    """
-    pieces = []
-    if system_message:
-        pieces.append(f"<|system|>\n{system_message}\n")
-    for turn in history:
-        role = turn.get("role", "user")
-        content = turn.get("content", "")
-        pieces.append(f"<|{role}|>\n{content}\n")
-    pieces.append(f"<|user|>\n{user_message}\n<|assistant|>\n")
-    return "\n".join(pieces)
-def generate_stream(prompt: str, max_tokens: int, temperature: float, top_p: float):
-    """
-    Stream partial strings via TextIteratorStreamer.
-    """
-    inputs = tokenizer(prompt, return_tensors="pt")
-    try:
-        input_ids = inputs["input_ids"].to(next(model.parameters()).device)
-    except Exception:
-        input_ids = inputs["input_ids"]
-    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-    gen_kwargs = dict(
-        input_ids=input_ids,
-        max_new_tokens=int(max_tokens),
-        do_sample=True,
-        temperature=float(temperature),
-        top_p=float(top_p),
-        streamer=streamer,
-    )
-    thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
-    thread.start()
-    partial = ""
-    for token_str in streamer:
-        partial += token_str
-        yield partial
-def visible_messages_from_history(real_history: list, streaming_partial: str | None):
-    """
-    Convert internal history into Gradio-visible messages.
-    - Show user messages.
-    - Show assistant messages (partial or final).
-    - Hide system messages.
-    """
-    msgs = []
-    for entry in real_history:
-        role = entry.get("role")
-        content = entry.get("content", "")
-        if role == "system":
-            continue
-        msgs.append({"role": role, "content": content or ("thinking..." if role == "assistant" else "")})
-    if streaming_partial is not None:
-        if msgs and msgs[-1]["role"] == "assistant":
-            msgs[-1]["content"] = streaming_partial
-        else:
-            msgs.append({"role": "assistant", "content": streaming_partial})
-    return msgs
-def respond_stream(user_message, system_message, max_tokens, temperature, top_p, history_state):
-    """
-    Gradio streaming handler with persistent memory.
-    """
-    if history_state is None:
-        history_state = []
-    # Sync local and global histories (optional global memory)
-    with HISTORY_LOCK:
-        GLOBAL_HISTORY[:] = history_state
-    # Add the new user message and placeholder assistant
-    with HISTORY_LOCK:
-        if system_message:
-            GLOBAL_HISTORY.append({"role": "system", "content": system_message})
-        GLOBAL_HISTORY.append({"role": "user", "content": user_message})
-        GLOBAL_HISTORY.append({"role": "assistant", "content": ""})
-        snapshot = list(GLOBAL_HISTORY)
-    # Show initial "thinking..." state
-    initial_display = visible_messages_from_history(snapshot, streaming_partial="thinking...")
-    yield initial_display, snapshot
-    # Build prompt excluding assistant placeholder
-    with HISTORY_LOCK:
-        prompt_history = [h for h in GLOBAL_HISTORY[:-1]]
-    prompt = build_prompt(system_message or "", prompt_history, user_message or "")
-    # Stream generation and update assistant output
-    for partial in generate_stream(prompt, max_tokens, temperature, top_p):
-        with HISTORY_LOCK:
-            if GLOBAL_HISTORY and GLOBAL_HISTORY[-1]["role"] == "assistant":
-                GLOBAL_HISTORY[-1]["content"] = partial
-            snapshot = list(GLOBAL_HISTORY)
-        display = visible_messages_from_history(snapshot, streaming_partial=partial)
-        yield display, snapshot
-    # Final display
-    with HISTORY_LOCK:
-        final_snapshot = list(GLOBAL_HISTORY)
-    final_display = visible_messages_from_history(final_snapshot, streaming_partial=final_snapshot[-1].get("content", ""))
-    yield final_display, final_snapshot
-def reset_all():
-    with HISTORY_LOCK:
-        GLOBAL_HISTORY.clear()
-    return [], []
-# --- Gradio UI ---
-with gr.Blocks() as demo:
-    gr.Markdown(f"**Model:** {MODEL_ID} — (system messages hidden; user visible)")
-    chatbot = gr.Chatbot(elem_id="chatbot", label="Chat", type="messages", height=560)
-    history_state = gr.State([])
-    with gr.Row():
-        with gr.Column(scale=4):
-            user_input = gr.Textbox(placeholder="Type a message and press Send", label="Your message")
-        with gr.Column(scale=2):
-            system_input = gr.Textbox(value="You are a Vibe Coder assistant.", label="System message (hidden)")
-            max_tokens = gr.Slider(minimum=1, maximum=4000, value=800, step=1, label="Max new tokens")
-            temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.01, label="Temperature")
-            top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top-p (nucleus sampling)")
-            send_btn = gr.Button("Send")
-    send_btn.click(
-        fn=respond_stream,
-        inputs=[user_input, system_input, max_tokens, temperature, top_p, history_state],
-        outputs=[chatbot, history_state],
-        queue=True,
-    )
-    clear_btn = gr.Button("Reset conversation")
-    clear_btn.click(fn=reset_all, inputs=None, outputs=[chatbot, history_state])
-    gr.Markdown(
-        "Notes: model loading uses `device_map='auto'` and `torch_dtype='auto'`. "
-        "If running multi-worker (gunicorn) you will need an external history store (Redis/DB)."
-    )
-if __name__ == "__main__":
-    demo.launch()

+from transformers import AutoModelForCausalLM, AutoTokenizer
 import gradio as gr
+checkpoint = "EpistemeAI/metatune-20b"
+device = "cpu"  # "cuda" or "cpu"
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)
+def predict(message, history):
+    history.append({"role": "user", "content": message})
+    input_text = tokenizer.apply_chat_template(history, tokenize=False)
+    inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
+    outputs = model.generate(inputs, max_new_tokens=64000, temperature=0.9, top_p=0.9, do_sample=True)
+    decoded = tokenizer.decode(outputs[0])
+    response = decoded.split("<|im_start|>assistant\n")[-1].split("<|im_end|>")[0]
+    return response
+demo = gr.ChatInterface(predict, type="messages")
+demo.launch()