import os os.environ.setdefault("HF_HOME", "/tmp/hf") os.environ.setdefault("HF_HUB_CACHE", "/tmp/hf/hub") os.environ.setdefault("TRANSFORMERS_CACHE", "/tmp/hf/transformers") os.environ.setdefault("NANOCHAT_BASE_DIR", "/tmp/nanochat") from huggingface_hub import hf_hub_download import torch import gradio as gr from nanochat.checkpoint_manager import load_model_from_dir from nanochat.engine import Engine # Hardcoded model selection for this Space MODEL_REPO = "loocorez/nanochat-base-d20-step21400" STEP = "021400" DEPTH = "20" ckpt_dir = f"/tmp/ckpt/d{DEPTH}" os.makedirs(ckpt_dir, exist_ok=True) # tokenizer (where nanochat expects it) tokenizer_dir = "/tmp/nanochat/tokenizer" os.makedirs(tokenizer_dir, exist_ok=True) hf_hub_download(MODEL_REPO, "tokenizer/tokenizer.pkl", local_dir=tokenizer_dir, local_dir_use_symlinks=False) # base checkpoint hf_hub_download(MODEL_REPO, f"base_checkpoints/d{DEPTH}/model_{STEP}.pt", local_dir=ckpt_dir, local_dir_use_symlinks=False) hf_hub_download(MODEL_REPO, f"base_checkpoints/d{DEPTH}/meta_{STEP}.json", local_dir=ckpt_dir, local_dir_use_symlinks=False) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model, tokenizer, _ = load_model_from_dir(ckpt_dir, device, phase="eval") engine = Engine(model, tokenizer) def chat_fn(history, temperature=0.8, top_k=50, max_new_tokens=256): bos = tokenizer.get_bos_token_id() user_start = tokenizer.encode_special("<|user_start|>") user_end = tokenizer.encode_special("<|user_end|>") assistant_start = tokenizer.encode_special("<|assistant_start|>") assistant_end = tokenizer.encode_special("<|assistant_end|>") tokens = [bos] for role, content in history: if role == "user": tokens += [user_start] + tokenizer.encode(content) + [user_end] else: tokens += [assistant_start] + tokenizer.encode(content) + [assistant_end] tokens += [assistant_start] with torch.amp.autocast(device_type="cuda" if device.type == "cuda" else "cpu", dtype=torch.bfloat16 if device.type == "cuda" else torch.float32): token_column, _ = next(engine.generate(tokens, num_samples=1, max_tokens=max_new_tokens, temperature=temperature, top_k=top_k)) new_tokens = token_column[len(tokens):] return tokenizer.decode(new_tokens) with gr.Blocks() as demo: gr.Markdown("# NanoChat BASE") chat = gr.Chatbot(type="tuple") msg = gr.Textbox() temp = gr.Slider(0.0, 1.5, value=0.8, step=0.05, label="Temperature") topk = gr.Slider(1, 200, value=50, step=1, label="Top-k") max_toks = gr.Slider(16, 1024, value=256, step=16, label="Max new tokens") def respond(user_message, chat_history, temperature, top_k, max_new_tokens): chat_history = chat_history + [("user", user_message)] reply = chat_fn(chat_history, temperature, top_k, max_new_tokens) chat_history = chat_history + [("assistant", reply)] return "", chat_history msg.submit(respond, [msg, chat, temp, topk, max_toks], [msg, chat]) demo.launch()