import os import subprocess import tempfile import threading import spaces import gradio as gr import torch import torch.nn.functional as F from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer, TextIteratorStreamer, AutoModel, AutoModelForSequenceClassification from kernels import get_kernel from typing import Any, Optional, Dict import numpy as np # Login to HF to get access to the model weights HF_LE_LLM_READ_TOKEN = os.environ.get('HF_LE_LLM_READ_TOKEN') from huggingface_hub import login login(token=HF_LE_LLM_READ_TOKEN) # Constants DEFAULT_MODEL = "lapa-llm/manipulative-score-model" DEVICE = "cuda" MODEL_OPTIONS = [ "lapa-llm/manipulative-score-model", "lapa-llm/gec-score-model", "lapa-llm/fineweb-mixtral-edu-score", "lapa-llm/fineweb-nemotron-edu-score", "lapa-llm/alignment-score-model", "lapa-llm/fasttext-quality-score", ] # --- Cache to avoid repeated reloads --- _model_cache: Dict[str, tuple[torch.nn.Module, AutoTokenizer]] = {} def load_model(model_id: str): if model_id in _model_cache: return _model_cache[model_id] print(f"🔹 Loading model: {model_id}") tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForSequenceClassification.from_pretrained(model_id, torch_dtype=torch.bfloat16) print(f"Detected model: {model_id}") model.to(DEVICE).eval() _model_cache[model_id] = (model, tokenizer) print(f"✅ Loaded model on {DEVICE}") return model, tokenizer def compute_score(text: str, model: torch.nn.Module, tokenizer: AutoTokenizer) -> dict: inputs = tokenizer( text, return_tensors="pt", padding="longest", truncation=True, ).to(DEVICE) with torch.no_grad(): outputs = model(**inputs) logits = outputs.logits.squeeze(-1).float().cpu().numpy() res = {} res["score"] = logits.tolist()[0] res["int_score"] = [int(round(max(0, min(score, 5)))) for score in logits] return res # --- Main scoring logic --- @spaces.GPU def bot(user_message: str, history: list[dict[str, Any]]): if not user_message.strip(): return "", history res = "" history = history + [{"role": "user", "content": user_message}] scores = {} for model_choice in MODEL_OPTIONS: model, tokenizer = load_model(model_choice) # returns embedding model score = compute_score(user_message, model, tokenizer)["score"] scores[model_choice] = score res += f"{model_choice}: {score}\n" formula_score = np.median([scores["lapa-llm/fineweb-nemotron-edu-score"], scores["lapa-llm/fineweb-mixtral-edu-score"], scores["lapa-llm/fasttext-quality-score"],]) \ * scores["lapa-llm/alignment-score-model"] * scores["lapa-llm/manipulative-score-model"] * scores["lapa-llm/gec-score-model"] res += f"Formula (combined) score: {formula_score}\n" history.append({"role": "assistant", "content": res.strip()}) return "", history # --- UI --- THEME = gr.themes.Soft(primary_hue="blue", secondary_hue="amber", neutral_hue="stone") def _clear_chat(): return "", [] with gr.Blocks(theme=THEME, fill_height=True) as demo: gr.Markdown("### 🤔 LAPA Quality Estimation") chatbot = gr.Chatbot(type="messages", height=480) msg = gr.Textbox(label=None, placeholder="Type your text…", lines=1) clear_btn = gr.Button("Clear") msg.submit(bot, inputs=[msg, chatbot], outputs=[msg, chatbot]) clear_btn.click(_clear_chat, outputs=[msg, chatbot]) if __name__ == "__main__": demo.queue().launch()