help2opensource
Update space
474fd08
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import gradio as gr
# -------------------------
# Base + Adapter configuration
# -------------------------
base_model_name = "Qwen/Qwen3-4B-Instruct-2507"
adapter_model_name = "help2opensource/Qwen3-4B-Instruct-2507_mental_health_therapy"
device = "cuda" if torch.cuda.is_available() else "cpu"
# -------------------------
# Load base model and tokenizer
# -------------------------
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
base_model = AutoModelForCausalLM.from_pretrained(
base_model_name,
torch_dtype=torch.float16 if device == "cuda" else torch.float32,
).to(device)
# -------------------------
# Load LoRA adapter
# -------------------------
model = PeftModel.from_pretrained(base_model, adapter_model_name)
# Optional: merge LoRA weights for faster inference
model = model.merge_and_unload()
def predict(message, history):
# Ensure history format is consistent
messages = history + [{"role": "user", "content": message}]
# Apply chat template correctly
try:
input_text = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
except TypeError:
# For older tokenizers that don't support add_generation_prompt
input_text = tokenizer.apply_chat_template(messages, tokenize=False)
inputs = tokenizer(input_text, return_tensors="pt").to(device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=1024,
temperature=0.7,
top_p=0.9,
do_sample=True,
)
decoded = tokenizer.decode(outputs[0], skip_special_tokens=False)
# Extract only the assistant’s final response
if "<|im_start|>assistant" in decoded:
response = (
decoded.split("<|im_start|>assistant")[-1].split("<|im_end|>")[0].strip()
)
else:
response = decoded
return response
demo = gr.ChatInterface(predict, type="messages")
demo.launch()