Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -2,16 +2,16 @@ import torch
|
|
| 2 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 3 |
from peft import PeftModel
|
| 4 |
import gradio as gr
|
|
|
|
| 5 |
|
| 6 |
-
# Load
|
| 7 |
base_model = AutoModelForCausalLM.from_pretrained(
|
| 8 |
"unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
|
| 9 |
torch_dtype=torch.float16,
|
| 10 |
-
device_map="auto",
|
| 11 |
-
|
| 12 |
)
|
| 13 |
|
| 14 |
-
# Load tokenizer
|
| 15 |
tokenizer = AutoTokenizer.from_pretrained("unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit")
|
| 16 |
|
| 17 |
# Add padding token if missing
|
|
@@ -21,6 +21,7 @@ if tokenizer.pad_token is None:
|
|
| 21 |
# Load LoRA adapter
|
| 22 |
model = PeftModel.from_pretrained(base_model, "rezaenayati/RezAi-Model")
|
| 23 |
|
|
|
|
| 24 |
def chat_with_rezAi(messages, history):
|
| 25 |
conversation = "<|start_header_id|>system<|end_header_id|>\nYou are Reza Enayati, a Computer Science student and entrepreneur from Los Angeles, who is eager to work as a software engineer or machine learning engineer. Answer these questions as if you are in an interview.<|eot_id|>"
|
| 26 |
|
|
@@ -32,13 +33,13 @@ def chat_with_rezAi(messages, history):
|
|
| 32 |
# Add current message
|
| 33 |
conversation += f"<|start_header_id|>user<|end_header_id|>\n{messages}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n"
|
| 34 |
|
| 35 |
-
# Tokenize
|
| 36 |
inputs = tokenizer(conversation, return_tensors="pt", truncate=True, max_length=2048)
|
| 37 |
-
inputs = {k: v.to(model.device) for k, v in inputs.items()} # Move to GPU
|
| 38 |
|
|
|
|
| 39 |
with torch.no_grad():
|
| 40 |
outputs = model.generate(
|
| 41 |
-
**inputs,
|
| 42 |
max_new_tokens=128,
|
| 43 |
temperature=0.5,
|
| 44 |
do_sample=True,
|
|
|
|
| 2 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 3 |
from peft import PeftModel
|
| 4 |
import gradio as gr
|
| 5 |
+
import spaces # Important for ZeroGPU
|
| 6 |
|
| 7 |
+
# Load models (will be moved to GPU when needed)
|
| 8 |
base_model = AutoModelForCausalLM.from_pretrained(
|
| 9 |
"unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
|
| 10 |
torch_dtype=torch.float16,
|
| 11 |
+
device_map="auto", # ZeroGPU handles this
|
| 12 |
+
trust_remote_code=True
|
| 13 |
)
|
| 14 |
|
|
|
|
| 15 |
tokenizer = AutoTokenizer.from_pretrained("unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit")
|
| 16 |
|
| 17 |
# Add padding token if missing
|
|
|
|
| 21 |
# Load LoRA adapter
|
| 22 |
model = PeftModel.from_pretrained(base_model, "rezaenayati/RezAi-Model")
|
| 23 |
|
| 24 |
+
@spaces.GPU # This decorator is CRITICAL for ZeroGPU
|
| 25 |
def chat_with_rezAi(messages, history):
|
| 26 |
conversation = "<|start_header_id|>system<|end_header_id|>\nYou are Reza Enayati, a Computer Science student and entrepreneur from Los Angeles, who is eager to work as a software engineer or machine learning engineer. Answer these questions as if you are in an interview.<|eot_id|>"
|
| 27 |
|
|
|
|
| 33 |
# Add current message
|
| 34 |
conversation += f"<|start_header_id|>user<|end_header_id|>\n{messages}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n"
|
| 35 |
|
| 36 |
+
# Tokenize
|
| 37 |
inputs = tokenizer(conversation, return_tensors="pt", truncate=True, max_length=2048)
|
|
|
|
| 38 |
|
| 39 |
+
# Generate response
|
| 40 |
with torch.no_grad():
|
| 41 |
outputs = model.generate(
|
| 42 |
+
**inputs,
|
| 43 |
max_new_tokens=128,
|
| 44 |
temperature=0.5,
|
| 45 |
do_sample=True,
|