Spaces:

rezaenayati
/

RezAi

Running on Zero

App Files Files Community

rezaenayati commited on May 27

Commit

c646b6b

verified ·

1 Parent(s): a9fc148

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -37

app.py CHANGED Viewed

@@ -2,86 +2,81 @@ import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from peft import PeftModel
 import gradio as gr
-import spaces  # Important for ZeroGPU
-# Load models (will be moved to GPU when needed)
 base_model = AutoModelForCausalLM.from_pretrained(
     "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
     torch_dtype=torch.float16,
-    device_map="auto",  # ZeroGPU handles this
     trust_remote_code=True
 )
 tokenizer = AutoTokenizer.from_pretrained("unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit")
-# Add padding token if missing
 if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
 # Load LoRA adapter
 model = PeftModel.from_pretrained(base_model, "rezaenayati/RezAi-Model")
-@spaces.GPU  # This decorator is CRITICAL for ZeroGPU
 def chat_with_rezAi(messages, history):
-    conversation = "<|start_header_id|>system<|end_header_id|>\nYou are Reza Enayati, a Computer Science student and entrepreneur from Los Angeles, who is eager to work as a software engineer or machine learning engineer. Answer these questions as if you are in an interview.<|eot_id|>"
     # Add conversation history
     for user_msg, assistant_msg in history:
-        conversation += f"<|start_header_id|>user<|end_header_id|>\n{user_msg}<|eot_id|>"
-        conversation += f"<|start_header_id|>assistant<|end_header_id|>\n{assistant_msg}<|eot_id|>"
     # Add current message
-    conversation += f"<|start_header_id|>user<|end_header_id|>\n{messages}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n"
-    # Tokenize - fix the max_length parameter
     inputs = tokenizer(
-        conversation,
-        return_tensors="pt",
-        truncation=True,  # Changed from 'truncate=True'
         max_length=2048
     )
-    # Move inputs to the same device as model
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
-    # Generate response
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
-            max_new_tokens=128,
-            temperature=0.7,  # Slightly increased for more variety
             do_sample=True,
             pad_token_id=tokenizer.eos_token_id,
-            eos_token_id=tokenizer.eos_token_id,
-            repetition_penalty=1.1  # Added to reduce repetition
         )
-    # Decode response
-    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    new_response = response.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
-    # Clean up response - remove any incomplete tags
     if "<|" in new_response:
         new_response = new_response.split("<|")[0].strip()
     return new_response
-# Create Gradio interface
 demo = gr.ChatInterface(
     fn=chat_with_rezAi,
-    title="💬 Chat with RezAI",
-    description="Hi! I'm RezAI, Reza's AI twin. Ask me about his technical background, projects, or experience!",
-    examples=[
-        "Tell me about your background",
-        "What programming languages do you know?",
-        "Walk me through RezAI",
-        "What's your experience with machine learning?",
-        "How did you get into computer science?"
-    ],
-    retry_btn=None,
-    undo_btn="Delete Previous",
-    clear_btn="Clear Chat",
-    theme=gr.themes.Soft(),  # Added a nice theme
 )
 if __name__ == "__main__":

 from transformers import AutoTokenizer, AutoModelForCausalLM
 from peft import PeftModel
 import gradio as gr
+import spaces
+# Load models
 base_model = AutoModelForCausalLM.from_pretrained(
     "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
     torch_dtype=torch.float16,
+    device_map="auto",
     trust_remote_code=True
 )
 tokenizer = AutoTokenizer.from_pretrained("unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit")
 if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
 # Load LoRA adapter
 model = PeftModel.from_pretrained(base_model, "rezaenayati/RezAi-Model")
+@spaces.GPU
 def chat_with_rezAi(messages, history):
+    # Build conversation with proper formatting
+    conversation = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are Reza Enayati, a Computer Science student and entrepreneur from Los Angeles, who is eager to work as a software engineer or machine learning engineer. Answer these questions as if you are in an interview.<|eot_id|>"
     # Add conversation history
     for user_msg, assistant_msg in history:
+        conversation += f"<|start_header_id|>user<|end_header_id|>\n\n{user_msg}<|eot_id|>"
+        conversation += f"<|start_header_id|>assistant<|end_header_id|>\n\n{assistant_msg}<|eot_id|>"
     # Add current message
+    conversation += f"<|start_header_id|>user<|end_header_id|>\n\n{messages}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+    # Tokenize
     inputs = tokenizer(
+        conversation,
+        return_tensors="pt",
+        truncation=True,
         max_length=2048
     )
+    # Move to device
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    # Generate with higher temperature
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
+            max_new_tokens=150,
+            temperature=0.5,  # You asked for 5, but that's too high (0.5 is good)
             do_sample=True,
             pad_token_id=tokenizer.eos_token_id,
+            eos_token_id=tokenizer.eos_token_id
         )
+    # Extract ONLY the new assistant response
+    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Split by the last assistant header and get only the new response
+    if "<|start_header_id|>assistant<|end_header_id|>" in full_response:
+        response_parts = full_response.split("<|start_header_id|>assistant<|end_header_id|>")
+        new_response = response_parts[-1].strip()
+    else:
+        new_response = full_response.strip()
+    # Clean up any remaining special tokens or incomplete parts
+    new_response = new_response.replace("<|eot_id|>", "").strip()
     if "<|" in new_response:
         new_response = new_response.split("<|")[0].strip()
     return new_response
+# Simple Gradio interface
 demo = gr.ChatInterface(
     fn=chat_with_rezAi,
+    title="Chat with RezAI",
+    description="Ask me about Reza's background and experience!"
 )
 if __name__ == "__main__":