Spaces:

hanzla
/

Falcon3MambaReasoner

Running on Zero

App Files Files Community

Falcon3MambaReasoner / app.py

hanzla

add

6aea303 8 months ago

raw

history blame

3.15 kB

	import gradio as gr
	import subprocess
	import sys
	import os

	# Install the necessary packages that require CUDA
	try:
	subprocess.check_call([sys.executable, "-m", "pip", "install", "causal-conv1d>=1.4.0", "--no-build-isolation"])
	subprocess.check_call([sys.executable, "-m", "pip", "install", "mamba-ssm"])
	except Exception as e:
	print(f"Warning: Could not install CUDA extensions: {e}")
	print("The model might not work correctly or will be slower.")

	# Now import the required libraries
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import torch

	# Define model repository
	repo_name = "hanzla/Falcon3-Mamba-R1-v0"

	# Load tokenizer
	print("Loading tokenizer...")
	tokenizer = AutoTokenizer.from_pretrained(repo_name)

	# Load model with appropriate settings
	print("Loading model... (this may take some time)")
	model = None

	try:
	# Try to load the model with GPU acceleration
	model = AutoModelForCausalLM.from_pretrained(
	repo_name,
	device_map="auto",
	torch_dtype=torch.float16,
	)
	except Exception as e:
	print(f"Error loading model with GPU: {e}")
	print("Attempting to load with CPU only...")
	try:
	model = AutoModelForCausalLM.from_pretrained(
	repo_name,
	device_map="cpu",
	torch_dtype=torch.float32,
	)
	except Exception as e2:
	print(f"Error loading model with CPU: {e2}")

	if model is None:
	print("Could not load the model. Please check the logs.")
	else:
	print("Model loaded successfully!")

	def generate_response(message, history):
	if model is None:
	return "Sorry, the model could not be loaded. Please check the logs."

	messages = [
	{"role": "system", "content": "You are a helpful assistant. You think before answering"},
	]

	# Add chat history to messages
	for h in history:
	messages.append({"role": "user", "content": h[0]})
	messages.append({"role": "assistant", "content": h[1]})

	# Add current message
	messages.append({"role": "user", "content": message})

	# Generate input text using chat template
	input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

	# Tokenize input
	input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)

	# Generate response
	outputs = model.generate(
	input_ids,
	max_new_tokens=512, # Reduced from 1024 to improve speed
	temperature=0.7,
	do_sample=True,
	)

	# Decode the generated tokens
	generated_tokens = outputs[0][len(input_ids[0]):]
	response = tokenizer.decode(generated_tokens, skip_special_tokens=True)

	return response

	# Create Gradio interface
	demo = gr.ChatInterface(
	generate_response,
	title="Falcon3-Mamba-R1-v0 Chat",
	description="Chat with the Falcon3-Mamba-R1-v0 model. This is a hybrid Falcon-Mamba architecture.",
	examples=["Tell me about yourself",
	"Explain quantum computing like I'm 10",
	"Write a short poem about AI"],
	theme="soft"
	)

	# Launch the interface
	demo.launch()