ZeroGPU-LLM-Inference

Running

App Files Files Community

ZeroGPU-LLM-Inference / app.py

Luigi

switch to 7b model

ea7adad 7 months ago

raw

history blame

2.4 kB

	import streamlit as st
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download

	hf_hub_download(
	repo_id="Qwen/Qwen2.5-7B-Instruct-GGUF",
	filename="qwen2.5-7b-instruct-q2_k.gguf",
	local_dir="./models",
	)

	# Load the model (on first run)
	@st.cache_resource
	def load_model():
	return Llama(
	model_path="models/qwen2.5-7b-instruct-q2_k.gguf",
	n_ctx=1024,
	n_threads=2,
	n_threads_batch=2,
	n_batch=4,
	n_gpu_layers=0,
	use_mlock=False,
	use_mmap=True,
	verbose=False,
	)

	llm = load_model()

	# Session state for chat history
	if "chat_history" not in st.session_state:
	st.session_state.chat_history = []

	st.title("🧠 Qwen2.5-7B-Instruct (Streamlit + GGUF)")
	st.caption("Powered by `llama.cpp` and `llama-cpp-python` \| 2-bit Q2_K inference")

	with st.sidebar:
	st.header("⚙️ Settings")
	system_prompt = st.text_area("System Prompt", value="You are a helpful assistant.", height=80)
	max_tokens = st.slider("Max tokens", 64, 2048, 512, step=32)
	temperature = st.slider("Temperature", 0.1, 2.0, 0.7)
	top_k = st.slider("Top-K", 1, 100, 40)
	top_p = st.slider("Top-P", 0.1, 1.0, 0.95)
	repeat_penalty = st.slider("Repetition Penalty", 1.0, 2.0, 1.1)

	# Input box
	user_input = st.chat_input("Ask something...")

	if user_input:
	# Add user message to chat
	st.session_state.chat_history.append({"role": "user", "content": user_input})

	# Display user message
	with st.chat_message("user"):
	st.markdown(user_input)

	# Construct the prompt
	messages = [{"role": "system", "content": system_prompt}] + st.session_state.chat_history

	# Stream response
	with st.chat_message("assistant"):
	full_response = ""
	response_area = st.empty()
	stream = llm.create_chat_completion(
	messages=messages,
	max_tokens=max_tokens,
	temperature=temperature,
	top_k=top_k,
	top_p=top_p,
	repeat_penalty=repeat_penalty,
	stream=True,
	)

	for chunk in stream:
	if "choices" in chunk:
	delta = chunk["choices"][0]["delta"].get("content", "")
	full_response += delta
	response_area.markdown(full_response)

	st.session_state.chat_history.append({"role": "assistant", "content": full_response})