Spaces:

sitammeur
/

Qwen-Coder-llamacpp

Running

App Files Files Community

Qwen-Coder-llamacpp / app.py

sitammeur

Update app.py

23f3a7b verified 7 months ago

raw

history blame contribute delete

7.96 kB

	# Importing required libraries
	import warnings
	warnings.filterwarnings("ignore")

	import os
	import json
	import subprocess
	import sys
	from typing import List, Tuple
	from llama_cpp import Llama
	from llama_cpp_agent import LlamaCppAgent
	from llama_cpp_agent import MessagesFormatterType
	from llama_cpp_agent.providers import LlamaCppPythonProvider
	from llama_cpp_agent.chat_history import BasicChatHistory
	from llama_cpp_agent.chat_history.messages import Roles
	from huggingface_hub import hf_hub_download
	import gradio as gr
	from logger import logging
	from exception import CustomExceptionHandling


	# Download gguf model files
	if not os.path.exists("./models"):
	os.makedirs("./models")

	hf_hub_download(
	repo_id="Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF",
	filename="qwen2.5-coder-1.5b-instruct-q4_k_m.gguf",
	local_dir="./models",
	)
	hf_hub_download(
	repo_id="Qwen/Qwen2.5-Coder-0.5B-Instruct-GGUF",
	filename="qwen2.5-coder-0.5b-instruct-q6_k.gguf",
	local_dir="./models",
	)


	# Set the title and description
	title = "Qwen-Coder Llama.cpp"
	description = """[Qwen2.5-Coder](https://huggingface.co/collections/Qwen/qwen25-coder-66eaa22e6f99801bf65b0c2f), a six-model family of LLMs, boasts enhanced code generation, reasoning, and debugging. Trained on 5.5 trillion tokens, its 32B parameter model rivals GPT-4o, offering versatile capabilities for coding and broader applications.
	This interactive chat interface allows you to experiment with the [`Qwen2.5-Coder-0.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B-Instruct) and [`Qwen2.5-Coder-1.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct) coding models using various prompts and generation parameters.
	Users can select different model variants (GGUF format), system prompts, and observe generated responses in real-time.
	Key generation parameters, such as ⁣`temperature`, `max_tokens`, `top_k` and others are exposed below for tuning model behavior."""


	llm = None
	llm_model = None

	def respond(
	message: str,
	history: List[Tuple[str, str]],
	model: str = "qwen2.5-coder-0.5b-instruct-q6_k.gguf", # Set default model
	system_message: str = "You are a helpful assistant.",
	max_tokens: int = 1024,
	temperature: float = 0.7,
	top_p: float = 0.95,
	top_k: int = 40,
	repeat_penalty: float = 1.1,
	):
	"""
	Respond to a message using the Qwen2.5-Coder model via Llama.cpp.

	Args:
	- message (str): The message to respond to.
	- history (List[Tuple[str, str]]): The chat history.
	- model (str): The model to use.
	- system_message (str): The system message to use.
	- max_tokens (int): The maximum number of tokens to generate.
	- temperature (float): The temperature of the model.
	- top_p (float): The top-p of the model.
	- top_k (int): The top-k of the model.
	- repeat_penalty (float): The repetition penalty of the model.

	Returns:
	str: The response to the message.
	"""
	try:
	# Load the global variables
	global llm
	global llm_model

	# Ensure model is not None
	if model is None:
	model = "qwen2.5-coder-0.5b-instruct-q6_k.gguf"

	# Load the model
	if llm is None or llm_model != model:
	# Check if model file exists
	model_path = f"models/{model}"
	if not os.path.exists(model_path):
	yield f"Error: Model file not found at {model_path}. Please check your model path."
	return

	llm = Llama(
	model_path=f"models/{model}",
	flash_attn=False,
	n_gpu_layers=0,
	n_batch=8,
	n_ctx=2048,
	n_threads=8,
	n_threads_batch=8,
	)
	llm_model = model
	provider = LlamaCppPythonProvider(llm)

	# Create the agent
	agent = LlamaCppAgent(
	provider,
	system_prompt=f"{system_message}",
	predefined_messages_formatter_type=MessagesFormatterType.CHATML,
	debug_output=True,
	)

	# Set the settings like temperature, top-k, top-p, max tokens, etc.
	settings = provider.get_provider_default_settings()
	settings.temperature = temperature
	settings.top_k = top_k
	settings.top_p = top_p
	settings.max_tokens = max_tokens
	settings.repeat_penalty = repeat_penalty
	settings.stream = True

	messages = BasicChatHistory()

	# Add the chat history
	for msn in history:
	user = {"role": Roles.user, "content": msn[0]}
	assistant = {"role": Roles.assistant, "content": msn[1]}
	messages.add_message(user)
	messages.add_message(assistant)

	# Get the response stream
	stream = agent.get_chat_response(
	message,
	llm_sampling_settings=settings,
	chat_history=messages,
	returns_streaming_generator=True,
	print_output=False,
	)

	# Log the success
	logging.info("Response stream generated successfully")

	# Generate the response
	outputs = ""
	for output in stream:
	outputs += output
	yield outputs

	# Handle exceptions that may occur during the process
	except Exception as e:
	# Custom exception handling
	raise CustomExceptionHandling(e, sys) from e


	# Create a chat interface
	demo = gr.ChatInterface(
	respond,
	examples=[["Write a quick sort algorithm in Python."], ["What is a function in programming?"], ["Please implement A* using Python."]],
	additional_inputs_accordion=gr.Accordion(
	label="⚙️ Parameters", open=False, render=False
	),
	additional_inputs=[
	gr.Dropdown(
	choices=[
	"qwen2.5-coder-1.5b-instruct-q4_k_m.gguf",
	"qwen2.5-coder-0.5b-instruct-q6_k.gguf",
	],
	value="qwen2.5-coder-0.5b-instruct-q6_k.gguf",
	label="Model",
	info="Select the AI model to use for chat",
	),
	gr.Textbox(
	value="You are Qwen, created by Alibaba Cloud. You are a helpful assistant.",
	label="System Prompt",
	info="Define the AI assistant's personality and behavior",
	lines=2,
	),
	gr.Slider(
	minimum=512,
	maximum=2048,
	value=1024,
	step=1,
	label="Max Tokens",
	info="Maximum length of response (higher = longer replies)",
	),
	gr.Slider(
	minimum=0.1,
	maximum=2.0,
	value=0.7,
	step=0.1,
	label="Temperature",
	info="Creativity level (higher = more creative, lower = more focused)",
	),
	gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.95,
	step=0.05,
	label="Top-p",
	info="Nucleus sampling threshold",
	),
	gr.Slider(
	minimum=1,
	maximum=100,
	value=40,
	step=1,
	label="Top-k",
	info="Limit vocabulary choices to top K tokens",
	),
	gr.Slider(
	minimum=1.0,
	maximum=2.0,
	value=1.1,
	step=0.1,
	label="Repetition Penalty",
	info="Penalize repeated words (higher = less repetition)",
	),
	],
	theme="Ocean",
	submit_btn="Send",
	stop_btn="Stop",
	title=title,
	description=description,
	chatbot=gr.Chatbot(scale=1, show_copy_button=True, resizable=True),
	flagging_mode="never",
	editable=True,
	cache_examples=False,
	)


	# Launch the chat interface
	if __name__ == "__main__":
	demo.launch(
	share=False,
	server_name="0.0.0.0",
	server_port=7860,
	show_api=False,
	)