Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	| # Importing required libraries | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| import os | |
| import json | |
| import subprocess | |
| import sys | |
| from typing import List, Tuple | |
| from llama_cpp import Llama | |
| from llama_cpp_agent import LlamaCppAgent | |
| from llama_cpp_agent import MessagesFormatterType | |
| from llama_cpp_agent.providers import LlamaCppPythonProvider | |
| from llama_cpp_agent.chat_history import BasicChatHistory | |
| from llama_cpp_agent.chat_history.messages import Roles | |
| from huggingface_hub import hf_hub_download | |
| import gradio as gr | |
| from logger import logging | |
| from exception import CustomExceptionHandling | |
| # Download gguf model files | |
| if not os.path.exists("./models"): | |
| os.makedirs("./models") | |
| hf_hub_download( | |
| repo_id="Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF", | |
| filename="qwen2.5-coder-1.5b-instruct-q4_k_m.gguf", | |
| local_dir="./models", | |
| ) | |
| hf_hub_download( | |
| repo_id="Qwen/Qwen2.5-Coder-0.5B-Instruct-GGUF", | |
| filename="qwen2.5-coder-0.5b-instruct-q6_k.gguf", | |
| local_dir="./models", | |
| ) | |
| # Set the title and description | |
| title = "Qwen-Coder Llama.cpp" | |
| description = """**[Qwen2.5-Coder](https://huggingface.co/collections/Qwen/qwen25-coder-66eaa22e6f99801bf65b0c2f)**, a six-model family of LLMs, boasts enhanced code generation, reasoning, and debugging. Trained on 5.5 trillion tokens, its 32B parameter model rivals GPT-4o, offering versatile capabilities for coding and broader applications. | |
| This interactive chat interface allows you to experiment with the [`Qwen2.5-Coder-0.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B-Instruct) and [`Qwen2.5-Coder-1.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct) coding models using various prompts and generation parameters. | |
| Users can select different model variants (GGUF format), system prompts, and observe generated responses in real-time. | |
| Key generation parameters, such as `temperature`, `max_tokens`, `top_k` and others are exposed below for tuning model behavior.""" | |
| llm = None | |
| llm_model = None | |
| def respond( | |
| message: str, | |
| history: List[Tuple[str, str]], | |
| model: str = "qwen2.5-coder-0.5b-instruct-q6_k.gguf", # Set default model | |
| system_message: str = "You are a helpful assistant.", | |
| max_tokens: int = 1024, | |
| temperature: float = 0.7, | |
| top_p: float = 0.95, | |
| top_k: int = 40, | |
| repeat_penalty: float = 1.1, | |
| ): | |
| """ | |
| Respond to a message using the Qwen2.5-Coder model via Llama.cpp. | |
| Args: | |
| - message (str): The message to respond to. | |
| - history (List[Tuple[str, str]]): The chat history. | |
| - model (str): The model to use. | |
| - system_message (str): The system message to use. | |
| - max_tokens (int): The maximum number of tokens to generate. | |
| - temperature (float): The temperature of the model. | |
| - top_p (float): The top-p of the model. | |
| - top_k (int): The top-k of the model. | |
| - repeat_penalty (float): The repetition penalty of the model. | |
| Returns: | |
| str: The response to the message. | |
| """ | |
| try: | |
| # Load the global variables | |
| global llm | |
| global llm_model | |
| # Ensure model is not None | |
| if model is None: | |
| model = "qwen2.5-coder-0.5b-instruct-q6_k.gguf" | |
| # Load the model | |
| if llm is None or llm_model != model: | |
| # Check if model file exists | |
| model_path = f"models/{model}" | |
| if not os.path.exists(model_path): | |
| yield f"Error: Model file not found at {model_path}. Please check your model path." | |
| return | |
| llm = Llama( | |
| model_path=f"models/{model}", | |
| flash_attn=False, | |
| n_gpu_layers=0, | |
| n_batch=8, | |
| n_ctx=2048, | |
| n_threads=8, | |
| n_threads_batch=8, | |
| ) | |
| llm_model = model | |
| provider = LlamaCppPythonProvider(llm) | |
| # Create the agent | |
| agent = LlamaCppAgent( | |
| provider, | |
| system_prompt=f"{system_message}", | |
| predefined_messages_formatter_type=MessagesFormatterType.CHATML, | |
| debug_output=True, | |
| ) | |
| # Set the settings like temperature, top-k, top-p, max tokens, etc. | |
| settings = provider.get_provider_default_settings() | |
| settings.temperature = temperature | |
| settings.top_k = top_k | |
| settings.top_p = top_p | |
| settings.max_tokens = max_tokens | |
| settings.repeat_penalty = repeat_penalty | |
| settings.stream = True | |
| messages = BasicChatHistory() | |
| # Add the chat history | |
| for msn in history: | |
| user = {"role": Roles.user, "content": msn[0]} | |
| assistant = {"role": Roles.assistant, "content": msn[1]} | |
| messages.add_message(user) | |
| messages.add_message(assistant) | |
| # Get the response stream | |
| stream = agent.get_chat_response( | |
| message, | |
| llm_sampling_settings=settings, | |
| chat_history=messages, | |
| returns_streaming_generator=True, | |
| print_output=False, | |
| ) | |
| # Log the success | |
| logging.info("Response stream generated successfully") | |
| # Generate the response | |
| outputs = "" | |
| for output in stream: | |
| outputs += output | |
| yield outputs | |
| # Handle exceptions that may occur during the process | |
| except Exception as e: | |
| # Custom exception handling | |
| raise CustomExceptionHandling(e, sys) from e | |
| # Create a chat interface | |
| demo = gr.ChatInterface( | |
| respond, | |
| examples=[["Write a quick sort algorithm in Python."], ["What is a function in programming?"], ["Please implement A* using Python."]], | |
| additional_inputs_accordion=gr.Accordion( | |
| label="⚙️ Parameters", open=False, render=False | |
| ), | |
| additional_inputs=[ | |
| gr.Dropdown( | |
| choices=[ | |
| "qwen2.5-coder-1.5b-instruct-q4_k_m.gguf", | |
| "qwen2.5-coder-0.5b-instruct-q6_k.gguf", | |
| ], | |
| value="qwen2.5-coder-0.5b-instruct-q6_k.gguf", | |
| label="Model", | |
| info="Select the AI model to use for chat", | |
| ), | |
| gr.Textbox( | |
| value="You are Qwen, created by Alibaba Cloud. You are a helpful assistant.", | |
| label="System Prompt", | |
| info="Define the AI assistant's personality and behavior", | |
| lines=2, | |
| ), | |
| gr.Slider( | |
| minimum=512, | |
| maximum=2048, | |
| value=1024, | |
| step=1, | |
| label="Max Tokens", | |
| info="Maximum length of response (higher = longer replies)", | |
| ), | |
| gr.Slider( | |
| minimum=0.1, | |
| maximum=2.0, | |
| value=0.7, | |
| step=0.1, | |
| label="Temperature", | |
| info="Creativity level (higher = more creative, lower = more focused)", | |
| ), | |
| gr.Slider( | |
| minimum=0.1, | |
| maximum=1.0, | |
| value=0.95, | |
| step=0.05, | |
| label="Top-p", | |
| info="Nucleus sampling threshold", | |
| ), | |
| gr.Slider( | |
| minimum=1, | |
| maximum=100, | |
| value=40, | |
| step=1, | |
| label="Top-k", | |
| info="Limit vocabulary choices to top K tokens", | |
| ), | |
| gr.Slider( | |
| minimum=1.0, | |
| maximum=2.0, | |
| value=1.1, | |
| step=0.1, | |
| label="Repetition Penalty", | |
| info="Penalize repeated words (higher = less repetition)", | |
| ), | |
| ], | |
| theme="Ocean", | |
| submit_btn="Send", | |
| stop_btn="Stop", | |
| title=title, | |
| description=description, | |
| chatbot=gr.Chatbot(scale=1, show_copy_button=True, resizable=True), | |
| flagging_mode="never", | |
| editable=True, | |
| cache_examples=False, | |
| ) | |
| # Launch the chat interface | |
| if __name__ == "__main__": | |
| demo.launch( | |
| share=False, | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| show_api=False, | |
| ) | |
