Spaces:

bunyaminergen
/

Qwen2.5-Coder-1.5B-Instruct-Reasoning

Runtime error

App Files Files Community

Qwen2.5-Coder-1.5B-Instruct-Reasoning / app.py

bunyaminergen

Initial

6238dbd 8 months ago

raw

history blame contribute delete

2.41 kB

	# Standard library imports
	import os
	import threading

	# Third-party imports
	import gradio as gr
	from peft import PeftModel
	from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer

	HF_TOKEN = os.getenv("HF_TOKEN")

	tokenizer = AutoTokenizer.from_pretrained(
	"bunyaminergen/Qwen2.5-Coder-1.5B-Instruct-Reasoning",
	token=HF_TOKEN,
	trust_remote_code=True
	)

	base_model = AutoModelForCausalLM.from_pretrained(
	"Qwen/Qwen2.5-Coder-1.5B-Instruct",
	device_map="auto",
	torch_dtype="auto",
	token=HF_TOKEN
	)

	base_model.resize_token_embeddings(len(tokenizer))

	model = PeftModel.from_pretrained(
	base_model,
	"bunyaminergen/Qwen2.5-Coder-1.5B-Instruct-Reasoning",
	token=HF_TOKEN
	)
	model.eval()


	def respond(
	message: str,
	history: list[tuple[str, str]],
	system_message: str,
	max_tokens: int,
	temperature: float,
	top_p: float,
	):
	messages = [{"role": "system", "content": system_message}]
	for u, a in history:
	if u:
	messages.append({"role": "user", "content": u})
	if a:
	messages.append({"role": "assistant", "content": a})
	messages.append({"role": "user", "content": message})

	prompt = tokenizer.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)
	inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

	streamer = TextIteratorStreamer(
	tokenizer,
	timeout=600.0,
	skip_prompt=True,
	skip_special_tokens=True
	)
	generation_kwargs = {
	**inputs,
	"max_new_tokens": max_tokens,
	"temperature": temperature,
	"top_p": top_p,
	"streamer": streamer,
	}
	thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()

	output = ""
	for chunk in streamer:
	output += chunk
	yield output


	demo = gr.ChatInterface(
	respond,
	additional_inputs=[
	gr.Textbox(value="You are a helpful coding assistant.", label="System message"),
	gr.Slider(minimum=512, maximum=8192, value=2048, step=1, label="Max new tokens"),
	gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
	gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
	],
	)

	if __name__ == "__main__":
	demo.launch()