Spaces:

WueNLP
/

Centurio-Demo

Running on Zero

App Files Files Community

Centurio-Demo / app.py

Gregor

Upload 4 files

4f7c605 verified 10 months ago

raw

history blame

4.59 kB

	import gradio as gr
	from transformers import TextIteratorStreamer, AutoModelForCausalLM, AutoProcessor
	from threading import Thread
	import re
	import time
	from PIL import Image
	import torch
	import argparse
	import spaces

	parser = argparse.ArgumentParser()
	parser.add_argument('--model', type=str, default='aya')
	args = parser.parse_args()

	model_name = args.model

	processor = AutoProcessor.from_pretrained(f"WueNLP/centurio_{model_name}", trust_remote_code=True)
	model = AutoModelForCausalLM.from_pretrained(f"WueNLP/centurio_{model_name}",
	trust_remote_code=True,
	torch_dtype=torch.bfloat16,
	low_cpu_mem_usage=True
	).to("cuda:0")

	@spaces.GPU
	def bot_streaming(message, history):
	if message["files"]:
	image = message["files"][-1]
	else:
	# if there's no image uploaded for this turn, look for images in the past turns
	# kept inside tuples, take the last one
	for hist in history:
	if type(hist[0]) == tuple:
	image = hist[0][0]

	if "qwen" in model_name:
	if image is None:
	prompt = f"<\|im_start\|>system\nYou are a helpful assistant.<\|im_end\|>\n<\|im_start\|>user\n{message['text']}<\|im_end\|>\n<\|im_start\|>assistant\n"
	else:
	image = Image.open(image).convert("RGB")
	prompt = f"<\|im_start\|>system\nYou are a helpful assistant.<\|im_end\|>\n<\|im_start\|>user\n<image_placeholder>\n{message['text']}<\|im_end\|>\n<\|im_start\|>assistant\n"
	else:
	if image is None:
	prompt = f"<BOS_TOKEN><\|START_OF_TURN_TOKEN\|><\|USER_TOKEN\|>{message['text']}<\|END_OF_TURN_TOKEN\|><\|START_OF_TURN_TOKEN\|><\|CHATBOT_TOKEN\|>"
	else:
	image = Image.open(image).convert("RGB")
	prompt = f"<BOS_TOKEN><\|START_OF_TURN_TOKEN\|><\|USER_TOKEN\|><image_placeholder>\n{message['text']}<\|END_OF_TURN_TOKEN\|><\|START_OF_TURN_TOKEN\|><\|CHATBOT_TOKEN\|>"

	inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda:0", torch.bfloat16)

	streamer = TextIteratorStreamer(processor, **{"skip_special_tokens": False})
	generation_kwargs = dict(inputs, streamer=streamer,
	do_sample=True,
	num_beams=1,
	repetition_penalty=1.15,
	temperature=0.7,
	top_p=0.8,
	top_k=20,
	max_new_tokens=512, min_new_tokens=1)

	thread = Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()

	buffer = ""
	for new_text in streamer:
	buffer += new_text
	if "qwen" in model_name:
	generated_text_without_prompt = buffer.split("<\|im_start\|>assistant\n")[-1].split("<\|im_end\|>")[0]
	else:
	generated_text_without_prompt = buffer.split("<\|CHATBOT_TOKEN\|>")[-1].split("<\|END_OF_TURN_TOKEN\|>")[0]

	time.sleep(0.04)
	yield generated_text_without_prompt


	description = ("""# [Centurio: On Drivers of Multilingual Ability of Large Vision-Language Model](gregor-ge.github.io/Centurio/)
	Try [Centurio](https://huggingface.co/collections/WueNLP/centurio-677cf0ab6ddea874927a154e), a massively multilingual large vision-language model, in this demo (specifically, [Centurio Aya](https://huggingface.co/WueNLP/centurio_aya)).
	Upload an image and start chatting about it, or try one of the examples below.

	Centurio is trained with 100 languages but quality of answers can differ greatly depending on your language.
	Centurio is trained to read text in images but struggles with small text and with non-Latin scripts.

	> If you don't upload an image, you will receive an error.
	> This demo does not support multi-image prompts or multi-turn dialog. Every new prompt will refer to the last image (if no new image is included) without prior dialog as context.""")

	demo = gr.ChatInterface(fn=bot_streaming, title="Centurio Demo",
	examples=[{"text": "What is on the flower?", "files": ["./bee.jpg"]},
	{"text": "How to make this pastry?", "files": ["./baklava.png"]}],
	description=description,
	stop_btn="Stop Generation",
	multimodal=True
	)
	demo.launch(debug=True, share=True)