Spaces:

marstin
/

VLM-Lens

Sleeping

App Files Files Community

VLM-Lens / src /models /pixtral /pixtral.py

marstin

[martin-dev] add demo v1 test

d425e71 about 2 months ago

raw

history blame contribute delete

3.62 kB

	"""pixtral.py.

	File for providing the Pixtral model implementation.
	"""
	import logging

	import torch
	from huggingface_hub import snapshot_download
	from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
	UserMessage)
	from mistral_common.protocol.instruct.request import ChatCompletionRequest
	from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
	from mistral_inference.transformer import Transformer
	from PIL import Image

	from src.models.base import ModelBase
	from src.models.config import Config


	class PixtralModel(ModelBase):
	"""Pixtral model implementation."""

	def __init__(self, config: Config) -> None:
	"""Initialization of the Pixtral model.

	Args:
	config (Config): Parsed config
	"""
	# initialize the parent class
	super().__init__(config)

	def _load_specific_model(self) -> None:
	"""Overridden function to populate self.model."""
	snapshot_download(
	repo_id=self.model_path,
	allow_patterns=['params.json', 'consolidated.safetensors', 'tekken.json'],
	local_dir=self.config.download_path,
	)

	self.model = Transformer.from_folder(self.config.download_path, **getattr(self.config, 'model', {}))

	def _generate_prompt(self, prompt: str) -> str:
	"""Generates the Pixtral model prompt which will not use the chat template.

	Args:
	prompt (str): The input prompt for the model.

	Returns:
	str: The prompt to return, set by the config.
	"""
	return prompt

	def _init_processor(self) -> None:
	"""Initialize the Pixtral Tokenizer."""
	self.processor = None # no intended processor here
	self.tokenizer = MistralTokenizer.from_file(f'{self.config.download_path}/tekken.json')

	def _generate_processor_output(self, prompt: str, img_path: str \| None) -> dict:
	"""Generate the processor outputs from the prompt and image path.

	Pixtral uses a specific chat template format with special image tokens.

	Args:
	prompt (str): The generated prompt string with the input text and
	the image labels.
	img_path (str or None): The specified image path, or None for text-only.

	Returns:
	dict: The corresponding processor output per image and prompt.
	"""
	user_content = [TextChunk(text=prompt)]
	if img_path is not None:
	image = Image.open(img_path)
	user_content = [ImageChunk(image=image)] + user_content

	completion_request = ChatCompletionRequest(messages=[UserMessage(content=user_content)])
	encoded = self.tokenizer.encode_chat_completion(completion_request)

	res = {
	'input_ids': torch.tensor(encoded.tokens, dtype=torch.long, device=self.model.device),
	'seqlens': [len(encoded.tokens)],
	}

	if img_path is not None:
	res['images'] = [
	torch.tensor(img, device=self.model.device, dtype=self.model.dtype)
	for img in encoded.images
	]

	return res

	def _forward(self, data: dict) -> None:
	"""Given some input data, performs a single forward pass.

	This function itself can be overriden, while _hook_and_eval
	should be left in tact.

	Args:
	data (dict): The given data tensor.
	"""
	with torch.no_grad():
	_ = self.model.forward(**data)
	logging.debug('Completed forward pass...')