marstin's picture
[martin-dev] add demo v1 test
d425e71
"""pixtral.py.
File for providing the Pixtral model implementation.
"""
import logging
import torch
from huggingface_hub import snapshot_download
from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
UserMessage)
from mistral_common.protocol.instruct.request import ChatCompletionRequest
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_inference.transformer import Transformer
from PIL import Image
from src.models.base import ModelBase
from src.models.config import Config
class PixtralModel(ModelBase):
"""Pixtral model implementation."""
def __init__(self, config: Config) -> None:
"""Initialization of the Pixtral model.
Args:
config (Config): Parsed config
"""
# initialize the parent class
super().__init__(config)
def _load_specific_model(self) -> None:
"""Overridden function to populate self.model."""
snapshot_download(
repo_id=self.model_path,
allow_patterns=['params.json', 'consolidated.safetensors', 'tekken.json'],
local_dir=self.config.download_path,
)
self.model = Transformer.from_folder(self.config.download_path, **getattr(self.config, 'model', {}))
def _generate_prompt(self, prompt: str) -> str:
"""Generates the Pixtral model prompt which will not use the chat template.
Args:
prompt (str): The input prompt for the model.
Returns:
str: The prompt to return, set by the config.
"""
return prompt
def _init_processor(self) -> None:
"""Initialize the Pixtral Tokenizer."""
self.processor = None # no intended processor here
self.tokenizer = MistralTokenizer.from_file(f'{self.config.download_path}/tekken.json')
def _generate_processor_output(self, prompt: str, img_path: str | None) -> dict:
"""Generate the processor outputs from the prompt and image path.
Pixtral uses a specific chat template format with special image tokens.
Args:
prompt (str): The generated prompt string with the input text and
the image labels.
img_path (str or None): The specified image path, or None for text-only.
Returns:
dict: The corresponding processor output per image and prompt.
"""
user_content = [TextChunk(text=prompt)]
if img_path is not None:
image = Image.open(img_path)
user_content = [ImageChunk(image=image)] + user_content
completion_request = ChatCompletionRequest(messages=[UserMessage(content=user_content)])
encoded = self.tokenizer.encode_chat_completion(completion_request)
res = {
'input_ids': torch.tensor(encoded.tokens, dtype=torch.long, device=self.model.device),
'seqlens': [len(encoded.tokens)],
}
if img_path is not None:
res['images'] = [
torch.tensor(img, device=self.model.device, dtype=self.model.dtype)
for img in encoded.images
]
return res
def _forward(self, data: dict) -> None:
"""Given some input data, performs a single forward pass.
This function itself can be overriden, while _hook_and_eval
should be left in tact.
Args:
data (dict): The given data tensor.
"""
with torch.no_grad():
_ = self.model.forward(**data)
logging.debug('Completed forward pass...')