File size: 3,618 Bytes
d425e71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
"""pixtral.py.

File for providing the Pixtral model implementation.
"""
import logging

import torch
from huggingface_hub import snapshot_download
from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
                                                       UserMessage)
from mistral_common.protocol.instruct.request import ChatCompletionRequest
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_inference.transformer import Transformer
from PIL import Image

from src.models.base import ModelBase
from src.models.config import Config


class PixtralModel(ModelBase):
    """Pixtral model implementation."""

    def __init__(self, config: Config) -> None:
        """Initialization of the Pixtral model.

        Args:
            config (Config): Parsed config
        """
        # initialize the parent class
        super().__init__(config)

    def _load_specific_model(self) -> None:
        """Overridden function to populate self.model."""
        snapshot_download(
            repo_id=self.model_path,
            allow_patterns=['params.json', 'consolidated.safetensors', 'tekken.json'],
            local_dir=self.config.download_path,
        )

        self.model = Transformer.from_folder(self.config.download_path, **getattr(self.config, 'model', {}))

    def _generate_prompt(self, prompt: str) -> str:
        """Generates the Pixtral model prompt which will not use the chat template.

        Args:
            prompt (str): The input prompt for the model.

        Returns:
            str: The prompt to return, set by the config.
        """
        return prompt

    def _init_processor(self) -> None:
        """Initialize the Pixtral Tokenizer."""
        self.processor = None  # no intended processor here
        self.tokenizer = MistralTokenizer.from_file(f'{self.config.download_path}/tekken.json')

    def _generate_processor_output(self, prompt: str, img_path: str | None) -> dict:
        """Generate the processor outputs from the prompt and image path.

        Pixtral uses a specific chat template format with special image tokens.

        Args:
            prompt (str): The generated prompt string with the input text and
                the image labels.
            img_path (str or None): The specified image path, or None for text-only.

        Returns:
            dict: The corresponding processor output per image and prompt.
        """
        user_content = [TextChunk(text=prompt)]
        if img_path is not None:
            image = Image.open(img_path)
            user_content = [ImageChunk(image=image)] + user_content

        completion_request = ChatCompletionRequest(messages=[UserMessage(content=user_content)])
        encoded = self.tokenizer.encode_chat_completion(completion_request)

        res = {
            'input_ids': torch.tensor(encoded.tokens, dtype=torch.long, device=self.model.device),
            'seqlens': [len(encoded.tokens)],
        }

        if img_path is not None:
            res['images'] = [
                torch.tensor(img, device=self.model.device, dtype=self.model.dtype)
                for img in encoded.images
            ]

        return res

    def _forward(self, data: dict) -> None:
        """Given some input data, performs a single forward pass.

        This function itself can be overriden, while _hook_and_eval
        should be left in tact.

        Args:
            data (dict): The given data tensor.
        """
        with torch.no_grad():
            _ = self.model.forward(**data)
        logging.debug('Completed forward pass...')