Spaces:

ankandrew
/

MiMo-VL-7B

Running on Zero

App Files Files Community

ankandrew commited on Aug 12

Commit

b59da6d

verified ·

1 Parent(s): 984bd48

Upload infer.py

Browse files

Files changed (1) hide show

infer.py +32 -56

infer.py CHANGED Viewed

@@ -1,36 +1,24 @@
 import os
 import torch
-from transformers import (
-    AutoProcessor,
-    Qwen2_5_VLForConditionalGeneration,
-    TextIteratorStreamer,
-)
-from transformers.generation.logits_process import LogitsProcessor
 from qwen_vl_utils import process_vision_info
 from threading import Thread
-class _NanSafeLogitsProcessor(LogitsProcessor):
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        scores = torch.nan_to_num(scores, neginf=-1e4, posinf=1e4)
-        scores.clamp_(min=-1e4, max=1e4)
-        return scores
 class MiMoVLInfer:
     def __init__(self, checkpoint_path, **kwargs):
         self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
             checkpoint_path,
-            torch_dtype=torch.float16,
             device_map={"": "cpu"},
-            attn_implementation="eager",
             trust_remote_code=True,
         ).eval()
         self.processor = AutoProcessor.from_pretrained(checkpoint_path, trust_remote_code=True)
         self._on_cuda = False
-        torch.set_float32_matmul_precision("high")
     def to_device(self, device: str):
         if device == "cuda" and not self._on_cuda:
             self.model.to("cuda")
@@ -42,67 +30,55 @@ class MiMoVLInfer:
     def __call__(self, inputs: dict, history: list = [], temperature: float = 1.0):
         messages = self.construct_messages(inputs)
         updated_history = history + messages
-        prompt = self.processor.apply_chat_template(
-            updated_history, tokenize=False, add_generation_prompt=True
-        )
         image_inputs, video_inputs = process_vision_info(updated_history)
         model_inputs = self.processor(
-            text=[prompt],
-            images=image_inputs,
-            videos=video_inputs,
-            padding=True,
-            return_tensors="pt",
         ).to(self.model.device)
         tokenizer = self.processor.tokenizer
-        streamer = TextIteratorStreamer(
-            tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True
-        )
         temp = float(temperature or 0.0)
         do_sample = temp > 1e-3
-        sampling_args = {"do_sample": False} if not do_sample else {
-            "do_sample": True,
-            "temperature": max(temp, 0.01),
-            "top_p": 0.95,
-        }
-        max_new = int(os.getenv("MAX_NEW_TOKENS", "768"))
         gen_kwargs = {
-            **model_inputs,
-            "max_new_tokens": max_new,
             "streamer": streamer,
-            "pad_token_id": tokenizer.pad_token_id or tokenizer.eos_token_id,
-            "logits_processor": [_NanSafeLogitsProcessor()],
-            **sampling_args,
         }
         thread = Thread(target=self.model.generate, kwargs=gen_kwargs, daemon=True)
         thread.start()
-        partial = ""
-        for chunk in streamer:
-            partial += chunk
-            yield partial, updated_history + [{
-                "role": "assistant",
-                "content": [{"type": "text", "text": partial}]
             }]
     def _is_video_file(self, filename):
         return any(filename.lower().endswith(ext) for ext in
-                   [".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", ".webm", ".mpeg"])
     def construct_messages(self, inputs: dict) -> list:
         content = []
-        for path in inputs.get("files", []):
             if self._is_video_file(path):
-                content.append({"type": "video", "video": f"file://{path}"})
             else:
-                content.append({"type": "image", "image": f"file://{path}"})
-        q = inputs.get("text", "")
-        if q:
-            content.append({"type": "text", "text": q})
-        return [{"role": "user", "content": content}]

+# modified from https://github.com/XiaomiMiMo/MiMo-VL/tree/main/infer.py
 import os
 import torch
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
+from transformers.generation.stopping_criteria import EosTokenCriteria, StoppingCriteriaList
 from qwen_vl_utils import process_vision_info
 from threading import Thread
 class MiMoVLInfer:
     def __init__(self, checkpoint_path, **kwargs):
+        dtype = torch.float16
         self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
             checkpoint_path,
+            torch_dtype=dtype,
             device_map={"": "cpu"},
             trust_remote_code=True,
         ).eval()
         self.processor = AutoProcessor.from_pretrained(checkpoint_path, trust_remote_code=True)
         self._on_cuda = False
     def to_device(self, device: str):
         if device == "cuda" and not self._on_cuda:
             self.model.to("cuda")
     def __call__(self, inputs: dict, history: list = [], temperature: float = 1.0):
         messages = self.construct_messages(inputs)
         updated_history = history + messages
+        text = self.processor.apply_chat_template(updated_history, tokenize=False, add_generation_prompt=True)
         image_inputs, video_inputs = process_vision_info(updated_history)
         model_inputs = self.processor(
+            text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors='pt'
         ).to(self.model.device)
         tokenizer = self.processor.tokenizer
+        streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
+        max_new = int(os.getenv("MAX_NEW_TOKENS", "1024"))
         temp = float(temperature or 0.0)
         do_sample = temp > 1e-3
+        if do_sample:
+            samp_args = {"do_sample": True, "temperature": max(temp, 0.01), "top_p": 0.95}
+        else:
+            samp_args = {"do_sample": False}
         gen_kwargs = {
+            "max_new_tokens": 1024,
             "streamer": streamer,
+            "stopping_criteria": StoppingCriteriaList([EosTokenCriteria(eos_token_id=self.model.config.eos_token_id)]),
+            "pad_token_id": self.model.config.eos_token_id,
+            **model_inputs,
+            **samp_args,
         }
         thread = Thread(target=self.model.generate, kwargs=gen_kwargs, daemon=True)
         thread.start()
+        partial_response = ""
+        for new_text in streamer:
+            partial_response += new_text
+            yield partial_response, updated_history + [{
+                'role': 'assistant',
+                'content': [{'type': 'text', 'text': partial_response}]
             }]
     def _is_video_file(self, filename):
         return any(filename.lower().endswith(ext) for ext in
+                   ['.mp4', '.avi', '.mkv', '.mov', '.wmv', '.flv', '.webm', '.mpeg'])
     def construct_messages(self, inputs: dict) -> list:
         content = []
+        for path in inputs.get('files', []):
             if self._is_video_file(path):
+                content.append({"type": "video", "video": f'file://{path}'})
             else:
+                content.append({"type": "image", "image": f'file://{path}'})
+        query = inputs.get('text', '')
+        if query:
+            content.append({"type": "text", "text": query})
+        return [{"role": "user", "content": content}]