Update modeling_dots_ocr_vllm.py (#7)

Browse files

- Update modeling_dots_ocr_vllm.py (67cbf202a8f58ec138797afa24090a392d608c1e)
- Update modeling_dots_ocr_vllm.py (4bc8bca53381124acd4bf63f4567b515539b3c7f)

Co-authored-by: Renjie Wu <RenjieWu@users.noreply.huggingface.co>

Files changed (1) hide show

modeling_dots_ocr_vllm.py +22 -0

modeling_dots_ocr_vllm.py CHANGED Viewed

@@ -91,6 +91,17 @@ class DotsOCRProcessingInfo(Qwen2_5_VLProcessingInfo):
         return config
     def get_hf_processor(
         self,
         *,
@@ -99,6 +110,7 @@ class DotsOCRProcessingInfo(Qwen2_5_VLProcessingInfo):
         size: Optional[dict[str, int]] = None,
         **kwargs: object,
     ) -> Qwen2VLProcessor:
         processor = self.ctx.get_hf_processor(
             Qwen2VLProcessor,
             image_processor=self.get_image_processor(min_pixels=min_pixels, max_pixels=max_pixels, size=size),
@@ -166,6 +178,11 @@ class DotsOCRForCausalLM(nn.Module, SupportsMultiModal):
     )
     _tp_plan = {}
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -409,6 +426,10 @@ class DotsOCRForCausalLM(nn.Module, SupportsMultiModal):
 def patch_vllm_chat_placeholder():
     from vllm.entrypoints.chat_utils import BaseMultiModalItemTracker
     ori = BaseMultiModalItemTracker._placeholder_str
@@ -426,4 +447,5 @@ ModelRegistry.register_model(
     "DotsOCRForCausalLM", DotsOCRForCausalLM,
 )
 patch_vllm_chat_placeholder()

         return config
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": 0}
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        max_image_tokens = self.get_max_image_tokens()
+        return {"image": max_image_tokens, "video": 0}
     def get_hf_processor(
         self,
         *,
         size: Optional[dict[str, int]] = None,
         **kwargs: object,
     ) -> Qwen2VLProcessor:
+        self.get_tokenizer().image_token = "<|imgpad|>" # Ensure image token is set
         processor = self.ctx.get_hf_processor(
             Qwen2VLProcessor,
             image_processor=self.get_image_processor(min_pixels=min_pixels, max_pixels=max_pixels, size=size),
     )
     _tp_plan = {}
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality in ("image",):
+            return "<|img|><|imgpad|><|endofimg|>"
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 def patch_vllm_chat_placeholder():
+    import vllm
+    # return when vllm version > 0.9.1
+    if not (vllm.__version_tuple__[0]==0 and vllm.__version_tuple__[1] <= 9 and vllm.__version_tuple__[2] <= 1):
+        return
     from vllm.entrypoints.chat_utils import BaseMultiModalItemTracker
     ori = BaseMultiModalItemTracker._placeholder_str
     "DotsOCRForCausalLM", DotsOCRForCausalLM,
 )
 patch_vllm_chat_placeholder()