add streamer and verbose params and image_file support io.ByteIO,PIL.Image.Image for infer method

#23
by weege007 - opened
import io
import os
import sys
import uuid
import asyncio
import subprocess
from threading import Thread
from PIL import Image


import modal


app = modal.App("deepseek-ocr")
IMAGE_GPU = os.getenv("IMAGE_GPU", None)
BACKEND = os.getenv("BACKEND", None)
img = (
    # https://catalog.ngc.nvidia.com/orgs/nvidia/containers/cuda/tags
    modal.Image.from_registry(
        "nvidia/cuda:12.6.1-cudnn-devel-ubuntu22.04",
        add_python="3.10",
    )
    .apt_install("git", "git-lfs")
    .run_commands(
        "git clone https://github.com/deepseek-ai/DeepSeek-OCR.git",
        "cd /DeepSeek-OCR && pip install -r requirements.txt",
    )
    .pip_install(
        "torch==2.6.0",
        "torchvision==0.21.0",
        "torchaudio==2.6.0",
        "wheel",
        "matplotlib",
        "accelerate>=0.26.0",  # for device_map="auto" model loading with safetensors slipt
    )
    # .apt_install("clang", "cmake", "ninja-build")
    # https://github.com/Dao-AILab/flash-attention/releases/tag/v2.7.4.post1 for torch 2.6.0
    .pip_install("flash-attn==2.7.4.post1", extra_options="--no-build-isolation")
    .env(
        {
            "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True",
            "ACHATBOT_PKG": "1",
            "CUDA_VISIBLE_DEVICES": "0",
            "LLM_MODEL": os.getenv("LLM_MODEL", "deepseek-ai/DeepSeek-OCR"),
        }
    )
)

if BACKEND == "achatbot":
    img = img.pip_install(
        f"achatbot==0.0.27.dev8",
        extra_index_url=os.getenv("EXTRA_INDEX_URL", "https://test.pypi.org/simple/"),
    )


HF_MODEL_DIR = "/root/.achatbot/models"
hf_model_vol = modal.Volume.from_name("models", create_if_missing=True)
ASSETS_DIR = "/root/.achatbot/assets"
assets_vol = modal.Volume.from_name("assets", create_if_missing=True)
CONFIG_DIR = "/root/.achatbot/config"
config_vol = modal.Volume.from_name("config", create_if_missing=True)

TORCH_CACHE_DIR = "/root/.cache/torch"
torch_cache_vol = modal.Volume.from_name("torch_cache", create_if_missing=True)


with img.imports():
    from queue import Queue

    import torch
    from transformers.generation.streamers import TextStreamer, TextIteratorStreamer
    from transformers import AutoModel, AutoTokenizer

    MODEL_ID = os.getenv("LLM_MODEL", "deepseek-ai/DeepSeek-OCR")
    MODEL_PATH = os.path.join(HF_MODEL_DIR, MODEL_ID)
    DEEPSEEK_ASSETS_DIR = os.path.join(ASSETS_DIR, "DeepSeek")

    # torch.set_float32_matmul_precision("high")


def print_model_params(model: torch.nn.Module, extra_info="", f=None):
    # print the number of parameters in the model
    model_million_params = sum(p.numel() for p in model.parameters()) / 1e6
    print(model, file=f)
    print(f"{extra_info} {model_million_params} M parameters", file=f)




@app
	.function(
    gpu=IMAGE_GPU,
    cpu=2.0,
    retries=0,
    image=img,
    secrets=[modal.Secret.from_name("achatbot")],
    volumes={
        HF_MODEL_DIR: hf_model_vol,
        ASSETS_DIR: assets_vol,
    },
    timeout=1200,  # default 300s
    scaledown_window=1200,
    max_containers=1,
)
async def run(func, **kwargs):
    os.makedirs(DEEPSEEK_ASSETS_DIR, exist_ok=True)
    subprocess.run("nvidia-smi --version", shell=True)
    subprocess.run("nvcc --version", shell=True)
    gpu_prop = None
    if torch.cuda.is_available():
        gpu_prop = torch.cuda.get_device_properties("cuda")
        print(gpu_prop)

    if asyncio.iscoroutinefunction(func):
        await func(**kwargs)
    else:
        func(**kwargs)


def dump_model(**kwargs):
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
    print("tokenizer.eos_token_id", tokenizer.eos_token_id)
    print("Tokenizer:", tokenizer)
    model = AutoModel.from_pretrained(
        MODEL_PATH,
        _attn_implementation="flash_attention_2",
        trust_remote_code=True,
        use_safetensors=True,
        torch_dtype=torch.bfloat16,
        device_map="cuda" if torch.cuda.is_available() else "auto",  # need accelerate>=0.26.0
    )
    model = model.eval()

    print_model_params(model, extra_info="DeepSeek-OCR", f=sys.stdout)


def infer(**kwargs):
    class NoEOSTextStreamer(TextStreamer):
        def on_finalized_text(self, text: str, stream_end: bool = False):
            eos_text = self.tokenizer.decode(
                [self.tokenizer.eos_token_id], skip_special_tokens=False
            )
            text = text.replace(eos_text, "\n")
            # print(text, flush=True, end="")
            stream_end and print("stream_end is True", flush=True)

    model = AutoModel.from_pretrained(
        MODEL_PATH,
        _attn_implementation="flash_attention_2",
        trust_remote_code=True,
        use_safetensors=True,
    )
    model = model.eval().cuda().to(torch.bfloat16)

    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
    prompt = "<image>\n<|grounding|>Convert the document to markdown. "
    # prompt = "<image>\nFree OCR. "
    # document: <image>\n<|grounding|>Convert the document to markdown.
    # other image: <image>\n<|grounding|>OCR this image.
    # without layouts: <image>\nFree OCR.
    # figures in document: <image>\nParse the figure.
    # general: <image>\nDescribe this image in detail.
    # rec: <image>\nLocate <|ref|>xxxx<|/ref|> in the image.
    # '先天下之忧而忧'
    # .......

    # create dummy image file
    dummy_img = Image.new("RGB", (640, 640), color="white")
    ioBuff = io.BytesIO()
    dummy_img.save(ioBuff, format="PNG")
    ioBuff.seek(0)

    image_files = [
        dummy_img,
        ioBuff,
        "/DeepSeek-OCR/assets/fig1.png",
        # use ORC detected Show pictures, detect again :)
        "/DeepSeek-OCR/assets/show1.jpg",
        # "/DeepSeek-OCR/assets/show2.jpg",
        # "/DeepSeek-OCR/assets/show3.jpg",
        # "/DeepSeek-OCR/assets/show4.jpg",
    ]

    # Tiny: base_size = 512, image_size = 512, crop_mode = False
    # Small: base_size = 640, image_size = 640, crop_mode = False
    # Base: base_size = 1024, image_size = 1024, crop_mode = False
    # Large: base_size = 1280, image_size = 1280, crop_mode = False
    # Gundam: base_size = 1024, image_size = 640, crop_mode = True # default

    for image_file in image_files:
        print("infer image_file:", image_file)
        streamer = TextIteratorStreamer(
            tokenizer,
            skip_prompt=True,
            # skip_special_tokens=True,
            skip_special_tokens=False,
        )
        # https://huggingface.co/deepseek-ai/DeepSeek-OCR/blob/refs%2Fpr%2F23/modeling_deepseekocr.py#L707
        gen_kwargs = dict(
            tokenizer=tokenizer,
            prompt=prompt,
            image_file=image_file,
            # output_path=DEEPSEEK_ASSETS_DIR,
            base_size=1024,
            image_size=640,
            crop_mode=True,
            save_results=False,  # open save results, u can push to S3 or other storage for cdn :)
            test_compress=False,
            eval_mode=False,
            # streamer=NoEOSTextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False),
            streamer=streamer,
            verbose=False,
        )
        thread = Thread(target=model.infer, kwargs=gen_kwargs)
        thread.start()
        for new_text in streamer:
            if new_text is not None:
                print(new_text, flush=True, end="")  # note: flush with buff to print




@torch
	.inference_mode()
def infer_filter(**kwargs):
    class NoEOSTextStreamer(TextStreamer):
        def on_finalized_text(self, text: str, stream_end: bool = False):
            eos_text = self.tokenizer.decode(
                [self.tokenizer.eos_token_id], skip_special_tokens=False
            )
            text = text.replace(eos_text, "\n")
            # print(text, flush=True, end="")
            stream_end and print("stream_end is True", flush=True)

    model = AutoModel.from_pretrained(
        MODEL_PATH,
        _attn_implementation="flash_attention_2",
        trust_remote_code=True,
        use_safetensors=True,
        # torch_dtype=torch.bfloat16,
        # device_map="cuda" if torch.cuda.is_available() else "auto",  # need accelerate>=0.26.0
    )
    model = model.eval().cuda().to(torch.bfloat16)

    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
    # NOTE: use FIED task prompt
    prompt = "<image>\n<|grounding|>Convert the document to markdown. "
    # prompt = "<image>\n<|grounding|>Convert the document to text. " # have issue
    # prompt = "<image>\nFree OCR. "
    # document: <image>\n<|grounding|>Convert the document to markdown.
    # other image: <image>\n<|grounding|>OCR this image.
    # without layouts: <image>\nFree OCR.
    # figures in document: <image>\nParse the figure.
    # general: <image>\nDescribe this image in detail.
    # rec: <image>\nLocate <|ref|>xxxx<|/ref|> in the image.
    # '先天下之忧而忧'
    # .......

    # create dummy image file
    dummy_img = Image.new("RGB", (640, 640), color="white")
    ioBuff = io.BytesIO()
    dummy_img.save(ioBuff, format="PNG")
    ioBuff.seek(0)

    image_files = [
        dummy_img,
        ioBuff,
        "/DeepSeek-OCR/assets/fig1.png",
        # use ORC detected Show pictures, detect again :)
        "/DeepSeek-OCR/assets/show1.jpg",
        # "/DeepSeek-OCR/assets/show2.jpg",
        # "/DeepSeek-OCR/assets/show3.jpg",
        # "/DeepSeek-OCR/assets/show4.jpg",
    ]

    # Tiny: base_size = 512, image_size = 512, crop_mode = False
    # Small: base_size = 640, image_size = 640, crop_mode = False
    # Base: base_size = 1024, image_size = 1024, crop_mode = False
    # Large: base_size = 1280, image_size = 1280, crop_mode = False
    # Gundam: base_size = 1024, image_size = 640, crop_mode = True # default

    for image_file in image_files:
        print("infer image_file:", image_file)
        streamer = TextIteratorStreamer(
            tokenizer,
            skip_prompt=True,
            # skip_special_tokens=True,
            skip_special_tokens=False,
        )
        # https://huggingface.co/deepseek-ai/DeepSeek-OCR/blob/refs%2Fpr%2F23/modeling_deepseekocr.py#L707
        gen_kwargs = dict(
            tokenizer=tokenizer,
            prompt=prompt,
            image_file=image_file,
            # output_path=DEEPSEEK_ASSETS_DIR,
            base_size=1024,
            image_size=640,
            crop_mode=True,
            save_results=False,  # open save results, u can push to S3 or other storage for cdn :)
            test_compress=False,
            eval_mode=False,
            # streamer=NoEOSTextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False),
            streamer=streamer,
            verbose=False,
        )
        thread = Thread(target=model.infer, kwargs=gen_kwargs)
        thread.start()
        is_ref_det = False
        det_text = ""
        for new_text in streamer:
            if new_text is None:
                continue
            print(new_text, flush=True, end="")  # note: flush with buff to print
            if "<|ref|>" in new_text:
                is_ref_det = True

            if "<|/det|>" in new_text:
                is_ref_det = False
                new_text = new_text.split("<|/det|>")[1]

            if "<|end▁of▁sentence|>" in new_text:
                if "<|/ref|>" not in new_text:
                    new_text = new_text.split("<|end▁of▁sentence|>")[0]
                print("\n" + 20 * "----")
            if "<|end▁of▁sentence|>" in new_text:
                if "<|/ref|>" not in new_text:
                    new_text = new_text.split("<|end▁of▁sentence|>")[0]
                print("\n" + 20 * "----")

            if is_ref_det is False:
                det_text += new_text
        print("det_text:", det_text)
        torch.cuda.empty_cache()


async def achatbot_infer(**kwargs):
    from achatbot.processors.vision.ocr_processor import OCRProcessor
    from achatbot.modules.vision.ocr import VisionOCREnvInit
    from achatbot.common.session import SessionCtx, Session
    from achatbot.types.frames.data_frames import UserImageRawFrame
    from achatbot.common.logger import Logger

    Logger.init(os.getenv("LOG_LEVEL", "info").upper(), is_file=False, is_console=True)

    ocr = VisionOCREnvInit.initVisionOCREngine(
        "llm_transformers_manual_vision_deepseek_ocr",
        {
            "lm_model_name_or_path": MODEL_PATH,
            "lm_device": "cuda",
            "ocr_base_size": 1024,
            "ocr_image_size": 640,
            "ocr_crop_mode": True,
            "ocr_prompt": "<image>\n<|grounding|>Convert the document to markdown. ",
        },
    )
    session = Session(**SessionCtx(str(uuid.uuid4())).__dict__)
    processor = OCRProcessor(ocr=ocr, session=session)
    image_files = [
        Image.new("RGB", (640, 640), color="white"),
        Image.open("/DeepSeek-OCR/assets/fig1.png"),
        # use ORC detected Show pictures, detect again :)
        Image.open("/DeepSeek-OCR/assets/show1.jpg"),
        # Image.open("/DeepSeek-OCR/assets/show2.jpg"),
        # Image.open("/DeepSeek-OCR/assets/show3.jpg"),
        # Image.open("/DeepSeek-OCR/assets/show4.jpg"),
    ]
    for image_obj in image_files:
        image_obj: Image.Image = image_obj
        frame = UserImageRawFrame(
            image=image_obj.tobytes(),
            size=image_obj.size,
            format=image_obj.format,  # from frame bytes, no save format, need add a save format e.g.: JPEG,PNG,
            mode=image_obj.mode,  # default: RGB
            user_id=session.ctx.client_id,
        )
        iter = processor.run_detect(frame)
        async for textFrame in iter:
            print(textFrame)


"""
modal run src/download_models.py --repo-ids "deepseek-ai/DeepSeek-OCR" --revision "refs/pr/23"

IMAGE_GPU=L4 modal run src/llm/transformers/vlm/ocr_deepseek.py --task dump_model
IMAGE_GPU=L4 modal run src/llm/transformers/vlm/ocr_deepseek.py --task infer
IMAGE_GPU=L4 modal run src/llm/transformers/vlm/ocr_deepseek.py --task infer_filter
BACKEND=achatbot IMAGE_GPU=L4 modal run src/llm/transformers/vlm/ocr_deepseek.py --task achatbot_infer
"""




@app
	.local_entrypoint()
def main(task: str = "dump_model"):
    tasks = {
        "dump_model": dump_model,
        "infer": infer,
        "infer_filter": infer_filter,
        "achatbot_infer": achatbot_infer,
    }
    if task not in tasks:
        raise ValueError(f"task {task} not found")
    print(f"running task {task}")
    run.remote(
        tasks[task],
    )
weege007 changed pull request title from add streamer for infer method when not eval mode to add streamer and verbose params for infer method
weege007 changed pull request title from add streamer and verbose params for infer method to add streamer and verbose params and image_file support io.ByteIO for infer method
weege007 changed pull request title from add streamer and verbose params and image_file support io.ByteIO for infer method to add streamer and verbose params and image_file support io.ByteIO,Image.Image for infer method
weege007 changed pull request title from add streamer and verbose params and image_file support io.ByteIO,Image.Image for infer method to add streamer and verbose params and image_file support io.ByteIO,PIL.Image.Image for infer method
Ready to merge
This branch is ready to get merged automatically.

Sign up or log in to comment