Spaces:

prithivMLmods
/

Multimodal-OCR2

Running on Zero

App Files Files Community

prithivMLmods commited on Sep 25

Commit

25a44d8

verified ·

1 Parent(s): eb21945

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -149

app.py CHANGED Viewed

@@ -23,7 +23,9 @@ from transformers import (
 )
 from transformers.image_utils import load_image
-from docling_core.types.doc import DoclingDocument, DocTagsDocument
 import re
 import ast
@@ -36,6 +38,7 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load Nanonets-OCR-s
 MODEL_ID_M = "nanonets/Nanonets-OCR-s"
 processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
@@ -87,7 +90,8 @@ model_n = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
-# Preprocessing functions for SmolDocling-256M
 def add_random_padding(image, min_percent=0.1, max_percent=0.10):
     """Add random padding to an image based on its size."""
     image = image.convert("RGB")
@@ -121,6 +125,7 @@ def downsample_video(video_path):
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     frames = []
     frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
     for i in frame_indices:
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
@@ -133,76 +138,11 @@ def downsample_video(video_path):
     vidcap.release()
     return frames
-@spaces.GPU
-def generate_image(model_name: str, text: str, image: Image.Image,
-                   max_new_tokens: int = 1024,
-                   temperature: float = 0.6,
-                   top_p: float = 0.9,
-                   top_k: int = 50,
-                   repetition_penalty: float = 1.2):
-    """Generate responses for image input using the selected model."""
-    if model_name == "Nanonets-OCR-s":
-        processor = processor_m
-        model = model_m
-    elif model_name == "MonkeyOCR-Recognition":
-        processor = processor_g
-        model = model_g
-    elif model_name == "SmolDocling-256M-preview":
-        processor = processor_x
-        model = model_x
-    elif model_name == "Typhoon-OCR-7B":
-        processor = processor_l
-        model = model_l
-    elif model_name == "Thyme-RL":
-        processor = processor_n
-        model = model_n
-    else:
-        yield "Invalid model selected.", "Invalid model selected."
-        return
-    if image is None:
-        yield "Please upload an image.", "Please upload an image."
-        return
-    images = [image]
-    if model_name == "SmolDocling-256M-preview":
-        if "OTSL" in text or "code" in text:
-            images = [add_random_padding(img) for img in images]
-        if "OCR at text at" in text or "Identify element" in text or "formula" in text:
-            text = normalize_values(text, target_max=500)
-    messages = [
-        {
-            "role": "user",
-            "content": [{"type": "image"} for _ in images] + [
-                {"type": "text", "text": text}
-            ]
-        }
-    ]
-    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
-    inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
-    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = {
-        **inputs,
-        "streamer": streamer,
-        "max_new_tokens": max_new_tokens,
-        "temperature": temperature,
-        "top_p": top_p,
-        "top_k": top_k,
-        "repetition_penalty": repetition_penalty,
-    }
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
-    thread.start()
-    buffer = ""
-    for new_text in streamer:
-        buffer += new_text.replace("<|im_end|>", "")
-        yield buffer, buffer
-    if model_name == "SmolDocling-256M-preview":
-        cleaned_output = buffer.replace("<end_of_utterance>", "").strip()
         if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
             if "<chart>" in cleaned_output:
                 cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
@@ -210,43 +150,44 @@ def generate_image(model_name: str, text: str, image: Image.Image,
             doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
             doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
             markdown_output = doc.export_to_markdown()
-            yield buffer, markdown_output
-        else:
-            yield buffer, cleaned_output
-@spaces.GPU
-def generate_video(model_name: str, text: str, video_path: str,
-                   max_new_tokens: int = 1024,
-                   temperature: float = 0.6,
-                   top_p: float = 0.9,
-                   top_k: int = 50,
-                   repetition_penalty: float = 1.2):
-    """Generate responses for video input using the selected model."""
     if model_name == "Nanonets-OCR-s":
-        processor = processor_m
-        model = model_m
     elif model_name == "MonkeyOCR-Recognition":
-        processor = processor_g
-        model = model_g
     elif model_name == "SmolDocling-256M-preview":
-        processor = processor_x
-        model = model_x
     elif model_name == "Typhoon-OCR-7B":
-        processor = processor_l
-        model = model_l
     elif model_name == "Thyme-RL":
-        processor = processor_n
-        model = model_n
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
-    if video_path is None:
-        yield "Please upload a video.", "Please upload a video."
         return
-    frames = downsample_video(video_path)
-    images = [frame for frame, _ in frames]
     if model_name == "SmolDocling-256M-preview":
         if "OTSL" in text or "code" in text:
@@ -255,12 +196,7 @@ def generate_video(model_name: str, text: str, video_path: str,
             text = normalize_values(text, target_max=500)
     messages = [
-        {
-            "role": "user",
-            "content": [{"type": "image"} for _ in images] + [
-                {"type": "text", "text": text}
-            ]
-        }
     ]
     prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
     inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
@@ -284,19 +220,20 @@ def generate_video(model_name: str, text: str, video_path: str,
         yield buffer, buffer
     if model_name == "SmolDocling-256M-preview":
-        cleaned_output = buffer.replace("<end_of_utterance>", "").strip()
-        if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
-            if "<chart>" in cleaned_output:
-                cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
-                cleaned_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned_output)
-            doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
-            doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
-            markdown_output = doc.export_to_markdown()
-            yield buffer, markdown_output
-        else:
-            yield buffer, cleaned_output
-# Define examples for image and video inference
 image_examples = [
     ["Reconstruct the doc [table] as it is.", "images/0.png"],
     ["Describe the image!", "images/8.png"],
@@ -306,7 +243,7 @@ image_examples = [
     ["Convert chart to OTSL.", "images/4.png"],
     ["Convert code to text", "images/5.jpg"],
     ["Convert this table to OTSL.", "images/6.jpg"],
-    ["Convert formula to late.", "images/7.jpg"],
 ]
 video_examples = [
@@ -314,84 +251,99 @@ video_examples = [
     ["Explain the video in detail.", "videos/2.mp4"]
 ]
-#css
 css = """
 .submit-btn {
     background-color: #2980b9 !important;
     color: white !important;
 }
 .submit-btn:hover {
     background-color: #3498db !important;
 }
 .canvas-output {
     border: 2px solid #4682B4;
     border-radius: 10px;
     padding: 20px;
 }
 """
-# Create the Gradio Interface
-with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
     gr.Markdown("# **[Multimodal OCR2](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
     with gr.Row():
-        with gr.Column():
             with gr.Tabs():
-                with gr.TabItem("Image Inference"):
                     image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
-                    image_upload = gr.Image(type="pil", label="Image", height=290)
-                    image_submit = gr.Button("Submit", elem_classes="submit-btn")
                     gr.Examples(
                         examples=image_examples,
-                        inputs=[image_query, image_upload]
                     )
-                with gr.TabItem("Video Inference"):
                     video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
-                    video_upload = gr.Video(label="Video", height=290)
-                    video_submit = gr.Button("Submit", elem_classes="submit-btn")
                     gr.Examples(
                         examples=video_examples,
-                        inputs=[video_query, video_upload]
                     )
-            with gr.Accordion("Advanced options", open=False):
-                max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                 temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
                 top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
                 top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
-                repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
-        with gr.Column():
             with gr.Column(elem_classes="canvas-output"):
                 gr.Markdown("## Output")
                 raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=5)
-                with gr.Accordion("(Result.md)", open=False):
-                    formatted_output = gr.Markdown(label="(Result.md)")
             model_choice = gr.Radio(
                 choices=["Nanonets-OCR-s", "MonkeyOCR-Recognition", "Thyme-RL", "Typhoon-OCR-7B", "SmolDocling-256M-preview"],
-                label="Select Model",
                 value="Nanonets-OCR-s"
             )
             gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR2/discussions)")
-            gr.Markdown("> [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
-            gr.Markdown("> [SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview): SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.")
-            gr.Markdown("> [MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR): MonkeyOCR adopts a Structure-Recognition-Relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
-            gr.Markdown("> [Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
-            gr.Markdown("> [Thyme-RL](https://huggingface.co/Kwai-Keye/Thyme-RL): Thyme: Think Beyond Images. Thyme transcends traditional ``thinking with images'' paradigms by autonomously generating and executing diverse image processing and computational operations through executable code, significantly enhancing performance on high-resolution perception and complex reasoning tasks.")
-            gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
     image_submit.click(
-        fn=generate_image,
-        inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
-        outputs=[raw_output, formatted_output]
     )
     video_submit.click(
-        fn=generate_video,
-        inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
-        outputs=[raw_output,
-                 formatted_output]
     )
 if __name__ == "__main__":
-    demo.queue(max_size=50).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)

 )
 from transformers.image_utils import load_image
+# These imports seem to be from a custom library.
+# If you have 'docling_core' installed, you can uncomment them.
+# from docling_core.types.doc import DoclingDocument, DocTagsDocument
 import re
 import ast
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# --- Model Loading ---
 # Load Nanonets-OCR-s
 MODEL_ID_M = "nanonets/Nanonets-OCR-s"
 processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
     torch_dtype=torch.float16
 ).to(device).eval()
+# --- Preprocessing and Helper Functions ---
 def add_random_padding(image, min_percent=0.1, max_percent=0.10):
     """Add random padding to an image based on its size."""
     image = image.convert("RGB")
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     frames = []
+    # Use 10 frames for video processing
     frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
     for i in frame_indices:
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
     vidcap.release()
     return frames
+# A placeholder function in case docling_core is not installed
+def format_smoldocling_output(buffer_text, images):
+    cleaned_output = buffer_text.replace("<end_of_utterance>", "").strip()
+    # Check if docling_core is available and was imported
+    if 'DocTagsDocument' in globals() and 'DoclingDocument' in globals():
         if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
             if "<chart>" in cleaned_output:
                 cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
             doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
             doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
             markdown_output = doc.export_to_markdown()
+            return buffer_text, markdown_output
+    # Fallback if library is not available or tags are not present
+    return buffer_text, cleaned_output
+# --- Core Generation Logic ---
+def get_model_and_processor(model_name):
+    """Helper to select model and processor."""
     if model_name == "Nanonets-OCR-s":
+        return processor_m, model_m
     elif model_name == "MonkeyOCR-Recognition":
+        return processor_g, model_g
     elif model_name == "SmolDocling-256M-preview":
+        return processor_x, model_x
     elif model_name == "Typhoon-OCR-7B":
+        return processor_l, model_l
     elif model_name == "Thyme-RL":
+        return processor_n, model_n
     else:
+        return None, None
+@spaces.GPU
+def generate_response(model_name: str, text: str, media_input, media_type: str,
+                      max_new_tokens: int, temperature: float, top_p: float, top_k: int, repetition_penalty: float):
+    """Unified generation function for both image and video."""
+    processor, model = get_model_and_processor(model_name)
+    if not processor or not model:
         yield "Invalid model selected.", "Invalid model selected."
         return
+    if media_input is None:
+        yield f"Please upload a {media_type}.", f"Please upload a {media_type}."
         return
+    if media_type == "video":
+        frames = downsample_video(media_input)
+        images = [frame for frame, _ in frames]
+    else: # image
+        images = [media_input]
     if model_name == "SmolDocling-256M-preview":
         if "OTSL" in text or "code" in text:
             text = normalize_values(text, target_max=500)
     messages = [
+        {"role": "user", "content": [{"type": "image"} for _ in images] + [{"type": "text", "text": text}]}
     ]
     prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
     inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
         yield buffer, buffer
     if model_name == "SmolDocling-256M-preview":
+        raw_output, formatted_output = format_smoldocling_output(buffer, images)
+        yield raw_output, formatted_output
+    else:
+        # For other models, the formatted output is just the cleaned buffer
+        yield buffer, buffer.strip()
+def generate_image_wrapper(*args):
+    yield from generate_response(*args, media_type="image")
+def generate_video_wrapper(*args):
+    yield from generate_response(*args, media_type="video")
+# --- Examples ---
 image_examples = [
     ["Reconstruct the doc [table] as it is.", "images/0.png"],
     ["Describe the image!", "images/8.png"],
     ["Convert chart to OTSL.", "images/4.png"],
     ["Convert code to text", "images/5.jpg"],
     ["Convert this table to OTSL.", "images/6.jpg"],
+    ["Convert formula to latex.", "images/7.jpg"],
 ]
 video_examples = [
     ["Explain the video in detail.", "videos/2.mp4"]
 ]
+# --- UI Styling ---
 css = """
 .submit-btn {
     background-color: #2980b9 !important;
     color: white !important;
+    border: none !important;
+    box-shadow: 2px 2px 5px rgba(0,0,0,0.2) !important;
 }
 .submit-btn:hover {
     background-color: #3498db !important;
+    box-shadow: 2px 2px 8px rgba(0,0,0,0.3) !important;
 }
 .canvas-output {
     border: 2px solid #4682B4;
     border-radius: 10px;
     padding: 20px;
+    background-color: #f0f8ff;
 }
 """
+# --- Gradio Interface ---
+with gr.Blocks(css=css) as demo:
     gr.Markdown("# **[Multimodal OCR2](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
     with gr.Row():
+        # Left Column for Inputs and Controls
+        with gr.Column(scale=1):
             with gr.Tabs():
+                with gr.TabItem("🖼️ Image Inference"):
                     image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
+                    image_upload = gr.Image(type="pil", label="Upload Image", height=300)
                     gr.Examples(
                         examples=image_examples,
+                        inputs=[image_query, image_upload],
+                        label="Image Examples"
                     )
+                    image_submit = gr.Button("Submit", elem_classes="submit-btn")
+                with gr.TabItem("🎥 Video Inference"):
                     video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
+                    video_upload = gr.Video(label="Upload Video", height=300)
                     gr.Examples(
                         examples=video_examples,
+                        inputs=[video_query, video_upload],
+                        label="Video Examples"
                     )
+                    video_submit = gr.Button("Submit", elem_classes="submit-btn")
+            with gr.Accordion("⚙️ Advanced Options", open=False):
+                max_new_tokens = gr.Slider(label="Max New Tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                 temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
                 top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
                 top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
+                repetition_penalty = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
+        # Right Column for Outputs and Model Info
+        with gr.Column(scale=1):
             with gr.Column(elem_classes="canvas-output"):
                 gr.Markdown("## Output")
                 raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=5)
+                with gr.Accordion("📄 Formatted Result (Result.md)", open=True):
+                    formatted_output = gr.Markdown(label="Formatted Output")
             model_choice = gr.Radio(
                 choices=["Nanonets-OCR-s", "MonkeyOCR-Recognition", "Thyme-RL", "Typhoon-OCR-7B", "SmolDocling-256M-preview"],
+                label="🤖 Select Model",
                 value="Nanonets-OCR-s"
             )
             gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR2/discussions)")
+            gr.Markdown("> **[Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s)**: A powerful, state-of-the-art image-to-markdown OCR model that transforms documents into structured markdown with intelligent content recognition.")
+            gr.Markdown("> **[SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview)**: A multimodal Image-Text-to-Text model designed for efficient document conversion, retaining key features of the larger Docling model.")
+            gr.Markdown("> **[MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR)**: Adopts a Structure-Recognition-Relation (SRR) paradigm, simplifying the pipeline for document processing.")
+            gr.Markdown("> **[Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b)**: A bilingual document parsing model for real-world documents in Thai and English, capable of extracting text from images and charts.")
+            gr.Markdown("> **[Thyme-RL](https://huggingface.co/Kwai-Keye/Thyme-RL)**: Thyme transcends traditional 'thinking with images' by autonomously generating and executing code for image processing and computation, enhancing performance on complex reasoning tasks.")
+            gr.Markdown("> ⚠️ **Note**: All models in this space are primarily optimized for image tasks and may not perform as well on video inference use cases.")
+    # --- Event Handlers ---
+    common_inputs = [model_choice, max_new_tokens, temperature, top_p, top_k, repetition_penalty]
+    common_outputs = [raw_output, formatted_output]
     image_submit.click(
+        fn=generate_image_wrapper,
+        inputs=[image_query, image_upload] + common_inputs,
+        outputs=common_outputs
     )
     video_submit.click(
+        fn=generate_video_wrapper,
+        inputs=[video_query, video_upload] + common_inputs,
+        outputs=common_outputs
     )
 if __name__ == "__main__":
+    demo.queue(max_size=50).launch(share=True, show_error=True)