Spaces:
Running
on
Zero
Running
on
Zero
update app (#5)
Browse files- update app (49a81e87b12c424ddc163194fa0b62d05713c2d0)
app.py
CHANGED
|
@@ -78,6 +78,15 @@ model_x = AutoModelForVision2Seq.from_pretrained(
|
|
| 78 |
torch_dtype=torch.float16
|
| 79 |
).to(device).eval()
|
| 80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
# Preprocessing functions for SmolDocling-256M
|
| 82 |
def add_random_padding(image, min_percent=0.1, max_percent=0.10):
|
| 83 |
"""Add random padding to an image based on its size."""
|
|
@@ -144,6 +153,9 @@ def generate_image(model_name: str, text: str, image: Image.Image,
|
|
| 144 |
elif model_name == "Typhoon-OCR-7B":
|
| 145 |
processor = processor_l
|
| 146 |
model = model_l
|
|
|
|
|
|
|
|
|
|
| 147 |
else:
|
| 148 |
yield "Invalid model selected.", "Invalid model selected."
|
| 149 |
return
|
|
@@ -222,6 +234,9 @@ def generate_video(model_name: str, text: str, video_path: str,
|
|
| 222 |
elif model_name == "Typhoon-OCR-7B":
|
| 223 |
processor = processor_l
|
| 224 |
model = model_l
|
|
|
|
|
|
|
|
|
|
| 225 |
else:
|
| 226 |
yield "Invalid model selected.", "Invalid model selected."
|
| 227 |
return
|
|
@@ -352,7 +367,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
| 352 |
formatted_output = gr.Markdown(label="(Result.md)")
|
| 353 |
|
| 354 |
model_choice = gr.Radio(
|
| 355 |
-
choices=["Nanonets-OCR-s", "MonkeyOCR-Recognition", "Typhoon-OCR-7B", "SmolDocling-256M-preview"],
|
| 356 |
label="Select Model",
|
| 357 |
value="Nanonets-OCR-s"
|
| 358 |
)
|
|
@@ -362,6 +377,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
| 362 |
gr.Markdown("> [SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview): SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.")
|
| 363 |
gr.Markdown("> [MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR): MonkeyOCR adopts a Structure-Recognition-Relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
|
| 364 |
gr.Markdown("> [Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
|
|
|
|
| 365 |
gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
|
| 366 |
|
| 367 |
image_submit.click(
|
|
|
|
| 78 |
torch_dtype=torch.float16
|
| 79 |
).to(device).eval()
|
| 80 |
|
| 81 |
+
# Nemesis-VLMer-7B-0818
|
| 82 |
+
MODEL_ID_N = "prithivMLmods/Nemesis-VLMer-7B-0818"
|
| 83 |
+
processor_n = AutoProcessor.from_pretrained(MODEL_ID_N, trust_remote_code=True)
|
| 84 |
+
model_n = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 85 |
+
MODEL_ID_N,
|
| 86 |
+
trust_remote_code=True,
|
| 87 |
+
torch_dtype=torch.float16
|
| 88 |
+
).to(device).eval()
|
| 89 |
+
|
| 90 |
# Preprocessing functions for SmolDocling-256M
|
| 91 |
def add_random_padding(image, min_percent=0.1, max_percent=0.10):
|
| 92 |
"""Add random padding to an image based on its size."""
|
|
|
|
| 153 |
elif model_name == "Typhoon-OCR-7B":
|
| 154 |
processor = processor_l
|
| 155 |
model = model_l
|
| 156 |
+
elif model_name == "Nemesis-VLMer-7B":
|
| 157 |
+
processor = processor_n
|
| 158 |
+
model = model_n
|
| 159 |
else:
|
| 160 |
yield "Invalid model selected.", "Invalid model selected."
|
| 161 |
return
|
|
|
|
| 234 |
elif model_name == "Typhoon-OCR-7B":
|
| 235 |
processor = processor_l
|
| 236 |
model = model_l
|
| 237 |
+
elif model_name == "Nemesis-VLMer-7B":
|
| 238 |
+
processor = processor_n
|
| 239 |
+
model = model_n
|
| 240 |
else:
|
| 241 |
yield "Invalid model selected.", "Invalid model selected."
|
| 242 |
return
|
|
|
|
| 367 |
formatted_output = gr.Markdown(label="(Result.md)")
|
| 368 |
|
| 369 |
model_choice = gr.Radio(
|
| 370 |
+
choices=["Nanonets-OCR-s", "MonkeyOCR-Recognition", "Typhoon-OCR-7B", "SmolDocling-256M-preview", "Nemesis-VLMer-7B"],
|
| 371 |
label="Select Model",
|
| 372 |
value="Nanonets-OCR-s"
|
| 373 |
)
|
|
|
|
| 377 |
gr.Markdown("> [SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview): SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.")
|
| 378 |
gr.Markdown("> [MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR): MonkeyOCR adopts a Structure-Recognition-Relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
|
| 379 |
gr.Markdown("> [Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
|
| 380 |
+
gr.Markdown("> [Nemesis-VLMer-7B-0818](https://huggingface.co/prithivMLmods/Nemesis-VLMer-7B-0818): The Nemesis-VLMer-7B-0818 model is a fine-tuned version of Qwen2.5-VL-7B-Instruct, optimized for Reasoning, Content Analysis, and Visual Question Answering. Built on top of the Qwen2.5-VL architecture, this model enhances multimodal comprehension capabilities ")
|
| 381 |
gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
|
| 382 |
|
| 383 |
image_submit.click(
|