Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -30,8 +30,8 @@ import ast
|
|
| 30 |
import html
|
| 31 |
|
| 32 |
# Constants for text generation
|
| 33 |
-
MAX_MAX_NEW_TOKENS =
|
| 34 |
-
DEFAULT_MAX_NEW_TOKENS =
|
| 35 |
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
|
| 36 |
|
| 37 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
|
@@ -78,8 +78,8 @@ model_x = AutoModelForVision2Seq.from_pretrained(
|
|
| 78 |
torch_dtype=torch.float16
|
| 79 |
).to(device).eval()
|
| 80 |
|
| 81 |
-
#
|
| 82 |
-
MODEL_ID_N = "
|
| 83 |
processor_n = AutoProcessor.from_pretrained(MODEL_ID_N, trust_remote_code=True)
|
| 84 |
model_n = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 85 |
MODEL_ID_N,
|
|
@@ -153,7 +153,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
|
|
| 153 |
elif model_name == "Typhoon-OCR-7B":
|
| 154 |
processor = processor_l
|
| 155 |
model = model_l
|
| 156 |
-
elif model_name == "
|
| 157 |
processor = processor_n
|
| 158 |
model = model_n
|
| 159 |
else:
|
|
@@ -234,7 +234,7 @@ def generate_video(model_name: str, text: str, video_path: str,
|
|
| 234 |
elif model_name == "Typhoon-OCR-7B":
|
| 235 |
processor = processor_l
|
| 236 |
model = model_l
|
| 237 |
-
elif model_name == "
|
| 238 |
processor = processor_n
|
| 239 |
model = model_n
|
| 240 |
else:
|
|
@@ -299,6 +299,7 @@ def generate_video(model_name: str, text: str, video_path: str,
|
|
| 299 |
# Define examples for image and video inference
|
| 300 |
image_examples = [
|
| 301 |
["Reconstruct the doc [table] as it is.", "images/0.png"],
|
|
|
|
| 302 |
["OCR the image", "images/2.jpg"],
|
| 303 |
["Convert this page to docling", "images/1.png"],
|
| 304 |
["Convert this page to docling", "images/3.png"],
|
|
@@ -367,7 +368,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
| 367 |
formatted_output = gr.Markdown(label="(Result.md)")
|
| 368 |
|
| 369 |
model_choice = gr.Radio(
|
| 370 |
-
choices=["Nanonets-OCR-s", "MonkeyOCR-Recognition", "Typhoon-OCR-7B", "SmolDocling-256M-preview"
|
| 371 |
label="Select Model",
|
| 372 |
value="Nanonets-OCR-s"
|
| 373 |
)
|
|
@@ -377,7 +378,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
| 377 |
gr.Markdown("> [SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview): SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.")
|
| 378 |
gr.Markdown("> [MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR): MonkeyOCR adopts a Structure-Recognition-Relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
|
| 379 |
gr.Markdown("> [Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
|
| 380 |
-
gr.Markdown("> [
|
| 381 |
gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
|
| 382 |
|
| 383 |
image_submit.click(
|
|
@@ -393,4 +394,4 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
| 393 |
)
|
| 394 |
|
| 395 |
if __name__ == "__main__":
|
| 396 |
-
demo.queue(max_size=
|
|
|
|
| 30 |
import html
|
| 31 |
|
| 32 |
# Constants for text generation
|
| 33 |
+
MAX_MAX_NEW_TOKENS = 8192
|
| 34 |
+
DEFAULT_MAX_NEW_TOKENS = 4096
|
| 35 |
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
|
| 36 |
|
| 37 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
|
|
|
| 78 |
torch_dtype=torch.float16
|
| 79 |
).to(device).eval()
|
| 80 |
|
| 81 |
+
# Thyme-RL
|
| 82 |
+
MODEL_ID_N = "Kwai-Keye/Thyme-RL"
|
| 83 |
processor_n = AutoProcessor.from_pretrained(MODEL_ID_N, trust_remote_code=True)
|
| 84 |
model_n = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 85 |
MODEL_ID_N,
|
|
|
|
| 153 |
elif model_name == "Typhoon-OCR-7B":
|
| 154 |
processor = processor_l
|
| 155 |
model = model_l
|
| 156 |
+
elif model_name == "Thyme-RL":
|
| 157 |
processor = processor_n
|
| 158 |
model = model_n
|
| 159 |
else:
|
|
|
|
| 234 |
elif model_name == "Typhoon-OCR-7B":
|
| 235 |
processor = processor_l
|
| 236 |
model = model_l
|
| 237 |
+
elif model_name == "Thyme-RL":
|
| 238 |
processor = processor_n
|
| 239 |
model = model_n
|
| 240 |
else:
|
|
|
|
| 299 |
# Define examples for image and video inference
|
| 300 |
image_examples = [
|
| 301 |
["Reconstruct the doc [table] as it is.", "images/0.png"],
|
| 302 |
+
["Describe the image!", "images/8.png"],
|
| 303 |
["OCR the image", "images/2.jpg"],
|
| 304 |
["Convert this page to docling", "images/1.png"],
|
| 305 |
["Convert this page to docling", "images/3.png"],
|
|
|
|
| 368 |
formatted_output = gr.Markdown(label="(Result.md)")
|
| 369 |
|
| 370 |
model_choice = gr.Radio(
|
| 371 |
+
choices=["Nanonets-OCR-s", "MonkeyOCR-Recognition", "Thyme-RL", "Typhoon-OCR-7B", "SmolDocling-256M-preview"],
|
| 372 |
label="Select Model",
|
| 373 |
value="Nanonets-OCR-s"
|
| 374 |
)
|
|
|
|
| 378 |
gr.Markdown("> [SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview): SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.")
|
| 379 |
gr.Markdown("> [MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR): MonkeyOCR adopts a Structure-Recognition-Relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
|
| 380 |
gr.Markdown("> [Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
|
| 381 |
+
gr.Markdown("> [Thyme-RL](https://huggingface.co/Kwai-Keye/Thyme-RL): Thyme: Think Beyond Images. Thyme transcends traditional ``thinking with images'' paradigms by autonomously generating and executing diverse image processing and computational operations through executable code, significantly enhancing performance on high-resolution perception and complex reasoning tasks.")
|
| 382 |
gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
|
| 383 |
|
| 384 |
image_submit.click(
|
|
|
|
| 394 |
)
|
| 395 |
|
| 396 |
if __name__ == "__main__":
|
| 397 |
+
demo.queue(max_size=50).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)
|