Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -19,6 +19,7 @@ from qwen_vl_utils import process_vision_info
|
|
| 19 |
MAX_MAX_NEW_TOKENS = 2048
|
| 20 |
DEFAULT_MAX_NEW_TOKENS = 1024
|
| 21 |
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
|
|
|
|
| 22 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
| 23 |
|
| 24 |
# Load Camel-Doc-OCR-062825
|
|
@@ -116,7 +117,6 @@ def generate_image(model_name: str, text: str, image: Image.Image,
|
|
| 116 |
{"type": "text", "text": text},
|
| 117 |
]
|
| 118 |
}]
|
| 119 |
-
|
| 120 |
prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 121 |
inputs = processor(
|
| 122 |
text=[prompt_full],
|
|
@@ -126,12 +126,10 @@ def generate_image(model_name: str, text: str, image: Image.Image,
|
|
| 126 |
truncation=False,
|
| 127 |
max_length=MAX_INPUT_TOKEN_LENGTH
|
| 128 |
).to(device)
|
| 129 |
-
|
| 130 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
| 131 |
generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
|
| 132 |
thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
|
| 133 |
thread.start()
|
| 134 |
-
|
| 135 |
buffer = ""
|
| 136 |
for new_text in streamer:
|
| 137 |
buffer += new_text
|
|
@@ -177,7 +175,6 @@ def generate_video(model_name: str, text: str, video_path: str,
|
|
| 177 |
image, timestamp = frame
|
| 178 |
messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
|
| 179 |
messages[1]["content"].append({"type": "image", "image": image})
|
| 180 |
-
|
| 181 |
inputs = processor.apply_chat_template(
|
| 182 |
messages,
|
| 183 |
tokenize=True,
|
|
@@ -187,7 +184,6 @@ def generate_video(model_name: str, text: str, video_path: str,
|
|
| 187 |
truncation=False,
|
| 188 |
max_length=MAX_INPUT_TOKEN_LENGTH
|
| 189 |
).to(device)
|
| 190 |
-
|
| 191 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
| 192 |
generation_kwargs = {
|
| 193 |
**inputs,
|
|
@@ -201,7 +197,6 @@ def generate_video(model_name: str, text: str, video_path: str,
|
|
| 201 |
}
|
| 202 |
thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
|
| 203 |
thread.start()
|
| 204 |
-
|
| 205 |
buffer = ""
|
| 206 |
for new_text in streamer:
|
| 207 |
buffer += new_text
|
|
@@ -213,9 +208,10 @@ def generate_video(model_name: str, text: str, video_path: str,
|
|
| 213 |
image_examples = [
|
| 214 |
["convert this page to doc [text] precisely for markdown.", "images/1.png"],
|
| 215 |
["convert this page to doc [table] precisely for markdown.", "images/2.png"],
|
| 216 |
-
["explain the movie shot in detail.", "images/3.png"],
|
| 217 |
["fill the correct numbers.", "images/4.png"]
|
| 218 |
]
|
|
|
|
| 219 |
video_examples = [
|
| 220 |
["explain the ad video in detail.", "videos/1.mp4"],
|
| 221 |
["explain the video in detail.", "videos/2.mp4"]
|
|
@@ -235,96 +231,10 @@ css = """
|
|
| 235 |
border-radius: 10px;
|
| 236 |
padding: 20px;
|
| 237 |
}
|
| 238 |
-
/* From Uiverse.io by Subaashbala */
|
| 239 |
-
button {
|
| 240 |
-
display: flex;
|
| 241 |
-
justify-content: space-around;
|
| 242 |
-
align-items: center;
|
| 243 |
-
padding: 1em 0em 1em 1em;
|
| 244 |
-
background-color: yellow;
|
| 245 |
-
cursor: pointer;
|
| 246 |
-
box-shadow: 4px 6px 0px black;
|
| 247 |
-
border: 4px solid;
|
| 248 |
-
border-radius: 15px;
|
| 249 |
-
position: relative;
|
| 250 |
-
overflow: hidden;
|
| 251 |
-
z-index: 100;
|
| 252 |
-
transition: box-shadow 250ms, transform 250ms, filter 50ms;
|
| 253 |
-
}
|
| 254 |
-
button:hover {
|
| 255 |
-
transform: translate(2px, 2px);
|
| 256 |
-
box-shadow: 2px 3px 0px black;
|
| 257 |
-
}
|
| 258 |
-
button:active {
|
| 259 |
-
filter: saturate(0.75);
|
| 260 |
-
}
|
| 261 |
-
button::after {
|
| 262 |
-
content: "";
|
| 263 |
-
position: absolute;
|
| 264 |
-
inset: 0;
|
| 265 |
-
background-color: pink;
|
| 266 |
-
z-index: -1;
|
| 267 |
-
transform: translateX(-100%);
|
| 268 |
-
transition: transform 250ms;
|
| 269 |
-
}
|
| 270 |
-
button:hover::after {
|
| 271 |
-
transform: translateX(0);
|
| 272 |
-
}
|
| 273 |
-
.bgContainer {
|
| 274 |
-
position: relative;
|
| 275 |
-
display: flex;
|
| 276 |
-
justify-content: start;
|
| 277 |
-
align-items: center;
|
| 278 |
-
overflow: hidden;
|
| 279 |
-
max-width: 35%; /* adjust this if the button text is not proper */
|
| 280 |
-
font-size: 2em;
|
| 281 |
-
font-weight: 600;
|
| 282 |
-
}
|
| 283 |
-
.bgContainer span {
|
| 284 |
-
position: relative;
|
| 285 |
-
transform: translateX(-100%);
|
| 286 |
-
transition: all 250ms;
|
| 287 |
-
}
|
| 288 |
-
.button:hover .bgContainer > span {
|
| 289 |
-
transform: translateX(0);
|
| 290 |
-
}
|
| 291 |
-
.arrowContainer {
|
| 292 |
-
padding: 1em;
|
| 293 |
-
margin-inline-end: 1em;
|
| 294 |
-
border: 4px solid;
|
| 295 |
-
border-radius: 50%;
|
| 296 |
-
background-color: pink;
|
| 297 |
-
position: relative;
|
| 298 |
-
overflow: hidden;
|
| 299 |
-
transition: transform 250ms, background-color 250ms;
|
| 300 |
-
z-index: 100;
|
| 301 |
-
}
|
| 302 |
-
.arrowContainer::after {
|
| 303 |
-
content: "";
|
| 304 |
-
position: absolute;
|
| 305 |
-
inset: 0;
|
| 306 |
-
border-radius: inherit;
|
| 307 |
-
background-color: yellow;
|
| 308 |
-
transform: translateX(-100%);
|
| 309 |
-
z-index: -1;
|
| 310 |
-
transition: transform 250ms ease-in-out;
|
| 311 |
-
}
|
| 312 |
-
button:hover .arrowContainer::after {
|
| 313 |
-
transform: translateX(0);
|
| 314 |
-
}
|
| 315 |
-
button:hover .arrowContainer {
|
| 316 |
-
transform: translateX(5px);
|
| 317 |
-
}
|
| 318 |
-
button:active .arrowContainer {
|
| 319 |
-
transform: translateX(8px);
|
| 320 |
-
}
|
| 321 |
-
.arrowContainer svg {
|
| 322 |
-
vertical-align: middle;
|
| 323 |
-
}
|
| 324 |
"""
|
| 325 |
|
| 326 |
# Create the Gradio Interface
|
| 327 |
-
with gr.Blocks(css=css) as demo:
|
| 328 |
gr.Markdown("# **[Multimodal OCR Comparator](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
|
| 329 |
with gr.Row():
|
| 330 |
with gr.Column():
|
|
@@ -345,24 +255,30 @@ with gr.Blocks(css=css) as demo:
|
|
| 345 |
examples=video_examples,
|
| 346 |
inputs=[video_query, video_upload]
|
| 347 |
)
|
|
|
|
| 348 |
with gr.Accordion("Advanced options", open=False):
|
| 349 |
max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
|
| 350 |
temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
|
| 351 |
top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
|
| 352 |
top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
|
| 353 |
repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
|
|
|
|
| 354 |
with gr.Column():
|
| 355 |
with gr.Column(elem_classes="canvas-output"):
|
| 356 |
gr.Markdown("## Output")
|
| 357 |
output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
|
| 358 |
-
|
|
|
|
| 359 |
markdown_output = gr.Markdown(label="(Result.md)")
|
|
|
|
| 360 |
model_choice = gr.Radio(
|
| 361 |
choices=["Camel-Doc-OCR-062825", "MonkeyOCR-pro-1.2B", "Megalodon-OCR-Sync-0713", "Qwen2-VL-OCR-2B"],
|
| 362 |
label="Select Model",
|
| 363 |
value="Camel-Doc-OCR-062825"
|
| 364 |
)
|
|
|
|
| 365 |
gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR-Comparator/discussions)")
|
|
|
|
| 366 |
# Define the submit button actions
|
| 367 |
image_submit.click(fn=generate_image,
|
| 368 |
inputs=[
|
|
|
|
| 19 |
MAX_MAX_NEW_TOKENS = 2048
|
| 20 |
DEFAULT_MAX_NEW_TOKENS = 1024
|
| 21 |
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
|
| 22 |
+
|
| 23 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
| 24 |
|
| 25 |
# Load Camel-Doc-OCR-062825
|
|
|
|
| 117 |
{"type": "text", "text": text},
|
| 118 |
]
|
| 119 |
}]
|
|
|
|
| 120 |
prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 121 |
inputs = processor(
|
| 122 |
text=[prompt_full],
|
|
|
|
| 126 |
truncation=False,
|
| 127 |
max_length=MAX_INPUT_TOKEN_LENGTH
|
| 128 |
).to(device)
|
|
|
|
| 129 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
| 130 |
generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
|
| 131 |
thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
|
| 132 |
thread.start()
|
|
|
|
| 133 |
buffer = ""
|
| 134 |
for new_text in streamer:
|
| 135 |
buffer += new_text
|
|
|
|
| 175 |
image, timestamp = frame
|
| 176 |
messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
|
| 177 |
messages[1]["content"].append({"type": "image", "image": image})
|
|
|
|
| 178 |
inputs = processor.apply_chat_template(
|
| 179 |
messages,
|
| 180 |
tokenize=True,
|
|
|
|
| 184 |
truncation=False,
|
| 185 |
max_length=MAX_INPUT_TOKEN_LENGTH
|
| 186 |
).to(device)
|
|
|
|
| 187 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
| 188 |
generation_kwargs = {
|
| 189 |
**inputs,
|
|
|
|
| 197 |
}
|
| 198 |
thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
|
| 199 |
thread.start()
|
|
|
|
| 200 |
buffer = ""
|
| 201 |
for new_text in streamer:
|
| 202 |
buffer += new_text
|
|
|
|
| 208 |
image_examples = [
|
| 209 |
["convert this page to doc [text] precisely for markdown.", "images/1.png"],
|
| 210 |
["convert this page to doc [table] precisely for markdown.", "images/2.png"],
|
| 211 |
+
["explain the movie shot in detail.", "images/3.png"],
|
| 212 |
["fill the correct numbers.", "images/4.png"]
|
| 213 |
]
|
| 214 |
+
|
| 215 |
video_examples = [
|
| 216 |
["explain the ad video in detail.", "videos/1.mp4"],
|
| 217 |
["explain the video in detail.", "videos/2.mp4"]
|
|
|
|
| 231 |
border-radius: 10px;
|
| 232 |
padding: 20px;
|
| 233 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
"""
|
| 235 |
|
| 236 |
# Create the Gradio Interface
|
| 237 |
+
with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
| 238 |
gr.Markdown("# **[Multimodal OCR Comparator](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
|
| 239 |
with gr.Row():
|
| 240 |
with gr.Column():
|
|
|
|
| 255 |
examples=video_examples,
|
| 256 |
inputs=[video_query, video_upload]
|
| 257 |
)
|
| 258 |
+
|
| 259 |
with gr.Accordion("Advanced options", open=False):
|
| 260 |
max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
|
| 261 |
temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
|
| 262 |
top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
|
| 263 |
top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
|
| 264 |
repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
|
| 265 |
+
|
| 266 |
with gr.Column():
|
| 267 |
with gr.Column(elem_classes="canvas-output"):
|
| 268 |
gr.Markdown("## Output")
|
| 269 |
output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
|
| 270 |
+
|
| 271 |
+
with gr.Accordion("(Result.md)", open=False):
|
| 272 |
markdown_output = gr.Markdown(label="(Result.md)")
|
| 273 |
+
|
| 274 |
model_choice = gr.Radio(
|
| 275 |
choices=["Camel-Doc-OCR-062825", "MonkeyOCR-pro-1.2B", "Megalodon-OCR-Sync-0713", "Qwen2-VL-OCR-2B"],
|
| 276 |
label="Select Model",
|
| 277 |
value="Camel-Doc-OCR-062825"
|
| 278 |
)
|
| 279 |
+
|
| 280 |
gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR-Comparator/discussions)")
|
| 281 |
+
|
| 282 |
# Define the submit button actions
|
| 283 |
image_submit.click(fn=generate_image,
|
| 284 |
inputs=[
|