Spaces:

prithivMLmods
/

Multimodal-VLM-v1.0

Running on Zero

App Files Files Community

prithivMLmods commited on Jul 29

Commit

b6e3398

verified ·

1 Parent(s): 8ac376e

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -95

app.py CHANGED Viewed

@@ -19,6 +19,7 @@ from qwen_vl_utils import process_vision_info
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load Camel-Doc-OCR-062825
@@ -116,7 +117,6 @@ def generate_image(model_name: str, text: str, image: Image.Image,
             {"type": "text", "text": text},
         ]
     }]
     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(
         text=[prompt_full],
@@ -126,12 +126,10 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         truncation=False,
         max_length=MAX_INPUT_TOKEN_LENGTH
     ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
     thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
         buffer += new_text
@@ -177,7 +175,6 @@ def generate_video(model_name: str, text: str, video_path: str,
         image, timestamp = frame
         messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
         messages[1]["content"].append({"type": "image", "image": image})
     inputs = processor.apply_chat_template(
         messages,
         tokenize=True,
@@ -187,7 +184,6 @@ def generate_video(model_name: str, text: str, video_path: str,
         truncation=False,
         max_length=MAX_INPUT_TOKEN_LENGTH
     ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,
@@ -201,7 +197,6 @@ def generate_video(model_name: str, text: str, video_path: str,
     }
     thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
         buffer += new_text
@@ -213,9 +208,10 @@ def generate_video(model_name: str, text: str, video_path: str,
 image_examples = [
     ["convert this page to doc [text] precisely for markdown.", "images/1.png"],
     ["convert this page to doc [table] precisely for markdown.", "images/2.png"],
-    ["explain the movie shot in detail.", "images/3.png"],
     ["fill the correct numbers.", "images/4.png"]
 ]
 video_examples = [
     ["explain the ad video in detail.", "videos/1.mp4"],
     ["explain the video in detail.", "videos/2.mp4"]
@@ -235,96 +231,10 @@ css = """
     border-radius: 10px;
     padding: 20px;
 }
-/* From Uiverse.io by Subaashbala */
-button {
-  display: flex;
-  justify-content: space-around;
-  align-items: center;
-  padding: 1em 0em 1em 1em;
-  background-color: yellow;
-  cursor: pointer;
-  box-shadow: 4px 6px 0px black;
-  border: 4px solid;
-  border-radius: 15px;
-  position: relative;
-  overflow: hidden;
-  z-index: 100;
-  transition: box-shadow 250ms, transform 250ms, filter 50ms;
-}
-button:hover {
-  transform: translate(2px, 2px);
-  box-shadow: 2px 3px 0px black;
-}
-button:active {
-  filter: saturate(0.75);
-}
-button::after {
-  content: "";
-  position: absolute;
-  inset: 0;
-  background-color: pink;
-  z-index: -1;
-  transform: translateX(-100%);
-  transition: transform 250ms;
-}
-button:hover::after {
-  transform: translateX(0);
-}
-.bgContainer {
-  position: relative;
-  display: flex;
-  justify-content: start;
-  align-items: center;
-  overflow: hidden;
-  max-width: 35%; /* adjust this if the button text is not proper */
-  font-size: 2em;
-  font-weight: 600;
-}
-.bgContainer span {
-  position: relative;
-  transform: translateX(-100%);
-  transition: all 250ms;
-}
-.button:hover .bgContainer > span {
-  transform: translateX(0);
-}
-.arrowContainer {
-  padding: 1em;
-  margin-inline-end: 1em;
-  border: 4px solid;
-  border-radius: 50%;
-  background-color: pink;
-  position: relative;
-  overflow: hidden;
-  transition: transform 250ms, background-color 250ms;
-  z-index: 100;
-}
-.arrowContainer::after {
-  content: "";
-  position: absolute;
-  inset: 0;
-  border-radius: inherit;
-  background-color: yellow;
-  transform: translateX(-100%);
-  z-index: -1;
-  transition: transform 250ms ease-in-out;
-}
-button:hover .arrowContainer::after {
-  transform: translateX(0);
-}
-button:hover .arrowContainer {
-  transform: translateX(5px);
-}
-button:active .arrowContainer {
-  transform: translateX(8px);
-}
-.arrowContainer svg {
-  vertical-align: middle;
-}
 """
 # Create the Gradio Interface
-with gr.Blocks(css=css) as demo:
     gr.Markdown("# **[Multimodal OCR Comparator](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
     with gr.Row():
         with gr.Column():
@@ -345,24 +255,30 @@ with gr.Blocks(css=css) as demo:
                         examples=video_examples,
                         inputs=[video_query, video_upload]
                     )
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                 temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
                 top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
                 top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
                 repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
         with gr.Column():
             with gr.Column(elem_classes="canvas-output"):
                 gr.Markdown("## Output")
                 output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
-                with gr.Accordion("(Result.md)", open=False):
                     markdown_output = gr.Markdown(label="(Result.md)")
             model_choice = gr.Radio(
                 choices=["Camel-Doc-OCR-062825", "MonkeyOCR-pro-1.2B", "Megalodon-OCR-Sync-0713", "Qwen2-VL-OCR-2B"],
                 label="Select Model",
                 value="Camel-Doc-OCR-062825"
             )
             gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR-Comparator/discussions)")
     # Define the submit button actions
     image_submit.click(fn=generate_image,
                        inputs=[

 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load Camel-Doc-OCR-062825
             {"type": "text", "text": text},
         ]
     }]
     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(
         text=[prompt_full],
         truncation=False,
         max_length=MAX_INPUT_TOKEN_LENGTH
     ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
     thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
         buffer += new_text
         image, timestamp = frame
         messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
         messages[1]["content"].append({"type": "image", "image": image})
     inputs = processor.apply_chat_template(
         messages,
         tokenize=True,
         truncation=False,
         max_length=MAX_INPUT_TOKEN_LENGTH
     ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,
     }
     thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
         buffer += new_text
 image_examples = [
     ["convert this page to doc [text] precisely for markdown.", "images/1.png"],
     ["convert this page to doc [table] precisely for markdown.", "images/2.png"],
+    ["explain the movie shot in detail.", "images/3.png"],
     ["fill the correct numbers.", "images/4.png"]
 ]
 video_examples = [
     ["explain the ad video in detail.", "videos/1.mp4"],
     ["explain the video in detail.", "videos/2.mp4"]
     border-radius: 10px;
     padding: 20px;
 }
 """
 # Create the Gradio Interface
+with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
     gr.Markdown("# **[Multimodal OCR Comparator](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
     with gr.Row():
         with gr.Column():
                         examples=video_examples,
                         inputs=[video_query, video_upload]
                     )
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                 temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
                 top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
                 top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
                 repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
         with gr.Column():
             with gr.Column(elem_classes="canvas-output"):
                 gr.Markdown("## Output")
                 output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
+                with gr.Accordion("(Result.md)", open=False):
                     markdown_output = gr.Markdown(label="(Result.md)")
             model_choice = gr.Radio(
                 choices=["Camel-Doc-OCR-062825", "MonkeyOCR-pro-1.2B", "Megalodon-OCR-Sync-0713", "Qwen2-VL-OCR-2B"],
                 label="Select Model",
                 value="Camel-Doc-OCR-062825"
             )
             gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR-Comparator/discussions)")
     # Define the submit button actions
     image_submit.click(fn=generate_image,
                        inputs=[