Spaces:

prithivMLmods
/

Multimodal-VLM-v1.0

Running on Zero

prithivMLmods commited on Oct 3

Commit

be8b851

verified ·

1 Parent(s): 16e37bd

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -87,7 +87,7 @@ model_md3 = AutoModelForCausalLM.from_pretrained(
     torch_dtype=torch.bfloat16,
     device_map={"": "cuda"},
 )
-# FIXED: Added trust_remote_code=True to the tokenizer loading
 tokenizer_md3 = AutoTokenizer.from_pretrained(MODEL_ID_MD3, trust_remote_code=True)
@@ -183,11 +183,11 @@ def process_document_stream(
     # --- Special Handling for Moondream3 ---
     if model_name == "Moondream3":
-        # Moondream3 has a different inference method
-        enc_image = model_md3.encode_image(image)
         answer = model_md3.answer_question(
-            enc_image,
-            prompt_input,
             tokenizer=tokenizer_md3
         )
         yield answer, answer
@@ -287,8 +287,8 @@ def create_gradio_interface():
                         raw_output_stream = gr.Textbox(label="Raw Model Output Stream", interactive=False, lines=15, show_copy_button=True)
                         with gr.Row():
                             examples = gr.Examples(
-                                examples=[["examples/1.png"], ["examples/2.png"], ["examples/3.png"],
-                                          ["examples/4.png"], ["examples/5.png"], ["examples/6.png"]],
                                 inputs=image_input, label="Examples"
                             )
                         gr.Markdown("[Report-Bug💻](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-v1.0/discussions) | [prithivMLmods🤗](https://huggingface.co/prithivMLmods)")

     torch_dtype=torch.bfloat16,
     device_map={"": "cuda"},
 )
+# FIX: Added trust_remote_code=True to resolve the loading error
 tokenizer_md3 = AutoTokenizer.from_pretrained(MODEL_ID_MD3, trust_remote_code=True)
     # --- Special Handling for Moondream3 ---
     if model_name == "Moondream3":
+        # Moondream3 uses a different prompt structure and doesn't stream by default in this implementation
+        prompt_full = f"<image>\n\nQuestion: {prompt_input}\n\nAnswer:"
         answer = model_md3.answer_question(
+            model_md3.encode_image(image),
+            prompt_full,
             tokenizer=tokenizer_md3
         )
         yield answer, answer
                         raw_output_stream = gr.Textbox(label="Raw Model Output Stream", interactive=False, lines=15, show_copy_button=True)
                         with gr.Row():
                             examples = gr.Examples(
+                                examples=["examples/1.png", "examples/2.png", "examples/3.png",
+                                          "examples/4.png", "examples/5.png", "examples/6.png"],
                                 inputs=image_input, label="Examples"
                             )
                         gr.Markdown("[Report-Bug💻](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-v1.0/discussions) | [prithivMLmods🤗](https://huggingface.co/prithivMLmods)")