Spaces:

prithivMLmods
/

DocScope-R1

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 12

Commit

7508a03

verified ·

1 Parent(s): 05081bc

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -3

app.py CHANGED Viewed

@@ -37,6 +37,7 @@ from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
 from diffusers.utils import export_to_ply
 os.system('pip install backoff')
 # Global constants and helper functions
 MAX_SEED = np.iinfo(np.int32).max
@@ -323,6 +324,14 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to("cuda").eval()
 # Asynchronous text-to-speech
 async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
@@ -464,7 +473,7 @@ def detect_objects(image: np.ndarray):
     return Image.fromarray(annotated_image)
-# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, @yolo, and now @phi4 commands
 @spaces.GPU
 def generate(
@@ -484,7 +493,8 @@ def generate(
       - "@web": triggers a web search or webpage visit.
       - "@rAgent": initiates a reasoning chain using Llama mode.
       - "@yolo": triggers object detection using YOLO.
-      - **"@phi4": triggers multimodal (image/audio) processing using the Phi-4 model.**
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
@@ -644,6 +654,48 @@ def generate(
             yield buffer
         return
     # --- Text and TTS branch ---
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
@@ -744,6 +796,7 @@ demo = gr.ChatInterface(
         ["@rAgent Explain how a binary search algorithm works."],
         ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
         ["@tts1 Explain Tower of Hanoi"],
     ],
     cache_examples=False,
     type="messages",
@@ -754,7 +807,7 @@ demo = gr.ChatInterface(
         label="Query Input",
         file_types=["image", "audio"],
         file_count="multiple",
-        placeholder="‎ @tts1, @tts2, @image, @3d, @phi4 [image, audio], @rAgent, @web, @yolo, default [plain text]"
     ),
     stop_btn="Stop Generation",
     multimodal=True,

 from diffusers.utils import export_to_ply
 os.system('pip install backoff')
 # Global constants and helper functions
 MAX_SEED = np.iinfo(np.int32).max
     torch_dtype=torch.float16
 ).to("cuda").eval()
+# ------------------------------------------------------------------------------
+# New Gemma3-4b Multimodal Feature (Image & Text)
+# ------------------------------------------------------------------------------
+from transformers import AutoProcessor as Gemma3AutoProcessor, Gemma3ForConditionalGeneration
+gemma3_model_id = "google/gemma-3-4b-it"
+gemma3_model = Gemma3ForConditionalGeneration.from_pretrained(gemma3_model_id, device_map="auto").eval()
+gemma3_processor = Gemma3AutoProcessor.from_pretrained(gemma3_model_id)
 # Asynchronous text-to-speech
 async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
     return Image.fromarray(annotated_image)
+# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, @yolo, @phi4, and now @gemma3-4b commands
 @spaces.GPU
 def generate(
       - "@web": triggers a web search or webpage visit.
       - "@rAgent": initiates a reasoning chain using Llama mode.
       - "@yolo": triggers object detection using YOLO.
+      - "@phi4": triggers multimodal (image/audio) processing using the Phi-4 model.
+      - **"@gemma3-4b": triggers multimodal (image/text) processing using the Gemma3-4b model.**
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
             yield buffer
         return
+    # --- Gemma3-4b Multimodal branch (Image/Text) with Streaming ---
+    if text.strip().lower().startswith("@gemma3-4b"):
+        question = text[len("@gemma3-4b"):].strip()
+        messages = [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": "You are a helpful assistant."}]
+            },
+            {
+                "role": "user",
+                "content": []
+            }
+        ]
+        if files:
+            try:
+                # If file is already a PIL Image, use it; otherwise try opening it.
+                if isinstance(files[0], Image.Image):
+                    image = files[0]
+                else:
+                    image = Image.open(files[0])
+                messages[1]["content"].append({"type": "image", "image": image})
+            except Exception as e:
+                yield f"Error processing image: {str(e)}"
+                return
+        messages[1]["content"].append({"type": "text", "text": question})
+        inputs = gemma3_processor.apply_chat_template(
+            messages, add_generation_prompt=True, tokenize=True,
+            return_dict=True, return_tensors="pt"
+        ).to(gemma3_model.device, dtype=torch.bfloat16)
+        input_len = inputs["input_ids"].shape[-1]
+        streamer = TextIteratorStreamer(gemma3_processor, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens, "do_sample": False}
+        thread = Thread(target=gemma3_model.generate, kwargs=generation_kwargs)
+        thread.start()
+        buffer = ""
+        yield progress_bar_html("Processing Gemma3-4b Multimodal")
+        for new_text in streamer:
+            buffer += new_text
+            time.sleep(0.01)
+            yield buffer
+        return
     # --- Text and TTS branch ---
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
         ["@rAgent Explain how a binary search algorithm works."],
         ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
         ["@tts1 Explain Tower of Hanoi"],
+        ["@gemma3-4b Describe this image in detail."]
     ],
     cache_examples=False,
     type="messages",
         label="Query Input",
         file_types=["image", "audio"],
         file_count="multiple",
+        placeholder="‎ @tts1, @tts2, @image, @3d, @phi4 [image, audio], @gemma3-4b, @rAgent, @web, @yolo, default [plain text]"
     ),
     stop_btn="Stop Generation",
     multimodal=True,