Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -37,6 +37,7 @@ from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
|
|
| 37 |
from diffusers.utils import export_to_ply
|
| 38 |
|
| 39 |
os.system('pip install backoff')
|
|
|
|
| 40 |
# Global constants and helper functions
|
| 41 |
|
| 42 |
MAX_SEED = np.iinfo(np.int32).max
|
|
@@ -323,6 +324,14 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
|
|
| 323 |
torch_dtype=torch.float16
|
| 324 |
).to("cuda").eval()
|
| 325 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 326 |
# Asynchronous text-to-speech
|
| 327 |
|
| 328 |
async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
|
|
@@ -464,7 +473,7 @@ def detect_objects(image: np.ndarray):
|
|
| 464 |
|
| 465 |
return Image.fromarray(annotated_image)
|
| 466 |
|
| 467 |
-
# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, @yolo, and now @
|
| 468 |
|
| 469 |
@spaces.GPU
|
| 470 |
def generate(
|
|
@@ -484,7 +493,8 @@ def generate(
|
|
| 484 |
- "@web": triggers a web search or webpage visit.
|
| 485 |
- "@rAgent": initiates a reasoning chain using Llama mode.
|
| 486 |
- "@yolo": triggers object detection using YOLO.
|
| 487 |
-
-
|
|
|
|
| 488 |
"""
|
| 489 |
text = input_dict["text"]
|
| 490 |
files = input_dict.get("files", [])
|
|
@@ -644,6 +654,48 @@ def generate(
|
|
| 644 |
yield buffer
|
| 645 |
return
|
| 646 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 647 |
# --- Text and TTS branch ---
|
| 648 |
tts_prefix = "@tts"
|
| 649 |
is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
|
|
@@ -744,6 +796,7 @@ demo = gr.ChatInterface(
|
|
| 744 |
["@rAgent Explain how a binary search algorithm works."],
|
| 745 |
["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
|
| 746 |
["@tts1 Explain Tower of Hanoi"],
|
|
|
|
| 747 |
],
|
| 748 |
cache_examples=False,
|
| 749 |
type="messages",
|
|
@@ -754,7 +807,7 @@ demo = gr.ChatInterface(
|
|
| 754 |
label="Query Input",
|
| 755 |
file_types=["image", "audio"],
|
| 756 |
file_count="multiple",
|
| 757 |
-
placeholder=" @tts1, @tts2, @image, @3d, @phi4 [image, audio], @rAgent, @web, @yolo, default [plain text]"
|
| 758 |
),
|
| 759 |
stop_btn="Stop Generation",
|
| 760 |
multimodal=True,
|
|
|
|
| 37 |
from diffusers.utils import export_to_ply
|
| 38 |
|
| 39 |
os.system('pip install backoff')
|
| 40 |
+
|
| 41 |
# Global constants and helper functions
|
| 42 |
|
| 43 |
MAX_SEED = np.iinfo(np.int32).max
|
|
|
|
| 324 |
torch_dtype=torch.float16
|
| 325 |
).to("cuda").eval()
|
| 326 |
|
| 327 |
+
# ------------------------------------------------------------------------------
|
| 328 |
+
# New Gemma3-4b Multimodal Feature (Image & Text)
|
| 329 |
+
# ------------------------------------------------------------------------------
|
| 330 |
+
from transformers import AutoProcessor as Gemma3AutoProcessor, Gemma3ForConditionalGeneration
|
| 331 |
+
gemma3_model_id = "google/gemma-3-4b-it"
|
| 332 |
+
gemma3_model = Gemma3ForConditionalGeneration.from_pretrained(gemma3_model_id, device_map="auto").eval()
|
| 333 |
+
gemma3_processor = Gemma3AutoProcessor.from_pretrained(gemma3_model_id)
|
| 334 |
+
|
| 335 |
# Asynchronous text-to-speech
|
| 336 |
|
| 337 |
async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
|
|
|
|
| 473 |
|
| 474 |
return Image.fromarray(annotated_image)
|
| 475 |
|
| 476 |
+
# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, @yolo, @phi4, and now @gemma3-4b commands
|
| 477 |
|
| 478 |
@spaces.GPU
|
| 479 |
def generate(
|
|
|
|
| 493 |
- "@web": triggers a web search or webpage visit.
|
| 494 |
- "@rAgent": initiates a reasoning chain using Llama mode.
|
| 495 |
- "@yolo": triggers object detection using YOLO.
|
| 496 |
+
- "@phi4": triggers multimodal (image/audio) processing using the Phi-4 model.
|
| 497 |
+
- **"@gemma3-4b": triggers multimodal (image/text) processing using the Gemma3-4b model.**
|
| 498 |
"""
|
| 499 |
text = input_dict["text"]
|
| 500 |
files = input_dict.get("files", [])
|
|
|
|
| 654 |
yield buffer
|
| 655 |
return
|
| 656 |
|
| 657 |
+
# --- Gemma3-4b Multimodal branch (Image/Text) with Streaming ---
|
| 658 |
+
if text.strip().lower().startswith("@gemma3-4b"):
|
| 659 |
+
question = text[len("@gemma3-4b"):].strip()
|
| 660 |
+
messages = [
|
| 661 |
+
{
|
| 662 |
+
"role": "system",
|
| 663 |
+
"content": [{"type": "text", "text": "You are a helpful assistant."}]
|
| 664 |
+
},
|
| 665 |
+
{
|
| 666 |
+
"role": "user",
|
| 667 |
+
"content": []
|
| 668 |
+
}
|
| 669 |
+
]
|
| 670 |
+
if files:
|
| 671 |
+
try:
|
| 672 |
+
# If file is already a PIL Image, use it; otherwise try opening it.
|
| 673 |
+
if isinstance(files[0], Image.Image):
|
| 674 |
+
image = files[0]
|
| 675 |
+
else:
|
| 676 |
+
image = Image.open(files[0])
|
| 677 |
+
messages[1]["content"].append({"type": "image", "image": image})
|
| 678 |
+
except Exception as e:
|
| 679 |
+
yield f"Error processing image: {str(e)}"
|
| 680 |
+
return
|
| 681 |
+
messages[1]["content"].append({"type": "text", "text": question})
|
| 682 |
+
inputs = gemma3_processor.apply_chat_template(
|
| 683 |
+
messages, add_generation_prompt=True, tokenize=True,
|
| 684 |
+
return_dict=True, return_tensors="pt"
|
| 685 |
+
).to(gemma3_model.device, dtype=torch.bfloat16)
|
| 686 |
+
input_len = inputs["input_ids"].shape[-1]
|
| 687 |
+
streamer = TextIteratorStreamer(gemma3_processor, skip_prompt=True, skip_special_tokens=True)
|
| 688 |
+
generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens, "do_sample": False}
|
| 689 |
+
thread = Thread(target=gemma3_model.generate, kwargs=generation_kwargs)
|
| 690 |
+
thread.start()
|
| 691 |
+
buffer = ""
|
| 692 |
+
yield progress_bar_html("Processing Gemma3-4b Multimodal")
|
| 693 |
+
for new_text in streamer:
|
| 694 |
+
buffer += new_text
|
| 695 |
+
time.sleep(0.01)
|
| 696 |
+
yield buffer
|
| 697 |
+
return
|
| 698 |
+
|
| 699 |
# --- Text and TTS branch ---
|
| 700 |
tts_prefix = "@tts"
|
| 701 |
is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
|
|
|
|
| 796 |
["@rAgent Explain how a binary search algorithm works."],
|
| 797 |
["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
|
| 798 |
["@tts1 Explain Tower of Hanoi"],
|
| 799 |
+
["@gemma3-4b Describe this image in detail."]
|
| 800 |
],
|
| 801 |
cache_examples=False,
|
| 802 |
type="messages",
|
|
|
|
| 807 |
label="Query Input",
|
| 808 |
file_types=["image", "audio"],
|
| 809 |
file_count="multiple",
|
| 810 |
+
placeholder=" @tts1, @tts2, @image, @3d, @phi4 [image, audio], @gemma3-4b, @rAgent, @web, @yolo, default [plain text]"
|
| 811 |
),
|
| 812 |
stop_btn="Stop Generation",
|
| 813 |
multimodal=True,
|