Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -234,9 +234,6 @@ def ragent_reasoning(prompt: str, history: list[dict], max_tokens: int = 2048, t
|
|
| 234 |
response += token
|
| 235 |
yield response
|
| 236 |
|
| 237 |
-
# ------------------------------------------------------------------------------
|
| 238 |
-
# New Phi-4 Multimodal Feature (Image & Audio)
|
| 239 |
-
# ------------------------------------------------------------------------------
|
| 240 |
# Define prompt structure for Phi-4
|
| 241 |
phi4_user_prompt = '<|user|>'
|
| 242 |
phi4_assistant_prompt = '<|assistant|>'
|
|
@@ -253,15 +250,8 @@ phi4_model = AutoModelForCausalLM.from_pretrained(
|
|
| 253 |
_attn_implementation="eager",
|
| 254 |
)
|
| 255 |
|
| 256 |
-
grpo_model_name = "prithivMLmods/SmolLM2-360M-Grpo-r999"
|
| 257 |
-
grpo_device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 258 |
-
grpo_tokenizer = AutoTokenizer.from_pretrained(grpo_model_name)
|
| 259 |
-
grpo_model = AutoModelForCausalLM.from_pretrained(grpo_model_name).to(grpo_device)
|
| 260 |
-
|
| 261 |
-
|
| 262 |
DESCRIPTION = """
|
| 263 |
-
# Agent Dino 🌠
|
| 264 |
-
"""
|
| 265 |
|
| 266 |
css = '''
|
| 267 |
h1 {
|
|
@@ -450,7 +440,7 @@ def detect_objects(image: np.ndarray):
|
|
| 450 |
|
| 451 |
return Image.fromarray(annotated_image)
|
| 452 |
|
| 453 |
-
# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, @yolo,
|
| 454 |
|
| 455 |
@spaces.GPU
|
| 456 |
def generate(
|
|
@@ -470,8 +460,7 @@ def generate(
|
|
| 470 |
- "@web": triggers a web search or webpage visit.
|
| 471 |
- "@rAgent": initiates a reasoning chain using Llama mode.
|
| 472 |
- "@yolo": triggers object detection using YOLO.
|
| 473 |
-
- "@phi4": triggers multimodal (image/audio) processing using the Phi-4 model
|
| 474 |
-
- **"@grpo": triggers text generation using the GRPO model with a text streamer.**
|
| 475 |
"""
|
| 476 |
text = input_dict["text"]
|
| 477 |
files = input_dict.get("files", [])
|
|
@@ -630,37 +619,6 @@ def generate(
|
|
| 630 |
yield buffer
|
| 631 |
return
|
| 632 |
|
| 633 |
-
# --- GRPO Text Generation branch ---
|
| 634 |
-
if text.strip().lower().startswith("@grpo"):
|
| 635 |
-
prompt = text[len("@grpo"):].strip()
|
| 636 |
-
yield "📝 Generating text with @grpo..."
|
| 637 |
-
messages = [
|
| 638 |
-
{"role": "system", "content": "Please respond in this specific format ONLY:\n<thinking>\n input your reasoning behind your answer in between these reasoning tags.\n</thinking>\n<answer>\nyour answer in between these answer tags.\n</answer>\n"},
|
| 639 |
-
{"role": "user", "content": prompt}
|
| 640 |
-
]
|
| 641 |
-
# Use the GRPO tokenizer's chat template if available, otherwise simply join the messages.
|
| 642 |
-
input_text = grpo_tokenizer.apply_chat_template(messages, tokenize=False) if hasattr(grpo_tokenizer, "apply_chat_template") else "\n".join([msg["content"] for msg in messages])
|
| 643 |
-
inputs = grpo_tokenizer.encode(input_text, return_tensors="pt").to(grpo_model.device)
|
| 644 |
-
streamer = TextIteratorStreamer(grpo_tokenizer, skip_prompt=True, skip_special_tokens=True)
|
| 645 |
-
generation_kwargs = {
|
| 646 |
-
"input_ids": inputs,
|
| 647 |
-
"max_new_tokens": 100,
|
| 648 |
-
"temperature": 0.2,
|
| 649 |
-
"top_p": 0.9,
|
| 650 |
-
"do_sample": True,
|
| 651 |
-
"use_cache": False,
|
| 652 |
-
"streamer": streamer,
|
| 653 |
-
}
|
| 654 |
-
thread = Thread(target=grpo_model.generate, kwargs=generation_kwargs)
|
| 655 |
-
thread.start()
|
| 656 |
-
buffer = ""
|
| 657 |
-
yield "🤔 Thinking..."
|
| 658 |
-
for new_text in streamer:
|
| 659 |
-
buffer += new_text
|
| 660 |
-
time.sleep(0.01)
|
| 661 |
-
yield buffer
|
| 662 |
-
return
|
| 663 |
-
|
| 664 |
# --- Text and TTS branch ---
|
| 665 |
tts_prefix = "@tts"
|
| 666 |
is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
|
|
@@ -756,10 +714,9 @@ demo = gr.ChatInterface(
|
|
| 756 |
["@tts2 What causes rainbows to form?"],
|
| 757 |
[{"text": "Summarize the letter", "files": ["examples/1.png"]}],
|
| 758 |
[{"text": "@yolo", "files": ["examples/yolo.jpeg"]}],
|
| 759 |
-
["@
|
| 760 |
["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
|
| 761 |
["@tts1 Explain Tower of Hanoi"],
|
| 762 |
-
["@grpo If there are 12 cookies in a dozen and you have 5 dozen, how many cookies do you have?"],
|
| 763 |
],
|
| 764 |
cache_examples=False,
|
| 765 |
type="messages",
|
|
@@ -770,7 +727,7 @@ demo = gr.ChatInterface(
|
|
| 770 |
label="Query Input",
|
| 771 |
file_types=["image", "audio"],
|
| 772 |
file_count="multiple",
|
| 773 |
-
placeholder="@tts1, @tts2, @image, @3d, @phi4 [image, audio], @rAgent, @web, @yolo,
|
| 774 |
),
|
| 775 |
stop_btn="Stop Generation",
|
| 776 |
multimodal=True,
|
|
|
|
| 234 |
response += token
|
| 235 |
yield response
|
| 236 |
|
|
|
|
|
|
|
|
|
|
| 237 |
# Define prompt structure for Phi-4
|
| 238 |
phi4_user_prompt = '<|user|>'
|
| 239 |
phi4_assistant_prompt = '<|assistant|>'
|
|
|
|
| 250 |
_attn_implementation="eager",
|
| 251 |
)
|
| 252 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
DESCRIPTION = """
|
| 254 |
+
# Agent Dino 🌠"""
|
|
|
|
| 255 |
|
| 256 |
css = '''
|
| 257 |
h1 {
|
|
|
|
| 440 |
|
| 441 |
return Image.fromarray(annotated_image)
|
| 442 |
|
| 443 |
+
# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, @yolo, and now @phi4 commands
|
| 444 |
|
| 445 |
@spaces.GPU
|
| 446 |
def generate(
|
|
|
|
| 460 |
- "@web": triggers a web search or webpage visit.
|
| 461 |
- "@rAgent": initiates a reasoning chain using Llama mode.
|
| 462 |
- "@yolo": triggers object detection using YOLO.
|
| 463 |
+
- **"@phi4": triggers multimodal (image/audio) processing using the Phi-4 model.**
|
|
|
|
| 464 |
"""
|
| 465 |
text = input_dict["text"]
|
| 466 |
files = input_dict.get("files", [])
|
|
|
|
| 619 |
yield buffer
|
| 620 |
return
|
| 621 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 622 |
# --- Text and TTS branch ---
|
| 623 |
tts_prefix = "@tts"
|
| 624 |
is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
|
|
|
|
| 714 |
["@tts2 What causes rainbows to form?"],
|
| 715 |
[{"text": "Summarize the letter", "files": ["examples/1.png"]}],
|
| 716 |
[{"text": "@yolo", "files": ["examples/yolo.jpeg"]}],
|
| 717 |
+
["@ragent Explain how a binary search algorithm works."],
|
| 718 |
["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
|
| 719 |
["@tts1 Explain Tower of Hanoi"],
|
|
|
|
| 720 |
],
|
| 721 |
cache_examples=False,
|
| 722 |
type="messages",
|
|
|
|
| 727 |
label="Query Input",
|
| 728 |
file_types=["image", "audio"],
|
| 729 |
file_count="multiple",
|
| 730 |
+
placeholder="@tts1, @tts2, @image, @3d, @phi4 [image, audio], @rAgent, @web, @yolo, default [plain text]"
|
| 731 |
),
|
| 732 |
stop_btn="Stop Generation",
|
| 733 |
multimodal=True,
|