Spaces:

prithivMLmods
/

Qwen3-VL-Outpost

Running on Zero

File size: 13,848 Bytes

a0a9007
6e2c6fa
 
 
ab0c591
6e2c6fa
a8b1c40
c11107a
a8b1c40
 
6e2c6fa
a8b1c40
a0a9007
6e2c6fa
a8b1c40
09dd649
a8b1c40
 
c11107a
6e2c6fa
a8b1c40
 
 
6e2c6fa
c11107a
 
 
1d7b167
 
 
 
 
 
 
 
 
 
 
 
 
c11107a
 
1d7b167
 
c11107a
 
 
 
1d7b167
c11107a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d7b167
c11107a
1d7b167
 
c11107a
 
 
 
 
 
 
 
1d7b167
c11107a
 
 
 
 
 
 
 
 
 
1d7b167
a8b1c40
1a17d9f
6e2c6fa
 
a8b1c40
6e2c6fa
a8b1c40
6e2c6fa
a8b1c40
 
 
6e2c6fa
 
 
a8b1c40
 
6e2c6fa
a8b1c40
 
 
6e2c6fa
 
 
a8b1c40
a5d07a8
c11107a
 
7c0a5ab
c11107a
6e2c6fa
 
 
7c0a5ab
 
c11107a
 
d3e8957
c11107a
d3e8957
 
 
 
 
1c3a854
 
 
 
 
 
 
 
 
323e41c
 
6e2c6fa
 
323e41c
6e2c6fa
 
 
 
c11107a
 
6e2c6fa
 
 
 
 
 
 
 
 
 
a0a9007
6e2c6fa
 
 
 
 
 
 
a8b1c40
6e2c6fa
a8b1c40
6e2c6fa
c11107a
6e2c6fa
c11107a
 
 
 
 
1c3a854
 
6e2c6fa
 
 
a0a9007
6e2c6fa
 
 
a0a9007
c11107a
6e2c6fa
 
c11107a
d2cfb56
 
6e2c6fa
 
 
 
 
 
 
 
 
 
a0a9007
a8b1c40
6e2c6fa
 
 
 
 
 
a8b1c40
6e2c6fa
a8b1c40
6e2c6fa
c11107a
6e2c6fa
c11107a
 
 
 
 
1c3a854
 
a8b1c40
6e2c6fa
c947ff2
a8b1c40
6e2c6fa
 
466e3e5
 
c11107a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6e2c6fa
d418457
a8b1c40
c11107a
 
 
a8b1c40
c947ff2
ab0c591
 
 
 
c11107a
ab0c591
6e2c6fa
 
c11107a
6e2c6fa
 
d3e8957
 
 
 
6e2c6fa
 
 
 
 
 
 
 
 
 
c11107a
 
6e2c6fa
c11107a
 
6e2c6fa
 
 
 
1d7b167
c11107a
6e2c6fa
c11107a
6e2c6fa
 
 
1a17d9f
c11107a
 
 
6e2c6fa
1d7b167
1a17d9f
c11107a
 
 
6e2c6fa
 
 
 
 
 
 
c11107a
 
1a17d9f
c11107a
 
6e2c6fa
 
1c3a854
6e2c6fa
c11107a
d3e8957
 
6e2c6fa
 
 
 
 
 
 
 
 
 
09dd649
a8b1c40
c11107a

import os
import random
import uuid
import json
import time
import asyncio
from threading import Thread
from typing import Iterable

import gradio as gr
import spaces
import torch
import numpy as np
from PIL import Image
import cv2

from transformers import (
    Qwen2_5_VLForConditionalGeneration,
    Qwen3VLForConditionalGeneration,
    AutoTokenizer,
    AutoProcessor,
    TextIteratorStreamer,
)
from transformers.image_utils import load_image
from gradio.themes import Soft
from gradio.themes.utils import colors, fonts, sizes

colors.steel_blue = colors.Color(
    name="steel_blue",
    c50="#EBF3F8",
    c100="#D3E5F0",
    c200="#A8CCE1",
    c300="#7DB3D2",
    c400="#529AC3",
    c500="#4682B4",
    c600="#3E72A0",
    c700="#36638C",
    c800="#2E5378",
    c900="#264364",
    c950="#1E3450",
)


class SteelBlueTheme(Soft):
    def __init__(
        self,
        *,
        primary_hue: colors.Color | str = colors.gray,
        secondary_hue: colors.Color | str = colors.steel_blue,
        neutral_hue: colors.Color | str = colors.slate,
        text_size: sizes.Size | str = sizes.text_lg,
        font: fonts.Font | str | Iterable[fonts.Font | str] = (
            fonts.GoogleFont("Outfit"), "Arial", "sans-serif",
        ),
        font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (
            fonts.GoogleFont("IBM Plex Mono"), "ui-monospace", "monospace",
        ),
    ):
        super().__init__(
            primary_hue=primary_hue,
            secondary_hue=secondary_hue,
            neutral_hue=neutral_hue,
            text_size=text_size,
            font=font,
            font_mono=font_mono,
        )
        super().set(
            background_fill_primary="*primary_50",
            background_fill_primary_dark="*primary_900",
            body_background_fill="linear-gradient(135deg, *primary_200, *primary_100)",
            body_background_fill_dark="linear-gradient(135deg, *primary_900, *primary_800)",
            button_primary_text_color="white",
            button_primary_text_color_hover="white",
            button_primary_background_fill="linear-gradient(90deg, *secondary_500, *secondary_600)",
            button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_700)",
            button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_800)",
            button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_500)",
            button_secondary_text_color="black",
            button_secondary_text_color_hover="white",
            button_secondary_background_fill="linear-gradient(90deg, *primary_300, *primary_300)",
            button_secondary_background_fill_hover="linear-gradient(90deg, *primary_400, *primary_400)",
            button_secondary_background_fill_dark="linear-gradient(90deg, *primary_500, *primary_600)",
            button_secondary_background_fill_hover_dark="linear-gradient(90deg, *primary_500, *primary_500)",
            slider_color="*secondary_500",
            slider_color_dark="*secondary_600",
            block_title_text_weight="600",
            block_border_width="3px",
            block_shadow="*shadow_drop_lg",
            button_primary_shadow="*shadow_drop_lg",
            button_large_padding="11px",
            color_accent_soft="*primary_100",
            block_label_background_fill="*primary_200",
        )

steel_blue_theme = SteelBlueTheme()

MAX_MAX_NEW_TOKENS = 4096
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load Qwen2.5-VL-7B-Instruct
MODEL_ID_M = "Qwen/Qwen2.5-VL-7B-Instruct"
processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID_M,
    trust_remote_code=True,
    torch_dtype=torch.float16
).to(device).eval()

# Load Qwen2.5-VL-3B-Instruct
MODEL_ID_X = "Qwen/Qwen2.5-VL-3B-Instruct"
processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID_X,
    trust_remote_code=True,
    torch_dtype=torch.float16
).to(device).eval()

# Load Qwen3-VL-4B-Instruct
MODEL_ID_Q = "Qwen/Qwen3-VL-4B-Instruct"
processor_q = AutoProcessor.from_pretrained(MODEL_ID_Q, trust_remote_code=True)
model_q = Qwen3VLForConditionalGeneration.from_pretrained(
    MODEL_ID_Q,
    trust_remote_code=True,
    torch_dtype=torch.float16
).to(device).eval()

# Load Qwen3-VL-8B-Instruct
MODEL_ID_Y = "Qwen/Qwen3-VL-8B-Instruct"
processor_y = AutoProcessor.from_pretrained(MODEL_ID_Y, trust_remote_code=True)
model_y = Qwen3VLForConditionalGeneration.from_pretrained(
    MODEL_ID_Y,
    trust_remote_code=True,
    torch_dtype=torch.float16
).to(device).eval()

# Load Qwen3-VL-4B-Thinking
MODEL_ID_T = "Qwen/Qwen3-VL-4B-Thinking"
processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True)
model_t = Qwen3VLForConditionalGeneration.from_pretrained(
    MODEL_ID_T,
    trust_remote_code=True,
    torch_dtype=torch.float16
).to(device).eval()

def downsample_video(video_path):
    """
    Downsamples the video to evenly spaced frames.
    Each frame is returned as a PIL image along with its timestamp.
    """
    vidcap = cv2.VideoCapture(video_path)
    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = vidcap.get(cv2.CAP_PROP_FPS)
    frames = []
    # Use a maximum of 10 frames to avoid excessive memory usage
    frame_indices = np.linspace(0, total_frames - 1, min(total_frames, 10), dtype=int)
    for i in frame_indices:
        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
        success, image = vidcap.read()
        if success:
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            pil_image = Image.fromarray(image)
            timestamp = round(i / fps, 2)
            frames.append((pil_image, timestamp))
    vidcap.release()
    return frames

@spaces.GPU
def generate_image(model_name: str, text: str, image: Image.Image,
                   max_new_tokens: int = 1024,
                   temperature: float = 0.6,
                   top_p: float = 0.9,
                   top_k: int = 50,
                   repetition_penalty: float = 1.2):
    """
    Generates responses using the selected model for image input.
    """
    if model_name == "Qwen2.5-VL-7B-Instruct":
        processor, model = processor_m, model_m
    elif model_name == "Qwen2.5-VL-3B-Instruct":
        processor, model = processor_x, model_x
    elif model_name == "Qwen3-VL-4B-Instruct":
        processor, model = processor_q, model_q
    elif model_name == "Qwen3-VL-8B-Instruct":
        processor, model = processor_y, model_y
    elif model_name == "Qwen3-VL-4B-Thinking":
        processor, model = processor_t, model_t
    else:
        yield "Invalid model selected.", "Invalid model selected."
        return

    if image is None:
        yield "Please upload an image.", "Please upload an image."
        return

    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
    prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[prompt_full], images=[image], return_tensors="pt", padding=True,
        truncation=True, 
        max_length=MAX_INPUT_TOKEN_LENGTH
    ).to(device)
    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
    generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()
    buffer = ""
    for new_text in streamer:
        buffer += new_text
        time.sleep(0.01)
        yield buffer, buffer

@spaces.GPU
def generate_video(model_name: str, text: str, video_path: str,
                   max_new_tokens: int = 1024,
                   temperature: float = 0.6,
                   top_p: float = 0.9,
                   top_k: int = 50,
                   repetition_penalty: float = 1.2):
    """
    Generates responses using the selected model for video input.
    """
    if model_name == "Qwen2.5-VL-7B-Instruct":
        processor, model = processor_m, model_m
    elif model_name == "Qwen2.5-VL-3B-Instruct":
        processor, model = processor_x, model_x
    elif model_name == "Qwen3-VL-4B-Instruct":
        processor, model = processor_q, model_q
    elif model_name == "Qwen3-VL-8B-Instruct":
        processor, model = processor_y, model_y
    elif model_name == "Qwen3-VL-4B-Thinking":
        processor, model = processor_t, model_t
    else:
        yield "Invalid model selected.", "Invalid model selected."
        return

    if video_path is None:
        yield "Please upload a video.", "Please upload a video."
        return

    frames_with_ts = downsample_video(video_path)
    if not frames_with_ts:
        yield "Could not process video.", "Could not process video."
        return

    messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
    images_for_processor = []
    for frame, timestamp in frames_with_ts:
        messages[0]["content"].append({"type": "image"})
        images_for_processor.append(frame)

    prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[prompt_full], images=images_for_processor, return_tensors="pt", padding=True,
        truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH
    ).to(device)
    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
    generation_kwargs = {
        **inputs, "streamer": streamer, "max_new_tokens": max_new_tokens,
        "do_sample": True, "temperature": temperature, "top_p": top_p,
        "top_k": top_k, "repetition_penalty": repetition_penalty,
    }
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()
    buffer = ""
    for new_text in streamer:
        buffer += new_text
        buffer = buffer.replace("<|im_end|>", "")
        time.sleep(0.01)
        yield buffer, buffer


# Define examples for image and video inference
image_examples = [
    ["Explain the content in detail.", "images/D.jpg"],
    ["Explain the content (ocr).", "images/O.jpg"],
    ["What is the core meaning of the poem?", "images/S.jpg"],
    ["Provide a detailed caption for the image.", "images/A.jpg"],
    ["Explain the pie-chart in detail.", "images/2.jpg"],
    ["Jsonify Data.", "images/1.jpg"],
]

video_examples = [
    ["Explain the ad in detail", "videos/1.mp4"],
    ["Identify the main actions in the video", "videos/2.mp4"],
]

css = """
#main-title h1 {
    font-size: 2.3em !important;
}
#output-title h2 {
    font-size: 2.1em !important;
}
"""

# Create the Gradio Interface
with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
    gr.Markdown("# **Qwen3-VL-Outpost**", elem_id="main-title")
    with gr.Row():
        with gr.Column(scale=2):
            with gr.Tabs():
                with gr.TabItem("Image Inference"):
                    image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
                    image_upload = gr.Image(type="pil", label="Upload Image", height=290)
                    image_submit = gr.Button("Submit", variant="primary")
                    gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
                    
                with gr.TabItem("Video Inference"):
                    video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
                    video_upload = gr.Video(label="Upload Video", height=290)
                    video_submit = gr.Button("Submit", variant="primary")
                    gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
                    
            with gr.Accordion("Advanced options", open=False):
                max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
                top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
                top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
                repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
                
        with gr.Column(scale=3):
            gr.Markdown("## Output", elem_id="output-title")
            output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=14, show_copy_button=True)
            with gr.Accordion("(Result.md)", open=False):
                markdown_output = gr.Markdown()
                
            model_choice = gr.Radio(
                choices=["Qwen3-VL-4B-Instruct", "Qwen3-VL-8B-Instruct", "Qwen3-VL-4B-Thinking", "Qwen2.5-VL-3B-Instruct", "Qwen2.5-VL-7B-Instruct"],
                label="Select Model",
                value="Qwen3-VL-4B-Instruct"
            )

    image_submit.click(
        fn=generate_image,
        inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
        outputs=[output, markdown_output]
    )
    video_submit.click(
        fn=generate_video,
        inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
        outputs=[output, markdown_output]
    )

if __name__ == "__main__":
    demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)