Spaces:

baohuynhbk14
/

Qwen3-VL-Demo

Running on Zero

App Files Files Community

baohuynhbk14 commited on 8 days ago

Commit

1d1d107

verified ·

1 Parent(s): 88a3550

Update app.py

Browse files

Files changed (1) hide show

app.py +531 -0

app.py CHANGED Viewed

	@@ -0,0 +1,531 @@

+import os
+import random
+import uuid
+import json
+import time
+import asyncio
+from threading import Thread
+# --- Thêm từ Script 1 ---
+from io import BytesIO
+from typing import Optional, Tuple, Dict, Any, Iterable
+import fitz  # Thư viện PyMuPDF
+# --- Kết thúc thêm ---
+import gradio as gr
+import spaces
+import torch
+import numpy as np
+from PIL import Image
+import cv2
+from transformers import (
+    Qwen2_5_VLForConditionalGeneration,
+    Qwen3VLForConditionalGeneration,
+    AutoTokenizer,
+    AutoProcessor,
+    TextIteratorStreamer,
+)
+from transformers.image_utils import load_image
+from gradio.themes import Soft
+from gradio.themes.utils import colors, fonts, sizes
+# --- Theme and CSS Definition (Từ Script 2) ---
+colors.steel_blue = colors.Color(
+    name="steel_blue",
+    c50="#EBF3F8",
+    c100="#D3E5F0",
+    c200="#A8CCE1",
+    c300="#7DB3D2",
+    c400="#529AC3",
+    c500="#4682B4",  # SteelBlue base color
+    c600="#3E72A0",
+    c700="#36638C",
+    c800="#2E5378",
+    c900="#264364",
+    c950="#1E3450",
+)
+class SteelBlueTheme(Soft):
+    def __init__(
+        self,
+        *,
+        primary_hue: colors.Color | str = colors.gray,
+        secondary_hue: colors.Color | str = colors.steel_blue,
+        neutral_hue: colors.Color | str = colors.slate,
+        text_size: sizes.Size | str = sizes.text_lg,
+        font: fonts.Font | str | Iterable[fonts.Font | str] = (
+            fonts.GoogleFont("Outfit"), "Arial", "sans-serif",
+        ),
+        font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (
+            fonts.GoogleFont("IBM Plex Mono"), "ui-monospace", "monospace",
+        ),
+    ):
+        super().__init__(
+            primary_hue=primary_hue,
+            secondary_hue=secondary_hue,
+            neutral_hue=neutral_hue,
+            text_size=text_size,
+            font=font,
+            font_mono=font_mono,
+        )
+        super().set(
+            background_fill_primary="*primary_50",
+            background_fill_primary_dark="*primary_900",
+            body_background_fill="linear-gradient(135deg, *primary_200, *primary_100)",
+            body_background_fill_dark="linear-gradient(135deg, *primary_900, *primary_800)",
+            button_primary_text_color="white",
+            button_primary_text_color_hover="white",
+            button_primary_background_fill="linear-gradient(90deg, *secondary_500, *secondary_600)",
+            button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_700)",
+            button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_800)",
+            button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_500)",
+            button_secondary_text_color="black",
+            button_secondary_text_color_hover="white",
+            button_secondary_background_fill="linear-gradient(90deg, *primary_300, *primary_300)",
+            button_secondary_background_fill_hover="linear-gradient(90deg, *primary_400, *primary_400)",
+            button_secondary_background_fill_dark="linear-gradient(90deg, *primary_500, *primary_600)",
+            button_secondary_background_fill_hover_dark="linear-gradient(90deg, *primary_500, *primary_500)",
+            slider_color="*secondary_500",
+            slider_color_dark="*secondary_600",
+            block_title_text_weight="600",
+            block_border_width="3px",
+            block_shadow="*shadow_drop_lg",
+            button_primary_shadow="*shadow_drop_lg",
+            button_large_padding="11px",
+            color_accent_soft="*primary_100",
+            block_label_background_fill="*primary_200",
+        )
+steel_blue_theme = SteelBlueTheme()
+# --- Cấu hình và Tải Model (Từ Script 2) ---
+MAX_MAX_NEW_TOKENS = 4096
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# Load Qwen2.5-VL-7B-Instruct
+MODEL_ID_M = "Qwen/Qwen2.5-VL-7B-Instruct"
+processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
+model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_M,
+    trust_remote_code=True,
+    torch_dtype=torch.float16).to(device).eval()
+# Load Qwen2.5-VL-3B-Instruct
+MODEL_ID_X = "Qwen/Qwen2.5-VL-3B-Instruct"
+processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
+model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_X,
+    trust_remote_code=True,
+    torch_dtype=torch.float16).to(device).eval()
+# Load Qwen3-VL-4B-Instruct
+MODEL_ID_Q = "Qwen/Qwen3-VL-4B-Instruct"
+processor_q = AutoProcessor.from_pretrained(MODEL_ID_Q, trust_remote_code=True)
+model_q = Qwen3VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_Q,
+    trust_remote_code=True,
+    torch_dtype=torch.float16).to(device).eval()
+# Load Qwen3-VL-8B-Instruct
+MODEL_ID_Y = "Qwen/Qwen3-VL-8B-Instruct"
+processor_y = AutoProcessor.from_pretrained(MODEL_ID_Y, trust_remote_code=True)
+model_y = Qwen3VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_Y,
+    trust_remote_code=True,
+    torch_dtype=torch.float16).to(device).eval()
+# Load Qwen3-VL-2B-Instruct
+MODEL_ID_L = "Qwen/Qwen3-VL-2B-Instruct"
+processor_l = AutoProcessor.from_pretrained(MODEL_ID_L, trust_remote_code=True)
+model_l = Qwen3VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_L,
+    trust_remote_code=True,
+    torch_dtype=torch.float16).to(device).eval()
+# Load Qwen3-VL-2B-Thinking
+MODEL_ID_J = "Qwen/Qwen3-VL-2B-Thinking"
+processor_j = AutoProcessor.from_pretrained(MODEL_ID_J, trust_remote_code=True)
+model_j = Qwen3VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_J,
+    trust_remote_code=True,
+    torch_dtype=torch.float16).to(device).eval()
+# Load Qwen3-VL-4B-Thinking
+MODEL_ID_T = "Qwen/Qwen3-VL-4B-Thinking"
+processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True)
+model_t = Qwen3VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_T,
+    trust_remote_code=True,
+    torch_dtype=torch.float16).to(device).eval()
+# --- Các hàm hỗ trợ PDF (Từ Script 1) ---
+def convert_pdf_to_images(file_path: str, dpi: int = 200):
+    if not file_path:
+        return []
+    images = []
+    pdf_document = fitz.open(file_path)
+    zoom = dpi / 72.0
+    mat = fitz.Matrix(zoom, zoom)
+    for page_num in range(len(pdf_document)):
+        page = pdf_document.load_page(page_num)
+        pix = page.get_pixmap(matrix=mat)
+        img_data = pix.tobytes("png")
+        images.append(Image.open(BytesIO(img_data)))
+    pdf_document.close()
+    return images
+def get_initial_pdf_state() -> Dict[str, Any]:
+    return {"pages": [], "total_pages": 0, "current_page_index": 0}
+def load_and_preview_pdf(file_path: Optional[str]) -> Tuple[Optional[Image.Image], Dict[str, Any], str]:
+    state = get_initial_pdf_state()
+    if not file_path:
+        return None, state, '<div style="text-align:center;">No file loaded</div>'
+    try:
+        pages = convert_pdf_to_images(file_path)
+        if not pages:
+            return None, state, '<div style="text-align:center;">Could not load file</div>'
+        state["pages"] = pages
+        state["total_pages"] = len(pages)
+        page_info_html = f'<div style="text-align:center;">Page 1 / {state["total_pages"]}</div>'
+        return pages[0], state, page_info_html
+    except Exception as e:
+        return None, state, f'<div style="text-align:center;">Failed to load preview: {e}</div>'
+def navigate_pdf_page(direction: str, state: Dict[str, Any]):
+    if not state or not state["pages"]:
+        return None, state, '<div style="text-align:center;">No file loaded</div>'
+    current_index = state["current_page_index"]
+    total_pages = state["total_pages"]
+    if direction == "prev":
+        new_index = max(0, current_index - 1)
+    elif direction == "next":
+        new_index = min(total_pages - 1, current_index + 1)
+    else:
+        new_index = current_index
+    state["current_page_index"] = new_index
+    image_preview = state["pages"][new_index]
+    page_info_html = f'<div style="text-align:center;">Page {new_index + 1} / {total_pages}</div>'
+    return image_preview, state, page_info_html
+# --- Hàm hỗ trợ Video (Từ Script 2) ---
+def downsample_video(video_path):
+    """
+    Downsamples the video to evenly spaced frames.
+    Each frame is returned as a PIL image along with its timestamp.
+    """
+    vidcap = cv2.VideoCapture(video_path)
+    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    fps = vidcap.get(cv2.CAP_PROP_FPS)
+    frames = []
+    # Use a maximum of 10 frames to avoid excessive memory usage
+    frame_indices = np.linspace(0, total_frames - 1, min(total_frames, 10), dtype=int)
+    for i in frame_indices:
+        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
+        success, image = vidcap.read()
+        if success:
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            pil_image = Image.fromarray(image)
+            timestamp = round(i / fps, 2)
+            frames.append((pil_image, timestamp))
+    vidcap.release()
+    return frames
+# --- Các hàm Generate (Từ Script 2, với `generate_pdf` được thêm vào) ---
+@spaces.GPU
+def generate_image(model_name: str, text: str, image: Image.Image,
+                   max_new_tokens: int = 1024,
+                   temperature: float = 0.6,
+                   top_p: float = 0.9,
+                   top_k: int = 50,
+                   repetition_penalty: float = 1.2):
+    """
+    Generates responses using the selected model for image input.
+    """
+    if model_name == "Qwen2.5-VL-7B-Instruct":
+        processor, model = processor_m, model_m
+    elif model_name == "Qwen2.5-VL-3B-Instruct":
+        processor, model = processor_x, model_x
+    elif model_name == "Qwen3-VL-4B-Instruct":
+        processor, model = processor_q, model_q
+    elif model_name == "Qwen3-VL-8B-Instruct":
+        processor, model = processor_y, model_y
+    elif model_name == "Qwen3-VL-4B-Thinking":
+        processor, model = processor_t, model_t
+    elif model_name == "Qwen3-VL-2B-Instruct":
+        processor, model = processor_l, model_l
+    elif model_name == "Qwen3-VL-2B-Thinking":
+        processor, model = processor_j, model_j
+    else:
+        yield "Invalid model selected.", "Invalid model selected."
+        return
+    if image is None:
+        yield "Please upload an image.", "Please upload an image."
+        return
+    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
+    prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = processor(
+        text=[prompt_full], images=[image], return_tensors="pt", padding=True).to(device)
+    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    buffer = ""
+    for new_text in streamer:
+        buffer += new_text
+        time.sleep(0.01)
+        yield buffer, buffer
+@spaces.GPU
+def generate_video(model_name: str, text: str, video_path: str,
+                   max_new_tokens: int = 1024,
+                   temperature: float = 0.6,
+                   top_p: float = 0.9,
+                   top_k: int = 50,
+                   repetition_penalty: float = 1.2):
+    """
+    Generates responses using the selected model for video input.
+    """
+    if model_name == "Qwen2.5-VL-7B-Instruct":
+        processor, model = processor_m, model_m
+    elif model_name == "Qwen2.5-VL-3B-Instruct":
+        processor, model = processor_x, model_x
+    elif model_name == "Qwen3-VL-4B-Instruct":
+        processor, model = processor_q, model_q
+    elif model_name == "Qwen3-VL-8B-Instruct":
+        processor, model = processor_y, model_y
+    elif model_name == "Qwen3-VL-4B-Thinking":
+        processor, model = processor_t, model_t
+    elif model_name == "Qwen3-VL-2B-Instruct":
+        processor, model = processor_l, model_l
+    elif model_name == "Qwen3-VL-2B-Thinking":
+        processor, model = processor_j, model_j
+    else:
+        yield "Invalid model selected.", "Invalid model selected."
+        return
+    if video_path is None:
+        yield "Please upload a video.", "Please upload a video."
+        return
+    frames_with_ts = downsample_video(video_path)
+    if not frames_with_ts:
+        yield "Could not process video.", "Could not process video."
+        return
+    messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
+    images_for_processor = []
+    for frame, timestamp in frames_with_ts:
+        messages[0]["content"].append({"type": "image"})
+        images_for_processor.append(frame)
+    prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = processor(
+        text=[prompt_full], images=images_for_processor, return_tensors="pt", padding=True).to(device)
+    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = {
+        **inputs, "streamer": streamer, "max_new_tokens": max_new_tokens,
+        "do_sample": True, "temperature": temperature, "top_p": top_p,
+        "top_k": top_k, "repetition_penalty": repetition_penalty,
+    }
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    buffer = ""
+    for new_text in streamer:
+        buffer += new_text
+        buffer = buffer.replace("<|im_end|>", "")
+        time.sleep(0.01)
+        yield buffer, buffer
+# --- Hàm generate_pdf (MỚI - Từ Script 1 và ĐÃ CHỈNH SỬA) ---
+@spaces.GPU
+def generate_pdf(model_name: str, text: str, state: Dict[str, Any],
+                 max_new_tokens: int = 2048,
+                 temperature: float = 0.6,
+                 top_p: float = 0.9,
+                 top_k: int = 50,
+                 repetition_penalty: float = 1.2):
+    # --- Thêm logic chọn model ---
+    if model_name == "Qwen2.5-VL-7B-Instruct":
+        processor, model = processor_m, model_m
+    elif model_name == "Qwen2.5-VL-3B-Instruct":
+        processor, model = processor_x, model_x
+    elif model_name == "Qwen3-VL-4B-Instruct":
+        processor, model = processor_q, model_q
+    elif model_name == "Qwen3-VL-8B-Instruct":
+        processor, model = processor_y, model_y
+    elif model_name == "Qwen3-VL-4B-Thinking":
+        processor, model = processor_t, model_t
+    elif model_name == "Qwen3-VL-2B-Instruct":
+        processor, model = processor_l, model_l
+    elif model_name == "Qwen3-VL-2B-Thinking":
+        processor, model = processor_j, model_j
+    else:
+        yield "Invalid model selected.", "Invalid model selected."
+        return
+    # --- Kết thúc logic chọn model ---
+    if not state or not state["pages"]:
+        yield "Please upload a PDF file first.", "Please upload a PDF file first."
+        return
+    page_images = state["pages"]
+    full_response = ""
+    for i, image in enumerate(page_images):
+        page_header = f"--- Page {i+1}/{len(page_images)} ---\n"
+        yield full_response + page_header, full_response + page_header
+        messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
+        # Sử dụng processor đã chọn
+        prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True).to(device)
+        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {
+            **inputs,
+            "streamer": streamer,
+            "max_new_tokens": max_new_tokens,
+            "do_sample": True,
+            "temperature": temperature,
+            "top_p": top_p,
+            "top_k": top_k,
+            "repetition_penalty": repetition_penalty
+        }
+        # Sử dụng model đã chọn
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+        page_buffer = ""
+        for new_text in streamer:
+            page_buffer += new_text
+            yield full_response + page_header + page_buffer, full_response + page_header + page_buffer
+            time.sleep(0.01)
+        full_response += page_header + page_buffer + "\n\n"
+# --- Định nghĩa Examples (Kết hợp từ 2 script) ---
+image_examples = [
+    ["Explain the content in detail.", "images/D.jpg"],
+    ["Explain the content (ocr).", "images/O.jpg"],
+    ["What is the core meaning of the poem?", "images/S.jpg"],
+    ["Provide a detailed caption for the image.", "images/A.jpg"],
+]
+video_examples = [
+    ["Explain the ad in detail", "videos/1.mp4"],
+    ["Identify the main actions in the video", "videos/2.mp4"],
+]
+# Thêm từ Script 1
+pdf_examples = [
+    ["Extract the content precisely.", "examples/pdfs/doc1.pdf"],
+    ["Analyze and provide a short report.", "examples/pdfs/doc2.pdf"]
+]
+css = """
+#main-title h1 {
+    font-size: 2.3em !important;
+}
+#output-title h2 {
+    font-size: 2.1em !important;
+}
+"""
+# --- Giao diện Gradio (Từ Script 2, đã thêm Tab PDF) ---
+with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
+    # Thêm từ Script 1
+    pdf_state = gr.State(value=get_initial_pdf_state())
+    gr.Markdown("# **Qwen3-VL-Outpost**", elem_id="main-title")
+    with gr.Row():
+        with gr.Column(scale=2):
+            with gr.Tabs():
+                with gr.TabItem("Image Inference"):
+                    image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
+                    image_upload = gr.Image(type="pil", label="Upload Image", height=290)
+                    image_submit = gr.Button("Submit", variant="primary")
+                    gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
+                with gr.TabItem("Video Inference"):
+                    video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
+                    video_upload = gr.Video(label="Upload Video", height=290)
+                    video_submit = gr.Button("Submit", variant="primary")
+                    gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
+                # --- Tab PDF MỚI (Từ Script 1) ---
+                with gr.TabItem("PDF Inference"):
+                    with gr.Row():
+                        with gr.Column(scale=1):
+                            pdf_query = gr.Textbox(label="Query Input", placeholder="e.g., 'Summarize this document'")
+                            pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
+                            pdf_submit = gr.Button("Submit", variant="primary")
+                        with gr.Column(scale=1):
+                            pdf_preview_img = gr.Image(label="PDF Preview", height=290)
+                            with gr.Row():
+                                prev_page_btn = gr.Button("◀ Previous")
+                                page_info = gr.HTML('<div style="text-align:center;">No file loaded</div>')
+                                next_page_btn = gr.Button("Next ▶")
+                    gr.Examples(examples=pdf_examples, inputs=[pdf_query, pdf_upload])
+                # --- Kết thúc Tab PDF ---
+            with gr.Accordion("Advanced options", open=False):
+                max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
+                temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
+                top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
+                top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
+                repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
+        with gr.Column(scale=3):
+            gr.Markdown("## Output", elem_id="output-title")
+            output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=14, show_copy_button=True)
+            with gr.Accordion("(Result.md)", open=False):
+                markdown_output = gr.Markdown(latex_delimiters=[
+                                {"left": "$$", "right": "$$", "display": True},
+                                {"left": "$", "right": "$", "display": False}
+                            ])
+            model_choice = gr.Radio(
+                choices=["Qwen3-VL-4B-Instruct", "Qwen3-VL-8B-Instruct", "Qwen3-VL-2B-Instruct", "Qwen3-VL-2B-Thinking", "Qwen3-VL-4B-Thinking", "Qwen2.5-VL-3B-Instruct", "Qwen2.5-VL-7B-Instruct"],
+                label="Select Model",
+                value="Qwen3-VL-4B-Instruct"
+            )
+    # --- Event Handlers (Đã thêm các sự kiện PDF) ---
+    image_submit.click(
+        fn=generate_image,
+        inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
+        outputs=[output, markdown_output]
+    )
+    video_submit.click(
+        fn=generate_video,
+        inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
+        outputs=[output, markdown_output]
+    )
+    # --- Thêm sự kiện cho PDF ---
+    pdf_submit.click(
+        fn=generate_pdf,
+        # Thêm 'model_choice' vào inputs
+        inputs=[model_choice, pdf_query, pdf_state, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
+        outputs=[output, markdown_output]
+    )
+    pdf_upload.change(
+        fn=load_and_preview_pdf,
+        inputs=[pdf_upload],
+        outputs=[pdf_preview_img, pdf_state, page_info]
+    )
+    prev_page_btn.click(
+        fn=lambda s: navigate_pdf_page("prev", s),
+        inputs=[pdf_state],
+        outputs=[pdf_preview_img, pdf_state, page_info]
+    )
+    next_page_btn.click(
+        fn=lambda s: navigate_pdf_page("next", s),
+        inputs=[pdf_state],
+        outputs=[pdf_preview_img, pdf_state, page_info]
+    )
+    # --- Kết thúc thêm sự kiện PDF ---
+if __name__ == "__main__":
+    demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)