Spaces:

baohuynhbk14
/

Qwen3-VL-Demo

Running on Zero

App Files Files Community

baohuynhbk14 commited on 4 days ago

Commit

728997f

verified ·

1 Parent(s): 54c12ba

Update app.py

Browse files

Files changed (1) hide show

app.py +107 -101

app.py CHANGED Viewed

@@ -25,6 +25,12 @@ from transformers.image_utils import load_image
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
 MAX_MAX_NEW_TOKENS = 4096
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
@@ -94,7 +100,7 @@ model_t = Qwen3VLForConditionalGeneration.from_pretrained(
     trust_remote_code=True,
     torch_dtype=torch.bfloat16).to(device).eval()
-def convert_pdf_to_images(file_path: str, dpi: int = 200):
     if not file_path:
         return []
     images = []
@@ -204,7 +210,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         time.sleep(0.01)
         yield buffer, buffer
-@spaces.GPU
 def generate_video(model_name: str, text: str, video_path: str,
                    max_new_tokens: int = 1024,
                    temperature: float = 0.6,
@@ -264,74 +270,7 @@ def generate_video(model_name: str, text: str, video_path: str,
         yield buffer, buffer
-@spaces.GPU
-# def generate_pdf(model_name: str, text: str, state: Dict[str, Any],
-#                  max_new_tokens: int = 2048,
-#                  temperature: float = 0.6,
-#                  top_p: float = 0.9,
-#                  top_k: int = 50,
-#                  repetition_penalty: float = 1.2):
-#     # if model_name == "Qwen2.5-VL-7B-Instruct":
-#     #     processor, model = processor_m, model_m
-#     # elif model_name == "Qwen2.5-VL-3B-Instruct":
-#     #     processor, model = processor_x, model_x
-#     if model_name == "Qwen3-VL-4B-Instruct":
-#         processor, model = processor_q, model_q
-#     elif model_name == "Qwen3-VL-8B-Instruct":
-#         processor, model = processor_y, model_y
-#     # elif model_name == "Qwen3-VL-8B-Thinking":
-#     #     processor, model = processor_z, model_z
-#     elif model_name == "Qwen3-VL-4B-Thinking":
-#         processor, model = processor_t, model_t
-#     elif model_name == "Qwen3-VL-2B-Instruct":
-#         processor, model = processor_l, model_l
-#     elif model_name == "Qwen3-VL-2B-Thinking":
-#         processor, model = processor_j, model_j
-#     else:
-#         yield "Invalid model selected.", "Invalid model selected."
-#         return
-#     if not state or not state["pages"]:
-#         yield "Please upload a PDF file first.", "Please upload a PDF file first."
-#         return
-#     page_images = state["pages"]
-#     full_response = ""
-#     for i, image in enumerate(page_images):
-#         page_header = f"--- Page {i+1}/{len(page_images)} ---\n"
-#         yield full_response + page_header, full_response + page_header
-#         messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
-#         # Sử dụng processor đã chọn
-#         prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-#         inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True).to(device)
-#         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-#         generation_kwargs = {
-#             **inputs,
-#             "streamer": streamer,
-#             "max_new_tokens": max_new_tokens,
-#             # "do_sample": True,
-#             # "temperature": temperature,
-#             # "top_p": top_p,
-#             # "top_k": top_k,
-#             # "repetition_penalty": repetition_penalty
-#         }
-#         # Sử dụng model đã chọn
-#         thread = Thread(target=model.generate, kwargs=generation_kwargs)
-#         thread.start()
-#         page_buffer = ""
-#         for new_text in streamer:
-#             page_buffer += new_text
-#             yield full_response + page_header + page_buffer, full_response + page_header + page_buffer
-#             time.sleep(0.01)
-#         full_response += page_header + page_buffer + "\n\n"
-@spaces.GPU
 def generate_pdf(model_name: str, text: str, state: Dict[str, Any],
                  max_new_tokens: int = 2048,
                  temperature: float = 0.6,
@@ -339,10 +278,16 @@ def generate_pdf(model_name: str, text: str, state: Dict[str, Any],
                  top_k: int = 50,
                  repetition_penalty: float = 1.2):
     if model_name == "Qwen3-VL-4B-Instruct":
         processor, model = processor_q, model_q
     elif model_name == "Qwen3-VL-8B-Instruct":
         processor, model = processor_y, model_y
     elif model_name == "Qwen3-VL-4B-Thinking":
         processor, model = processor_t, model_t
     elif model_name == "Qwen3-VL-2B-Instruct":
@@ -358,44 +303,105 @@ def generate_pdf(model_name: str, text: str, state: Dict[str, Any],
         return
     page_images = state["pages"]
-    messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
-    images_for_processor = []
-    for frame in page_images:
-        messages[0]["content"].append({"type": "image"})
-        images_for_processor.append(frame)
-    prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    inputs = processor(
-        text=[prompt_full],
-        images=images_for_processor,  # Truyền cả list ảnh
-        return_tensors="pt",
-        padding=True
-    ).to(device)
-    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = {
-        **inputs,
-        "streamer": streamer,
-        "max_new_tokens": max_new_tokens,
-        "do_sample": True,
-        "temperature": temperature,
-        "top_p": top_p,
-        "top_k": top_k,
-        "repetition_penalty": repetition_penalty
-    }
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
-    thread.start()
-    buffer = ""
-    for new_text in streamer:
-        buffer += new_text
-        buffer = buffer.replace("<|im_end|>", "") # Thêm dòng này giống video
-        yield buffer, buffer
-        time.sleep(0.01)
 image_examples = [
     ["Explain the content in detail.", "images/force.jpg"],

 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
+import shlex
+import subprocess
+subprocess.run(shlex.split("pip install flash-attn  --no-build-isolation"), env=os.environ | {"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, check=True)
 MAX_MAX_NEW_TOKENS = 4096
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
     trust_remote_code=True,
     torch_dtype=torch.bfloat16).to(device).eval()
+def convert_pdf_to_images(file_path: str, dpi: int = 128):
     if not file_path:
         return []
     images = []
         time.sleep(0.01)
         yield buffer, buffer
+@spaces.GPU(duration=120)
 def generate_video(model_name: str, text: str, video_path: str,
                    max_new_tokens: int = 1024,
                    temperature: float = 0.6,
         yield buffer, buffer
+@spaces.GPU(duration=120)
 def generate_pdf(model_name: str, text: str, state: Dict[str, Any],
                  max_new_tokens: int = 2048,
                  temperature: float = 0.6,
                  top_k: int = 50,
                  repetition_penalty: float = 1.2):
+    # if model_name == "Qwen2.5-VL-7B-Instruct":
+    #     processor, model = processor_m, model_m
+    # elif model_name == "Qwen2.5-VL-3B-Instruct":
+    #     processor, model = processor_x, model_x
     if model_name == "Qwen3-VL-4B-Instruct":
         processor, model = processor_q, model_q
     elif model_name == "Qwen3-VL-8B-Instruct":
         processor, model = processor_y, model_y
+    # elif model_name == "Qwen3-VL-8B-Thinking":
+    #     processor, model = processor_z, model_z
     elif model_name == "Qwen3-VL-4B-Thinking":
         processor, model = processor_t, model_t
     elif model_name == "Qwen3-VL-2B-Instruct":
         return
     page_images = state["pages"]
+    full_response = ""
+    for i, image in enumerate(page_images):
+        page_header = f"--- Page {i+1}/{len(page_images)} ---\n"
+        yield full_response + page_header, full_response + page_header
+        messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
+        # Sử dụng processor đã chọn
+        prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True).to(device)
+        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {
+            **inputs,
+            "streamer": streamer,
+            "max_new_tokens": max_new_tokens,
+            "do_sample": True,
+            "temperature": temperature,
+            "top_p": top_p,
+            "top_k": top_k,
+            "repetition_penalty": repetition_penalty
+        }
+        # Sử dụng model đã chọn
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+        page_buffer = ""
+        for new_text in streamer:
+            page_buffer += new_text
+            yield full_response + page_header + page_buffer, full_response + page_header + page_buffer
+            time.sleep(0.01)
+        full_response += page_header + page_buffer + "\n\n"
+# @spaces.GPU
+# def generate_pdf(model_name: str, text: str, state: Dict[str, Any],
+#                  max_new_tokens: int = 2048,
+#                  temperature: float = 0.6,
+#                  top_p: float = 0.9,
+#                  top_k: int = 50,
+#                  repetition_penalty: float = 1.2):
+#     if model_name == "Qwen3-VL-4B-Instruct":
+#         processor, model = processor_q, model_q
+#     elif model_name == "Qwen3-VL-8B-Instruct":
+#         processor, model = processor_y, model_y
+#     elif model_name == "Qwen3-VL-4B-Thinking":
+#         processor, model = processor_t, model_t
+#     elif model_name == "Qwen3-VL-2B-Instruct":
+#         processor, model = processor_l, model_l
+#     elif model_name == "Qwen3-VL-2B-Thinking":
+#         processor, model = processor_j, model_j
+#     else:
+#         yield "Invalid model selected.", "Invalid model selected."
+#         return
+#     if not state or not state["pages"]:
+#         yield "Please upload a PDF file first.", "Please upload a PDF file first."
+#         return
+#     page_images = state["pages"]
+#     messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
+#     images_for_processor = []
+#     for frame in page_images:
+#         messages[0]["content"].append({"type": "image"})
+#         images_for_processor.append(frame)
+#     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+#     inputs = processor(
+#         text=[prompt_full],
+#         images=images_for_processor,  # Truyền cả list ảnh
+#         return_tensors="pt",
+#         padding=True
+#     ).to(device)
+#     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+#     generation_kwargs = {
+#         **inputs,
+#         "streamer": streamer,
+#         "max_new_tokens": max_new_tokens,
+#         "do_sample": True,
+#         "temperature": temperature,
+#         "top_p": top_p,
+#         "top_k": top_k,
+#         "repetition_penalty": repetition_penalty
+#     }
+#     thread = Thread(target=model.generate, kwargs=generation_kwargs)
+#     thread.start()
+#     buffer = ""
+#     for new_text in streamer:
+#         buffer += new_text
+#         buffer = buffer.replace("<|im_end|>", "") # Thêm dòng này giống video
+#         yield buffer, buffer
+#         time.sleep(0.01)
 image_examples = [
     ["Explain the content in detail.", "images/force.jpg"],