baohuynhbk14 commited on
Commit
1d1d107
·
verified ·
1 Parent(s): 88a3550

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +531 -0
app.py CHANGED
@@ -0,0 +1,531 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+ import uuid
4
+ import json
5
+ import time
6
+ import asyncio
7
+ from threading import Thread
8
+ # --- Thêm từ Script 1 ---
9
+ from io import BytesIO
10
+ from typing import Optional, Tuple, Dict, Any, Iterable
11
+ import fitz # Thư viện PyMuPDF
12
+ # --- Kết thúc thêm ---
13
+ import gradio as gr
14
+ import spaces
15
+ import torch
16
+ import numpy as np
17
+ from PIL import Image
18
+ import cv2
19
+ from transformers import (
20
+ Qwen2_5_VLForConditionalGeneration,
21
+ Qwen3VLForConditionalGeneration,
22
+ AutoTokenizer,
23
+ AutoProcessor,
24
+ TextIteratorStreamer,
25
+ )
26
+ from transformers.image_utils import load_image
27
+ from gradio.themes import Soft
28
+ from gradio.themes.utils import colors, fonts, sizes
29
+
30
+ # --- Theme and CSS Definition (Từ Script 2) ---
31
+ colors.steel_blue = colors.Color(
32
+ name="steel_blue",
33
+ c50="#EBF3F8",
34
+ c100="#D3E5F0",
35
+ c200="#A8CCE1",
36
+ c300="#7DB3D2",
37
+ c400="#529AC3",
38
+ c500="#4682B4", # SteelBlue base color
39
+ c600="#3E72A0",
40
+ c700="#36638C",
41
+ c800="#2E5378",
42
+ c900="#264364",
43
+ c950="#1E3450",
44
+ )
45
+
46
+ class SteelBlueTheme(Soft):
47
+ def __init__(
48
+ self,
49
+ *,
50
+ primary_hue: colors.Color | str = colors.gray,
51
+ secondary_hue: colors.Color | str = colors.steel_blue,
52
+ neutral_hue: colors.Color | str = colors.slate,
53
+ text_size: sizes.Size | str = sizes.text_lg,
54
+ font: fonts.Font | str | Iterable[fonts.Font | str] = (
55
+ fonts.GoogleFont("Outfit"), "Arial", "sans-serif",
56
+ ),
57
+ font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (
58
+ fonts.GoogleFont("IBM Plex Mono"), "ui-monospace", "monospace",
59
+ ),
60
+ ):
61
+ super().__init__(
62
+ primary_hue=primary_hue,
63
+ secondary_hue=secondary_hue,
64
+ neutral_hue=neutral_hue,
65
+ text_size=text_size,
66
+ font=font,
67
+ font_mono=font_mono,
68
+ )
69
+ super().set(
70
+ background_fill_primary="*primary_50",
71
+ background_fill_primary_dark="*primary_900",
72
+ body_background_fill="linear-gradient(135deg, *primary_200, *primary_100)",
73
+ body_background_fill_dark="linear-gradient(135deg, *primary_900, *primary_800)",
74
+ button_primary_text_color="white",
75
+ button_primary_text_color_hover="white",
76
+ button_primary_background_fill="linear-gradient(90deg, *secondary_500, *secondary_600)",
77
+ button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_700)",
78
+ button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_800)",
79
+ button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_500)",
80
+ button_secondary_text_color="black",
81
+ button_secondary_text_color_hover="white",
82
+ button_secondary_background_fill="linear-gradient(90deg, *primary_300, *primary_300)",
83
+ button_secondary_background_fill_hover="linear-gradient(90deg, *primary_400, *primary_400)",
84
+ button_secondary_background_fill_dark="linear-gradient(90deg, *primary_500, *primary_600)",
85
+ button_secondary_background_fill_hover_dark="linear-gradient(90deg, *primary_500, *primary_500)",
86
+ slider_color="*secondary_500",
87
+ slider_color_dark="*secondary_600",
88
+ block_title_text_weight="600",
89
+ block_border_width="3px",
90
+ block_shadow="*shadow_drop_lg",
91
+ button_primary_shadow="*shadow_drop_lg",
92
+ button_large_padding="11px",
93
+ color_accent_soft="*primary_100",
94
+ block_label_background_fill="*primary_200",
95
+ )
96
+
97
+ steel_blue_theme = SteelBlueTheme()
98
+
99
+ # --- Cấu hình và Tải Model (Từ Script 2) ---
100
+ MAX_MAX_NEW_TOKENS = 4096
101
+ DEFAULT_MAX_NEW_TOKENS = 1024
102
+ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
103
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
104
+
105
+ # Load Qwen2.5-VL-7B-Instruct
106
+ MODEL_ID_M = "Qwen/Qwen2.5-VL-7B-Instruct"
107
+ processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
108
+ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
109
+ MODEL_ID_M,
110
+ trust_remote_code=True,
111
+ torch_dtype=torch.float16).to(device).eval()
112
+
113
+ # Load Qwen2.5-VL-3B-Instruct
114
+ MODEL_ID_X = "Qwen/Qwen2.5-VL-3B-Instruct"
115
+ processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
116
+ model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
117
+ MODEL_ID_X,
118
+ trust_remote_code=True,
119
+ torch_dtype=torch.float16).to(device).eval()
120
+
121
+ # Load Qwen3-VL-4B-Instruct
122
+ MODEL_ID_Q = "Qwen/Qwen3-VL-4B-Instruct"
123
+ processor_q = AutoProcessor.from_pretrained(MODEL_ID_Q, trust_remote_code=True)
124
+ model_q = Qwen3VLForConditionalGeneration.from_pretrained(
125
+ MODEL_ID_Q,
126
+ trust_remote_code=True,
127
+ torch_dtype=torch.float16).to(device).eval()
128
+
129
+ # Load Qwen3-VL-8B-Instruct
130
+ MODEL_ID_Y = "Qwen/Qwen3-VL-8B-Instruct"
131
+ processor_y = AutoProcessor.from_pretrained(MODEL_ID_Y, trust_remote_code=True)
132
+ model_y = Qwen3VLForConditionalGeneration.from_pretrained(
133
+ MODEL_ID_Y,
134
+ trust_remote_code=True,
135
+ torch_dtype=torch.float16).to(device).eval()
136
+
137
+ # Load Qwen3-VL-2B-Instruct
138
+ MODEL_ID_L = "Qwen/Qwen3-VL-2B-Instruct"
139
+ processor_l = AutoProcessor.from_pretrained(MODEL_ID_L, trust_remote_code=True)
140
+ model_l = Qwen3VLForConditionalGeneration.from_pretrained(
141
+ MODEL_ID_L,
142
+ trust_remote_code=True,
143
+ torch_dtype=torch.float16).to(device).eval()
144
+
145
+ # Load Qwen3-VL-2B-Thinking
146
+ MODEL_ID_J = "Qwen/Qwen3-VL-2B-Thinking"
147
+ processor_j = AutoProcessor.from_pretrained(MODEL_ID_J, trust_remote_code=True)
148
+ model_j = Qwen3VLForConditionalGeneration.from_pretrained(
149
+ MODEL_ID_J,
150
+ trust_remote_code=True,
151
+ torch_dtype=torch.float16).to(device).eval()
152
+
153
+ # Load Qwen3-VL-4B-Thinking
154
+ MODEL_ID_T = "Qwen/Qwen3-VL-4B-Thinking"
155
+ processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True)
156
+ model_t = Qwen3VLForConditionalGeneration.from_pretrained(
157
+ MODEL_ID_T,
158
+ trust_remote_code=True,
159
+ torch_dtype=torch.float16).to(device).eval()
160
+
161
+ # --- Các hàm hỗ trợ PDF (Từ Script 1) ---
162
+ def convert_pdf_to_images(file_path: str, dpi: int = 200):
163
+ if not file_path:
164
+ return []
165
+ images = []
166
+ pdf_document = fitz.open(file_path)
167
+ zoom = dpi / 72.0
168
+ mat = fitz.Matrix(zoom, zoom)
169
+ for page_num in range(len(pdf_document)):
170
+ page = pdf_document.load_page(page_num)
171
+ pix = page.get_pixmap(matrix=mat)
172
+ img_data = pix.tobytes("png")
173
+ images.append(Image.open(BytesIO(img_data)))
174
+ pdf_document.close()
175
+ return images
176
+
177
+ def get_initial_pdf_state() -> Dict[str, Any]:
178
+ return {"pages": [], "total_pages": 0, "current_page_index": 0}
179
+
180
+ def load_and_preview_pdf(file_path: Optional[str]) -> Tuple[Optional[Image.Image], Dict[str, Any], str]:
181
+ state = get_initial_pdf_state()
182
+ if not file_path:
183
+ return None, state, '<div style="text-align:center;">No file loaded</div>'
184
+ try:
185
+ pages = convert_pdf_to_images(file_path)
186
+ if not pages:
187
+ return None, state, '<div style="text-align:center;">Could not load file</div>'
188
+ state["pages"] = pages
189
+ state["total_pages"] = len(pages)
190
+ page_info_html = f'<div style="text-align:center;">Page 1 / {state["total_pages"]}</div>'
191
+ return pages[0], state, page_info_html
192
+ except Exception as e:
193
+ return None, state, f'<div style="text-align:center;">Failed to load preview: {e}</div>'
194
+
195
+ def navigate_pdf_page(direction: str, state: Dict[str, Any]):
196
+ if not state or not state["pages"]:
197
+ return None, state, '<div style="text-align:center;">No file loaded</div>'
198
+ current_index = state["current_page_index"]
199
+ total_pages = state["total_pages"]
200
+ if direction == "prev":
201
+ new_index = max(0, current_index - 1)
202
+ elif direction == "next":
203
+ new_index = min(total_pages - 1, current_index + 1)
204
+ else:
205
+ new_index = current_index
206
+ state["current_page_index"] = new_index
207
+ image_preview = state["pages"][new_index]
208
+ page_info_html = f'<div style="text-align:center;">Page {new_index + 1} / {total_pages}</div>'
209
+ return image_preview, state, page_info_html
210
+
211
+ # --- Hàm hỗ trợ Video (Từ Script 2) ---
212
+ def downsample_video(video_path):
213
+ """
214
+ Downsamples the video to evenly spaced frames.
215
+ Each frame is returned as a PIL image along with its timestamp.
216
+ """
217
+ vidcap = cv2.VideoCapture(video_path)
218
+ total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
219
+ fps = vidcap.get(cv2.CAP_PROP_FPS)
220
+ frames = []
221
+ # Use a maximum of 10 frames to avoid excessive memory usage
222
+ frame_indices = np.linspace(0, total_frames - 1, min(total_frames, 10), dtype=int)
223
+ for i in frame_indices:
224
+ vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
225
+ success, image = vidcap.read()
226
+ if success:
227
+ image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
228
+ pil_image = Image.fromarray(image)
229
+ timestamp = round(i / fps, 2)
230
+ frames.append((pil_image, timestamp))
231
+ vidcap.release()
232
+ return frames
233
+
234
+ # --- Các hàm Generate (Từ Script 2, với `generate_pdf` được thêm vào) ---
235
+ @spaces.GPU
236
+ def generate_image(model_name: str, text: str, image: Image.Image,
237
+ max_new_tokens: int = 1024,
238
+ temperature: float = 0.6,
239
+ top_p: float = 0.9,
240
+ top_k: int = 50,
241
+ repetition_penalty: float = 1.2):
242
+ """
243
+ Generates responses using the selected model for image input.
244
+ """
245
+ if model_name == "Qwen2.5-VL-7B-Instruct":
246
+ processor, model = processor_m, model_m
247
+ elif model_name == "Qwen2.5-VL-3B-Instruct":
248
+ processor, model = processor_x, model_x
249
+ elif model_name == "Qwen3-VL-4B-Instruct":
250
+ processor, model = processor_q, model_q
251
+ elif model_name == "Qwen3-VL-8B-Instruct":
252
+ processor, model = processor_y, model_y
253
+ elif model_name == "Qwen3-VL-4B-Thinking":
254
+ processor, model = processor_t, model_t
255
+ elif model_name == "Qwen3-VL-2B-Instruct":
256
+ processor, model = processor_l, model_l
257
+ elif model_name == "Qwen3-VL-2B-Thinking":
258
+ processor, model = processor_j, model_j
259
+ else:
260
+ yield "Invalid model selected.", "Invalid model selected."
261
+ return
262
+ if image is None:
263
+ yield "Please upload an image.", "Please upload an image."
264
+ return
265
+ messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
266
+ prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
267
+ inputs = processor(
268
+ text=[prompt_full], images=[image], return_tensors="pt", padding=True).to(device)
269
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
270
+ generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
271
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
272
+ thread.start()
273
+ buffer = ""
274
+ for new_text in streamer:
275
+ buffer += new_text
276
+ time.sleep(0.01)
277
+ yield buffer, buffer
278
+
279
+ @spaces.GPU
280
+ def generate_video(model_name: str, text: str, video_path: str,
281
+ max_new_tokens: int = 1024,
282
+ temperature: float = 0.6,
283
+ top_p: float = 0.9,
284
+ top_k: int = 50,
285
+ repetition_penalty: float = 1.2):
286
+ """
287
+ Generates responses using the selected model for video input.
288
+ """
289
+ if model_name == "Qwen2.5-VL-7B-Instruct":
290
+ processor, model = processor_m, model_m
291
+ elif model_name == "Qwen2.5-VL-3B-Instruct":
292
+ processor, model = processor_x, model_x
293
+ elif model_name == "Qwen3-VL-4B-Instruct":
294
+ processor, model = processor_q, model_q
295
+ elif model_name == "Qwen3-VL-8B-Instruct":
296
+ processor, model = processor_y, model_y
297
+ elif model_name == "Qwen3-VL-4B-Thinking":
298
+ processor, model = processor_t, model_t
299
+ elif model_name == "Qwen3-VL-2B-Instruct":
300
+ processor, model = processor_l, model_l
301
+ elif model_name == "Qwen3-VL-2B-Thinking":
302
+ processor, model = processor_j, model_j
303
+ else:
304
+ yield "Invalid model selected.", "Invalid model selected."
305
+ return
306
+ if video_path is None:
307
+ yield "Please upload a video.", "Please upload a video."
308
+ return
309
+ frames_with_ts = downsample_video(video_path)
310
+ if not frames_with_ts:
311
+ yield "Could not process video.", "Could not process video."
312
+ return
313
+ messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
314
+ images_for_processor = []
315
+ for frame, timestamp in frames_with_ts:
316
+ messages[0]["content"].append({"type": "image"})
317
+ images_for_processor.append(frame)
318
+ prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
319
+ inputs = processor(
320
+ text=[prompt_full], images=images_for_processor, return_tensors="pt", padding=True).to(device)
321
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
322
+ generation_kwargs = {
323
+ **inputs, "streamer": streamer, "max_new_tokens": max_new_tokens,
324
+ "do_sample": True, "temperature": temperature, "top_p": top_p,
325
+ "top_k": top_k, "repetition_penalty": repetition_penalty,
326
+ }
327
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
328
+ thread.start()
329
+ buffer = ""
330
+ for new_text in streamer:
331
+ buffer += new_text
332
+ buffer = buffer.replace("<|im_end|>", "")
333
+ time.sleep(0.01)
334
+ yield buffer, buffer
335
+
336
+ # --- Hàm generate_pdf (MỚI - Từ Script 1 và ĐÃ CHỈNH SỬA) ---
337
+ @spaces.GPU
338
+ def generate_pdf(model_name: str, text: str, state: Dict[str, Any],
339
+ max_new_tokens: int = 2048,
340
+ temperature: float = 0.6,
341
+ top_p: float = 0.9,
342
+ top_k: int = 50,
343
+ repetition_penalty: float = 1.2):
344
+
345
+ # --- Thêm logic chọn model ---
346
+ if model_name == "Qwen2.5-VL-7B-Instruct":
347
+ processor, model = processor_m, model_m
348
+ elif model_name == "Qwen2.5-VL-3B-Instruct":
349
+ processor, model = processor_x, model_x
350
+ elif model_name == "Qwen3-VL-4B-Instruct":
351
+ processor, model = processor_q, model_q
352
+ elif model_name == "Qwen3-VL-8B-Instruct":
353
+ processor, model = processor_y, model_y
354
+ elif model_name == "Qwen3-VL-4B-Thinking":
355
+ processor, model = processor_t, model_t
356
+ elif model_name == "Qwen3-VL-2B-Instruct":
357
+ processor, model = processor_l, model_l
358
+ elif model_name == "Qwen3-VL-2B-Thinking":
359
+ processor, model = processor_j, model_j
360
+ else:
361
+ yield "Invalid model selected.", "Invalid model selected."
362
+ return
363
+ # --- Kết thúc logic chọn model ---
364
+
365
+ if not state or not state["pages"]:
366
+ yield "Please upload a PDF file first.", "Please upload a PDF file first."
367
+ return
368
+
369
+ page_images = state["pages"]
370
+ full_response = ""
371
+ for i, image in enumerate(page_images):
372
+ page_header = f"--- Page {i+1}/{len(page_images)} ---\n"
373
+ yield full_response + page_header, full_response + page_header
374
+
375
+ messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
376
+ # Sử dụng processor đã chọn
377
+ prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
378
+ inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True).to(device)
379
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
380
+
381
+ generation_kwargs = {
382
+ **inputs,
383
+ "streamer": streamer,
384
+ "max_new_tokens": max_new_tokens,
385
+ "do_sample": True,
386
+ "temperature": temperature,
387
+ "top_p": top_p,
388
+ "top_k": top_k,
389
+ "repetition_penalty": repetition_penalty
390
+ }
391
+
392
+ # Sử dụng model đã chọn
393
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
394
+ thread.start()
395
+
396
+ page_buffer = ""
397
+ for new_text in streamer:
398
+ page_buffer += new_text
399
+ yield full_response + page_header + page_buffer, full_response + page_header + page_buffer
400
+ time.sleep(0.01)
401
+
402
+ full_response += page_header + page_buffer + "\n\n"
403
+
404
+ # --- Định nghĩa Examples (Kết hợp từ 2 script) ---
405
+ image_examples = [
406
+ ["Explain the content in detail.", "images/D.jpg"],
407
+ ["Explain the content (ocr).", "images/O.jpg"],
408
+ ["What is the core meaning of the poem?", "images/S.jpg"],
409
+ ["Provide a detailed caption for the image.", "images/A.jpg"],
410
+ ]
411
+ video_examples = [
412
+ ["Explain the ad in detail", "videos/1.mp4"],
413
+ ["Identify the main actions in the video", "videos/2.mp4"],
414
+ ]
415
+ # Thêm từ Script 1
416
+ pdf_examples = [
417
+ ["Extract the content precisely.", "examples/pdfs/doc1.pdf"],
418
+ ["Analyze and provide a short report.", "examples/pdfs/doc2.pdf"]
419
+ ]
420
+
421
+ css = """
422
+ #main-title h1 {
423
+ font-size: 2.3em !important;
424
+ }
425
+ #output-title h2 {
426
+ font-size: 2.1em !important;
427
+ }
428
+ """
429
+
430
+ # --- Giao diện Gradio (Từ Script 2, đã thêm Tab PDF) ---
431
+ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
432
+
433
+ # Thêm từ Script 1
434
+ pdf_state = gr.State(value=get_initial_pdf_state())
435
+
436
+ gr.Markdown("# **Qwen3-VL-Outpost**", elem_id="main-title")
437
+ with gr.Row():
438
+ with gr.Column(scale=2):
439
+ with gr.Tabs():
440
+ with gr.TabItem("Image Inference"):
441
+ image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
442
+ image_upload = gr.Image(type="pil", label="Upload Image", height=290)
443
+ image_submit = gr.Button("Submit", variant="primary")
444
+ gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
445
+
446
+ with gr.TabItem("Video Inference"):
447
+ video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
448
+ video_upload = gr.Video(label="Upload Video", height=290)
449
+ video_submit = gr.Button("Submit", variant="primary")
450
+ gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
451
+
452
+ # --- Tab PDF MỚI (Từ Script 1) ---
453
+ with gr.TabItem("PDF Inference"):
454
+ with gr.Row():
455
+ with gr.Column(scale=1):
456
+ pdf_query = gr.Textbox(label="Query Input", placeholder="e.g., 'Summarize this document'")
457
+ pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
458
+ pdf_submit = gr.Button("Submit", variant="primary")
459
+ with gr.Column(scale=1):
460
+ pdf_preview_img = gr.Image(label="PDF Preview", height=290)
461
+ with gr.Row():
462
+ prev_page_btn = gr.Button("◀ Previous")
463
+ page_info = gr.HTML('<div style="text-align:center;">No file loaded</div>')
464
+ next_page_btn = gr.Button("Next ▶")
465
+ gr.Examples(examples=pdf_examples, inputs=[pdf_query, pdf_upload])
466
+ # --- Kết thúc Tab PDF ---
467
+
468
+ with gr.Accordion("Advanced options", open=False):
469
+ max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
470
+ temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
471
+ top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
472
+ top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
473
+ repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
474
+
475
+ with gr.Column(scale=3):
476
+ gr.Markdown("## Output", elem_id="output-title")
477
+ output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=14, show_copy_button=True)
478
+ with gr.Accordion("(Result.md)", open=False):
479
+ markdown_output = gr.Markdown(latex_delimiters=[
480
+ {"left": "$$", "right": "$$", "display": True},
481
+ {"left": "$", "right": "$", "display": False}
482
+ ])
483
+
484
+ model_choice = gr.Radio(
485
+ choices=["Qwen3-VL-4B-Instruct", "Qwen3-VL-8B-Instruct", "Qwen3-VL-2B-Instruct", "Qwen3-VL-2B-Thinking", "Qwen3-VL-4B-Thinking", "Qwen2.5-VL-3B-Instruct", "Qwen2.5-VL-7B-Instruct"],
486
+ label="Select Model",
487
+ value="Qwen3-VL-4B-Instruct"
488
+ )
489
+
490
+ # --- Event Handlers (Đã thêm các sự kiện PDF) ---
491
+ image_submit.click(
492
+ fn=generate_image,
493
+ inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
494
+ outputs=[output, markdown_output]
495
+ )
496
+
497
+ video_submit.click(
498
+ fn=generate_video,
499
+ inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
500
+ outputs=[output, markdown_output]
501
+ )
502
+
503
+ # --- Thêm sự kiện cho PDF ---
504
+ pdf_submit.click(
505
+ fn=generate_pdf,
506
+ # Thêm 'model_choice' vào inputs
507
+ inputs=[model_choice, pdf_query, pdf_state, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
508
+ outputs=[output, markdown_output]
509
+ )
510
+
511
+ pdf_upload.change(
512
+ fn=load_and_preview_pdf,
513
+ inputs=[pdf_upload],
514
+ outputs=[pdf_preview_img, pdf_state, page_info]
515
+ )
516
+
517
+ prev_page_btn.click(
518
+ fn=lambda s: navigate_pdf_page("prev", s),
519
+ inputs=[pdf_state],
520
+ outputs=[pdf_preview_img, pdf_state, page_info]
521
+ )
522
+
523
+ next_page_btn.click(
524
+ fn=lambda s: navigate_pdf_page("next", s),
525
+ inputs=[pdf_state],
526
+ outputs=[pdf_preview_img, pdf_state, page_info]
527
+ )
528
+ # --- Kết thúc thêm sự kiện PDF ---
529
+
530
+ if __name__ == "__main__":
531
+ demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)