baohuynhbk14 commited on
Commit
728997f
·
verified ·
1 Parent(s): 54c12ba

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -101
app.py CHANGED
@@ -25,6 +25,12 @@ from transformers.image_utils import load_image
25
  from gradio.themes import Soft
26
  from gradio.themes.utils import colors, fonts, sizes
27
 
 
 
 
 
 
 
28
  MAX_MAX_NEW_TOKENS = 4096
29
  DEFAULT_MAX_NEW_TOKENS = 1024
30
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
@@ -94,7 +100,7 @@ model_t = Qwen3VLForConditionalGeneration.from_pretrained(
94
  trust_remote_code=True,
95
  torch_dtype=torch.bfloat16).to(device).eval()
96
 
97
- def convert_pdf_to_images(file_path: str, dpi: int = 200):
98
  if not file_path:
99
  return []
100
  images = []
@@ -204,7 +210,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
204
  time.sleep(0.01)
205
  yield buffer, buffer
206
 
207
- @spaces.GPU
208
  def generate_video(model_name: str, text: str, video_path: str,
209
  max_new_tokens: int = 1024,
210
  temperature: float = 0.6,
@@ -264,74 +270,7 @@ def generate_video(model_name: str, text: str, video_path: str,
264
  yield buffer, buffer
265
 
266
 
267
- @spaces.GPU
268
- # def generate_pdf(model_name: str, text: str, state: Dict[str, Any],
269
- # max_new_tokens: int = 2048,
270
- # temperature: float = 0.6,
271
- # top_p: float = 0.9,
272
- # top_k: int = 50,
273
- # repetition_penalty: float = 1.2):
274
-
275
- # # if model_name == "Qwen2.5-VL-7B-Instruct":
276
- # # processor, model = processor_m, model_m
277
- # # elif model_name == "Qwen2.5-VL-3B-Instruct":
278
- # # processor, model = processor_x, model_x
279
- # if model_name == "Qwen3-VL-4B-Instruct":
280
- # processor, model = processor_q, model_q
281
- # elif model_name == "Qwen3-VL-8B-Instruct":
282
- # processor, model = processor_y, model_y
283
- # # elif model_name == "Qwen3-VL-8B-Thinking":
284
- # # processor, model = processor_z, model_z
285
- # elif model_name == "Qwen3-VL-4B-Thinking":
286
- # processor, model = processor_t, model_t
287
- # elif model_name == "Qwen3-VL-2B-Instruct":
288
- # processor, model = processor_l, model_l
289
- # elif model_name == "Qwen3-VL-2B-Thinking":
290
- # processor, model = processor_j, model_j
291
- # else:
292
- # yield "Invalid model selected.", "Invalid model selected."
293
- # return
294
-
295
- # if not state or not state["pages"]:
296
- # yield "Please upload a PDF file first.", "Please upload a PDF file first."
297
- # return
298
-
299
- # page_images = state["pages"]
300
- # full_response = ""
301
- # for i, image in enumerate(page_images):
302
- # page_header = f"--- Page {i+1}/{len(page_images)} ---\n"
303
- # yield full_response + page_header, full_response + page_header
304
-
305
- # messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
306
- # # Sử dụng processor đã chọn
307
- # prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
308
- # inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True).to(device)
309
- # streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
310
-
311
- # generation_kwargs = {
312
- # **inputs,
313
- # "streamer": streamer,
314
- # "max_new_tokens": max_new_tokens,
315
- # # "do_sample": True,
316
- # # "temperature": temperature,
317
- # # "top_p": top_p,
318
- # # "top_k": top_k,
319
- # # "repetition_penalty": repetition_penalty
320
- # }
321
-
322
- # # Sử dụng model đã chọn
323
- # thread = Thread(target=model.generate, kwargs=generation_kwargs)
324
- # thread.start()
325
-
326
- # page_buffer = ""
327
- # for new_text in streamer:
328
- # page_buffer += new_text
329
- # yield full_response + page_header + page_buffer, full_response + page_header + page_buffer
330
- # time.sleep(0.01)
331
-
332
- # full_response += page_header + page_buffer + "\n\n"
333
-
334
- @spaces.GPU
335
  def generate_pdf(model_name: str, text: str, state: Dict[str, Any],
336
  max_new_tokens: int = 2048,
337
  temperature: float = 0.6,
@@ -339,10 +278,16 @@ def generate_pdf(model_name: str, text: str, state: Dict[str, Any],
339
  top_k: int = 50,
340
  repetition_penalty: float = 1.2):
341
 
 
 
 
 
342
  if model_name == "Qwen3-VL-4B-Instruct":
343
  processor, model = processor_q, model_q
344
  elif model_name == "Qwen3-VL-8B-Instruct":
345
  processor, model = processor_y, model_y
 
 
346
  elif model_name == "Qwen3-VL-4B-Thinking":
347
  processor, model = processor_t, model_t
348
  elif model_name == "Qwen3-VL-2B-Instruct":
@@ -358,44 +303,105 @@ def generate_pdf(model_name: str, text: str, state: Dict[str, Any],
358
  return
359
 
360
  page_images = state["pages"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361
 
362
- messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
363
- images_for_processor = []
364
- for frame in page_images:
365
- messages[0]["content"].append({"type": "image"})
366
- images_for_processor.append(frame)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
367
 
368
- prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
 
369
 
370
- inputs = processor(
371
- text=[prompt_full],
372
- images=images_for_processor, # Truyền cả list ảnh
373
- return_tensors="pt",
374
- padding=True
375
- ).to(device)
 
 
 
376
 
377
- streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
 
 
 
 
 
378
 
379
- generation_kwargs = {
380
- **inputs,
381
- "streamer": streamer,
382
- "max_new_tokens": max_new_tokens,
383
- "do_sample": True,
384
- "temperature": temperature,
385
- "top_p": top_p,
386
- "top_k": top_k,
387
- "repetition_penalty": repetition_penalty
388
- }
389
 
390
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
391
- thread.start()
 
 
 
 
 
 
 
 
392
 
393
- buffer = ""
394
- for new_text in streamer:
395
- buffer += new_text
396
- buffer = buffer.replace("<|im_end|>", "") # Thêm dòng này giống video
397
- yield buffer, buffer
398
- time.sleep(0.01)
 
 
 
399
 
400
  image_examples = [
401
  ["Explain the content in detail.", "images/force.jpg"],
 
25
  from gradio.themes import Soft
26
  from gradio.themes.utils import colors, fonts, sizes
27
 
28
+ import shlex
29
+ import subprocess
30
+
31
+ subprocess.run(shlex.split("pip install flash-attn --no-build-isolation"), env=os.environ | {"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, check=True)
32
+
33
+
34
  MAX_MAX_NEW_TOKENS = 4096
35
  DEFAULT_MAX_NEW_TOKENS = 1024
36
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 
100
  trust_remote_code=True,
101
  torch_dtype=torch.bfloat16).to(device).eval()
102
 
103
+ def convert_pdf_to_images(file_path: str, dpi: int = 128):
104
  if not file_path:
105
  return []
106
  images = []
 
210
  time.sleep(0.01)
211
  yield buffer, buffer
212
 
213
+ @spaces.GPU(duration=120)
214
  def generate_video(model_name: str, text: str, video_path: str,
215
  max_new_tokens: int = 1024,
216
  temperature: float = 0.6,
 
270
  yield buffer, buffer
271
 
272
 
273
+ @spaces.GPU(duration=120)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
  def generate_pdf(model_name: str, text: str, state: Dict[str, Any],
275
  max_new_tokens: int = 2048,
276
  temperature: float = 0.6,
 
278
  top_k: int = 50,
279
  repetition_penalty: float = 1.2):
280
 
281
+ # if model_name == "Qwen2.5-VL-7B-Instruct":
282
+ # processor, model = processor_m, model_m
283
+ # elif model_name == "Qwen2.5-VL-3B-Instruct":
284
+ # processor, model = processor_x, model_x
285
  if model_name == "Qwen3-VL-4B-Instruct":
286
  processor, model = processor_q, model_q
287
  elif model_name == "Qwen3-VL-8B-Instruct":
288
  processor, model = processor_y, model_y
289
+ # elif model_name == "Qwen3-VL-8B-Thinking":
290
+ # processor, model = processor_z, model_z
291
  elif model_name == "Qwen3-VL-4B-Thinking":
292
  processor, model = processor_t, model_t
293
  elif model_name == "Qwen3-VL-2B-Instruct":
 
303
  return
304
 
305
  page_images = state["pages"]
306
+ full_response = ""
307
+ for i, image in enumerate(page_images):
308
+ page_header = f"--- Page {i+1}/{len(page_images)} ---\n"
309
+ yield full_response + page_header, full_response + page_header
310
+
311
+ messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
312
+ # Sử dụng processor đã chọn
313
+ prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
314
+ inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True).to(device)
315
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
316
+
317
+ generation_kwargs = {
318
+ **inputs,
319
+ "streamer": streamer,
320
+ "max_new_tokens": max_new_tokens,
321
+ "do_sample": True,
322
+ "temperature": temperature,
323
+ "top_p": top_p,
324
+ "top_k": top_k,
325
+ "repetition_penalty": repetition_penalty
326
+ }
327
+
328
+ # Sử dụng model đã chọn
329
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
330
+ thread.start()
331
+
332
+ page_buffer = ""
333
+ for new_text in streamer:
334
+ page_buffer += new_text
335
+ yield full_response + page_header + page_buffer, full_response + page_header + page_buffer
336
+ time.sleep(0.01)
337
+
338
+ full_response += page_header + page_buffer + "\n\n"
339
 
340
+ # @spaces.GPU
341
+ # def generate_pdf(model_name: str, text: str, state: Dict[str, Any],
342
+ # max_new_tokens: int = 2048,
343
+ # temperature: float = 0.6,
344
+ # top_p: float = 0.9,
345
+ # top_k: int = 50,
346
+ # repetition_penalty: float = 1.2):
347
+
348
+ # if model_name == "Qwen3-VL-4B-Instruct":
349
+ # processor, model = processor_q, model_q
350
+ # elif model_name == "Qwen3-VL-8B-Instruct":
351
+ # processor, model = processor_y, model_y
352
+ # elif model_name == "Qwen3-VL-4B-Thinking":
353
+ # processor, model = processor_t, model_t
354
+ # elif model_name == "Qwen3-VL-2B-Instruct":
355
+ # processor, model = processor_l, model_l
356
+ # elif model_name == "Qwen3-VL-2B-Thinking":
357
+ # processor, model = processor_j, model_j
358
+ # else:
359
+ # yield "Invalid model selected.", "Invalid model selected."
360
+ # return
361
 
362
+ # if not state or not state["pages"]:
363
+ # yield "Please upload a PDF file first.", "Please upload a PDF file first."
364
+ # return
365
 
366
+ # page_images = state["pages"]
367
+
368
+ # messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
369
+ # images_for_processor = []
370
+ # for frame in page_images:
371
+ # messages[0]["content"].append({"type": "image"})
372
+ # images_for_processor.append(frame)
373
+
374
+ # prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
375
 
376
+ # inputs = processor(
377
+ # text=[prompt_full],
378
+ # images=images_for_processor, # Truyền cả list ảnh
379
+ # return_tensors="pt",
380
+ # padding=True
381
+ # ).to(device)
382
 
383
+ # streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
384
 
385
+ # generation_kwargs = {
386
+ # **inputs,
387
+ # "streamer": streamer,
388
+ # "max_new_tokens": max_new_tokens,
389
+ # "do_sample": True,
390
+ # "temperature": temperature,
391
+ # "top_p": top_p,
392
+ # "top_k": top_k,
393
+ # "repetition_penalty": repetition_penalty
394
+ # }
395
 
396
+ # thread = Thread(target=model.generate, kwargs=generation_kwargs)
397
+ # thread.start()
398
+
399
+ # buffer = ""
400
+ # for new_text in streamer:
401
+ # buffer += new_text
402
+ # buffer = buffer.replace("<|im_end|>", "") # Thêm dòng này giống video
403
+ # yield buffer, buffer
404
+ # time.sleep(0.01)
405
 
406
  image_examples = [
407
  ["Explain the content in detail.", "images/force.jpg"],