Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			Zero
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			Zero
	Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | @@ -270,6 +270,73 @@ def generate_video(model_name: str, text: str, video_path: str, | |
| 270 | 
             
                    yield buffer, buffer
         | 
| 271 |  | 
| 272 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 273 | 
             
            @spaces.GPU(duration=120)
         | 
| 274 | 
             
            def generate_pdf(model_name: str, text: str, state: Dict[str, Any], 
         | 
| 275 | 
             
                             max_new_tokens: int = 2048, 
         | 
| @@ -278,16 +345,10 @@ def generate_pdf(model_name: str, text: str, state: Dict[str, Any], | |
| 278 | 
             
                             top_k: int = 50, 
         | 
| 279 | 
             
                             repetition_penalty: float = 1.2):
         | 
| 280 |  | 
| 281 | 
            -
                # if model_name == "Qwen2.5-VL-7B-Instruct":
         | 
| 282 | 
            -
                #     processor, model = processor_m, model_m
         | 
| 283 | 
            -
                # elif model_name == "Qwen2.5-VL-3B-Instruct":
         | 
| 284 | 
            -
                #     processor, model = processor_x, model_x
         | 
| 285 | 
             
                if model_name == "Qwen3-VL-4B-Instruct":
         | 
| 286 | 
             
                    processor, model = processor_q, model_q
         | 
| 287 | 
             
                elif model_name == "Qwen3-VL-8B-Instruct":
         | 
| 288 | 
             
                    processor, model = processor_y, model_y
         | 
| 289 | 
            -
                # elif model_name == "Qwen3-VL-8B-Thinking":
         | 
| 290 | 
            -
                #     processor, model = processor_z, model_z
         | 
| 291 | 
             
                elif model_name == "Qwen3-VL-4B-Thinking":
         | 
| 292 | 
             
                    processor, model = processor_t, model_t
         | 
| 293 | 
             
                elif model_name == "Qwen3-VL-2B-Instruct":
         | 
| @@ -303,105 +364,44 @@ def generate_pdf(model_name: str, text: str, state: Dict[str, Any], | |
| 303 | 
             
                    return
         | 
| 304 |  | 
| 305 | 
             
                page_images = state["pages"]
         | 
| 306 | 
            -
                full_response = ""
         | 
| 307 | 
            -
                for i, image in enumerate(page_images):
         | 
| 308 | 
            -
                    page_header = f"--- Page {i+1}/{len(page_images)} ---\n"
         | 
| 309 | 
            -
                    yield full_response + page_header, full_response + page_header
         | 
| 310 | 
            -
                    
         | 
| 311 | 
            -
                    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
         | 
| 312 | 
            -
                    # Sử dụng processor đã chọn
         | 
| 313 | 
            -
                    prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         | 
| 314 | 
            -
                    inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True).to(device)
         | 
| 315 | 
            -
                    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
         | 
| 316 | 
            -
                    
         | 
| 317 | 
            -
                    generation_kwargs = {
         | 
| 318 | 
            -
                        **inputs, 
         | 
| 319 | 
            -
                        "streamer": streamer, 
         | 
| 320 | 
            -
                        "max_new_tokens": max_new_tokens,
         | 
| 321 | 
            -
                        "do_sample": True, 
         | 
| 322 | 
            -
                        "temperature": temperature, 
         | 
| 323 | 
            -
                        "top_p": top_p, 
         | 
| 324 | 
            -
                        "top_k": top_k, 
         | 
| 325 | 
            -
                        "repetition_penalty": repetition_penalty
         | 
| 326 | 
            -
                    }
         | 
| 327 | 
            -
                    
         | 
| 328 | 
            -
                    # Sử dụng model đã chọn
         | 
| 329 | 
            -
                    thread = Thread(target=model.generate, kwargs=generation_kwargs)
         | 
| 330 | 
            -
                    thread.start()
         | 
| 331 | 
            -
                    
         | 
| 332 | 
            -
                    page_buffer = ""
         | 
| 333 | 
            -
                    for new_text in streamer:
         | 
| 334 | 
            -
                        page_buffer += new_text
         | 
| 335 | 
            -
                        yield full_response + page_header + page_buffer, full_response + page_header + page_buffer
         | 
| 336 | 
            -
                        time.sleep(0.01)
         | 
| 337 | 
            -
                    
         | 
| 338 | 
            -
                    full_response += page_header + page_buffer + "\n\n"
         | 
| 339 |  | 
| 340 | 
            -
             | 
| 341 | 
            -
             | 
| 342 | 
            -
             | 
| 343 | 
            -
             | 
| 344 | 
            -
             | 
| 345 | 
            -
            #                  top_k: int = 50, 
         | 
| 346 | 
            -
            #                  repetition_penalty: float = 1.2):
         | 
| 347 | 
            -
                
         | 
| 348 | 
            -
            #     if model_name == "Qwen3-VL-4B-Instruct":
         | 
| 349 | 
            -
            #         processor, model = processor_q, model_q
         | 
| 350 | 
            -
            #     elif model_name == "Qwen3-VL-8B-Instruct":
         | 
| 351 | 
            -
            #         processor, model = processor_y, model_y
         | 
| 352 | 
            -
            #     elif model_name == "Qwen3-VL-4B-Thinking":
         | 
| 353 | 
            -
            #         processor, model = processor_t, model_t
         | 
| 354 | 
            -
            #     elif model_name == "Qwen3-VL-2B-Instruct":
         | 
| 355 | 
            -
            #         processor, model = processor_l, model_l
         | 
| 356 | 
            -
            #     elif model_name == "Qwen3-VL-2B-Thinking":
         | 
| 357 | 
            -
            #         processor, model = processor_j, model_j
         | 
| 358 | 
            -
            #     else:
         | 
| 359 | 
            -
            #         yield "Invalid model selected.", "Invalid model selected."
         | 
| 360 | 
            -
            #         return
         | 
| 361 | 
            -
             | 
| 362 | 
            -
            #     if not state or not state["pages"]:
         | 
| 363 | 
            -
            #         yield "Please upload a PDF file first.", "Please upload a PDF file first."
         | 
| 364 | 
            -
            #         return
         | 
| 365 | 
            -
                
         | 
| 366 | 
            -
            #     page_images = state["pages"]
         | 
| 367 | 
            -
             | 
| 368 | 
            -
            #     messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
         | 
| 369 | 
            -
            #     images_for_processor = []
         | 
| 370 | 
            -
            #     for frame in page_images:
         | 
| 371 | 
            -
            #         messages[0]["content"].append({"type": "image"})
         | 
| 372 | 
            -
            #         images_for_processor.append(frame)
         | 
| 373 |  | 
| 374 | 
            -
             | 
| 375 |  | 
| 376 | 
            -
             | 
| 377 | 
            -
             | 
| 378 | 
            -
             | 
| 379 | 
            -
             | 
| 380 | 
            -
             | 
| 381 | 
            -
             | 
| 382 |  | 
| 383 | 
            -
             | 
| 384 |  | 
| 385 | 
            -
             | 
| 386 | 
            -
             | 
| 387 | 
            -
             | 
| 388 | 
            -
             | 
| 389 | 
            -
             | 
| 390 | 
            -
             | 
| 391 | 
            -
             | 
| 392 | 
            -
             | 
| 393 | 
            -
             | 
| 394 | 
            -
             | 
| 395 |  | 
| 396 | 
            -
             | 
| 397 | 
            -
             | 
| 398 |  | 
| 399 | 
            -
             | 
| 400 | 
            -
             | 
| 401 | 
            -
             | 
| 402 | 
            -
             | 
| 403 | 
            -
             | 
| 404 | 
            -
             | 
| 405 |  | 
| 406 | 
             
            image_examples = [
         | 
| 407 | 
             
                ["Explain the content in detail.", "images/force.jpg"],
         | 
|  | |
| 270 | 
             
                    yield buffer, buffer
         | 
| 271 |  | 
| 272 |  | 
| 273 | 
            +
            # @spaces.GPU(duration=120)
         | 
| 274 | 
            +
            # def generate_pdf(model_name: str, text: str, state: Dict[str, Any], 
         | 
| 275 | 
            +
            #                  max_new_tokens: int = 2048, 
         | 
| 276 | 
            +
            #                  temperature: float = 0.6, 
         | 
| 277 | 
            +
            #                  top_p: float = 0.9, 
         | 
| 278 | 
            +
            #                  top_k: int = 50, 
         | 
| 279 | 
            +
            #                  repetition_penalty: float = 1.2):
         | 
| 280 | 
            +
                
         | 
| 281 | 
            +
            #     # if model_name == "Qwen2.5-VL-7B-Instruct":
         | 
| 282 | 
            +
            #     #     processor, model = processor_m, model_m
         | 
| 283 | 
            +
            #     # elif model_name == "Qwen2.5-VL-3B-Instruct":
         | 
| 284 | 
            +
            #     #     processor, model = processor_x, model_x
         | 
| 285 | 
            +
            #     if model_name == "Qwen3-VL-4B-Instruct":
         | 
| 286 | 
            +
            #         processor, model = processor_q, model_q
         | 
| 287 | 
            +
            #     elif model_name == "Qwen3-VL-8B-Instruct":
         | 
| 288 | 
            +
            #         processor, model = processor_y, model_y
         | 
| 289 | 
            +
            #     # elif model_name == "Qwen3-VL-8B-Thinking":
         | 
| 290 | 
            +
            #     #     processor, model = processor_z, model_z
         | 
| 291 | 
            +
            #     elif model_name == "Qwen3-VL-4B-Thinking":
         | 
| 292 | 
            +
            #         processor, model = processor_t, model_t
         | 
| 293 | 
            +
            #     elif model_name == "Qwen3-VL-2B-Instruct":
         | 
| 294 | 
            +
            #         processor, model = processor_l, model_l
         | 
| 295 | 
            +
            #     elif model_name == "Qwen3-VL-2B-Thinking":
         | 
| 296 | 
            +
            #         processor, model = processor_j, model_j
         | 
| 297 | 
            +
            #     else:
         | 
| 298 | 
            +
            #         yield "Invalid model selected.", "Invalid model selected."
         | 
| 299 | 
            +
            #         return
         | 
| 300 | 
            +
             | 
| 301 | 
            +
            #     if not state or not state["pages"]:
         | 
| 302 | 
            +
            #         yield "Please upload a PDF file first.", "Please upload a PDF file first."
         | 
| 303 | 
            +
            #         return
         | 
| 304 | 
            +
                
         | 
| 305 | 
            +
            #     page_images = state["pages"]
         | 
| 306 | 
            +
            #     full_response = ""
         | 
| 307 | 
            +
            #     for i, image in enumerate(page_images):
         | 
| 308 | 
            +
            #         page_header = f"--- Page {i+1}/{len(page_images)} ---\n"
         | 
| 309 | 
            +
            #         yield full_response + page_header, full_response + page_header
         | 
| 310 | 
            +
                    
         | 
| 311 | 
            +
            #         messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
         | 
| 312 | 
            +
            #         # Sử dụng processor đã chọn
         | 
| 313 | 
            +
            #         prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         | 
| 314 | 
            +
            #         inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True).to(device)
         | 
| 315 | 
            +
            #         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
         | 
| 316 | 
            +
                    
         | 
| 317 | 
            +
            #         generation_kwargs = {
         | 
| 318 | 
            +
            #             **inputs, 
         | 
| 319 | 
            +
            #             "streamer": streamer, 
         | 
| 320 | 
            +
            #             "max_new_tokens": max_new_tokens,
         | 
| 321 | 
            +
            #             "do_sample": True, 
         | 
| 322 | 
            +
            #             "temperature": temperature, 
         | 
| 323 | 
            +
            #             "top_p": top_p, 
         | 
| 324 | 
            +
            #             "top_k": top_k, 
         | 
| 325 | 
            +
            #             "repetition_penalty": repetition_penalty
         | 
| 326 | 
            +
            #         }
         | 
| 327 | 
            +
                    
         | 
| 328 | 
            +
            #         # Sử dụng model đã chọn
         | 
| 329 | 
            +
            #         thread = Thread(target=model.generate, kwargs=generation_kwargs)
         | 
| 330 | 
            +
            #         thread.start()
         | 
| 331 | 
            +
                    
         | 
| 332 | 
            +
            #         page_buffer = ""
         | 
| 333 | 
            +
            #         for new_text in streamer:
         | 
| 334 | 
            +
            #             page_buffer += new_text
         | 
| 335 | 
            +
            #             yield full_response + page_header + page_buffer, full_response + page_header + page_buffer
         | 
| 336 | 
            +
            #             time.sleep(0.01)
         | 
| 337 | 
            +
                    
         | 
| 338 | 
            +
            #         full_response += page_header + page_buffer + "\n\n"
         | 
| 339 | 
            +
             | 
| 340 | 
             
            @spaces.GPU(duration=120)
         | 
| 341 | 
             
            def generate_pdf(model_name: str, text: str, state: Dict[str, Any], 
         | 
| 342 | 
             
                             max_new_tokens: int = 2048, 
         | 
|  | |
| 345 | 
             
                             top_k: int = 50, 
         | 
| 346 | 
             
                             repetition_penalty: float = 1.2):
         | 
| 347 |  | 
|  | |
|  | |
|  | |
|  | |
| 348 | 
             
                if model_name == "Qwen3-VL-4B-Instruct":
         | 
| 349 | 
             
                    processor, model = processor_q, model_q
         | 
| 350 | 
             
                elif model_name == "Qwen3-VL-8B-Instruct":
         | 
| 351 | 
             
                    processor, model = processor_y, model_y
         | 
|  | |
|  | |
| 352 | 
             
                elif model_name == "Qwen3-VL-4B-Thinking":
         | 
| 353 | 
             
                    processor, model = processor_t, model_t
         | 
| 354 | 
             
                elif model_name == "Qwen3-VL-2B-Instruct":
         | 
|  | |
| 364 | 
             
                    return
         | 
| 365 |  | 
| 366 | 
             
                page_images = state["pages"]
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 367 |  | 
| 368 | 
            +
                messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
         | 
| 369 | 
            +
                images_for_processor = []
         | 
| 370 | 
            +
                for frame in page_images:
         | 
| 371 | 
            +
                    messages[0]["content"].append({"type": "image"})
         | 
| 372 | 
            +
                    images_for_processor.append(frame)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 373 |  | 
| 374 | 
            +
                prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         | 
| 375 |  | 
| 376 | 
            +
                inputs = processor(
         | 
| 377 | 
            +
                    text=[prompt_full], 
         | 
| 378 | 
            +
                    images=images_for_processor,  # Truyền cả list ảnh
         | 
| 379 | 
            +
                    return_tensors="pt", 
         | 
| 380 | 
            +
                    padding=True
         | 
| 381 | 
            +
                ).to(device)
         | 
| 382 |  | 
| 383 | 
            +
                streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
         | 
| 384 |  | 
| 385 | 
            +
                generation_kwargs = {
         | 
| 386 | 
            +
                    **inputs, 
         | 
| 387 | 
            +
                    "streamer": streamer, 
         | 
| 388 | 
            +
                    "max_new_tokens": max_new_tokens,
         | 
| 389 | 
            +
                    "do_sample": True, 
         | 
| 390 | 
            +
                    "temperature": temperature, 
         | 
| 391 | 
            +
                    "top_p": top_p, 
         | 
| 392 | 
            +
                    "top_k": top_k, 
         | 
| 393 | 
            +
                    "repetition_penalty": repetition_penalty
         | 
| 394 | 
            +
                }
         | 
| 395 |  | 
| 396 | 
            +
                thread = Thread(target=model.generate, kwargs=generation_kwargs)
         | 
| 397 | 
            +
                thread.start()
         | 
| 398 |  | 
| 399 | 
            +
                buffer = ""
         | 
| 400 | 
            +
                for new_text in streamer:
         | 
| 401 | 
            +
                    buffer += new_text
         | 
| 402 | 
            +
                    buffer = buffer.replace("<|im_end|>", "") # Thêm dòng này giống video
         | 
| 403 | 
            +
                    yield buffer, buffer
         | 
| 404 | 
            +
                    time.sleep(0.01)
         | 
| 405 |  | 
| 406 | 
             
            image_examples = [
         | 
| 407 | 
             
                ["Explain the content in detail.", "images/force.jpg"],
         | 
