Spaces:

prithivMLmods
/

DocScope-R1

Running on Zero

App Files Files Community

prithivMLmods commited on May 30

Commit

83c1dff

verified ·

1 Parent(s): 2a06976

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -11

app.py CHANGED Viewed

@@ -24,7 +24,7 @@ from transformers.image_utils import load_image
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 # Increase or disable input truncation to avoid token mismatches
-MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
@@ -34,7 +34,7 @@ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID,
     trust_remote_code=True,
     torch_dtype=torch.float16
-).to(device).eval()
 def downsample_video(video_path):
     """
@@ -80,15 +80,14 @@ def generate_image(text: str, image: Image.Image,
         ]
     }]
     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    # Use max-length padding and enable truncation
     inputs = processor(
         text=[prompt_full],
         images=[image],
         return_tensors="pt",
-        padding="max_length",
-        truncation=True,
         max_length=MAX_INPUT_TOKEN_LENGTH
-    ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
     thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
@@ -120,20 +119,19 @@ def generate_video(text: str, video_path: str,
         {"role": "user", "content": [{"type": "text", "text": text}]}
     ]
     # Append each frame with its timestamp.
-    for image, timestamp in frames:
         messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
         messages[1]["content"].append({"type": "image", "image": image})
-    # Enable truncation in template application
     inputs = processor.apply_chat_template(
         messages,
         tokenize=True,
         add_generation_prompt=True,
         return_dict=True,
         return_tensors="pt",
-        truncation=True,
         max_length=MAX_INPUT_TOKEN_LENGTH
-    ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,
@@ -165,6 +163,7 @@ video_examples = [
     ["Identify the main actions in the video", "videos/2.mp4"]
 ]
 css = """
 .submit-btn {
     background-color: #2980b9 !important;

 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 # Increase or disable input truncation to avoid token mismatches
+MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "8192"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     MODEL_ID,
     trust_remote_code=True,
     torch_dtype=torch.float16
+).to("cuda").eval()
 def downsample_video(video_path):
     """
         ]
     }]
     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(
         text=[prompt_full],
         images=[image],
         return_tensors="pt",
+        padding=True,
+        truncation=False,
         max_length=MAX_INPUT_TOKEN_LENGTH
+    ).to("cuda")
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
     thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
         {"role": "user", "content": [{"type": "text", "text": text}]}
     ]
     # Append each frame with its timestamp.
+    for frame in frames:
+        image, timestamp = frame
         messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
         messages[1]["content"].append({"type": "image", "image": image})
     inputs = processor.apply_chat_template(
         messages,
         tokenize=True,
         add_generation_prompt=True,
         return_dict=True,
         return_tensors="pt",
+        truncation=False,
         max_length=MAX_INPUT_TOKEN_LENGTH
+    ).to("cuda")
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,
     ["Identify the main actions in the video", "videos/2.mp4"]
 ]
 css = """
 .submit-btn {
     background-color: #2980b9 !important;