Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -24,7 +24,7 @@ from transformers.image_utils import load_image
|
|
| 24 |
MAX_MAX_NEW_TOKENS = 2048
|
| 25 |
DEFAULT_MAX_NEW_TOKENS = 1024
|
| 26 |
# Increase or disable input truncation to avoid token mismatches
|
| 27 |
-
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "
|
| 28 |
|
| 29 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
| 30 |
|
|
@@ -34,7 +34,7 @@ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
|
| 34 |
MODEL_ID,
|
| 35 |
trust_remote_code=True,
|
| 36 |
torch_dtype=torch.float16
|
| 37 |
-
).to(
|
| 38 |
|
| 39 |
def downsample_video(video_path):
|
| 40 |
"""
|
|
@@ -80,15 +80,14 @@ def generate_image(text: str, image: Image.Image,
|
|
| 80 |
]
|
| 81 |
}]
|
| 82 |
prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 83 |
-
# Use max-length padding and enable truncation
|
| 84 |
inputs = processor(
|
| 85 |
text=[prompt_full],
|
| 86 |
images=[image],
|
| 87 |
return_tensors="pt",
|
| 88 |
-
padding=
|
| 89 |
-
truncation=
|
| 90 |
max_length=MAX_INPUT_TOKEN_LENGTH
|
| 91 |
-
).to(
|
| 92 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
| 93 |
generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
|
| 94 |
thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
|
|
@@ -120,20 +119,19 @@ def generate_video(text: str, video_path: str,
|
|
| 120 |
{"role": "user", "content": [{"type": "text", "text": text}]}
|
| 121 |
]
|
| 122 |
# Append each frame with its timestamp.
|
| 123 |
-
for
|
|
|
|
| 124 |
messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
|
| 125 |
messages[1]["content"].append({"type": "image", "image": image})
|
| 126 |
-
|
| 127 |
-
# Enable truncation in template application
|
| 128 |
inputs = processor.apply_chat_template(
|
| 129 |
messages,
|
| 130 |
tokenize=True,
|
| 131 |
add_generation_prompt=True,
|
| 132 |
return_dict=True,
|
| 133 |
return_tensors="pt",
|
| 134 |
-
truncation=
|
| 135 |
max_length=MAX_INPUT_TOKEN_LENGTH
|
| 136 |
-
).to(
|
| 137 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
| 138 |
generation_kwargs = {
|
| 139 |
**inputs,
|
|
@@ -165,6 +163,7 @@ video_examples = [
|
|
| 165 |
["Identify the main actions in the video", "videos/2.mp4"]
|
| 166 |
]
|
| 167 |
|
|
|
|
| 168 |
css = """
|
| 169 |
.submit-btn {
|
| 170 |
background-color: #2980b9 !important;
|
|
|
|
| 24 |
MAX_MAX_NEW_TOKENS = 2048
|
| 25 |
DEFAULT_MAX_NEW_TOKENS = 1024
|
| 26 |
# Increase or disable input truncation to avoid token mismatches
|
| 27 |
+
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "8192"))
|
| 28 |
|
| 29 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
| 30 |
|
|
|
|
| 34 |
MODEL_ID,
|
| 35 |
trust_remote_code=True,
|
| 36 |
torch_dtype=torch.float16
|
| 37 |
+
).to("cuda").eval()
|
| 38 |
|
| 39 |
def downsample_video(video_path):
|
| 40 |
"""
|
|
|
|
| 80 |
]
|
| 81 |
}]
|
| 82 |
prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
|
|
| 83 |
inputs = processor(
|
| 84 |
text=[prompt_full],
|
| 85 |
images=[image],
|
| 86 |
return_tensors="pt",
|
| 87 |
+
padding=True,
|
| 88 |
+
truncation=False,
|
| 89 |
max_length=MAX_INPUT_TOKEN_LENGTH
|
| 90 |
+
).to("cuda")
|
| 91 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
| 92 |
generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
|
| 93 |
thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
|
|
|
|
| 119 |
{"role": "user", "content": [{"type": "text", "text": text}]}
|
| 120 |
]
|
| 121 |
# Append each frame with its timestamp.
|
| 122 |
+
for frame in frames:
|
| 123 |
+
image, timestamp = frame
|
| 124 |
messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
|
| 125 |
messages[1]["content"].append({"type": "image", "image": image})
|
|
|
|
|
|
|
| 126 |
inputs = processor.apply_chat_template(
|
| 127 |
messages,
|
| 128 |
tokenize=True,
|
| 129 |
add_generation_prompt=True,
|
| 130 |
return_dict=True,
|
| 131 |
return_tensors="pt",
|
| 132 |
+
truncation=False,
|
| 133 |
max_length=MAX_INPUT_TOKEN_LENGTH
|
| 134 |
+
).to("cuda")
|
| 135 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
| 136 |
generation_kwargs = {
|
| 137 |
**inputs,
|
|
|
|
| 163 |
["Identify the main actions in the video", "videos/2.mp4"]
|
| 164 |
]
|
| 165 |
|
| 166 |
+
|
| 167 |
css = """
|
| 168 |
.submit-btn {
|
| 169 |
background-color: #2980b9 !important;
|