Spaces:

yuhangzang
/

caprl

Running on Zero

App Files Files Community

yuhangzang commited on 8 days ago

Commit

8327c64

1 Parent(s): b4bcbcf

update

Browse files

Files changed (1) hide show

app.py +65 -48

app.py CHANGED Viewed

@@ -25,16 +25,14 @@ def load_model():
     device = get_device()
     dtype = select_dtype(device)
     model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
         MODEL_ID,
         torch_dtype=dtype,
-        device_map="auto" if device == "cuda" else None,
         trust_remote_code=True,
     )
-    if device != "cuda":
-        model.to(device)
     processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
     return model, processor
@@ -48,47 +46,64 @@ def generate_caption(image: Image.Image):
     if image is None:
         return "", 0
-    device = MODEL.device
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image"},
-                {"type": "text", "text": DEFAULT_PROMPT},
-            ],
-        }
-    ]
-    prompt_text = PROCESSOR.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
-    inputs = PROCESSOR(
-        text=[prompt_text],
-        images=[image],
-        return_tensors="pt",
-    ).to(device)
-    generated_ids = MODEL.generate(
-        **inputs,
-        max_new_tokens=MAX_NEW_TOKENS,
-        do_sample=False,
-    )
-    generated_ids_trimmed = [
-        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-    ]
-    output_text = PROCESSOR.batch_decode(
-        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-    )
-    caption = output_text[0].strip()
-    input_ids = inputs.get("input_ids")
-    input_length = input_ids.shape[-1] if input_ids is not None else 0
-    total_length = generated_ids.shape[-1]
-    num_generated_tokens = max(total_length - input_length, 0)
-    return caption, int(num_generated_tokens)
 with gr.Blocks(title="CapRL Image Captioning") as demo:
@@ -109,19 +124,21 @@ with gr.Blocks(title="CapRL Image Captioning") as demo:
             image_input = gr.Image(type="pil", label="Input Image")
             generate_button = gr.Button("Generate Caption")
         with gr.Column():
-            caption_output = gr.Textbox(label="Caption", lines=6, show_copy_button=True)
             token_output = gr.Number(label="Generated Tokens", precision=0)
     generate_button.click(
         fn=generate_caption,
         inputs=image_input,
         outputs=[caption_output, token_output],
     )
     image_input.upload(
         fn=generate_caption,
         inputs=image_input,
         outputs=[caption_output, token_output],
     )
     gr.Examples(
@@ -133,7 +150,7 @@ with gr.Blocks(title="CapRL Image Captioning") as demo:
         inputs=image_input,
         outputs=[caption_output, token_output],
         fn=generate_caption,
-        cache_examples=False,
         label="📸 Example Images"
     )
@@ -147,7 +164,7 @@ with gr.Blocks(title="CapRL Image Captioning") as demo:
   year={2025}
 }"""
-    gr.Code(value=citation_text, language="bibtex", label="BibTeX Citation", show_copy_button=True)
 demo.launch()

     device = get_device()
     dtype = select_dtype(device)
+    # Use device_map="auto" for proper GPU allocation with spaces.GPU decorator
     model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
         MODEL_ID,
         torch_dtype=dtype,
+        device_map="auto",
         trust_remote_code=True,
     )
     processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
     return model, processor
     if image is None:
         return "", 0
+    try:
+        # Validate image
+        if not isinstance(image, Image.Image):
+            return "Error: Invalid image format", 0
+        # Check image size (warn if too large)
+        max_size = 4096
+        if image.width > max_size or image.height > max_size:
+            # Resize if too large to prevent OOM
+            image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
+        device = MODEL.device
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": DEFAULT_PROMPT},
+                ],
+            }
+        ]
+        prompt_text = PROCESSOR.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        inputs = PROCESSOR(
+            text=[prompt_text],
+            images=[image],
+            return_tensors="pt",
+        ).to(device)
+        generated_ids = MODEL.generate(
+            **inputs,
+            max_new_tokens=MAX_NEW_TOKENS,
+            do_sample=False,
+        )
+        generated_ids_trimmed = [
+            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        output_text = PROCESSOR.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+        caption = output_text[0].strip()
+        input_ids = inputs.get("input_ids")
+        input_length = input_ids.shape[-1] if input_ids is not None else 0
+        total_length = generated_ids.shape[-1]
+        num_generated_tokens = max(total_length - input_length, 0)
+        return caption, int(num_generated_tokens)
+    except torch.cuda.OutOfMemoryError:
+        torch.cuda.empty_cache()
+        return "Error: Out of GPU memory. Please try with a smaller image.", 0
+    except Exception as e:
+        return f"Error generating caption: {str(e)}", 0
 with gr.Blocks(title="CapRL Image Captioning") as demo:
             image_input = gr.Image(type="pil", label="Input Image")
             generate_button = gr.Button("Generate Caption")
         with gr.Column():
+            caption_output = gr.Textbox(label="Caption", lines=6)
             token_output = gr.Number(label="Generated Tokens", precision=0)
     generate_button.click(
         fn=generate_caption,
         inputs=image_input,
         outputs=[caption_output, token_output],
+        show_progress=True,
     )
     image_input.upload(
         fn=generate_caption,
         inputs=image_input,
         outputs=[caption_output, token_output],
+        show_progress=True,
     )
     gr.Examples(
         inputs=image_input,
         outputs=[caption_output, token_output],
         fn=generate_caption,
+        cache_examples=True,
         label="📸 Example Images"
     )
   year={2025}
 }"""
+    gr.Code(value=citation_text, language="bibtex", label="BibTeX Citation")
 demo.launch()