Spaces:

yuhangzang
/

caprl

Running on Zero

App Files Files Community

yuhangzang commited on 7 days ago

Commit

a72baa1

1 Parent(s): 3870541

update

Browse files

Files changed (2) hide show

README.md +0 -9
app.py +23 -11

README.md CHANGED Viewed

@@ -12,12 +12,3 @@ short_description: Generate captions for images with CapRL
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
-Citation:
-@article{xing2025caprl,
-  title={CapRL: Stimulating Dense Image Caption Capabilities via Reinforcement Learning},
-  author={Xing, Long and Dong, Xiaoyi and Zang, Yuhang and Cao, Yuhang and Liang, Jianze and Huang, Qidong and Wang, Jiaqi and Wu, Feng and Lin, Dahua},
-  journal={arXiv preprint arXiv:2509.22647},
-  year={2025}
-}


12	---
13
14	Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
 MODEL_ID = "internlm/CapRL-3B"
 DEFAULT_PROMPT = "Describe the image in detail."
-MAX_NEW_TOKENS = 128
 def get_device() -> str:
@@ -69,25 +69,23 @@ def generate_caption(image: Image.Image):
         return_tensors="pt",
     ).to(device)
-    output_ids = MODEL.generate(
         **inputs,
         max_new_tokens=MAX_NEW_TOKENS,
         do_sample=False,
     )
-    generated_text = PROCESSOR.batch_decode(
-        output_ids, skip_special_tokens=True
-    )[0]
-    processed_outputs = PROCESSOR.post_process_generation(
-        generated_text,
-        messages,
     )
-    caption = processed_outputs[0].get("generated_text", generated_text).strip()
     input_ids = inputs.get("input_ids")
     input_length = input_ids.shape[-1] if input_ids is not None else 0
-    total_length = output_ids.shape[-1]
     num_generated_tokens = max(total_length - input_length, 0)
     return caption, int(num_generated_tokens)
@@ -95,6 +93,20 @@ def generate_caption(image: Image.Image):
 with gr.Blocks(title="CapRL Image Captioning") as demo:
     gr.Markdown("# CapRL Image Captioning\nUpload an image to generate a caption with CapRL-3B.")
     with gr.Row():
         with gr.Column():

 MODEL_ID = "internlm/CapRL-3B"
 DEFAULT_PROMPT = "Describe the image in detail."
+MAX_NEW_TOKENS = 1024
 def get_device() -> str:
         return_tensors="pt",
     ).to(device)
+    generated_ids = MODEL.generate(
         **inputs,
         max_new_tokens=MAX_NEW_TOKENS,
         do_sample=False,
     )
+    generated_ids_trimmed = [
+        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
     )
+    caption = output_text.strip()
     input_ids = inputs.get("input_ids")
     input_length = input_ids.shape[-1] if input_ids is not None else 0
+    total_length = generated_ids.shape[-1]
     num_generated_tokens = max(total_length - input_length, 0)
     return caption, int(num_generated_tokens)
 with gr.Blocks(title="CapRL Image Captioning") as demo:
     gr.Markdown("# CapRL Image Captioning\nUpload an image to generate a caption with CapRL-3B.")
+    gr.Markdown(
+        """### Citation
+If you find this project useful, please kindly cite:
+```
+@article{xing2025caprl,
+  title={CapRL: Stimulating Dense Image Caption Capabilities via Reinforcement Learning},
+  author={Xing, Long and Dong, Xiaoyi and Zang, Yuhang and Cao, Yuhang and Liang, Jianze and Huang, Qidong and Wang, Jiaqi and Wu, Feng and Lin, Dahua},
+  journal={arXiv preprint arXiv:2509.22647},
+  year={2025}
+}
+```
+"""
+    )
     with gr.Row():
         with gr.Column():