yuhangzang commited on
Commit
a72baa1
·
1 Parent(s): 3870541
Files changed (2) hide show
  1. README.md +0 -9
  2. app.py +23 -11
README.md CHANGED
@@ -12,12 +12,3 @@ short_description: Generate captions for images with CapRL
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
15
-
16
- Citation:
17
-
18
- @article{xing2025caprl,
19
- title={CapRL: Stimulating Dense Image Caption Capabilities via Reinforcement Learning},
20
- author={Xing, Long and Dong, Xiaoyi and Zang, Yuhang and Cao, Yuhang and Liang, Jianze and Huang, Qidong and Wang, Jiaqi and Wu, Feng and Lin, Dahua},
21
- journal={arXiv preprint arXiv:2509.22647},
22
- year={2025}
23
- }
 
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -6,7 +6,7 @@ from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
6
 
7
  MODEL_ID = "internlm/CapRL-3B"
8
  DEFAULT_PROMPT = "Describe the image in detail."
9
- MAX_NEW_TOKENS = 128
10
 
11
 
12
  def get_device() -> str:
@@ -69,25 +69,23 @@ def generate_caption(image: Image.Image):
69
  return_tensors="pt",
70
  ).to(device)
71
 
72
- output_ids = MODEL.generate(
73
  **inputs,
74
  max_new_tokens=MAX_NEW_TOKENS,
75
  do_sample=False,
76
  )
77
 
78
- generated_text = PROCESSOR.batch_decode(
79
- output_ids, skip_special_tokens=True
80
- )[0]
81
- processed_outputs = PROCESSOR.post_process_generation(
82
- generated_text,
83
- messages,
84
  )
85
-
86
- caption = processed_outputs[0].get("generated_text", generated_text).strip()
87
 
88
  input_ids = inputs.get("input_ids")
89
  input_length = input_ids.shape[-1] if input_ids is not None else 0
90
- total_length = output_ids.shape[-1]
91
  num_generated_tokens = max(total_length - input_length, 0)
92
 
93
  return caption, int(num_generated_tokens)
@@ -95,6 +93,20 @@ def generate_caption(image: Image.Image):
95
 
96
  with gr.Blocks(title="CapRL Image Captioning") as demo:
97
  gr.Markdown("# CapRL Image Captioning\nUpload an image to generate a caption with CapRL-3B.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
  with gr.Row():
100
  with gr.Column():
 
6
 
7
  MODEL_ID = "internlm/CapRL-3B"
8
  DEFAULT_PROMPT = "Describe the image in detail."
9
+ MAX_NEW_TOKENS = 1024
10
 
11
 
12
  def get_device() -> str:
 
69
  return_tensors="pt",
70
  ).to(device)
71
 
72
+ generated_ids = MODEL.generate(
73
  **inputs,
74
  max_new_tokens=MAX_NEW_TOKENS,
75
  do_sample=False,
76
  )
77
 
78
+ generated_ids_trimmed = [
79
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
80
+ ]
81
+ output_text = processor.batch_decode(
82
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
 
83
  )
84
+ caption = output_text.strip()
 
85
 
86
  input_ids = inputs.get("input_ids")
87
  input_length = input_ids.shape[-1] if input_ids is not None else 0
88
+ total_length = generated_ids.shape[-1]
89
  num_generated_tokens = max(total_length - input_length, 0)
90
 
91
  return caption, int(num_generated_tokens)
 
93
 
94
  with gr.Blocks(title="CapRL Image Captioning") as demo:
95
  gr.Markdown("# CapRL Image Captioning\nUpload an image to generate a caption with CapRL-3B.")
96
+ gr.Markdown(
97
+ """### Citation
98
+ If you find this project useful, please kindly cite:
99
+
100
+ ```
101
+ @article{xing2025caprl,
102
+ title={CapRL: Stimulating Dense Image Caption Capabilities via Reinforcement Learning},
103
+ author={Xing, Long and Dong, Xiaoyi and Zang, Yuhang and Cao, Yuhang and Liang, Jianze and Huang, Qidong and Wang, Jiaqi and Wu, Feng and Lin, Dahua},
104
+ journal={arXiv preprint arXiv:2509.22647},
105
+ year={2025}
106
+ }
107
+ ```
108
+ """
109
+ )
110
 
111
  with gr.Row():
112
  with gr.Column():