import gradio as gr
import spaces
import torch
from PIL import Image
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration

MODEL_ID = "internlm/CapRL-3B"
DEFAULT_PROMPT = "Describe the image in detail."
MAX_NEW_TOKENS = 4096


def get_device() -> str:
    return "cuda" if torch.cuda.is_available() else "cpu"


def select_dtype(device: str):
    if device == "cuda":
        if torch.cuda.is_bf16_supported():
            return torch.bfloat16
        return torch.float16
    return torch.float32


def load_model():
    device = get_device()
    dtype = select_dtype(device)

    # Use device_map="auto" for proper GPU allocation with spaces.GPU decorator
    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        MODEL_ID,
        torch_dtype=dtype,
        device_map="auto",
        trust_remote_code=True,
    )

    processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
    return model, processor


MODEL, PROCESSOR = load_model()


@spaces.GPU
@torch.inference_mode()
def generate_caption(image: Image.Image):
    if image is None:
        return "", 0

    try:
        # Validate image
        if not isinstance(image, Image.Image):
            return "Error: Invalid image format", 0

        # Check image size (warn if too large)
        max_size = 4096
        if image.width > max_size or image.height > max_size:
            # Resize if too large to prevent OOM
            image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)

        device = MODEL.device
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image"},
                    {"type": "text", "text": DEFAULT_PROMPT},
                ],
            }
        ]

        prompt_text = PROCESSOR.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )

        inputs = PROCESSOR(
            text=[prompt_text],
            images=[image],
            return_tensors="pt",
        ).to(device)

        generated_ids = MODEL.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=False,
        )

        generated_ids_trimmed = [
            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        output_text = PROCESSOR.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )
        caption = output_text[0].strip()

        input_ids = inputs.get("input_ids")
        input_length = input_ids.shape[-1] if input_ids is not None else 0
        total_length = generated_ids.shape[-1]
        num_generated_tokens = max(total_length - input_length, 0)

        return caption, int(num_generated_tokens)

    except torch.cuda.OutOfMemoryError:
        torch.cuda.empty_cache()
        return "Error: Out of GPU memory. Please try with a smaller image.", 0
    except Exception as e:
        return f"Error generating caption: {str(e)}", 0


with gr.Blocks(title="CapRL Image Captioning") as demo:
    gr.Markdown("# 🎨 CapRL for Image Captioning")
    gr.Markdown("### CapRL: Stimulating Dense Image Caption Capabilities via Reinforcement Learning")
    gr.Markdown("✨ Upload an image to generate a detailed caption with CapRL-3B! ✨")
    gr.Markdown(
        """
📖 <a href="https://arxiv.org/abs/2509.22647">Paper</a> | 🏠 <a href="https://github.com/InternLM/CapRL">Github</a> | 🤗 <a href="https://huggingface.co/internlm/CapRL-3B">CapRL-3B Model</a> | 🤗 <a href="https://huggingface.co/yuhangzang/CapRL-InternVL3.5-8B">CapRL-InternVL3.5-8B Model</a> |
🤗 <a href="https://huggingface.co/datasets/internlm/CapRL-2M">CapRL-2M Dataset</a>

🤗 <a href="https://huggingface.co/collections/long-xing1/caprl-68d64ac32ded31596c36e189">CapRL Collection</a> | 📰 <a href="https://huggingface.co/papers/2509.22647">Daily Paper</a> | 💾 <a href="https://huggingface.co/mradermacher/CapRL-3B-GGUF">CapRL-3B-GGUF</a> | 💾 <a href="https://huggingface.co/mradermacher/CapRL-3B-i1-GGUF">CapRL-3B-i1-GGUF</a>
"""
    )

    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="pil", label="Input Image")
            generate_button = gr.Button("Generate Caption")
        with gr.Column():
            caption_output = gr.Textbox(label="Caption", lines=6)
            token_output = gr.Number(label="Generated Tokens", precision=0)

    generate_button.click(
        fn=generate_caption,
        inputs=image_input,
        outputs=[caption_output, token_output],
        show_progress=True,
    )

    image_input.upload(
        fn=generate_caption,
        inputs=image_input,
        outputs=[caption_output, token_output],
        show_progress=True,
    )

    gr.Examples(
        examples=[
            ["./examples/example_chinese.png"],
            ["./examples/example_receipt.jpg"],
            ["./examples/example_table.png"],
        ],
        inputs=image_input,
        outputs=[caption_output, token_output],
        fn=generate_caption,
        cache_examples=True,
        label="📸 Example Images"
    )

    gr.Markdown("### Citation")
    gr.Markdown("If you find this project useful, please kindly cite:")

    citation_text = """@article{xing2025caprl,
  title={{CapRL}: Stimulating Dense Image Caption Capabilities via Reinforcement Learning},
  author={Xing, Long and Dong, Xiaoyi and Zang, Yuhang and Cao, Yuhang and Liang, Jianze and Huang, Qidong and Wang, Jiaqi and Wu, Feng and Lin, Dahua},
  journal={arXiv preprint arXiv:2509.22647},
  year={2025}
}"""

    gr.Code(value=citation_text, language="markdown", label="BibTeX Citation")


demo.launch()