Spaces:

davanstrien
/

vllm-index-card-extractor

Running on Zero

File size: 5,676 Bytes

a85cd29

import gradio as gr
from PIL import Image
import os
import torch
import json
import spaces
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

# Load model and processor
print("Loading Qwen3-VL-30B-A3B-Instruct model...")
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen3-VL-30B-A3B-Instruct",
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-30B-A3B-Instruct")
print("Model loaded successfully!")

EXTRACTION_PROMPT = """Extract all metadata from this library catalog card and return it as valid JSON with the following fields:
- title: The main title or name on the card
- author: Author, creator, or associated person/organization
- date: Any dates mentioned (publication, creation, or coverage dates)
- call_number: Library classification or call number
- physical_description: Details about the physical item (size, extent, format)
- subjects: Subject headings or topics
- notes: Any additional notes or information

Return ONLY the JSON object, nothing else. If a field is not present on the card, use null for that field."""

@spaces.GPU
def extract_metadata(image):
    """Extract structured metadata from catalog card image."""
    if image is None:
        return "Please upload an image."

    try:
        # Ensure image is PIL Image
        if not isinstance(image, Image.Image):
            image = Image.open(image).convert("RGB")

        # Format messages for Qwen3-VL
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": image},
                    {"type": "text", "text": EXTRACTION_PROMPT}
                ]
            }
        ]

        # Prepare inputs
        text = processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        image_inputs, video_inputs = process_vision_info(messages)

        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt"
        )
        inputs = inputs.to(model.device)

        # Generate
        with torch.inference_mode():
            generated_ids = model.generate(
                **inputs,
                max_new_tokens=512,
                temperature=0.1,
                do_sample=False
            )

        # Trim input tokens from output
        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]

        # Decode output
        output_text = processor.batch_decode(
            generated_ids_trimmed,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False
        )[0]

        # Try to parse as JSON for pretty formatting
        try:
            json_data = json.loads(output_text)
            return json.dumps(json_data, indent=2)
        except json.JSONDecodeError:
            # If not valid JSON, return as-is
            return output_text

    except Exception as e:
        return f"Error during extraction: {str(e)}"

# Create Gradio interface
with gr.Blocks(title="Library Card Metadata Extractor") as demo:
    gr.Markdown("# 📇 Library Card Metadata Extractor")
    gr.Markdown(
        "Extract structured metadata from library catalog cards using **Qwen3-VL-30B**. "
        "Upload an image of a catalog card and get JSON-formatted metadata including title, author, dates, "
        "call numbers, and more.\n\n"
        "This demo works with catalog cards from libraries and archives, such as the "
        "[Rubenstein Manuscript Catalog](https://huggingface.co/datasets/biglam/rubenstein-manuscript-catalog) "
        "and [Boston Public Library Card Catalog](https://huggingface.co/datasets/biglam/bpl-card-catalog)."
    )

    gr.Markdown("---")

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### 📤 Upload Catalog Card")
            image_input = gr.Image(
                label="Library Catalog Card",
                type="pil"
            )
            submit_btn = gr.Button("🔍 Extract Metadata", variant="primary", size="lg")

        with gr.Column(scale=1):
            gr.Markdown("### 📋 Extracted Metadata (JSON)")
            output = gr.Code(
                label="Metadata",
                language="json",
                lines=15
            )

    submit_btn.click(
        fn=extract_metadata,
        inputs=image_input,
        outputs=output
    )

    gr.Markdown("---")

    # Examples
    gr.Markdown("## 🎯 Try Examples")
    gr.Examples(
        examples=[
            ["examples/rubenstein_0.jpg"],
            ["examples/rubenstein_1.jpg"],
            ["examples/rubenstein_2.jpg"],
            ["examples/bpl_0.jpg"],
            ["examples/bpl_1.jpg"],
            ["examples/bpl_2.jpg"],
        ],
        inputs=image_input,
        outputs=output,
        fn=extract_metadata,
        cache_examples=False
    )

    gr.Markdown("---")

    # Footer
    gr.Markdown(
        "<center>\n\n"
        "Built for the GLAM community using [Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct) | "
        "Example cards from [Rubenstein](https://huggingface.co/datasets/biglam/rubenstein-manuscript-catalog) "
        "and [BPL](https://huggingface.co/datasets/biglam/bpl-card-catalog) collections\n\n"
        "</center>"
    )

if __name__ == "__main__":
    print("Launching demo...")
    demo.launch()