davanstrien's picture
davanstrien HF Staff
Upload 9 files
a85cd29 verified
raw
history blame
5.68 kB
import gradio as gr
from PIL import Image
import os
import torch
import json
import spaces
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
# Load model and processor
print("Loading Qwen3-VL-30B-A3B-Instruct model...")
model = Qwen2VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen3-VL-30B-A3B-Instruct",
torch_dtype=torch.bfloat16,
device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-30B-A3B-Instruct")
print("Model loaded successfully!")
EXTRACTION_PROMPT = """Extract all metadata from this library catalog card and return it as valid JSON with the following fields:
- title: The main title or name on the card
- author: Author, creator, or associated person/organization
- date: Any dates mentioned (publication, creation, or coverage dates)
- call_number: Library classification or call number
- physical_description: Details about the physical item (size, extent, format)
- subjects: Subject headings or topics
- notes: Any additional notes or information
Return ONLY the JSON object, nothing else. If a field is not present on the card, use null for that field."""
@spaces.GPU
def extract_metadata(image):
"""Extract structured metadata from catalog card image."""
if image is None:
return "Please upload an image."
try:
# Ensure image is PIL Image
if not isinstance(image, Image.Image):
image = Image.open(image).convert("RGB")
# Format messages for Qwen3-VL
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": EXTRACTION_PROMPT}
]
}
]
# Prepare inputs
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt"
)
inputs = inputs.to(model.device)
# Generate
with torch.inference_mode():
generated_ids = model.generate(
**inputs,
max_new_tokens=512,
temperature=0.1,
do_sample=False
)
# Trim input tokens from output
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
# Decode output
output_text = processor.batch_decode(
generated_ids_trimmed,
skip_special_tokens=True,
clean_up_tokenization_spaces=False
)[0]
# Try to parse as JSON for pretty formatting
try:
json_data = json.loads(output_text)
return json.dumps(json_data, indent=2)
except json.JSONDecodeError:
# If not valid JSON, return as-is
return output_text
except Exception as e:
return f"Error during extraction: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="Library Card Metadata Extractor") as demo:
gr.Markdown("# πŸ“‡ Library Card Metadata Extractor")
gr.Markdown(
"Extract structured metadata from library catalog cards using **Qwen3-VL-30B**. "
"Upload an image of a catalog card and get JSON-formatted metadata including title, author, dates, "
"call numbers, and more.\n\n"
"This demo works with catalog cards from libraries and archives, such as the "
"[Rubenstein Manuscript Catalog](https://huggingface.co/datasets/biglam/rubenstein-manuscript-catalog) "
"and [Boston Public Library Card Catalog](https://huggingface.co/datasets/biglam/bpl-card-catalog)."
)
gr.Markdown("---")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### πŸ“€ Upload Catalog Card")
image_input = gr.Image(
label="Library Catalog Card",
type="pil"
)
submit_btn = gr.Button("πŸ” Extract Metadata", variant="primary", size="lg")
with gr.Column(scale=1):
gr.Markdown("### πŸ“‹ Extracted Metadata (JSON)")
output = gr.Code(
label="Metadata",
language="json",
lines=15
)
submit_btn.click(
fn=extract_metadata,
inputs=image_input,
outputs=output
)
gr.Markdown("---")
# Examples
gr.Markdown("## 🎯 Try Examples")
gr.Examples(
examples=[
["examples/rubenstein_0.jpg"],
["examples/rubenstein_1.jpg"],
["examples/rubenstein_2.jpg"],
["examples/bpl_0.jpg"],
["examples/bpl_1.jpg"],
["examples/bpl_2.jpg"],
],
inputs=image_input,
outputs=output,
fn=extract_metadata,
cache_examples=False
)
gr.Markdown("---")
# Footer
gr.Markdown(
"<center>\n\n"
"Built for the GLAM community using [Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct) | "
"Example cards from [Rubenstein](https://huggingface.co/datasets/biglam/rubenstein-manuscript-catalog) "
"and [BPL](https://huggingface.co/datasets/biglam/bpl-card-catalog) collections\n\n"
"</center>"
)
if __name__ == "__main__":
print("Launching demo...")
demo.launch()