import gradio as gr from PIL import Image import os import torch import json import spaces from transformers import AutoModelForImageTextToText, AutoProcessor from qwen_vl_utils import process_vision_info os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" # Load model and processor print("Loading Qwen3-VL-30B-A3B-Instruct model...") model = AutoModelForImageTextToText.from_pretrained( "Qwen/Qwen3-VL-30B-A3B-Instruct", torch_dtype=torch.bfloat16, device_map="auto" ) processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-30B-A3B-Instruct") print("Model loaded successfully!") EXTRACTION_PROMPT = """Extract metadata from this library catalog card as JSON. Library catalog cards contain bibliographic information about materials and filing/access information. Extract whatever fields are present: CORE BIBLIOGRAPHIC FIELDS: - title: Full title of the work - author: Main author/creator (person or organization) - editor: Editor if different from author - contributor: Other contributors (translators, illustrators, etc.) - publication_date: Date(s) of publication - publisher: Publisher name - publication_place: Place of publication - physical_description: Physical details (volumes, pages, size, illustrations) - series: Series information if part of a series - edition: Edition statement - contents: Description of contents, volumes, or parts CATALOGING/ACCESS FIELDS: - call_number: Library classification number - subject_headings: Subject terms (often numbered list) - added_entries: Additional access points for co-authors, editors, etc. (often with Roman numerals) - notes: Any additional notes CARD-SPECIFIC: - filing_heading: The heading under which this card is filed (often at top, may be in all caps) - card_sequence: If this is a continuation card (e.g., "Card 2", "Card 3") Return ONLY valid JSON. Use null for fields not present on the card. Use arrays [] for repeating fields like subject_headings and added_entries.""" @spaces.GPU def extract_metadata(image): """Extract structured metadata from catalog card image.""" if image is None: return "Please upload an image." try: # Ensure image is PIL Image if not isinstance(image, Image.Image): image = Image.open(image).convert("RGB") # Format messages for Qwen3-VL messages = [ { "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": EXTRACTION_PROMPT}, ], } ] # Prepare inputs text = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) image_inputs, video_inputs = process_vision_info(messages) inputs = processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ) inputs = inputs.to(model.device) # Generate with torch.inference_mode(): generated_ids = model.generate( **inputs, max_new_tokens=512, temperature=0.1, do_sample=False ) # Trim input tokens from output generated_ids_trimmed = [ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) ] # Decode output output_text = processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False, )[0] # Try to parse as JSON for pretty formatting try: json_data = json.loads(output_text) return json.dumps(json_data, indent=2) except json.JSONDecodeError: # If not valid JSON, return as-is return output_text except Exception as e: return f"Error during extraction: {str(e)}" # Create Gradio interface with gr.Blocks(title="Library Card Metadata Extractor") as demo: gr.Markdown("# 📇 Library Card Metadata Extractor") gr.Markdown( "Extract structured metadata from library catalog cards using **Qwen/Qwen3-VL-30B-A3B-Instruct**. " "Upload an image of a catalog card and get JSON-formatted metadata including title, author, dates, " "call numbers, and more.\n\n" "This demo works with catalog cards from libraries and archives, such as the " "[Rubenstein Manuscript Catalog](https://huggingface.co/datasets/biglam/rubenstein-manuscript-catalog) " "and [Boston Public Library Card Catalog](https://huggingface.co/datasets/biglam/bpl-card-catalog)." ) gr.Markdown("---") with gr.Row(): with gr.Column(scale=1): gr.Markdown("### 📤 Upload Catalog Card") image_input = gr.Image(label="Library Catalog Card", type="pil") submit_btn = gr.Button("🔍 Extract Metadata", variant="primary", size="lg") with gr.Column(scale=1): gr.Markdown("### 📋 Extracted Metadata (JSON)") output = gr.Code(label="Metadata", language="json", lines=15) submit_btn.click(fn=extract_metadata, inputs=image_input, outputs=output) gr.Markdown("---") # Examples gr.Markdown("## 🎯 Try Examples") gr.Examples( examples=[ ["examples/bpl_0.jpg"], ["examples/bpl_2.jpg"], ["examples/bpl_4.jpg"], ["examples/bpl_6.jpg"], ["examples/bpl_8.jpg"], ["examples/bpl_9.jpg"], ["examples/bpl_12.jpg"], ["examples/bpl_15.jpg"], ["examples/bpl_22.jpg"], ], inputs=image_input, outputs=output, fn=extract_metadata, cache_examples=False, ) gr.Markdown("---") # Footer gr.Markdown( "