Spaces:

davanstrien
/

vllm-index-card-extractor

Running on Zero

App Files Files Community

vllm-index-card-extractor / app.py

davanstrien HF Staff

Upload 9 files

a85cd29 verified about 2 months ago

raw

history blame

5.68 kB

	import gradio as gr
	from PIL import Image
	import os
	import torch
	import json
	import spaces
	from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
	from qwen_vl_utils import process_vision_info

	os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

	# Load model and processor
	print("Loading Qwen3-VL-30B-A3B-Instruct model...")
	model = Qwen2VLForConditionalGeneration.from_pretrained(
	"Qwen/Qwen3-VL-30B-A3B-Instruct",
	torch_dtype=torch.bfloat16,
	device_map="auto"
	)
	processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-30B-A3B-Instruct")
	print("Model loaded successfully!")

	EXTRACTION_PROMPT = """Extract all metadata from this library catalog card and return it as valid JSON with the following fields:
	- title: The main title or name on the card
	- author: Author, creator, or associated person/organization
	- date: Any dates mentioned (publication, creation, or coverage dates)
	- call_number: Library classification or call number
	- physical_description: Details about the physical item (size, extent, format)
	- subjects: Subject headings or topics
	- notes: Any additional notes or information

	Return ONLY the JSON object, nothing else. If a field is not present on the card, use null for that field."""

	@spaces.GPU
	def extract_metadata(image):
	"""Extract structured metadata from catalog card image."""
	if image is None:
	return "Please upload an image."

	try:
	# Ensure image is PIL Image
	if not isinstance(image, Image.Image):
	image = Image.open(image).convert("RGB")

	# Format messages for Qwen3-VL
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image", "image": image},
	{"type": "text", "text": EXTRACTION_PROMPT}
	]
	}
	]

	# Prepare inputs
	text = processor.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)
	image_inputs, video_inputs = process_vision_info(messages)

	inputs = processor(
	text=[text],
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt"
	)
	inputs = inputs.to(model.device)

	# Generate
	with torch.inference_mode():
	generated_ids = model.generate(
	**inputs,
	max_new_tokens=512,
	temperature=0.1,
	do_sample=False
	)

	# Trim input tokens from output
	generated_ids_trimmed = [
	out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]

	# Decode output
	output_text = processor.batch_decode(
	generated_ids_trimmed,
	skip_special_tokens=True,
	clean_up_tokenization_spaces=False
	)[0]

	# Try to parse as JSON for pretty formatting
	try:
	json_data = json.loads(output_text)
	return json.dumps(json_data, indent=2)
	except json.JSONDecodeError:
	# If not valid JSON, return as-is
	return output_text

	except Exception as e:
	return f"Error during extraction: {str(e)}"

	# Create Gradio interface
	with gr.Blocks(title="Library Card Metadata Extractor") as demo:
	gr.Markdown("# 📇 Library Card Metadata Extractor")
	gr.Markdown(
	"Extract structured metadata from library catalog cards using Qwen3-VL-30B. "
	"Upload an image of a catalog card and get JSON-formatted metadata including title, author, dates, "
	"call numbers, and more.\n\n"
	"This demo works with catalog cards from libraries and archives, such as the "
	"[Rubenstein Manuscript Catalog](https://huggingface.co/datasets/biglam/rubenstein-manuscript-catalog) "
	"and [Boston Public Library Card Catalog](https://huggingface.co/datasets/biglam/bpl-card-catalog)."
	)

	gr.Markdown("---")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### 📤 Upload Catalog Card")
	image_input = gr.Image(
	label="Library Catalog Card",
	type="pil"
	)
	submit_btn = gr.Button("🔍 Extract Metadata", variant="primary", size="lg")

	with gr.Column(scale=1):
	gr.Markdown("### 📋 Extracted Metadata (JSON)")
	output = gr.Code(
	label="Metadata",
	language="json",
	lines=15
	)

	submit_btn.click(
	fn=extract_metadata,
	inputs=image_input,
	outputs=output
	)

	gr.Markdown("---")

	# Examples
	gr.Markdown("## 🎯 Try Examples")
	gr.Examples(
	examples=[
	["examples/rubenstein_0.jpg"],
	["examples/rubenstein_1.jpg"],
	["examples/rubenstein_2.jpg"],
	["examples/bpl_0.jpg"],
	["examples/bpl_1.jpg"],
	["examples/bpl_2.jpg"],
	],
	inputs=image_input,
	outputs=output,
	fn=extract_metadata,
	cache_examples=False
	)

	gr.Markdown("---")

	# Footer
	gr.Markdown(
	"<center>\n\n"
	"Built for the GLAM community using [Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct) \| "
	"Example cards from [Rubenstein](https://huggingface.co/datasets/biglam/rubenstein-manuscript-catalog) "
	"and [BPL](https://huggingface.co/datasets/biglam/bpl-card-catalog) collections\n\n"
	"</center>"
	)

	if __name__ == "__main__":
	print("Launching demo...")
	demo.launch()