PaperShow / Paper2Poster /PosterAgent /create_dataset.py
ZaynZhu
Clean version without large assets
7c08dc3
from datasets import load_dataset
import os
import subprocess
from PIL import Image
import json
def generate_meta_json(base_dir='Paper2Poster-data'):
# Loop over each item in the specified base directory
for folder_name in os.listdir(base_dir):
subfolder_path = os.path.join(base_dir, folder_name)
# Ensure the item is a directory
if os.path.isdir(subfolder_path):
poster_path = os.path.join(subfolder_path, 'poster.png')
# Check if the poster.png exists in the subfolder
if os.path.exists(poster_path):
try:
# Open the image and get size (width, height)
with Image.open(poster_path) as img:
width, height = img.size
# Prepare metadata dictionary
metadata = {
'width': width,
'height': height
}
# Write metadata to meta.json in the same subfolder
meta_json_path = os.path.join(subfolder_path, 'meta.json')
with open(meta_json_path, 'w') as json_file:
json.dump(metadata, json_file)
print(f"Metadata for '{folder_name}' saved successfully.")
except Exception as e:
print(f"Error processing image in folder '{folder_name}': {e}")
else:
print(f"No poster.png found in folder '{folder_name}'.")
if __name__ == "__main__":
dataset = load_dataset("Paper2Poster/Paper2Poster", split="train")
os.makedirs('Paper2Poster-data', exist_ok=True)
for data in dataset:
paper_title = data['title']
paper_url = data['paper_url']
poster_url = data['image_url']
qa = data['qa']
os.makedirs(f'Paper2Poster-data/{paper_title}', exist_ok=True)
paper_output_path = os.path.join('Paper2Poster-data', paper_title, 'paper.pdf')
poster_output_path = os.path.join('Paper2Poster-data', paper_title, 'poster.png')
qa_path = os.path.join('Paper2Poster-data', paper_title, 'o3_qa.json')
qa_dict = json.loads(qa)
with open(qa_path, 'w') as f:
json.dump(qa_dict, f, indent=4)
print(f"Saved QA for {paper_title} into {qa_path}")
try:
subprocess.run(['wget', paper_url, '-O', paper_output_path], check=True)
subprocess.run(['wget', poster_url, '-O', poster_output_path], check=True)
print(f"Downloaded {poster_url} into {poster_output_path}")
print(f"Downloaded {paper_url} into {paper_output_path}")
except subprocess.CalledProcessError as e:
print(f"Error downloading {paper_url} or {poster_url}: {e}")
generate_meta_json('Paper2Poster-data')