|
|
from datasets import load_dataset |
|
|
import os |
|
|
import subprocess |
|
|
|
|
|
from PIL import Image |
|
|
import json |
|
|
|
|
|
def generate_meta_json(base_dir='Paper2Poster-data'): |
|
|
|
|
|
for folder_name in os.listdir(base_dir): |
|
|
subfolder_path = os.path.join(base_dir, folder_name) |
|
|
|
|
|
|
|
|
if os.path.isdir(subfolder_path): |
|
|
poster_path = os.path.join(subfolder_path, 'poster.png') |
|
|
|
|
|
|
|
|
if os.path.exists(poster_path): |
|
|
try: |
|
|
|
|
|
with Image.open(poster_path) as img: |
|
|
width, height = img.size |
|
|
|
|
|
|
|
|
metadata = { |
|
|
'width': width, |
|
|
'height': height |
|
|
} |
|
|
|
|
|
|
|
|
meta_json_path = os.path.join(subfolder_path, 'meta.json') |
|
|
with open(meta_json_path, 'w') as json_file: |
|
|
json.dump(metadata, json_file) |
|
|
|
|
|
print(f"Metadata for '{folder_name}' saved successfully.") |
|
|
except Exception as e: |
|
|
print(f"Error processing image in folder '{folder_name}': {e}") |
|
|
else: |
|
|
print(f"No poster.png found in folder '{folder_name}'.") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
dataset = load_dataset("Paper2Poster/Paper2Poster", split="train") |
|
|
os.makedirs('Paper2Poster-data', exist_ok=True) |
|
|
for data in dataset: |
|
|
paper_title = data['title'] |
|
|
paper_url = data['paper_url'] |
|
|
poster_url = data['image_url'] |
|
|
qa = data['qa'] |
|
|
|
|
|
os.makedirs(f'Paper2Poster-data/{paper_title}', exist_ok=True) |
|
|
|
|
|
paper_output_path = os.path.join('Paper2Poster-data', paper_title, 'paper.pdf') |
|
|
poster_output_path = os.path.join('Paper2Poster-data', paper_title, 'poster.png') |
|
|
qa_path = os.path.join('Paper2Poster-data', paper_title, 'o3_qa.json') |
|
|
|
|
|
qa_dict = json.loads(qa) |
|
|
with open(qa_path, 'w') as f: |
|
|
json.dump(qa_dict, f, indent=4) |
|
|
print(f"Saved QA for {paper_title} into {qa_path}") |
|
|
|
|
|
try: |
|
|
subprocess.run(['wget', paper_url, '-O', paper_output_path], check=True) |
|
|
subprocess.run(['wget', poster_url, '-O', poster_output_path], check=True) |
|
|
print(f"Downloaded {poster_url} into {poster_output_path}") |
|
|
print(f"Downloaded {paper_url} into {paper_output_path}") |
|
|
except subprocess.CalledProcessError as e: |
|
|
print(f"Error downloading {paper_url} or {poster_url}: {e}") |
|
|
|
|
|
generate_meta_json('Paper2Poster-data') |