Spaces:

Dataset-Tools
/

pdf-to-page-images-dataset

Running

App Files Files Community

davanstrien HF Staff commited on Sep 19, 2024

Commit

3c15d19

1 Parent(s): 0ef7a85

update options

Browse files

Files changed (1) hide show

app.py +69 -9

app.py CHANGED Viewed

@@ -3,11 +3,15 @@ import random
 import shutil
 import tempfile
 import zipfile
 import gradio as gr
-from huggingface_hub import HfApi
 from pdf2image import convert_from_path
 from PyPDF2 import PdfReader
 def pdf_to_images(pdf_files, sample_size, temp_dir, progress=gr.Progress()):
@@ -48,10 +52,25 @@ def pdf_to_images(pdf_files, sample_size, temp_dir, progress=gr.Progress()):
     return all_images, f"Saved {len(all_images)} images to temporary directory"
 def process_pdfs(
     pdf_files,
     sample_size,
     hf_repo,
     oauth_token: gr.OAuthToken | None,
     progress=gr.Progress(),
 ):
@@ -81,12 +100,15 @@ def process_pdfs(
         progress(0, desc="Starting PDF processing")
         images, message = pdf_to_images(pdf_files, sample_size, images_dir)
-        # Create a zip file of the images
-        zip_path = os.path.join(temp_dir, "converted_images.zip")
-        with zipfile.ZipFile(zip_path, "w") as zipf:
-            progress(0, desc="Zipping images")
-            for image in progress.tqdm(images, desc="Zipping images"):
-                zipf.write(image, os.path.basename(image))
         if hf_repo:
             try:
@@ -94,6 +116,7 @@ def process_pdfs(
                 hf_api.create_repo(
                     hf_repo,
                     repo_type="dataset",
                 )
                 hf_api.upload_folder(
                     folder_path=images_dir,
@@ -101,7 +124,41 @@ def process_pdfs(
                     repo_type="dataset",
                     path_in_repo="images",
                 )
-                message += f"\nUploaded images to Hugging Face repo: {hf_repo}/images"
             except Exception as e:
                 message += f"\nFailed to upload to Hugging Face: {str(e)}"
@@ -140,6 +197,9 @@ with gr.Blocks() as demo:
             placeholder="username/repo-name",
             info="Enter the Hugging Face repository name in the format 'username/repo-name'",
         )
     with gr.Accordion("View converted images", open=False):
         output_gallery = gr.Gallery(label="Converted Images")
     status_text = gr.Markdown(label="Status")
@@ -148,7 +208,7 @@ with gr.Blocks() as demo:
     submit_button = gr.Button("Convert PDFs to page images")
     submit_button.click(
         process_pdfs,
-        inputs=[pdf_files, sample_size, hf_repo],
         outputs=[output_gallery, download_button, status_text],
     )

 import shutil
 import tempfile
 import zipfile
+from datetime import datetime
 import gradio as gr
+from huggingface_hub import HfApi, DatasetCard, DatasetCardData
 from pdf2image import convert_from_path
 from PyPDF2 import PdfReader
+from dataset_card_template import DATASET_CARD_TEMPLATE
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 def pdf_to_images(pdf_files, sample_size, temp_dir, progress=gr.Progress()):
     return all_images, f"Saved {len(all_images)} images to temporary directory"
+def get_size_category(num_images):
+    if num_images < 1000:
+        return "n<1K"
+    elif num_images < 10000:
+        return "1K<n<10K"
+    elif num_images < 100000:
+        return "10K<n<100K"
+    elif num_images < 1000000:
+        return "100K<n<1M"
+    else:
+        return "n>1M"
 def process_pdfs(
     pdf_files,
     sample_size,
     hf_repo,
+    create_zip,
+    private_repo,
     oauth_token: gr.OAuthToken | None,
     progress=gr.Progress(),
 ):
         progress(0, desc="Starting PDF processing")
         images, message = pdf_to_images(pdf_files, sample_size, images_dir)
+        zip_path = None
+        if create_zip:
+            # Create a zip file of the images
+            zip_path = os.path.join(temp_dir, "converted_images.zip")
+            with zipfile.ZipFile(zip_path, "w") as zipf:
+                progress(0, desc="Zipping images")
+                for image in progress.tqdm(images, desc="Zipping images"):
+                    zipf.write(image, os.path.basename(image))
+            message += f"\nCreated zip file with {len(images)} images"
         if hf_repo:
             try:
                 hf_api.create_repo(
                     hf_repo,
                     repo_type="dataset",
+                    private=private_repo,
                 )
                 hf_api.upload_folder(
                     folder_path=images_dir,
                     repo_type="dataset",
                     path_in_repo="images",
                 )
+                # Determine size category
+                size_category = get_size_category(len(images))
+                # Create DatasetCardData instance
+                card_data = DatasetCardData(
+                    tags=["created-with-pdfs-to-page-images-converter", "pdf-to-image"],
+                    size_categories=[size_category],
+                )
+                # Create and populate the dataset card
+                card = DatasetCard.from_template(
+                    card_data,
+                    template_path=None,  # Use default template
+                    hf_repo=hf_repo,
+                    num_images=len(images),
+                    num_pdfs=len(pdf_files),
+                    sample_size=sample_size if sample_size > 0 else "All pages",
+                    creation_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+                )
+                # Add our custom content to the card
+                card.text = DATASET_CARD_TEMPLATE.format(
+                    hf_repo=hf_repo,
+                    num_images=len(images),
+                    num_pdfs=len(pdf_files),
+                    sample_size=sample_size if sample_size > 0 else "All pages",
+                    creation_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+                    size_category=size_category,
+                )
+                repo_url = f"https://huggingface.co/datasets/{hf_repo}"
+                message += f"\nUploaded dataset card to Hugging Face repo: [{hf_repo}]({repo_url})"
+                card.push_to_hub(hf_repo)
             except Exception as e:
                 message += f"\nFailed to upload to Hugging Face: {str(e)}"
             placeholder="username/repo-name",
             info="Enter the Hugging Face repository name in the format 'username/repo-name'",
         )
+    with gr.Row():
+        create_zip = gr.Checkbox(label="Create ZIP file of images?", value=False)
+        private_repo = gr.Checkbox(label="Make repository private?", value=False)
     with gr.Accordion("View converted images", open=False):
         output_gallery = gr.Gallery(label="Converted Images")
     status_text = gr.Markdown(label="Status")
     submit_button = gr.Button("Convert PDFs to page images")
     submit_button.click(
         process_pdfs,
+        inputs=[pdf_files, sample_size, hf_repo, create_zip, private_repo],
         outputs=[output_gallery, download_button, status_text],
     )