space_to_dataset_saver

Build error

App Files Files Community

Wauplin HF Staff commited on Jun 30, 2023

Commit

65e637b

1 Parent(s): 1a0a79f

leaner ParquetScheduler

Browse files

Files changed (1) hide show

app_parquet.py +77 -40

app_parquet.py CHANGED Viewed

@@ -6,13 +6,14 @@ import shutil
 import tempfile
 import uuid
 from pathlib import Path
-from typing import Any, Dict, List
 import gradio as gr
 import pyarrow as pa
 import pyarrow.parquet as pq
 from gradio_client import Client
 from huggingface_hub import CommitScheduler
 #######################
 # Parquet scheduler   #
@@ -21,54 +22,95 @@ from huggingface_hub import CommitScheduler
 class ParquetScheduler(CommitScheduler):
     def append(self, row: Dict[str, Any]) -> None:
         with self.lock:
-            if not hasattr(self, "rows") or self.rows is None:
-                self.rows = []
-            self.rows.append(row)
-    def set_schema(self, schema: Dict[str, Dict[str, str]]) -> None:
-        """
-        Define a schema to help `datasets` load the generated library.
-        This method is optional and can be called once just after the scheduler had been created. If it is not called,
-        the schema is automatically inferred before pushing the data to the Hub.
-        See https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Value for the list of
-        possible values.
-        Example:
-        ```py
-        scheduler.set_schema({
-            "prompt": {"_type": "Value", "dtype": "string"},
-            "negative_prompt": {"_type": "Value", "dtype": "string"},
-            "guidance_scale": {"_type": "Value", "dtype": "int64"},
-            "image": {"_type": "Image"},
-        })
-        ```
-        """
-        self._schema = schema
     def push_to_hub(self):
         # Check for new rows to push
         with self.lock:
-            rows = getattr(self, "rows", None)
-            self.rows = None
         if not rows:
             return
         print(f"Got {len(rows)} item(s) to commit.")
         # Load images + create 'features' config for datasets library
-        hf_features: Dict[str, Dict] = getattr(self, "_schema", None) or {}
         path_to_cleanup: List[Path] = []
         for row in rows:
             for key, value in row.items():
                 # Infer schema (for `datasets` library)
-                if key not in hf_features:
-                    hf_features[key] = _infer_schema(key, value)
                 # Load binary files if necessary
-                if hf_features[key]["_type"] in ("Image", "Audio"):
                     # It's an image or audio: we load the bytes and remember to cleanup the file
                     file_path = Path(value)
                     if file_path.is_file():
@@ -80,7 +122,7 @@ class ParquetScheduler(CommitScheduler):
         # Complete rows if needed
         for row in rows:
-            for feature in hf_features:
                 if feature not in row:
                     row[feature] = None
@@ -89,7 +131,7 @@ class ParquetScheduler(CommitScheduler):
         # Add metadata (used by datasets library)
         table = table.replace_schema_metadata(
-            {"huggingface": json.dumps({"info": {"features": hf_features}})}
         )
         # Write to parquet file
@@ -142,12 +184,7 @@ def _infer_schema(key: str, value: Any) -> Dict[str, str]:
 PARQUET_DATASET_DIR = Path("parquet_dataset")
 PARQUET_DATASET_DIR.mkdir(parents=True, exist_ok=True)
-scheduler = ParquetScheduler(
-    repo_id="example-space-to-dataset-parquet",
-    repo_type="dataset",
-    folder_path=PARQUET_DATASET_DIR,
-    path_in_repo="data",
-)
 client = Client("stabilityai/stable-diffusion")

 import tempfile
 import uuid
 from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
 import gradio as gr
 import pyarrow as pa
 import pyarrow.parquet as pq
 from gradio_client import Client
 from huggingface_hub import CommitScheduler
+from huggingface_hub.hf_api import HfApi
 #######################
 # Parquet scheduler   #
 class ParquetScheduler(CommitScheduler):
+    """
+    Usage: configure the scheduler with a repo id. Once started, you can add data to be uploaded to the Hub. 1 `.append`
+    call will result in 1 row in your final dataset.
+    ```py
+    # Start scheduler
+    >>> scheduler = ParquetScheduler(repo_id="my-parquet-dataset")
+    # Append some data to be uploaded
+    >>> scheduler.append({...})
+    >>> scheduler.append({...})
+    >>> scheduler.append({...})
+    ```
+    The scheduler will automatically infer the schema from the data it pushes.
+    Optionally, you can manually set the schema yourself:
+    ```py
+    >>> scheduler = ParquetScheduler(
+    ...     repo_id="my-parquet-dataset",
+    ...     schema={
+    ...         "prompt": {"_type": "Value", "dtype": "string"},
+    ...         "negative_prompt": {"_type": "Value", "dtype": "string"},
+    ...         "guidance_scale": {"_type": "Value", "dtype": "int64"},
+    ...         "image": {"_type": "Image"},
+    ...     },
+    ... )
+    See https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Value for the list of
+    possible values.
+    """
+    def __init__(
+        self,
+        *,
+        repo_id: str,
+        schema: Optional[Dict[str, Dict[str, str]]] = None,
+        every: Union[int, float] = 5,
+        path_in_repo: Optional[str] = "data",
+        repo_type: Optional[str] = "dataset",
+        revision: Optional[str] = None,
+        private: bool = False,
+        token: Optional[str] = None,
+        allow_patterns: Union[List[str], str, None] = None,
+        ignore_patterns: Union[List[str], str, None] = None,
+        hf_api: Optional[HfApi] = None,
+    ) -> None:
+        super().__init__(
+            repo_id=repo_id,
+            folder_path="dummy",  # not used by the scheduler
+            every=every,
+            path_in_repo=path_in_repo,
+            repo_type=repo_type,
+            revision=revision,
+            private=private,
+            token=token,
+            allow_patterns=allow_patterns,
+            ignore_patterns=ignore_patterns,
+            hf_api=hf_api,
+        )
+        self._rows: List[Dict[str, Any]] = []
+        self._schema = schema
     def append(self, row: Dict[str, Any]) -> None:
+        """Add a new item to be uploaded."""
         with self.lock:
+            self._rows.append(row)
     def push_to_hub(self):
         # Check for new rows to push
         with self.lock:
+            rows = self._rows
+            self._rows = []
         if not rows:
             return
         print(f"Got {len(rows)} item(s) to commit.")
         # Load images + create 'features' config for datasets library
+        schema: Dict[str, Dict] = self._schema or {}
         path_to_cleanup: List[Path] = []
         for row in rows:
             for key, value in row.items():
                 # Infer schema (for `datasets` library)
+                if key not in schema:
+                    schema[key] = _infer_schema(key, value)
                 # Load binary files if necessary
+                if schema[key]["_type"] in ("Image", "Audio"):
                     # It's an image or audio: we load the bytes and remember to cleanup the file
                     file_path = Path(value)
                     if file_path.is_file():
         # Complete rows if needed
         for row in rows:
+            for feature in schema:
                 if feature not in row:
                     row[feature] = None
         # Add metadata (used by datasets library)
         table = table.replace_schema_metadata(
+            {"huggingface": json.dumps({"info": {"features": schema}})}
         )
         # Write to parquet file
 PARQUET_DATASET_DIR = Path("parquet_dataset")
 PARQUET_DATASET_DIR.mkdir(parents=True, exist_ok=True)
+scheduler = ParquetScheduler(repo_id="example-space-to-dataset-parquet")
 client = Client("stabilityai/stable-diffusion")