Spaces:

synthetic-data-universe
/

synth

Sleeping

App Files Files Community

edbeeching commited on Sep 16

Commit

f00ab9d

1 Parent(s): 30e16b4

polishing

Browse files

Files changed (1) hide show

app.py +35 -41

app.py CHANGED Viewed

@@ -276,7 +276,7 @@ def validate_request(request: GenerationRequest, oauth_token: Optional[Union[gr.
     # check the output dataset is valid and accessible with the provided token
     try:
-        output_dataset_info = get_dataset_infos(request.output_dataset_name, token=request.output_dataset_token)
         raise Exception(f"Output dataset {request.output_dataset_name} already exists. Please choose a different name.")
     except Exception:
         pass  # dataset does not exist, which is expected
@@ -335,7 +335,7 @@ def validate_request(request: GenerationRequest, oauth_token: Optional[Union[gr.
     return request
-def load_dataset_info(dataset_name, model_name, oauth_token=None, dataset_token=None):
     """Load dataset information and return choices for dropdowns"""
     if not dataset_name.strip():
         return (
@@ -542,8 +542,8 @@ def main():
     cache_all_model_params()
     print("Model parameter caching complete.")
-    with gr.Blocks(title="Synthetic Data Generation") as demo:
-        gr.Image("dataforge.png", show_label=False, show_download_button=False, container=False, height=200)
         gr.HTML("<h3 style='text-align:center'>Generate synthetic data with AI models. Free to use! Sign up for PRO benefits (10k samples vs 100). <a href='http://huggingface.co/subscribe/pro?source=synthetic-data-universe' target='_blank'>Upgrade to PRO</a></h3>", elem_id="sub_title")
         # Add sign-in button at the top
@@ -561,75 +561,69 @@ def main():
         with main_interface:
             with gr.Group():
                 with gr.Row():
-                    gr.Markdown("# Synthetic Data Generation Request")
                 with gr.Row():
                     gr.Markdown("""
-                    🚀 **Generate high-quality synthetic data using state-of-the-art language models!** Perfect for training datasets, data augmentation, and research experiments.
-                    ✨ **Features:**
-                    - 🆓 **Free for PRO users** - Uses idle GPUs on the HF science cluster
-                    - 🤖 **20+ Popular Models** - Including Qwen, Llama, Mistral, and more
-                    - ⚡ **Fast Processing** - Optimized for batch generation
-                    - 📊 **Up to 10K samples** - For PRO users (100 for free users)
-                    ⚠️ **Important:** All generated datasets are **PUBLIC** and available under [synthetic-data-universe](https://huggingface.co/synthetic-data-universe).
                     """)
-            with gr.Accordion("📖 Complete Usage Guide", open=False):
                 with gr.Row():
                     gr.Markdown("""
-                    **🔄 Step-by-Step Process:**
-                    1. **📂 Load Dataset**: Enter a Hugging Face dataset name (e.g., `simplescaling/s1K-1.1`)
-                    2. **📊 Load Info**: Click "📊 Load Dataset Info" to populate configs, columns, and splits
-                    3. **🤖 Choose Model**: Select from 20+ popular instruction-tuned models
-                    4. **⚙️ Configure**: Set generation parameters (temperature, tokens, etc.)
-                    5. **🚀 Submit**: Click submit and monitor progress in the Statistics tab
-                    **💡 Pro Tips:**
                     - Use temperature 0.7-1.0 for creative tasks, 0.1-0.3 for factual content
                     - Start with fewer samples to test your prompt before scaling up
                     - Check existing datasets in [synthetic-data-universe](https://huggingface.co/synthetic-data-universe) for inspiration
                     """)
                     gr.Markdown("""
-                    **📋 Requirements & Limits:**
-                    - ✅ Input dataset must be **publicly accessible** on HF Hub
-                    - ✅ Model must be **publicly accessible** (not gated)
-                    - 📊 **Sample Limits:**
-                      - 🆓 Free users: 100 samples max
-                      - ⭐ PRO users: 10,000 samples max
-                    - 🔤 **Token Limit:** 8,192 generated tokens per sample
-                    - ⏱️ **Processing Time:** Varies by model size and queue status
-                    **🔒 Privacy & Usage:**
                     - All outputs are **PUBLIC** on Hugging Face Hub
                     - Datasets appear under `synthetic-data-universe` organization
                     - Perfect for research, training data, and open-source projects
                     """)
-            with gr.Accordion("💡 Examples & Use Cases", open=False):
                 gr.Markdown("""
-                **🎯 Popular Use Cases:**
-                **📚 Educational Content Generation**
                 - Input: Questions dataset → Output: Detailed explanations and answers
                 - Models: `Qwen/Qwen3-4B-Instruct-2507` or `microsoft/Phi-3.5-mini-instruct`
                 - Temperature: 0.3-0.5 for factual accuracy
-                **💬 Conversational Data**
                 - Input: Conversation starters → Output: Multi-turn dialogues
                 - Models: `meta-llama/Llama-3.2-3B-Instruct` or `mistralai/Mistral-7B-Instruct-v0.3`
                 - Temperature: 0.7-0.9 for natural variety
-                **🔧 Code Generation**
                 - Input: Problem descriptions → Output: Code solutions with explanations
                 - Models: `Qwen/Qwen2.5-Coder-3B-Instruct` or `deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct`
                 - Temperature: 0.1-0.3 for accurate code
-                **📖 Creative Writing**
                 - Input: Story prompts → Output: Creative narratives
                 - Models: `meta-llama/Llama-3.2-3B-Instruct` or `mistralai/Mistral-7B-Instruct-v0.3`
                 - Temperature: 0.8-1.2 for creativity
-                **📊 Example Dataset Names to Try:**
                 ```
                 simplescaling/s1K-1.1        # Simple Q&A pairs
                 HuggingFaceH4/ultrachat_200k # Conversations
@@ -638,7 +632,7 @@ def main():
                 """)
             with gr.Tabs():
-                with gr.TabItem("Generate Synthetic Data"):
                     with gr.Row():
                         with gr.Column():
                             with gr.Group():
@@ -692,7 +686,7 @@ def main():
                     output_status = gr.Textbox(label="Status", interactive=False)
                 with gr.TabItem("Statistics Dashboard"):
-                    gr.Markdown("## Generation Requests Statistics")
                     gr.Markdown("📊 View recent synthetic data generation requests and their status.")
                     with gr.Row():

     # check the output dataset is valid and accessible with the provided token
     try:
+        get_dataset_infos(request.output_dataset_name, token=request.output_dataset_token)
         raise Exception(f"Output dataset {request.output_dataset_name} already exists. Please choose a different name.")
     except Exception:
         pass  # dataset does not exist, which is expected
     return request
+def load_dataset_info(dataset_name, model_name, oauth_token=None):
     """Load dataset information and return choices for dropdowns"""
     if not dataset_name.strip():
         return (
     cache_all_model_params()
     print("Model parameter caching complete.")
+    with gr.Blocks(title="DataForge - Synthetic Data Generation") as demo:
+        gr.Image("dataforge.png", show_label=False, show_download_button=False, container=False, height=300)
         gr.HTML("<h3 style='text-align:center'>Generate synthetic data with AI models. Free to use! Sign up for PRO benefits (10k samples vs 100). <a href='http://huggingface.co/subscribe/pro?source=synthetic-data-universe' target='_blank'>Upgrade to PRO</a></h3>", elem_id="sub_title")
         # Add sign-in button at the top
         with main_interface:
             with gr.Group():
                 with gr.Row():
+                    gr.Markdown("# DataForge - Synthetic Data Generation")
                 with gr.Row():
                     gr.Markdown("""
+                    **DataForge** - Scalable synthetic data generation framework built on DataTrove. Supports distributed Slurm processing with 20+ models.
+                    **Free for PRO users** (10K samples) • **100 samples** for free users • All datasets are **PUBLIC** under [synthetic-data-universe](https://huggingface.co/synthetic-data-universe)
                     """)
+            with gr.Accordion("Complete Usage Guide", open=False):
                 with gr.Row():
                     gr.Markdown("""
+                    **Step-by-Step Process:**
+                    1. **Load Dataset**: Enter a Hugging Face dataset name (e.g., `simplescaling/s1K-1.1`)
+                    2. **Load Info**: Click "Load Dataset Info" to populate configs, columns, and splits
+                    3. **Choose Model**: Select from 20+ popular instruction-tuned models
+                    4. **Configure**: Set generation parameters (temperature, tokens, etc.)
+                    5. **Submit**: Click submit and monitor progress in the Statistics tab
+                    **Pro Tips:**
                     - Use temperature 0.7-1.0 for creative tasks, 0.1-0.3 for factual content
                     - Start with fewer samples to test your prompt before scaling up
                     - Check existing datasets in [synthetic-data-universe](https://huggingface.co/synthetic-data-universe) for inspiration
                     """)
                     gr.Markdown("""
+                    **Requirements & Limits:**
+                    - Input dataset must be **publicly accessible** on HF Hub
+                    - Model must be **publicly accessible** (not gated)
+                    - **Sample Limits:**
+                      - Free users: 100 samples max
+                      - PRO users: 10,000 samples max
+                    - **Token Limit:** 8,192 generated tokens per sample
+                    - **Processing Time:** Varies by model size and queue status
+                    **Privacy & Usage:**
                     - All outputs are **PUBLIC** on Hugging Face Hub
                     - Datasets appear under `synthetic-data-universe` organization
                     - Perfect for research, training data, and open-source projects
                     """)
+            with gr.Accordion("Examples & Use Cases", open=False):
                 gr.Markdown("""
+                **Popular Use Cases:**
+                **Educational Content Generation**
                 - Input: Questions dataset → Output: Detailed explanations and answers
                 - Models: `Qwen/Qwen3-4B-Instruct-2507` or `microsoft/Phi-3.5-mini-instruct`
                 - Temperature: 0.3-0.5 for factual accuracy
+                **Conversational Data**
                 - Input: Conversation starters → Output: Multi-turn dialogues
                 - Models: `meta-llama/Llama-3.2-3B-Instruct` or `mistralai/Mistral-7B-Instruct-v0.3`
                 - Temperature: 0.7-0.9 for natural variety
+                **Code Generation**
                 - Input: Problem descriptions → Output: Code solutions with explanations
                 - Models: `Qwen/Qwen2.5-Coder-3B-Instruct` or `deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct`
                 - Temperature: 0.1-0.3 for accurate code
+                **Creative Writing**
                 - Input: Story prompts → Output: Creative narratives
                 - Models: `meta-llama/Llama-3.2-3B-Instruct` or `mistralai/Mistral-7B-Instruct-v0.3`
                 - Temperature: 0.8-1.2 for creativity
+                **Example Dataset Names to Try:**
                 ```
                 simplescaling/s1K-1.1        # Simple Q&A pairs
                 HuggingFaceH4/ultrachat_200k # Conversations
                 """)
             with gr.Tabs():
+                with gr.TabItem("Generate Data"):
                     with gr.Row():
                         with gr.Column():
                             with gr.Group():
                     output_status = gr.Textbox(label="Status", interactive=False)
                 with gr.TabItem("Statistics Dashboard"):
+                    gr.Markdown("## DataForge Generation Statistics")
                     gr.Markdown("📊 View recent synthetic data generation requests and their status.")
                     with gr.Row():