Spaces:

synthetic-data-universe
/

synth

Sleeping

App Files Files Community

edbeeching commited on Sep 16

Commit

3e3c42b

1 Parent(s): c80506e

polish 3

Browse files

Files changed (1) hide show

app.py +18 -58

app.py CHANGED Viewed

@@ -544,71 +544,30 @@ def main():
     with gr.Blocks(title="DataForge - Synthetic Data Generation") as demo:
         gr.Image("dataforge.png", show_label=False, show_download_button=False, container=False, height=300)
-        gr.HTML("<h3 style='text-align:center'>Generate synthetic data with AI models. Free to use! Sign up for PRO benefits (10k samples vs 100). <a href='http://huggingface.co/subscribe/pro?source=synthetic-data-universe' target='_blank'>Upgrade to PRO</a></h3>", elem_id="sub_title")
-        # Add sign-in button at the top
         with gr.Row():
             gr.Markdown("")  # Empty space for alignment
             login_button = gr.LoginButton(value="🔑 Sign in", size="sm")
             gr.Markdown("")  # Empty space for alignment
         signin_message = gr.Markdown("## 🔑 Sign In Required\n\nPlease sign in with your Hugging Face account to access the synthetic data generation service. Click the **Sign in** button above to continue.", visible=True)
         main_interface = gr.Column(visible=False)
-        # Store the current oauth token for use in submit_request
-        current_oauth_token = gr.State(None)
-        with main_interface:
-            with gr.Group():
-                with gr.Row():
-                    gr.Markdown("# DataForge - Synthetic Data Generation")
-                with gr.Row():
-                    with gr.Column(scale=1):
-                        gr.Markdown("""
-                        **DataForge** - Scalable synthetic data generation framework built on DataTrove. Supports distributed Slurm processing with 20+ models.
-                        **Free for PRO users** (10K samples) • **100 samples** for free users • All datasets are **PUBLIC** under [synthetic-data-universe](https://huggingface.co/synthetic-data-universe)
-                        """)
-                    with gr.Column(scale=1):
-                        with gr.Accordion("Usage Guide", open=False):
-                            gr.Markdown("""
-                            **Step-by-Step Process:**
-                            1. **Load Dataset**: Enter a HF dataset name
-                            2. **Load Info**: Click "Load Dataset Info"
-                            3. **Choose Model**: Select from 20+ models
-                            4. **Configure**: Set generation parameters
-                            5. **Submit**: Monitor progress in Statistics tab
-                            **Requirements:**
-                            - Input dataset must be public on HF Hub
-                            - Model must be publicly accessible
-                            - Free users: 100 samples max, PRO: 10K max
-                            - Token limit: 8,192 per sample
-                            """)
-                        with gr.Accordion("Examples", open=False):
-                            gr.Markdown("""
-                            **Popular Use Cases:**
-                            **Educational**: Q&A datasets
-                            - Models: Qwen3-4B, Phi-3.5-mini
-                            - Temperature: 0.3-0.5
-                            **Conversational**: Multi-turn dialogues
-                            - Models: Llama-3.2-3B, Mistral-7B
-                            - Temperature: 0.7-0.9
-                            **Code**: Problem → Solution
-                            - Models: Qwen2.5-Coder, DeepSeek-Coder
-                            - Temperature: 0.1-0.3
-                            **Example datasets to try:**
-                            ```
-                            simplescaling/s1K-1.1
-                            HuggingFaceH4/ultrachat_200k
-                            iamtarun/python_code_instructions_18k_alpaca
-                            ```
-                            """)
             with gr.Tabs():
                 with gr.TabItem("Generate Data"):
                     with gr.Row():
@@ -658,8 +617,9 @@ def main():
                                     with gr.Column():
                                         input_dataset_split = gr.Dropdown(label="Dataset Split", choices=[], value=None, interactive=False, info="Click Load Info to populate")
                                         num_output_samples = gr.Slider(label="Number of samples, leave as '0' for all", value=0, minimum=0, maximum=MAX_SAMPLES_FREE, step=1, interactive=False, info="Click Load Info to populate")
                     submit_btn = gr.Button("Submit Generation Request", variant="primary")
                     output_status = gr.Textbox(label="Status", interactive=False)

     with gr.Blocks(title="DataForge - Synthetic Data Generation") as demo:
         gr.Image("dataforge.png", show_label=False, show_download_button=False, container=False, height=300)
+        # Store the current oauth token for use in submit_request
+        current_oauth_token = gr.State(None)
+        # Title
+        gr.Markdown("# DataForge")
+        # Main description
+        gr.Markdown("""
+        This tool allows you to **generate synthetic data from existing datasets**: you get expanded training data from your prompts, super useful for all your **fine-tuning/research/data augmentation** needs!
+        """)
+        # PRO sentence
+        gr.Markdown("**🎯 FREE for HuggingFace PRO users (10,000 samples) • 100 samples for free users**")
+        # Sign in button
         with gr.Row():
             gr.Markdown("")  # Empty space for alignment
             login_button = gr.LoginButton(value="🔑 Sign in", size="sm")
             gr.Markdown("")  # Empty space for alignment
         signin_message = gr.Markdown("## 🔑 Sign In Required\n\nPlease sign in with your Hugging Face account to access the synthetic data generation service. Click the **Sign in** button above to continue.", visible=True)
         main_interface = gr.Column(visible=False)
+        with main_interface:
             with gr.Tabs():
                 with gr.TabItem("Generate Data"):
                     with gr.Row():
                                     with gr.Column():
                                         input_dataset_split = gr.Dropdown(label="Dataset Split", choices=[], value=None, interactive=False, info="Click Load Info to populate")
                                         num_output_samples = gr.Slider(label="Number of samples, leave as '0' for all", value=0, minimum=0, maximum=MAX_SAMPLES_FREE, step=1, interactive=False, info="Click Load Info to populate")
+                    gr.Markdown("**All generated datasets will be publicly available under the [synthetic-data-universe](https://huggingface.co/synthetic-data-universe) organization.**")
                     submit_btn = gr.Button("Submit Generation Request", variant="primary")
                     output_status = gr.Textbox(label="Status", interactive=False)