Spaces:

synthetic-data-universe
/

synth

Sleeping

App Files Files Community

edbeeching commited on Sep 16

Commit

684d1a6

1 Parent(s): 3e3c42b

fixing descriptions

Browse files

Files changed (1) hide show

app.py +52 -12

app.py CHANGED Viewed

@@ -546,25 +546,65 @@ def main():
         gr.Image("dataforge.png", show_label=False, show_download_button=False, container=False, height=300)
         # Store the current oauth token for use in submit_request
         current_oauth_token = gr.State(None)
-        # Title
-        gr.Markdown("# DataForge")
         # Main description
         gr.Markdown("""
-        This tool allows you to **generate synthetic data from existing datasets**: you get expanded training data from your prompts, super useful for all your **fine-tuning/research/data augmentation** needs!
-        """)
-        # PRO sentence
-        gr.Markdown("**🎯 FREE for HuggingFace PRO users (10,000 samples) • 100 samples for free users**")
-        # Sign in button
         with gr.Row():
-            gr.Markdown("")  # Empty space for alignment
-            login_button = gr.LoginButton(value="🔑 Sign in", size="sm")
-            gr.Markdown("")  # Empty space for alignment
-        signin_message = gr.Markdown("## 🔑 Sign In Required\n\nPlease sign in with your Hugging Face account to access the synthetic data generation service. Click the **Sign in** button above to continue.", visible=True)
         main_interface = gr.Column(visible=False)
         with main_interface:

         gr.Image("dataforge.png", show_label=False, show_download_button=False, container=False, height=300)
         # Store the current oauth token for use in submit_request
         current_oauth_token = gr.State(None)
+        with gr.Row():
+            gr.Markdown("")  # Empty space for alignment
+            login_button = gr.LoginButton(value="🔑 Sign in", size="sm")
+            gr.Markdown("")  # Empty space for alignment
+        signin_message = gr.Markdown("## 🔑 Sign In Required\n\nPlease sign in with your Hugging Face account to access the synthetic data generation service. Click the **Sign in** button above to continue.", visible=True)
         # Main description
         gr.Markdown("""
+        This tool allows you to **generate synthetic data from existing datasets**, for all your **fine-tuning/research/data augmentation** needs!
+        DataForge is built on top of [DataTrove](https://github.com/huggingface/datatrove), our backend data generation script is open-source and available on [GitHub](https://github.com/huggingface/dataforge). DataForge is **FREE** for HuggingFace PRO users (10,000 samples) • 100 samples for free users.
+        """)
+        # Usage guide and examples (right below description)
         with gr.Row():
+            with gr.Column(scale=1):
+                with gr.Accordion("Usage Guide", open=False):
+                    gr.Markdown("""
+                    **Step-by-Step Process:**
+                    1. **Load Dataset**: Enter a HF dataset name
+                    2. **Load Info**: Click "Load Dataset Info"
+                    3. **Choose Model**: Select from 20+ models
+                    4. **Configure**: Set generation parameters
+                    5. **Submit**: Monitor progress in Statistics tab
+                    **Requirements:**
+                    - Input dataset must be public on HF Hub
+                    - Model must be publicly accessible
+                    - Free users: 100 samples max, PRO: 10K max
+                    - Token limit: 8,192 per sample
+                    """)
+            with gr.Column(scale=1):
+                with gr.Accordion("Examples", open=False):
+                    gr.Markdown("""
+                    **Popular Use Cases:**
+                    **Educational**: Q&A datasets
+                    - Models: Qwen3-4B, Phi-3.5-mini
+                    - Temperature: 0.3-0.5
+                    **Conversational**: Multi-turn dialogues
+                    - Models: Llama-3.2-3B, Mistral-7B
+                    - Temperature: 0.7-0.9
+                    **Code**: Problem → Solution
+                    - Models: Qwen2.5-Coder, DeepSeek-Coder
+                    - Temperature: 0.1-0.3
+                    **Example datasets to try:**
+                    ```
+                    simplescaling/s1K-1.1
+                    HuggingFaceH4/ultrachat_200k
+                    iamtarun/python_code_instructions_18k_alpaca
+                    ```
+                    """)
+        # Sign in button
         main_interface = gr.Column(visible=False)
         with main_interface: