edbeeching commited on
Commit
c80506e
Β·
1 Parent(s): f00ab9d

polishing 2

Browse files
Files changed (1) hide show
  1. app.py +47 -69
app.py CHANGED
@@ -561,75 +561,53 @@ def main():
561
  with main_interface:
562
  with gr.Group():
563
  with gr.Row():
564
- gr.Markdown("# DataForge - Synthetic Data Generation")
565
  with gr.Row():
566
- gr.Markdown("""
567
- **DataForge** - Scalable synthetic data generation framework built on DataTrove. Supports distributed Slurm processing with 20+ models.
568
-
569
- **Free for PRO users** (10K samples) β€’ **100 samples** for free users β€’ All datasets are **PUBLIC** under [synthetic-data-universe](https://huggingface.co/synthetic-data-universe)
570
- """)
571
- with gr.Accordion("Complete Usage Guide", open=False):
572
- with gr.Row():
573
- gr.Markdown("""
574
- **Step-by-Step Process:**
575
- 1. **Load Dataset**: Enter a Hugging Face dataset name (e.g., `simplescaling/s1K-1.1`)
576
- 2. **Load Info**: Click "Load Dataset Info" to populate configs, columns, and splits
577
- 3. **Choose Model**: Select from 20+ popular instruction-tuned models
578
- 4. **Configure**: Set generation parameters (temperature, tokens, etc.)
579
- 5. **Submit**: Click submit and monitor progress in the Statistics tab
580
-
581
- **Pro Tips:**
582
- - Use temperature 0.7-1.0 for creative tasks, 0.1-0.3 for factual content
583
- - Start with fewer samples to test your prompt before scaling up
584
- - Check existing datasets in [synthetic-data-universe](https://huggingface.co/synthetic-data-universe) for inspiration
585
- """)
586
- gr.Markdown("""
587
- **Requirements & Limits:**
588
- - Input dataset must be **publicly accessible** on HF Hub
589
- - Model must be **publicly accessible** (not gated)
590
- - **Sample Limits:**
591
- - Free users: 100 samples max
592
- - PRO users: 10,000 samples max
593
- - **Token Limit:** 8,192 generated tokens per sample
594
- - **Processing Time:** Varies by model size and queue status
595
-
596
- **Privacy & Usage:**
597
- - All outputs are **PUBLIC** on Hugging Face Hub
598
- - Datasets appear under `synthetic-data-universe` organization
599
- - Perfect for research, training data, and open-source projects
600
- """)
601
-
602
- with gr.Accordion("Examples & Use Cases", open=False):
603
- gr.Markdown("""
604
- **Popular Use Cases:**
605
-
606
- **Educational Content Generation**
607
- - Input: Questions dataset β†’ Output: Detailed explanations and answers
608
- - Models: `Qwen/Qwen3-4B-Instruct-2507` or `microsoft/Phi-3.5-mini-instruct`
609
- - Temperature: 0.3-0.5 for factual accuracy
610
-
611
- **Conversational Data**
612
- - Input: Conversation starters β†’ Output: Multi-turn dialogues
613
- - Models: `meta-llama/Llama-3.2-3B-Instruct` or `mistralai/Mistral-7B-Instruct-v0.3`
614
- - Temperature: 0.7-0.9 for natural variety
615
-
616
- **Code Generation**
617
- - Input: Problem descriptions β†’ Output: Code solutions with explanations
618
- - Models: `Qwen/Qwen2.5-Coder-3B-Instruct` or `deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct`
619
- - Temperature: 0.1-0.3 for accurate code
620
-
621
- **Creative Writing**
622
- - Input: Story prompts β†’ Output: Creative narratives
623
- - Models: `meta-llama/Llama-3.2-3B-Instruct` or `mistralai/Mistral-7B-Instruct-v0.3`
624
- - Temperature: 0.8-1.2 for creativity
625
-
626
- **Example Dataset Names to Try:**
627
- ```
628
- simplescaling/s1K-1.1 # Simple Q&A pairs
629
- HuggingFaceH4/ultrachat_200k # Conversations
630
- iamtarun/python_code_instructions_18k_alpaca # Code tasks
631
- ```
632
- """)
633
 
634
  with gr.Tabs():
635
  with gr.TabItem("Generate Data"):
@@ -647,7 +625,7 @@ def main():
647
  )
648
  # model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
649
  with gr.Row():
650
- system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.", info="Sets the AI's role/behavior. Leave empty for default model behavior.")
651
  gr.Markdown("### Generation Parameters")
652
  with gr.Row():
653
  with gr.Column():
 
561
  with main_interface:
562
  with gr.Group():
563
  with gr.Row():
564
+ gr.Markdown("# DataForge - Synthetic Data Generation")
565
  with gr.Row():
566
+ with gr.Column(scale=1):
567
+ gr.Markdown("""
568
+ **DataForge** - Scalable synthetic data generation framework built on DataTrove. Supports distributed Slurm processing with 20+ models.
569
+
570
+ **Free for PRO users** (10K samples) β€’ **100 samples** for free users β€’ All datasets are **PUBLIC** under [synthetic-data-universe](https://huggingface.co/synthetic-data-universe)
571
+ """)
572
+ with gr.Column(scale=1):
573
+ with gr.Accordion("Usage Guide", open=False):
574
+ gr.Markdown("""
575
+ **Step-by-Step Process:**
576
+ 1. **Load Dataset**: Enter a HF dataset name
577
+ 2. **Load Info**: Click "Load Dataset Info"
578
+ 3. **Choose Model**: Select from 20+ models
579
+ 4. **Configure**: Set generation parameters
580
+ 5. **Submit**: Monitor progress in Statistics tab
581
+
582
+ **Requirements:**
583
+ - Input dataset must be public on HF Hub
584
+ - Model must be publicly accessible
585
+ - Free users: 100 samples max, PRO: 10K max
586
+ - Token limit: 8,192 per sample
587
+ """)
588
+ with gr.Accordion("Examples", open=False):
589
+ gr.Markdown("""
590
+ **Popular Use Cases:**
591
+
592
+ **Educational**: Q&A datasets
593
+ - Models: Qwen3-4B, Phi-3.5-mini
594
+ - Temperature: 0.3-0.5
595
+
596
+ **Conversational**: Multi-turn dialogues
597
+ - Models: Llama-3.2-3B, Mistral-7B
598
+ - Temperature: 0.7-0.9
599
+
600
+ **Code**: Problem β†’ Solution
601
+ - Models: Qwen2.5-Coder, DeepSeek-Coder
602
+ - Temperature: 0.1-0.3
603
+
604
+ **Example datasets to try:**
605
+ ```
606
+ simplescaling/s1K-1.1
607
+ HuggingFaceH4/ultrachat_200k
608
+ iamtarun/python_code_instructions_18k_alpaca
609
+ ```
610
+ """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
611
 
612
  with gr.Tabs():
613
  with gr.TabItem("Generate Data"):
 
625
  )
626
  # model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
627
  with gr.Row():
628
+ system_prompt = gr.Textbox(label="System Prompt (Optional)", placeholder="Optional system prompt... e.g., You are a helpful assistant.", info="Sets the AI's role/behavior. Leave empty for default model behavior.")
629
  gr.Markdown("### Generation Parameters")
630
  with gr.Row():
631
  with gr.Column():