edbeeching commited on
Commit
f00ab9d
Β·
1 Parent(s): 30e16b4
Files changed (1) hide show
  1. app.py +35 -41
app.py CHANGED
@@ -276,7 +276,7 @@ def validate_request(request: GenerationRequest, oauth_token: Optional[Union[gr.
276
 
277
  # check the output dataset is valid and accessible with the provided token
278
  try:
279
- output_dataset_info = get_dataset_infos(request.output_dataset_name, token=request.output_dataset_token)
280
  raise Exception(f"Output dataset {request.output_dataset_name} already exists. Please choose a different name.")
281
  except Exception:
282
  pass # dataset does not exist, which is expected
@@ -335,7 +335,7 @@ def validate_request(request: GenerationRequest, oauth_token: Optional[Union[gr.
335
  return request
336
 
337
 
338
- def load_dataset_info(dataset_name, model_name, oauth_token=None, dataset_token=None):
339
  """Load dataset information and return choices for dropdowns"""
340
  if not dataset_name.strip():
341
  return (
@@ -542,8 +542,8 @@ def main():
542
  cache_all_model_params()
543
  print("Model parameter caching complete.")
544
 
545
- with gr.Blocks(title="Synthetic Data Generation") as demo:
546
- gr.Image("dataforge.png", show_label=False, show_download_button=False, container=False, height=200)
547
  gr.HTML("<h3 style='text-align:center'>Generate synthetic data with AI models. Free to use! Sign up for PRO benefits (10k samples vs 100). <a href='http://huggingface.co/subscribe/pro?source=synthetic-data-universe' target='_blank'>Upgrade to PRO</a></h3>", elem_id="sub_title")
548
 
549
  # Add sign-in button at the top
@@ -561,75 +561,69 @@ def main():
561
  with main_interface:
562
  with gr.Group():
563
  with gr.Row():
564
- gr.Markdown("# Synthetic Data Generation Request")
565
  with gr.Row():
566
  gr.Markdown("""
567
- πŸš€ **Generate high-quality synthetic data using state-of-the-art language models!** Perfect for training datasets, data augmentation, and research experiments.
568
 
569
- ✨ **Features:**
570
- - πŸ†“ **Free for PRO users** - Uses idle GPUs on the HF science cluster
571
- - πŸ€– **20+ Popular Models** - Including Qwen, Llama, Mistral, and more
572
- - ⚑ **Fast Processing** - Optimized for batch generation
573
- - πŸ“Š **Up to 10K samples** - For PRO users (100 for free users)
574
-
575
- ⚠️ **Important:** All generated datasets are **PUBLIC** and available under [synthetic-data-universe](https://huggingface.co/synthetic-data-universe).
576
  """)
577
- with gr.Accordion("πŸ“– Complete Usage Guide", open=False):
578
  with gr.Row():
579
  gr.Markdown("""
580
- **πŸ”„ Step-by-Step Process:**
581
- 1. **πŸ“‚ Load Dataset**: Enter a Hugging Face dataset name (e.g., `simplescaling/s1K-1.1`)
582
- 2. **πŸ“Š Load Info**: Click "πŸ“Š Load Dataset Info" to populate configs, columns, and splits
583
- 3. **πŸ€– Choose Model**: Select from 20+ popular instruction-tuned models
584
- 4. **βš™οΈ Configure**: Set generation parameters (temperature, tokens, etc.)
585
- 5. **πŸš€ Submit**: Click submit and monitor progress in the Statistics tab
586
-
587
- **πŸ’‘ Pro Tips:**
588
  - Use temperature 0.7-1.0 for creative tasks, 0.1-0.3 for factual content
589
  - Start with fewer samples to test your prompt before scaling up
590
  - Check existing datasets in [synthetic-data-universe](https://huggingface.co/synthetic-data-universe) for inspiration
591
  """)
592
  gr.Markdown("""
593
- **πŸ“‹ Requirements & Limits:**
594
- - βœ… Input dataset must be **publicly accessible** on HF Hub
595
- - βœ… Model must be **publicly accessible** (not gated)
596
- - πŸ“Š **Sample Limits:**
597
- - πŸ†“ Free users: 100 samples max
598
- - ⭐ PRO users: 10,000 samples max
599
- - πŸ”€ **Token Limit:** 8,192 generated tokens per sample
600
- - ⏱️ **Processing Time:** Varies by model size and queue status
601
-
602
- **πŸ”’ Privacy & Usage:**
603
  - All outputs are **PUBLIC** on Hugging Face Hub
604
  - Datasets appear under `synthetic-data-universe` organization
605
  - Perfect for research, training data, and open-source projects
606
  """)
607
 
608
- with gr.Accordion("πŸ’‘ Examples & Use Cases", open=False):
609
  gr.Markdown("""
610
- **🎯 Popular Use Cases:**
611
 
612
- **πŸ“š Educational Content Generation**
613
  - Input: Questions dataset β†’ Output: Detailed explanations and answers
614
  - Models: `Qwen/Qwen3-4B-Instruct-2507` or `microsoft/Phi-3.5-mini-instruct`
615
  - Temperature: 0.3-0.5 for factual accuracy
616
 
617
- **πŸ’¬ Conversational Data**
618
  - Input: Conversation starters β†’ Output: Multi-turn dialogues
619
  - Models: `meta-llama/Llama-3.2-3B-Instruct` or `mistralai/Mistral-7B-Instruct-v0.3`
620
  - Temperature: 0.7-0.9 for natural variety
621
 
622
- **πŸ”§ Code Generation**
623
  - Input: Problem descriptions β†’ Output: Code solutions with explanations
624
  - Models: `Qwen/Qwen2.5-Coder-3B-Instruct` or `deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct`
625
  - Temperature: 0.1-0.3 for accurate code
626
 
627
- **πŸ“– Creative Writing**
628
  - Input: Story prompts β†’ Output: Creative narratives
629
  - Models: `meta-llama/Llama-3.2-3B-Instruct` or `mistralai/Mistral-7B-Instruct-v0.3`
630
  - Temperature: 0.8-1.2 for creativity
631
 
632
- **πŸ“Š Example Dataset Names to Try:**
633
  ```
634
  simplescaling/s1K-1.1 # Simple Q&A pairs
635
  HuggingFaceH4/ultrachat_200k # Conversations
@@ -638,7 +632,7 @@ def main():
638
  """)
639
 
640
  with gr.Tabs():
641
- with gr.TabItem("Generate Synthetic Data"):
642
  with gr.Row():
643
  with gr.Column():
644
  with gr.Group():
@@ -692,7 +686,7 @@ def main():
692
  output_status = gr.Textbox(label="Status", interactive=False)
693
 
694
  with gr.TabItem("Statistics Dashboard"):
695
- gr.Markdown("## Generation Requests Statistics")
696
  gr.Markdown("πŸ“Š View recent synthetic data generation requests and their status.")
697
 
698
  with gr.Row():
 
276
 
277
  # check the output dataset is valid and accessible with the provided token
278
  try:
279
+ get_dataset_infos(request.output_dataset_name, token=request.output_dataset_token)
280
  raise Exception(f"Output dataset {request.output_dataset_name} already exists. Please choose a different name.")
281
  except Exception:
282
  pass # dataset does not exist, which is expected
 
335
  return request
336
 
337
 
338
+ def load_dataset_info(dataset_name, model_name, oauth_token=None):
339
  """Load dataset information and return choices for dropdowns"""
340
  if not dataset_name.strip():
341
  return (
 
542
  cache_all_model_params()
543
  print("Model parameter caching complete.")
544
 
545
+ with gr.Blocks(title="DataForge - Synthetic Data Generation") as demo:
546
+ gr.Image("dataforge.png", show_label=False, show_download_button=False, container=False, height=300)
547
  gr.HTML("<h3 style='text-align:center'>Generate synthetic data with AI models. Free to use! Sign up for PRO benefits (10k samples vs 100). <a href='http://huggingface.co/subscribe/pro?source=synthetic-data-universe' target='_blank'>Upgrade to PRO</a></h3>", elem_id="sub_title")
548
 
549
  # Add sign-in button at the top
 
561
  with main_interface:
562
  with gr.Group():
563
  with gr.Row():
564
+ gr.Markdown("# DataForge - Synthetic Data Generation")
565
  with gr.Row():
566
  gr.Markdown("""
567
+ **DataForge** - Scalable synthetic data generation framework built on DataTrove. Supports distributed Slurm processing with 20+ models.
568
 
569
+ **Free for PRO users** (10K samples) β€’ **100 samples** for free users β€’ All datasets are **PUBLIC** under [synthetic-data-universe](https://huggingface.co/synthetic-data-universe)
 
 
 
 
 
 
570
  """)
571
+ with gr.Accordion("Complete Usage Guide", open=False):
572
  with gr.Row():
573
  gr.Markdown("""
574
+ **Step-by-Step Process:**
575
+ 1. **Load Dataset**: Enter a Hugging Face dataset name (e.g., `simplescaling/s1K-1.1`)
576
+ 2. **Load Info**: Click "Load Dataset Info" to populate configs, columns, and splits
577
+ 3. **Choose Model**: Select from 20+ popular instruction-tuned models
578
+ 4. **Configure**: Set generation parameters (temperature, tokens, etc.)
579
+ 5. **Submit**: Click submit and monitor progress in the Statistics tab
580
+
581
+ **Pro Tips:**
582
  - Use temperature 0.7-1.0 for creative tasks, 0.1-0.3 for factual content
583
  - Start with fewer samples to test your prompt before scaling up
584
  - Check existing datasets in [synthetic-data-universe](https://huggingface.co/synthetic-data-universe) for inspiration
585
  """)
586
  gr.Markdown("""
587
+ **Requirements & Limits:**
588
+ - Input dataset must be **publicly accessible** on HF Hub
589
+ - Model must be **publicly accessible** (not gated)
590
+ - **Sample Limits:**
591
+ - Free users: 100 samples max
592
+ - PRO users: 10,000 samples max
593
+ - **Token Limit:** 8,192 generated tokens per sample
594
+ - **Processing Time:** Varies by model size and queue status
595
+
596
+ **Privacy & Usage:**
597
  - All outputs are **PUBLIC** on Hugging Face Hub
598
  - Datasets appear under `synthetic-data-universe` organization
599
  - Perfect for research, training data, and open-source projects
600
  """)
601
 
602
+ with gr.Accordion("Examples & Use Cases", open=False):
603
  gr.Markdown("""
604
+ **Popular Use Cases:**
605
 
606
+ **Educational Content Generation**
607
  - Input: Questions dataset β†’ Output: Detailed explanations and answers
608
  - Models: `Qwen/Qwen3-4B-Instruct-2507` or `microsoft/Phi-3.5-mini-instruct`
609
  - Temperature: 0.3-0.5 for factual accuracy
610
 
611
+ **Conversational Data**
612
  - Input: Conversation starters β†’ Output: Multi-turn dialogues
613
  - Models: `meta-llama/Llama-3.2-3B-Instruct` or `mistralai/Mistral-7B-Instruct-v0.3`
614
  - Temperature: 0.7-0.9 for natural variety
615
 
616
+ **Code Generation**
617
  - Input: Problem descriptions β†’ Output: Code solutions with explanations
618
  - Models: `Qwen/Qwen2.5-Coder-3B-Instruct` or `deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct`
619
  - Temperature: 0.1-0.3 for accurate code
620
 
621
+ **Creative Writing**
622
  - Input: Story prompts β†’ Output: Creative narratives
623
  - Models: `meta-llama/Llama-3.2-3B-Instruct` or `mistralai/Mistral-7B-Instruct-v0.3`
624
  - Temperature: 0.8-1.2 for creativity
625
 
626
+ **Example Dataset Names to Try:**
627
  ```
628
  simplescaling/s1K-1.1 # Simple Q&A pairs
629
  HuggingFaceH4/ultrachat_200k # Conversations
 
632
  """)
633
 
634
  with gr.Tabs():
635
+ with gr.TabItem("Generate Data"):
636
  with gr.Row():
637
  with gr.Column():
638
  with gr.Group():
 
686
  output_status = gr.Textbox(label="Status", interactive=False)
687
 
688
  with gr.TabItem("Statistics Dashboard"):
689
+ gr.Markdown("## DataForge Generation Statistics")
690
  gr.Markdown("πŸ“Š View recent synthetic data generation requests and their status.")
691
 
692
  with gr.Row():