edbeeching commited on
Commit
3e3c42b
Β·
1 Parent(s): c80506e
Files changed (1) hide show
  1. app.py +18 -58
app.py CHANGED
@@ -544,71 +544,30 @@ def main():
544
 
545
  with gr.Blocks(title="DataForge - Synthetic Data Generation") as demo:
546
  gr.Image("dataforge.png", show_label=False, show_download_button=False, container=False, height=300)
547
- gr.HTML("<h3 style='text-align:center'>Generate synthetic data with AI models. Free to use! Sign up for PRO benefits (10k samples vs 100). <a href='http://huggingface.co/subscribe/pro?source=synthetic-data-universe' target='_blank'>Upgrade to PRO</a></h3>", elem_id="sub_title")
 
548
 
549
- # Add sign-in button at the top
 
 
 
 
 
 
 
 
 
 
 
550
  with gr.Row():
551
  gr.Markdown("") # Empty space for alignment
552
  login_button = gr.LoginButton(value="πŸ”‘ Sign in", size="sm")
553
  gr.Markdown("") # Empty space for alignment
554
-
555
  signin_message = gr.Markdown("## πŸ”‘ Sign In Required\n\nPlease sign in with your Hugging Face account to access the synthetic data generation service. Click the **Sign in** button above to continue.", visible=True)
556
  main_interface = gr.Column(visible=False)
557
-
558
- # Store the current oauth token for use in submit_request
559
- current_oauth_token = gr.State(None)
560
-
561
- with main_interface:
562
- with gr.Group():
563
- with gr.Row():
564
- gr.Markdown("# DataForge - Synthetic Data Generation")
565
- with gr.Row():
566
- with gr.Column(scale=1):
567
- gr.Markdown("""
568
- **DataForge** - Scalable synthetic data generation framework built on DataTrove. Supports distributed Slurm processing with 20+ models.
569
-
570
- **Free for PRO users** (10K samples) β€’ **100 samples** for free users β€’ All datasets are **PUBLIC** under [synthetic-data-universe](https://huggingface.co/synthetic-data-universe)
571
- """)
572
- with gr.Column(scale=1):
573
- with gr.Accordion("Usage Guide", open=False):
574
- gr.Markdown("""
575
- **Step-by-Step Process:**
576
- 1. **Load Dataset**: Enter a HF dataset name
577
- 2. **Load Info**: Click "Load Dataset Info"
578
- 3. **Choose Model**: Select from 20+ models
579
- 4. **Configure**: Set generation parameters
580
- 5. **Submit**: Monitor progress in Statistics tab
581
-
582
- **Requirements:**
583
- - Input dataset must be public on HF Hub
584
- - Model must be publicly accessible
585
- - Free users: 100 samples max, PRO: 10K max
586
- - Token limit: 8,192 per sample
587
- """)
588
- with gr.Accordion("Examples", open=False):
589
- gr.Markdown("""
590
- **Popular Use Cases:**
591
-
592
- **Educational**: Q&A datasets
593
- - Models: Qwen3-4B, Phi-3.5-mini
594
- - Temperature: 0.3-0.5
595
-
596
- **Conversational**: Multi-turn dialogues
597
- - Models: Llama-3.2-3B, Mistral-7B
598
- - Temperature: 0.7-0.9
599
-
600
- **Code**: Problem β†’ Solution
601
- - Models: Qwen2.5-Coder, DeepSeek-Coder
602
- - Temperature: 0.1-0.3
603
-
604
- **Example datasets to try:**
605
- ```
606
- simplescaling/s1K-1.1
607
- HuggingFaceH4/ultrachat_200k
608
- iamtarun/python_code_instructions_18k_alpaca
609
- ```
610
- """)
611
 
 
612
  with gr.Tabs():
613
  with gr.TabItem("Generate Data"):
614
  with gr.Row():
@@ -658,8 +617,9 @@ def main():
658
  with gr.Column():
659
  input_dataset_split = gr.Dropdown(label="Dataset Split", choices=[], value=None, interactive=False, info="Click Load Info to populate")
660
  num_output_samples = gr.Slider(label="Number of samples, leave as '0' for all", value=0, minimum=0, maximum=MAX_SAMPLES_FREE, step=1, interactive=False, info="Click Load Info to populate")
661
-
662
 
 
 
663
  submit_btn = gr.Button("Submit Generation Request", variant="primary")
664
  output_status = gr.Textbox(label="Status", interactive=False)
665
 
 
544
 
545
  with gr.Blocks(title="DataForge - Synthetic Data Generation") as demo:
546
  gr.Image("dataforge.png", show_label=False, show_download_button=False, container=False, height=300)
547
+ # Store the current oauth token for use in submit_request
548
+ current_oauth_token = gr.State(None)
549
 
550
+ # Title
551
+ gr.Markdown("# DataForge")
552
+
553
+ # Main description
554
+ gr.Markdown("""
555
+ This tool allows you to **generate synthetic data from existing datasets**: you get expanded training data from your prompts, super useful for all your **fine-tuning/research/data augmentation** needs!
556
+ """)
557
+
558
+ # PRO sentence
559
+ gr.Markdown("**🎯 FREE for HuggingFace PRO users (10,000 samples) β€’ 100 samples for free users**")
560
+
561
+ # Sign in button
562
  with gr.Row():
563
  gr.Markdown("") # Empty space for alignment
564
  login_button = gr.LoginButton(value="πŸ”‘ Sign in", size="sm")
565
  gr.Markdown("") # Empty space for alignment
566
+
567
  signin_message = gr.Markdown("## πŸ”‘ Sign In Required\n\nPlease sign in with your Hugging Face account to access the synthetic data generation service. Click the **Sign in** button above to continue.", visible=True)
568
  main_interface = gr.Column(visible=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
569
 
570
+ with main_interface:
571
  with gr.Tabs():
572
  with gr.TabItem("Generate Data"):
573
  with gr.Row():
 
617
  with gr.Column():
618
  input_dataset_split = gr.Dropdown(label="Dataset Split", choices=[], value=None, interactive=False, info="Click Load Info to populate")
619
  num_output_samples = gr.Slider(label="Number of samples, leave as '0' for all", value=0, minimum=0, maximum=MAX_SAMPLES_FREE, step=1, interactive=False, info="Click Load Info to populate")
 
620
 
621
+
622
+ gr.Markdown("**All generated datasets will be publicly available under the [synthetic-data-universe](https://huggingface.co/synthetic-data-universe) organization.**")
623
  submit_btn = gr.Button("Submit Generation Request", variant="primary")
624
  output_status = gr.Textbox(label="Status", interactive=False)
625