Spaces:
Sleeping
Sleeping
edbeeching
commited on
Commit
Β·
f00ab9d
1
Parent(s):
30e16b4
polishing
Browse files
app.py
CHANGED
|
@@ -276,7 +276,7 @@ def validate_request(request: GenerationRequest, oauth_token: Optional[Union[gr.
|
|
| 276 |
|
| 277 |
# check the output dataset is valid and accessible with the provided token
|
| 278 |
try:
|
| 279 |
-
|
| 280 |
raise Exception(f"Output dataset {request.output_dataset_name} already exists. Please choose a different name.")
|
| 281 |
except Exception:
|
| 282 |
pass # dataset does not exist, which is expected
|
|
@@ -335,7 +335,7 @@ def validate_request(request: GenerationRequest, oauth_token: Optional[Union[gr.
|
|
| 335 |
return request
|
| 336 |
|
| 337 |
|
| 338 |
-
def load_dataset_info(dataset_name, model_name, oauth_token=None
|
| 339 |
"""Load dataset information and return choices for dropdowns"""
|
| 340 |
if not dataset_name.strip():
|
| 341 |
return (
|
|
@@ -542,8 +542,8 @@ def main():
|
|
| 542 |
cache_all_model_params()
|
| 543 |
print("Model parameter caching complete.")
|
| 544 |
|
| 545 |
-
with gr.Blocks(title="Synthetic Data Generation") as demo:
|
| 546 |
-
gr.Image("dataforge.png", show_label=False, show_download_button=False, container=False, height=
|
| 547 |
gr.HTML("<h3 style='text-align:center'>Generate synthetic data with AI models. Free to use! Sign up for PRO benefits (10k samples vs 100). <a href='http://huggingface.co/subscribe/pro?source=synthetic-data-universe' target='_blank'>Upgrade to PRO</a></h3>", elem_id="sub_title")
|
| 548 |
|
| 549 |
# Add sign-in button at the top
|
|
@@ -561,75 +561,69 @@ def main():
|
|
| 561 |
with main_interface:
|
| 562 |
with gr.Group():
|
| 563 |
with gr.Row():
|
| 564 |
-
gr.Markdown("# Synthetic Data Generation
|
| 565 |
with gr.Row():
|
| 566 |
gr.Markdown("""
|
| 567 |
-
|
| 568 |
|
| 569 |
-
|
| 570 |
-
- π **Free for PRO users** - Uses idle GPUs on the HF science cluster
|
| 571 |
-
- π€ **20+ Popular Models** - Including Qwen, Llama, Mistral, and more
|
| 572 |
-
- β‘ **Fast Processing** - Optimized for batch generation
|
| 573 |
-
- π **Up to 10K samples** - For PRO users (100 for free users)
|
| 574 |
-
|
| 575 |
-
β οΈ **Important:** All generated datasets are **PUBLIC** and available under [synthetic-data-universe](https://huggingface.co/synthetic-data-universe).
|
| 576 |
""")
|
| 577 |
-
with gr.Accordion("
|
| 578 |
with gr.Row():
|
| 579 |
gr.Markdown("""
|
| 580 |
-
|
| 581 |
-
1.
|
| 582 |
-
2.
|
| 583 |
-
3.
|
| 584 |
-
4.
|
| 585 |
-
5.
|
| 586 |
-
|
| 587 |
-
|
| 588 |
- Use temperature 0.7-1.0 for creative tasks, 0.1-0.3 for factual content
|
| 589 |
- Start with fewer samples to test your prompt before scaling up
|
| 590 |
- Check existing datasets in [synthetic-data-universe](https://huggingface.co/synthetic-data-universe) for inspiration
|
| 591 |
""")
|
| 592 |
gr.Markdown("""
|
| 593 |
-
|
| 594 |
-
-
|
| 595 |
-
-
|
| 596 |
-
-
|
| 597 |
-
-
|
| 598 |
-
-
|
| 599 |
-
-
|
| 600 |
-
-
|
| 601 |
-
|
| 602 |
-
|
| 603 |
- All outputs are **PUBLIC** on Hugging Face Hub
|
| 604 |
- Datasets appear under `synthetic-data-universe` organization
|
| 605 |
- Perfect for research, training data, and open-source projects
|
| 606 |
""")
|
| 607 |
|
| 608 |
-
with gr.Accordion("
|
| 609 |
gr.Markdown("""
|
| 610 |
-
|
| 611 |
|
| 612 |
-
|
| 613 |
- Input: Questions dataset β Output: Detailed explanations and answers
|
| 614 |
- Models: `Qwen/Qwen3-4B-Instruct-2507` or `microsoft/Phi-3.5-mini-instruct`
|
| 615 |
- Temperature: 0.3-0.5 for factual accuracy
|
| 616 |
|
| 617 |
-
|
| 618 |
- Input: Conversation starters β Output: Multi-turn dialogues
|
| 619 |
- Models: `meta-llama/Llama-3.2-3B-Instruct` or `mistralai/Mistral-7B-Instruct-v0.3`
|
| 620 |
- Temperature: 0.7-0.9 for natural variety
|
| 621 |
|
| 622 |
-
|
| 623 |
- Input: Problem descriptions β Output: Code solutions with explanations
|
| 624 |
- Models: `Qwen/Qwen2.5-Coder-3B-Instruct` or `deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct`
|
| 625 |
- Temperature: 0.1-0.3 for accurate code
|
| 626 |
|
| 627 |
-
|
| 628 |
- Input: Story prompts β Output: Creative narratives
|
| 629 |
- Models: `meta-llama/Llama-3.2-3B-Instruct` or `mistralai/Mistral-7B-Instruct-v0.3`
|
| 630 |
- Temperature: 0.8-1.2 for creativity
|
| 631 |
|
| 632 |
-
|
| 633 |
```
|
| 634 |
simplescaling/s1K-1.1 # Simple Q&A pairs
|
| 635 |
HuggingFaceH4/ultrachat_200k # Conversations
|
|
@@ -638,7 +632,7 @@ def main():
|
|
| 638 |
""")
|
| 639 |
|
| 640 |
with gr.Tabs():
|
| 641 |
-
with gr.TabItem("Generate
|
| 642 |
with gr.Row():
|
| 643 |
with gr.Column():
|
| 644 |
with gr.Group():
|
|
@@ -692,7 +686,7 @@ def main():
|
|
| 692 |
output_status = gr.Textbox(label="Status", interactive=False)
|
| 693 |
|
| 694 |
with gr.TabItem("Statistics Dashboard"):
|
| 695 |
-
gr.Markdown("## Generation
|
| 696 |
gr.Markdown("π View recent synthetic data generation requests and their status.")
|
| 697 |
|
| 698 |
with gr.Row():
|
|
|
|
| 276 |
|
| 277 |
# check the output dataset is valid and accessible with the provided token
|
| 278 |
try:
|
| 279 |
+
get_dataset_infos(request.output_dataset_name, token=request.output_dataset_token)
|
| 280 |
raise Exception(f"Output dataset {request.output_dataset_name} already exists. Please choose a different name.")
|
| 281 |
except Exception:
|
| 282 |
pass # dataset does not exist, which is expected
|
|
|
|
| 335 |
return request
|
| 336 |
|
| 337 |
|
| 338 |
+
def load_dataset_info(dataset_name, model_name, oauth_token=None):
|
| 339 |
"""Load dataset information and return choices for dropdowns"""
|
| 340 |
if not dataset_name.strip():
|
| 341 |
return (
|
|
|
|
| 542 |
cache_all_model_params()
|
| 543 |
print("Model parameter caching complete.")
|
| 544 |
|
| 545 |
+
with gr.Blocks(title="DataForge - Synthetic Data Generation") as demo:
|
| 546 |
+
gr.Image("dataforge.png", show_label=False, show_download_button=False, container=False, height=300)
|
| 547 |
gr.HTML("<h3 style='text-align:center'>Generate synthetic data with AI models. Free to use! Sign up for PRO benefits (10k samples vs 100). <a href='http://huggingface.co/subscribe/pro?source=synthetic-data-universe' target='_blank'>Upgrade to PRO</a></h3>", elem_id="sub_title")
|
| 548 |
|
| 549 |
# Add sign-in button at the top
|
|
|
|
| 561 |
with main_interface:
|
| 562 |
with gr.Group():
|
| 563 |
with gr.Row():
|
| 564 |
+
gr.Markdown("# DataForge - Synthetic Data Generation")
|
| 565 |
with gr.Row():
|
| 566 |
gr.Markdown("""
|
| 567 |
+
**DataForge** - Scalable synthetic data generation framework built on DataTrove. Supports distributed Slurm processing with 20+ models.
|
| 568 |
|
| 569 |
+
**Free for PRO users** (10K samples) β’ **100 samples** for free users β’ All datasets are **PUBLIC** under [synthetic-data-universe](https://huggingface.co/synthetic-data-universe)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 570 |
""")
|
| 571 |
+
with gr.Accordion("Complete Usage Guide", open=False):
|
| 572 |
with gr.Row():
|
| 573 |
gr.Markdown("""
|
| 574 |
+
**Step-by-Step Process:**
|
| 575 |
+
1. **Load Dataset**: Enter a Hugging Face dataset name (e.g., `simplescaling/s1K-1.1`)
|
| 576 |
+
2. **Load Info**: Click "Load Dataset Info" to populate configs, columns, and splits
|
| 577 |
+
3. **Choose Model**: Select from 20+ popular instruction-tuned models
|
| 578 |
+
4. **Configure**: Set generation parameters (temperature, tokens, etc.)
|
| 579 |
+
5. **Submit**: Click submit and monitor progress in the Statistics tab
|
| 580 |
+
|
| 581 |
+
**Pro Tips:**
|
| 582 |
- Use temperature 0.7-1.0 for creative tasks, 0.1-0.3 for factual content
|
| 583 |
- Start with fewer samples to test your prompt before scaling up
|
| 584 |
- Check existing datasets in [synthetic-data-universe](https://huggingface.co/synthetic-data-universe) for inspiration
|
| 585 |
""")
|
| 586 |
gr.Markdown("""
|
| 587 |
+
**Requirements & Limits:**
|
| 588 |
+
- Input dataset must be **publicly accessible** on HF Hub
|
| 589 |
+
- Model must be **publicly accessible** (not gated)
|
| 590 |
+
- **Sample Limits:**
|
| 591 |
+
- Free users: 100 samples max
|
| 592 |
+
- PRO users: 10,000 samples max
|
| 593 |
+
- **Token Limit:** 8,192 generated tokens per sample
|
| 594 |
+
- **Processing Time:** Varies by model size and queue status
|
| 595 |
+
|
| 596 |
+
**Privacy & Usage:**
|
| 597 |
- All outputs are **PUBLIC** on Hugging Face Hub
|
| 598 |
- Datasets appear under `synthetic-data-universe` organization
|
| 599 |
- Perfect for research, training data, and open-source projects
|
| 600 |
""")
|
| 601 |
|
| 602 |
+
with gr.Accordion("Examples & Use Cases", open=False):
|
| 603 |
gr.Markdown("""
|
| 604 |
+
**Popular Use Cases:**
|
| 605 |
|
| 606 |
+
**Educational Content Generation**
|
| 607 |
- Input: Questions dataset β Output: Detailed explanations and answers
|
| 608 |
- Models: `Qwen/Qwen3-4B-Instruct-2507` or `microsoft/Phi-3.5-mini-instruct`
|
| 609 |
- Temperature: 0.3-0.5 for factual accuracy
|
| 610 |
|
| 611 |
+
**Conversational Data**
|
| 612 |
- Input: Conversation starters β Output: Multi-turn dialogues
|
| 613 |
- Models: `meta-llama/Llama-3.2-3B-Instruct` or `mistralai/Mistral-7B-Instruct-v0.3`
|
| 614 |
- Temperature: 0.7-0.9 for natural variety
|
| 615 |
|
| 616 |
+
**Code Generation**
|
| 617 |
- Input: Problem descriptions β Output: Code solutions with explanations
|
| 618 |
- Models: `Qwen/Qwen2.5-Coder-3B-Instruct` or `deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct`
|
| 619 |
- Temperature: 0.1-0.3 for accurate code
|
| 620 |
|
| 621 |
+
**Creative Writing**
|
| 622 |
- Input: Story prompts β Output: Creative narratives
|
| 623 |
- Models: `meta-llama/Llama-3.2-3B-Instruct` or `mistralai/Mistral-7B-Instruct-v0.3`
|
| 624 |
- Temperature: 0.8-1.2 for creativity
|
| 625 |
|
| 626 |
+
**Example Dataset Names to Try:**
|
| 627 |
```
|
| 628 |
simplescaling/s1K-1.1 # Simple Q&A pairs
|
| 629 |
HuggingFaceH4/ultrachat_200k # Conversations
|
|
|
|
| 632 |
""")
|
| 633 |
|
| 634 |
with gr.Tabs():
|
| 635 |
+
with gr.TabItem("Generate Data"):
|
| 636 |
with gr.Row():
|
| 637 |
with gr.Column():
|
| 638 |
with gr.Group():
|
|
|
|
| 686 |
output_status = gr.Textbox(label="Status", interactive=False)
|
| 687 |
|
| 688 |
with gr.TabItem("Statistics Dashboard"):
|
| 689 |
+
gr.Markdown("## DataForge Generation Statistics")
|
| 690 |
gr.Markdown("π View recent synthetic data generation requests and their status.")
|
| 691 |
|
| 692 |
with gr.Row():
|