Spaces:

synthetic-data-universe
/

synth

Sleeping

App Files Files Community

edbeeching commited on Sep 12

Commit

0fb1b95

1 Parent(s): 67c4080

vibing with claude

Browse files

Files changed (1) hide show

app.py +103 -66

app.py CHANGED Viewed

@@ -49,7 +49,8 @@ class GenerationStatus(Enum):
     FAILED = "FAILED"
-MAX_SAMPLES = 10000  # max number of samples in the input dataset
 MAX_TOKENS = 8192
 MAX_MODEL_PARAMS = 20_000_000_000  # 20 billion parameters (for now)
@@ -79,7 +80,7 @@ class GenerationRequest:
     private: bool = False
     num_retries: int = 0
-def validate_request(request: GenerationRequest) -> GenerationRequest:
     # checks that the request is valid
     # - input dataset exists and can be accessed with the provided token
     try:
@@ -101,8 +102,13 @@ def validate_request(request: GenerationRequest) -> GenerationRequest:
-    if request.num_output_examples > MAX_SAMPLES:
-        raise Exception(f"Requested number of output examples {request.num_output_examples} exceeds the max limit of {MAX_SAMPLES}.")
     # check the prompt column exists in the dataset
     if request.prompt_column not in input_dataset_info.features:
@@ -195,16 +201,16 @@ def add_request_to_db(request: GenerationRequest):
         raise Exception("Failed to add request to database")
 def main():
     with gr.Blocks(title="Synthetic Data Generation") as demo:
-        gr.HTML("<h3 style='text-align:center'>Hugging Face PRO users can use the Synthetic generation service. <a href='http://huggingface.co/subscribe/pro?source=synthetic-data-universe' target='_blank'>Subscribe to PRO</a></h3>", elem_id="sub_title")
         pro_message = gr.Markdown(visible=False)
         main_interface = gr.Column(visible=False)
         with main_interface:
             with gr.Group():
                 with gr.Row():
@@ -214,7 +220,7 @@ def main():
                     Welcome to the Synthetic Data Generation service! This tool allows you to generate synthetic data using large language models. Generation is FREE for Hugging Face PRO users and uses idle GPUs on the HF science cluster.\n
                     Outputs from this service will be PUBLIC and available on the Hugging Face Hub under the organization [synthetic-data-universe](https://huggingface.co/synthetic-data-universe).\n
                     """)
-            with gr.Group():
                 with gr.Row():
                     gr.Markdown("""
                     **How it works:**
@@ -232,54 +238,81 @@ def main():
                     - Maximum of 8192 generated tokens
                     """)
-            with gr.Group():
-                gr.Markdown("##  Dataset information")
-                with gr.Column():
-                    with gr.Row():
-                        input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1")
-                        input_dataset_split = gr.Textbox(label="Input Dataset Split", value="train", placeholder="e.g., train, test, validation")
-                        input_dataset_config = gr.Textbox(label="Input Dataset Config", value="default", placeholder="e.g., default, custom")
-                        prompt_column = gr.Textbox(label="Prompt Column", placeholder="e.g., text, prompt, question")
-                    with gr.Column():
-                        with gr.Row():
-                            output_dataset_name = gr.Textbox(label="Output Dataset Name", placeholder="e.g., my-generated-dataset, must be unique. Will be created under the org 'synthetic-data-universe'")
-                            num_output_samples = gr.Slider(label="Number of samples, leave as '0' for all", value=0, minimum=0, maximum=MAX_SAMPLES, step=1)
-            with gr.Group():
-                gr.Markdown("##  Model information")
-                with gr.Column():
-                    with gr.Row():
-                        model_name_or_path = gr.Textbox(label="Model Name or Path", placeholder="e.g., Qwen/Qwen3-4B-Instruct-2507")
-                        model_revision = gr.Textbox(label="Model Revision", value="main", placeholder="e.g., main, v1.0")
-                    # model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
-            with gr.Group():
-                gr.Markdown("##  Generation Parameters")
-                with gr.Row():
-                    with gr.Column():
-                        with gr.Row():
-                            max_tokens = gr.Slider(label="Max Tokens", value=512, minimum=256, maximum=MAX_TOKENS, step=256)
-                            temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.1)
                         with gr.Row():
-                            top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5)
-                            top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
                         with gr.Row():
-                            system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.")
-            with gr.Group():
-                gr.Markdown("##  User Information, for notification when your job is completed (still TODO)")
-                with gr.Row():
-                    with gr.Column():
                         with gr.Row():
-                            email = gr.Textbox(label="Email", placeholder="your.email@example.com")
-                        # with gr.Row():
-                            # input_dataset_token = gr.Textbox(label="Input dataset token", type="password", placeholder="Your HF token with read access to the input dataset, leave blank if public dataset")
-                            # output_dataset_token = gr.Textbox(label="Output dataset token", type="password", placeholder="Your HF token with write access to the output dataset")
-            submit_btn = gr.Button("Submit Generation Request", variant="primary")
-            output_status = gr.Textbox(label="Status", interactive=False)
             def submit_request(input_dataset_name, input_split, input_dataset_config, output_dataset_name, prompt_col, model_name, model_rev, sys_prompt,
-                                max_tok, temp, top_k_val, top_p_val, email_addr, num_output_samples):
                 MASTER_ORG = "synthetic-data-universe/"
                 model_token = False # This is currently not supported
@@ -312,7 +345,7 @@ def main():
                     )
                     # check the input dataset exists and can be accessed with the provided token
-                    request = validate_request(request)
                     add_request_to_db(request)
                     return "Request submitted successfully!"
@@ -322,25 +355,29 @@ def main():
             submit_btn.click(
                 submit_request,
                 inputs=[input_dataset_name, input_dataset_split, input_dataset_config, output_dataset_name, prompt_column, model_name_or_path,
-                        model_revision, system_prompt, max_tokens, temperature, top_k, top_p, email, num_output_samples],
                 outputs=output_status
             )
-        def control_access(profile: Optional[gr.OAuthProfile] = None, oauth_token: Optional[gr.OAuthToken] = None):
-            if not profile: return gr.update(visible=False), gr.update(visible=False)
-            if verify_pro_status(oauth_token): return gr.update(visible=True), gr.update(visible=False)
             else:
-                message = (
-                    "## ✨ Exclusive Access for PRO Users\n\n"
-                    "Thank you for your interest! This app is available exclusively for our Hugging Face **PRO** members.\n\n"
-                    "To unlock this and many other cool stuff, please consider upgrading your account.\n\n"
-                    "### [**Become a PRO Today!**](http://huggingface.co/subscribe/pro?source=synthetic-data-universe)"
-                )
-                return gr.update(visible=False), gr.update(visible=True, value=message)
         login_button = gr.LoginButton() # this is required or AUTH will not work
-        demo.load(control_access, inputs=None, outputs=[main_interface, pro_message])
         demo.queue(max_size=None, default_concurrency_limit=None).launch(show_error=True)
 if __name__ == "__main__":

     FAILED = "FAILED"
+MAX_SAMPLES_PRO = 10000  # max number of samples for PRO/Enterprise users
+MAX_SAMPLES_FREE = 100   # max number of samples for free users
 MAX_TOKENS = 8192
 MAX_MODEL_PARAMS = 20_000_000_000  # 20 billion parameters (for now)
     private: bool = False
     num_retries: int = 0
+def validate_request(request: GenerationRequest, oauth_token: Optional[Union[gr.OAuthToken, str]] = None) -> GenerationRequest:
     # checks that the request is valid
     # - input dataset exists and can be accessed with the provided token
     try:
+    # Check user tier and apply appropriate limits
+    is_pro = verify_pro_status(oauth_token)
+    max_samples = MAX_SAMPLES_PRO if is_pro else MAX_SAMPLES_FREE
+    if request.num_output_examples > max_samples:
+        user_tier = "PRO/Enterprise" if is_pro else "free"
+        raise Exception(f"Requested number of output examples {request.num_output_examples} exceeds the max limit of {max_samples} for {user_tier} users.")
     # check the prompt column exists in the dataset
     if request.prompt_column not in input_dataset_info.features:
         raise Exception("Failed to add request to database")
 def main():
     with gr.Blocks(title="Synthetic Data Generation") as demo:
+        gr.HTML("<h3 style='text-align:center'>Generate synthetic data with AI models. PRO users get 10k samples, free users get 100 samples. <a href='http://huggingface.co/subscribe/pro?source=synthetic-data-universe' target='_blank'>Upgrade to PRO</a></h3>", elem_id="sub_title")
         pro_message = gr.Markdown(visible=False)
         main_interface = gr.Column(visible=False)
+        # Store the current oauth token for use in submit_request
+        current_oauth_token = gr.State(None)
         with main_interface:
             with gr.Group():
                 with gr.Row():
                     Welcome to the Synthetic Data Generation service! This tool allows you to generate synthetic data using large language models. Generation is FREE for Hugging Face PRO users and uses idle GPUs on the HF science cluster.\n
                     Outputs from this service will be PUBLIC and available on the Hugging Face Hub under the organization [synthetic-data-universe](https://huggingface.co/synthetic-data-universe).\n
                     """)
+            with gr.Accordion("How it works", open=False):
                 with gr.Row():
                     gr.Markdown("""
                     **How it works:**
                     - Maximum of 8192 generated tokens
                     """)
+            with gr.Tabs():
+                with gr.TabItem("Generate Synthetic Data"):
+                    with gr.Group():
+                        gr.Markdown("##  Model information")
+                        with gr.Column():
+                            with gr.Row():
+                                model_name_or_path = gr.Dropdown(
+                                    choices=[
+                                        "microsoft/Phi-3.5-mini-instruct",
+                                        "Qwen/Qwen2.5-7B-Instruct",
+                                        "meta-llama/Llama-3.2-8B-Instruct",
+                                        "mistralai/Mistral-7B-Instruct-v0.3",
+                                        "google/gemma-2-9b-it",
+                                        "microsoft/DialoGPT-medium",
+                                        "HuggingFaceH4/zephyr-7b-beta",
+                                        "teknium/OpenHermes-2.5-Mistral-7B",
+                                        "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO",
+                                        "01-ai/Yi-34B-Chat"
+                                    ],
+                                    label="Select Model",
+                                    value="microsoft/Phi-3.5-mini-instruct",
+                                    info="Choose from popular instruction-tuned models under 40B parameters"
+                                )
+                            # model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
+                    with gr.Group():
+                        gr.Markdown("##  Dataset information")
+                        # Dynamic user limit info
+                        user_limit_info = gr.Markdown(value="", visible=True)
                         with gr.Row():
+                            with gr.Column():
+                                input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1")
+                                prompt_column = gr.Textbox(label="Prompt Column", placeholder="e.g., text, prompt, question")
+                            with gr.Column():
+                                output_dataset_name = gr.Textbox(label="Output Dataset Name", placeholder="e.g., my-generated-dataset, must be unique. Will be created under the org 'synthetic-data-universe'")
+                                num_output_samples = gr.Slider(label="Number of samples, leave as '0' for all", value=0, minimum=0, maximum=MAX_SAMPLES_PRO, step=1)
+                    with gr.Accordion("Advanced Options", open=False):
                         with gr.Row():
+                            input_dataset_config = gr.Textbox(label="Input Dataset Config", value="default", placeholder="e.g., default, custom")
+                            input_dataset_split = gr.Textbox(label="Input Dataset Split", value="train", placeholder="e.g., train, test, validation")
+                            model_revision = gr.Textbox(label="Model Revision", value="main", placeholder="e.g., main, v1.0")
+                        with gr.Group():
+                            gr.Markdown("### Generation Parameters")
+                            with gr.Row():
+                                with gr.Column():
+                                    with gr.Row():
+                                        max_tokens = gr.Slider(label="Max Tokens", value=512, minimum=256, maximum=MAX_TOKENS, step=256)
+                                        temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.1)
+                                    with gr.Row():
+                                        top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5)
+                                        top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
+                                    with gr.Row():
+                                        system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.")
+                    with gr.Group():
+                        gr.Markdown("##  User Information, for notification when your job is completed (still TODO)")
                         with gr.Row():
+                            with gr.Column():
+                                with gr.Row():
+                                    email = gr.Textbox(label="Email", placeholder="your.email@example.com")
+                                # with gr.Row():
+                                    # input_dataset_token = gr.Textbox(label="Input dataset token", type="password", placeholder="Your HF token with read access to the input dataset, leave blank if public dataset")
+                                    # output_dataset_token = gr.Textbox(label="Output dataset token", type="password", placeholder="Your HF token with write access to the output dataset")
+                    submit_btn = gr.Button("Submit Generation Request", variant="primary")
+                    output_status = gr.Textbox(label="Status", interactive=False)
+                with gr.TabItem("Coming Soon"):
+                    gr.Markdown("## New features coming soon!")
+                    gr.Markdown("This tab will contain additional functionality in future updates.")
             def submit_request(input_dataset_name, input_split, input_dataset_config, output_dataset_name, prompt_col, model_name, model_rev, sys_prompt,
+                                max_tok, temp, top_k_val, top_p_val, email_addr, num_output_samples, oauth_token=None):
                 MASTER_ORG = "synthetic-data-universe/"
                 model_token = False # This is currently not supported
                     )
                     # check the input dataset exists and can be accessed with the provided token
+                    request = validate_request(request, oauth_token)
                     add_request_to_db(request)
                     return "Request submitted successfully!"
             submit_btn.click(
                 submit_request,
                 inputs=[input_dataset_name, input_dataset_split, input_dataset_config, output_dataset_name, prompt_column, model_name_or_path,
+                        model_revision, system_prompt, max_tokens, temperature, top_k, top_p, email, num_output_samples, current_oauth_token],
                 outputs=output_status
             )
+        def update_user_limits(oauth_token):
+            if oauth_token is None:
+                return ""
+            is_pro = verify_pro_status(oauth_token)
+            if is_pro:
+                return "✨ **PRO User**: You can generate up to 10,000 samples per request."
             else:
+                return "👤 **Free User**: You can generate up to 100 samples per request. [Upgrade to PRO](http://huggingface.co/subscribe/pro?source=synthetic-data-universe) for 10,000 samples."
+        def control_access(profile: Optional[gr.OAuthProfile] = None, oauth_token: Optional[gr.OAuthToken] = None):
+            if not profile: return gr.update(visible=False), gr.update(visible=False), None, ""
+            # Allow all users but show different messaging, and store the token
+            limit_msg = update_user_limits(oauth_token)
+            return gr.update(visible=True), gr.update(visible=False), oauth_token, limit_msg
         login_button = gr.LoginButton() # this is required or AUTH will not work
+        demo.load(control_access, inputs=None, outputs=[main_interface, pro_message, current_oauth_token, user_limit_info])
         demo.queue(max_size=None, default_concurrency_limit=None).launch(show_error=True)
 if __name__ == "__main__":