Spaces:

synthetic-data-universe
/

synth

Sleeping

App Files Files Community

edbeeching commited on Sep 16

Commit

eb54763

1 Parent(s): 7580ee9

reorg

Browse files

Files changed (1) hide show

app.py +78 -55

app.py CHANGED Viewed

@@ -198,7 +198,7 @@ def validate_request(request: GenerationRequest, oauth_token: Optional[Union[gr.
     return request
-def load_dataset_info(dataset_name, dataset_token=None, oauth_token=None):
     """Load dataset information and return choices for dropdowns"""
     if not dataset_name.strip():
         return (
@@ -212,7 +212,7 @@ def load_dataset_info(dataset_name, dataset_token=None, oauth_token=None):
     try:
         # Get dataset info
-        dataset_infos = get_dataset_infos(dataset_name, token=dataset_token)
         if not dataset_infos:
             raise Exception("No configs found for this dataset")
@@ -254,9 +254,28 @@ def load_dataset_info(dataset_name, dataset_token=None, oauth_token=None):
         # Set slider maximum to the minimum of dataset samples and user limit
         slider_max = min(dataset_sample_count, user_max_samples) if dataset_sample_count > 0 else user_max_samples
-        # Generate a suggested output dataset name
         dataset_base_name = dataset_name.split('/')[-1] if '/' in dataset_name else dataset_name
-        suggested_output_name = f"{dataset_base_name}-synthetic"
         status_msg = f"✅ Dataset info loaded successfully! Found {len(config_choices)} config(s), {len(split_choices)} split(s), and {len(column_choices)} column(s)."
         if dataset_sample_count > 0:
@@ -411,60 +430,64 @@ def main():
             with gr.Tabs():
                 with gr.TabItem("Generate Synthetic Data"):
-                    with gr.Group():
-                        gr.Markdown("##  Model information")
-                        with gr.Column():
-                            with gr.Row():
-                                model_name_or_path = gr.Dropdown(
-                                    choices=SUPPORTED_MODELS,
-                                    label="Select Model",
-                                    value="Qwen/Qwen3-4B-Instruct-2507",
-                                    info="Choose from popular instruction-tuned models under 40B parameters"
-                                )
-                            # model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
-                    with gr.Group():
-                        gr.Markdown("##  Dataset information")
-                        # Dynamic user limit info - default to anonymous user
-                        user_limit_info = gr.Markdown(value="👤 **Anonymous User**: You can generate up to 100 samples per request. Use the sign-in button above for PRO benefits (10,000 samples).", visible=True)
-                        with gr.Row():
-                            with gr.Column():
-                                input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1")
-                                load_info_btn = gr.Button("📊 Load Dataset Info", size="sm", variant="secondary")
-                                load_info_status = gr.Markdown("", visible=True)
-                            with gr.Column():
-                                output_dataset_name = gr.Textbox(label="Output Dataset Name", placeholder="e.g., my-generated-dataset, must be unique. Will be created under the org 'synthetic-data-universe'", value=None, interactive=False, info="Click Load Info to populate")
-                        with gr.Row():
-                            with gr.Column():
-                                input_dataset_config = gr.Dropdown(label="Dataset Config", choices=[], value=None, interactive=False, info="Click Load Info to populate")
-                                prompt_column = gr.Dropdown(label="Prompt Column", choices=[], value=None, interactive=False, info="Click Load Info to populate")
-                            with gr.Column():
-                                input_dataset_split = gr.Dropdown(label="Dataset Split", choices=[], value=None, interactive=False, info="Click Load Info to populate")
-                                num_output_samples = gr.Slider(label="Number of samples, leave as '0' for all", value=0, minimum=0, maximum=MAX_SAMPLES_FREE, step=1, interactive=False, info="Click Load Info to populate")
-                    gr.Markdown("### Generation Parameters")
                     with gr.Row():
                         with gr.Column():
-                            with gr.Row():
-                                max_tokens = gr.Slider(label="Max Tokens", value=1024, minimum=256, maximum=MAX_TOKENS, step=256)
-                                temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.1)
-                            with gr.Row():
-                                top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5)
-                                top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
-                    with gr.Row():
-                        system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.")
-                    with gr.Group():
-                        gr.Markdown("##  User Information, for notification when your job is completed (still TODO)")
-                        with gr.Row():
-                            with gr.Column():
                                 with gr.Row():
-                                    email = gr.Textbox(label="Email", placeholder="your.email@example.com")
-                                # with gr.Row():
-                                    # input_dataset_token = gr.Textbox(label="Input dataset token", type="password", placeholder="Your HF token with read access to the input dataset, leave blank if public dataset")
-                                    # output_dataset_token = gr.Textbox(label="Output dataset token", type="password", placeholder="Your HF token with write access to the output dataset")
                     submit_btn = gr.Button("Submit Generation Request", variant="primary")
                     output_status = gr.Textbox(label="Status", interactive=False)
@@ -610,7 +633,7 @@ def main():
             # Wire up the Load Dataset Info button
             load_info_btn.click(
                 load_dataset_info,
-                inputs=[input_dataset_name, gr.State(None), current_oauth_token],
                 outputs=[input_dataset_config, input_dataset_split, prompt_column, output_dataset_name, num_output_samples, load_info_status]
             )

     return request
+def load_dataset_info(dataset_name, model_name, oauth_token=None, dataset_token=None, ):
     """Load dataset information and return choices for dropdowns"""
     if not dataset_name.strip():
         return (
     try:
         # Get dataset info
+        dataset_infos = get_dataset_infos(dataset_name)
         if not dataset_infos:
             raise Exception("No configs found for this dataset")
         # Set slider maximum to the minimum of dataset samples and user limit
         slider_max = min(dataset_sample_count, user_max_samples) if dataset_sample_count > 0 else user_max_samples
+        # Generate a suggested output dataset name with model name and timestamp
         dataset_base_name = dataset_name.split('/')[-1] if '/' in dataset_name else dataset_name
+        # Extract model short name (e.g., "Qwen/Qwen3-4B-Instruct-2507" -> "Qwen3-4B-Instruct-2507")
+        model_short_name = model_name.split('/')[-1]
+        # Create a compact timestamp (YYMMDD-HHMM format)
+        from datetime import datetime
+        timestamp = datetime.now().strftime("%y%m%d-%H%M")
+        # Build the output name: MODEL-dataset-timestamp
+        suggested_output_name = f"{model_short_name}-{dataset_base_name}-{timestamp}"
+        # Limit to 86 characters
+        if len(suggested_output_name) > 86:
+            # Truncate dataset name to fit within limit
+            available_for_dataset = 86 - len(model_short_name) - len(timestamp) - 2  # -2 for the hyphens
+            if available_for_dataset > 0:
+                dataset_base_name = dataset_base_name[:available_for_dataset]
+                suggested_output_name = f"{model_short_name}-{dataset_base_name}-{timestamp}"
+            else:
+                suggested_output_name = f"{model_short_name}-{timestamp}"
         status_msg = f"✅ Dataset info loaded successfully! Found {len(config_choices)} config(s), {len(split_choices)} split(s), and {len(column_choices)} column(s)."
         if dataset_sample_count > 0:
             with gr.Tabs():
                 with gr.TabItem("Generate Synthetic Data"):
                     with gr.Row():
                         with gr.Column():
+                            with gr.Group():
+                                gr.Markdown("##  Model information")
+                                with gr.Column():
+                                    with gr.Row():
+                                        model_name_or_path = gr.Dropdown(
+                                            choices=SUPPORTED_MODELS,
+                                            label="Select Model",
+                                            value="Qwen/Qwen3-4B-Instruct-2507",
+                                            info="Choose from popular instruction-tuned models under 40B parameters"
+                                        )
+                                    # model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
+                                gr.Markdown("### Generation Parameters")
+                                with gr.Row():
+                                    system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.")
+                                with gr.Row():
+                                    with gr.Column():
+                                        with gr.Row():
+                                            max_tokens = gr.Slider(label="Max Tokens", value=1024, minimum=256, maximum=MAX_TOKENS, step=256)
+                                            temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.1)
+                                        with gr.Row():
+                                            top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5)
+                                            top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
+                        with gr.Column():
+                            with gr.Group():
+                                gr.Markdown("##  Dataset information")
+                                # Dynamic user limit info - default to anonymous user
+                                user_limit_info = gr.Markdown(value="👤 **Anonymous User**: You can generate up to 100 samples per request. Use the sign-in button above for PRO benefits (10,000 samples).", visible=True)
                                 with gr.Row():
+                                    with gr.Column():
+                                        input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1")
+                                        load_info_btn = gr.Button("📊 Load Dataset Info", size="sm", variant="secondary")
+                                        load_info_status = gr.Markdown("", visible=True)
+                                    with gr.Column():
+                                        output_dataset_name = gr.Textbox(label="Output Dataset Name", placeholder="e.g., my-generated-dataset, must be unique. Will be created under the org 'synthetic-data-universe'", value=None, interactive=False, info="Click Load Info to populate")
+                                with gr.Row():
+                                    with gr.Column():
+                                        input_dataset_config = gr.Dropdown(label="Dataset Config", choices=[], value=None, interactive=False, info="Click Load Info to populate")
+                                        prompt_column = gr.Dropdown(label="Prompt Column", choices=[], value=None, interactive=False, info="Click Load Info to populate")
+                                    with gr.Column():
+                                        input_dataset_split = gr.Dropdown(label="Dataset Split", choices=[], value=None, interactive=False, info="Click Load Info to populate")
+                                        num_output_samples = gr.Slider(label="Number of samples, leave as '0' for all", value=0, minimum=0, maximum=MAX_SAMPLES_FREE, step=1, interactive=False, info="Click Load Info to populate")
+                    # with gr.Group():
+                    #     gr.Markdown("##  User Information, for notification when your job is completed (still TODO)")
+                    #     with gr.Row():
+                    #         with gr.Column():
+                    #             with gr.Row():
+                    #                 email = gr.Textbox(label="Email", placeholder="your.email@example.com")
+                    #             # with gr.Row():
+                    #                 # input_dataset_token = gr.Textbox(label="Input dataset token", type="password", placeholder="Your HF token with read access to the input dataset, leave blank if public dataset")
+                    #                 # output_dataset_token = gr.Textbox(label="Output dataset token", type="password", placeholder="Your HF token with write access to the output dataset")
                     submit_btn = gr.Button("Submit Generation Request", variant="primary")
                     output_status = gr.Textbox(label="Status", interactive=False)
             # Wire up the Load Dataset Info button
             load_info_btn.click(
                 load_dataset_info,
+                inputs=[input_dataset_name, model_name_or_path, current_oauth_token],
                 outputs=[input_dataset_config, input_dataset_split, prompt_column, output_dataset_name, num_output_samples, load_info_status]
             )