edbeeching commited on
Commit
eb54763
Β·
1 Parent(s): 7580ee9
Files changed (1) hide show
  1. app.py +78 -55
app.py CHANGED
@@ -198,7 +198,7 @@ def validate_request(request: GenerationRequest, oauth_token: Optional[Union[gr.
198
  return request
199
 
200
 
201
- def load_dataset_info(dataset_name, dataset_token=None, oauth_token=None):
202
  """Load dataset information and return choices for dropdowns"""
203
  if not dataset_name.strip():
204
  return (
@@ -212,7 +212,7 @@ def load_dataset_info(dataset_name, dataset_token=None, oauth_token=None):
212
 
213
  try:
214
  # Get dataset info
215
- dataset_infos = get_dataset_infos(dataset_name, token=dataset_token)
216
 
217
  if not dataset_infos:
218
  raise Exception("No configs found for this dataset")
@@ -254,9 +254,28 @@ def load_dataset_info(dataset_name, dataset_token=None, oauth_token=None):
254
  # Set slider maximum to the minimum of dataset samples and user limit
255
  slider_max = min(dataset_sample_count, user_max_samples) if dataset_sample_count > 0 else user_max_samples
256
 
257
- # Generate a suggested output dataset name
258
  dataset_base_name = dataset_name.split('/')[-1] if '/' in dataset_name else dataset_name
259
- suggested_output_name = f"{dataset_base_name}-synthetic"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
 
261
  status_msg = f"βœ… Dataset info loaded successfully! Found {len(config_choices)} config(s), {len(split_choices)} split(s), and {len(column_choices)} column(s)."
262
  if dataset_sample_count > 0:
@@ -411,60 +430,64 @@ def main():
411
 
412
  with gr.Tabs():
413
  with gr.TabItem("Generate Synthetic Data"):
414
- with gr.Group():
415
- gr.Markdown("## Model information")
416
- with gr.Column():
417
- with gr.Row():
418
- model_name_or_path = gr.Dropdown(
419
- choices=SUPPORTED_MODELS,
420
- label="Select Model",
421
- value="Qwen/Qwen3-4B-Instruct-2507",
422
- info="Choose from popular instruction-tuned models under 40B parameters"
423
- )
424
- # model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
425
- with gr.Group():
426
- gr.Markdown("## Dataset information")
427
- # Dynamic user limit info - default to anonymous user
428
- user_limit_info = gr.Markdown(value="πŸ‘€ **Anonymous User**: You can generate up to 100 samples per request. Use the sign-in button above for PRO benefits (10,000 samples).", visible=True)
429
- with gr.Row():
430
- with gr.Column():
431
- input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1")
432
- load_info_btn = gr.Button("πŸ“Š Load Dataset Info", size="sm", variant="secondary")
433
- load_info_status = gr.Markdown("", visible=True)
434
-
435
- with gr.Column():
436
- output_dataset_name = gr.Textbox(label="Output Dataset Name", placeholder="e.g., my-generated-dataset, must be unique. Will be created under the org 'synthetic-data-universe'", value=None, interactive=False, info="Click Load Info to populate")
437
-
438
- with gr.Row():
439
- with gr.Column():
440
- input_dataset_config = gr.Dropdown(label="Dataset Config", choices=[], value=None, interactive=False, info="Click Load Info to populate")
441
- prompt_column = gr.Dropdown(label="Prompt Column", choices=[], value=None, interactive=False, info="Click Load Info to populate")
442
-
443
- with gr.Column():
444
- input_dataset_split = gr.Dropdown(label="Dataset Split", choices=[], value=None, interactive=False, info="Click Load Info to populate")
445
- num_output_samples = gr.Slider(label="Number of samples, leave as '0' for all", value=0, minimum=0, maximum=MAX_SAMPLES_FREE, step=1, interactive=False, info="Click Load Info to populate")
446
-
447
- gr.Markdown("### Generation Parameters")
448
  with gr.Row():
449
  with gr.Column():
450
- with gr.Row():
451
- max_tokens = gr.Slider(label="Max Tokens", value=1024, minimum=256, maximum=MAX_TOKENS, step=256)
452
- temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.1)
453
- with gr.Row():
454
- top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5)
455
- top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
456
- with gr.Row():
457
- system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
458
 
459
- with gr.Group():
460
- gr.Markdown("## User Information, for notification when your job is completed (still TODO)")
461
- with gr.Row():
462
- with gr.Column():
 
463
  with gr.Row():
464
- email = gr.Textbox(label="Email", placeholder="your.email@example.com")
465
- # with gr.Row():
466
- # input_dataset_token = gr.Textbox(label="Input dataset token", type="password", placeholder="Your HF token with read access to the input dataset, leave blank if public dataset")
467
- # output_dataset_token = gr.Textbox(label="Output dataset token", type="password", placeholder="Your HF token with write access to the output dataset")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
468
 
469
  submit_btn = gr.Button("Submit Generation Request", variant="primary")
470
  output_status = gr.Textbox(label="Status", interactive=False)
@@ -610,7 +633,7 @@ def main():
610
  # Wire up the Load Dataset Info button
611
  load_info_btn.click(
612
  load_dataset_info,
613
- inputs=[input_dataset_name, gr.State(None), current_oauth_token],
614
  outputs=[input_dataset_config, input_dataset_split, prompt_column, output_dataset_name, num_output_samples, load_info_status]
615
  )
616
 
 
198
  return request
199
 
200
 
201
+ def load_dataset_info(dataset_name, model_name, oauth_token=None, dataset_token=None, ):
202
  """Load dataset information and return choices for dropdowns"""
203
  if not dataset_name.strip():
204
  return (
 
212
 
213
  try:
214
  # Get dataset info
215
+ dataset_infos = get_dataset_infos(dataset_name)
216
 
217
  if not dataset_infos:
218
  raise Exception("No configs found for this dataset")
 
254
  # Set slider maximum to the minimum of dataset samples and user limit
255
  slider_max = min(dataset_sample_count, user_max_samples) if dataset_sample_count > 0 else user_max_samples
256
 
257
+ # Generate a suggested output dataset name with model name and timestamp
258
  dataset_base_name = dataset_name.split('/')[-1] if '/' in dataset_name else dataset_name
259
+
260
+ # Extract model short name (e.g., "Qwen/Qwen3-4B-Instruct-2507" -> "Qwen3-4B-Instruct-2507")
261
+ model_short_name = model_name.split('/')[-1]
262
+
263
+ # Create a compact timestamp (YYMMDD-HHMM format)
264
+ from datetime import datetime
265
+ timestamp = datetime.now().strftime("%y%m%d-%H%M")
266
+
267
+ # Build the output name: MODEL-dataset-timestamp
268
+ suggested_output_name = f"{model_short_name}-{dataset_base_name}-{timestamp}"
269
+
270
+ # Limit to 86 characters
271
+ if len(suggested_output_name) > 86:
272
+ # Truncate dataset name to fit within limit
273
+ available_for_dataset = 86 - len(model_short_name) - len(timestamp) - 2 # -2 for the hyphens
274
+ if available_for_dataset > 0:
275
+ dataset_base_name = dataset_base_name[:available_for_dataset]
276
+ suggested_output_name = f"{model_short_name}-{dataset_base_name}-{timestamp}"
277
+ else:
278
+ suggested_output_name = f"{model_short_name}-{timestamp}"
279
 
280
  status_msg = f"βœ… Dataset info loaded successfully! Found {len(config_choices)} config(s), {len(split_choices)} split(s), and {len(column_choices)} column(s)."
281
  if dataset_sample_count > 0:
 
430
 
431
  with gr.Tabs():
432
  with gr.TabItem("Generate Synthetic Data"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
433
  with gr.Row():
434
  with gr.Column():
435
+ with gr.Group():
436
+ gr.Markdown("## Model information")
437
+ with gr.Column():
438
+ with gr.Row():
439
+ model_name_or_path = gr.Dropdown(
440
+ choices=SUPPORTED_MODELS,
441
+ label="Select Model",
442
+ value="Qwen/Qwen3-4B-Instruct-2507",
443
+ info="Choose from popular instruction-tuned models under 40B parameters"
444
+ )
445
+ # model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
446
+ gr.Markdown("### Generation Parameters")
447
+ with gr.Row():
448
+ system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.")
449
+ with gr.Row():
450
+ with gr.Column():
451
+ with gr.Row():
452
+ max_tokens = gr.Slider(label="Max Tokens", value=1024, minimum=256, maximum=MAX_TOKENS, step=256)
453
+ temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.1)
454
+ with gr.Row():
455
+ top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5)
456
+ top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
457
 
458
+ with gr.Column():
459
+ with gr.Group():
460
+ gr.Markdown("## Dataset information")
461
+ # Dynamic user limit info - default to anonymous user
462
+ user_limit_info = gr.Markdown(value="πŸ‘€ **Anonymous User**: You can generate up to 100 samples per request. Use the sign-in button above for PRO benefits (10,000 samples).", visible=True)
463
  with gr.Row():
464
+ with gr.Column():
465
+ input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1")
466
+ load_info_btn = gr.Button("πŸ“Š Load Dataset Info", size="sm", variant="secondary")
467
+ load_info_status = gr.Markdown("", visible=True)
468
+
469
+ with gr.Column():
470
+ output_dataset_name = gr.Textbox(label="Output Dataset Name", placeholder="e.g., my-generated-dataset, must be unique. Will be created under the org 'synthetic-data-universe'", value=None, interactive=False, info="Click Load Info to populate")
471
+
472
+ with gr.Row():
473
+ with gr.Column():
474
+ input_dataset_config = gr.Dropdown(label="Dataset Config", choices=[], value=None, interactive=False, info="Click Load Info to populate")
475
+ prompt_column = gr.Dropdown(label="Prompt Column", choices=[], value=None, interactive=False, info="Click Load Info to populate")
476
+
477
+ with gr.Column():
478
+ input_dataset_split = gr.Dropdown(label="Dataset Split", choices=[], value=None, interactive=False, info="Click Load Info to populate")
479
+ num_output_samples = gr.Slider(label="Number of samples, leave as '0' for all", value=0, minimum=0, maximum=MAX_SAMPLES_FREE, step=1, interactive=False, info="Click Load Info to populate")
480
+
481
+
482
+ # with gr.Group():
483
+ # gr.Markdown("## User Information, for notification when your job is completed (still TODO)")
484
+ # with gr.Row():
485
+ # with gr.Column():
486
+ # with gr.Row():
487
+ # email = gr.Textbox(label="Email", placeholder="your.email@example.com")
488
+ # # with gr.Row():
489
+ # # input_dataset_token = gr.Textbox(label="Input dataset token", type="password", placeholder="Your HF token with read access to the input dataset, leave blank if public dataset")
490
+ # # output_dataset_token = gr.Textbox(label="Output dataset token", type="password", placeholder="Your HF token with write access to the output dataset")
491
 
492
  submit_btn = gr.Button("Submit Generation Request", variant="primary")
493
  output_status = gr.Textbox(label="Status", interactive=False)
 
633
  # Wire up the Load Dataset Info button
634
  load_info_btn.click(
635
  load_dataset_info,
636
+ inputs=[input_dataset_name, model_name_or_path, current_oauth_token],
637
  outputs=[input_dataset_config, input_dataset_split, prompt_column, output_dataset_name, num_output_samples, load_info_status]
638
  )
639