Spaces:
Sleeping
Sleeping
edbeeching
commited on
Commit
Β·
eb54763
1
Parent(s):
7580ee9
reorg
Browse files
app.py
CHANGED
|
@@ -198,7 +198,7 @@ def validate_request(request: GenerationRequest, oauth_token: Optional[Union[gr.
|
|
| 198 |
return request
|
| 199 |
|
| 200 |
|
| 201 |
-
def load_dataset_info(dataset_name,
|
| 202 |
"""Load dataset information and return choices for dropdowns"""
|
| 203 |
if not dataset_name.strip():
|
| 204 |
return (
|
|
@@ -212,7 +212,7 @@ def load_dataset_info(dataset_name, dataset_token=None, oauth_token=None):
|
|
| 212 |
|
| 213 |
try:
|
| 214 |
# Get dataset info
|
| 215 |
-
dataset_infos = get_dataset_infos(dataset_name
|
| 216 |
|
| 217 |
if not dataset_infos:
|
| 218 |
raise Exception("No configs found for this dataset")
|
|
@@ -254,9 +254,28 @@ def load_dataset_info(dataset_name, dataset_token=None, oauth_token=None):
|
|
| 254 |
# Set slider maximum to the minimum of dataset samples and user limit
|
| 255 |
slider_max = min(dataset_sample_count, user_max_samples) if dataset_sample_count > 0 else user_max_samples
|
| 256 |
|
| 257 |
-
# Generate a suggested output dataset name
|
| 258 |
dataset_base_name = dataset_name.split('/')[-1] if '/' in dataset_name else dataset_name
|
| 259 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
|
| 261 |
status_msg = f"β
Dataset info loaded successfully! Found {len(config_choices)} config(s), {len(split_choices)} split(s), and {len(column_choices)} column(s)."
|
| 262 |
if dataset_sample_count > 0:
|
|
@@ -411,60 +430,64 @@ def main():
|
|
| 411 |
|
| 412 |
with gr.Tabs():
|
| 413 |
with gr.TabItem("Generate Synthetic Data"):
|
| 414 |
-
with gr.Group():
|
| 415 |
-
gr.Markdown("## Model information")
|
| 416 |
-
with gr.Column():
|
| 417 |
-
with gr.Row():
|
| 418 |
-
model_name_or_path = gr.Dropdown(
|
| 419 |
-
choices=SUPPORTED_MODELS,
|
| 420 |
-
label="Select Model",
|
| 421 |
-
value="Qwen/Qwen3-4B-Instruct-2507",
|
| 422 |
-
info="Choose from popular instruction-tuned models under 40B parameters"
|
| 423 |
-
)
|
| 424 |
-
# model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
|
| 425 |
-
with gr.Group():
|
| 426 |
-
gr.Markdown("## Dataset information")
|
| 427 |
-
# Dynamic user limit info - default to anonymous user
|
| 428 |
-
user_limit_info = gr.Markdown(value="π€ **Anonymous User**: You can generate up to 100 samples per request. Use the sign-in button above for PRO benefits (10,000 samples).", visible=True)
|
| 429 |
-
with gr.Row():
|
| 430 |
-
with gr.Column():
|
| 431 |
-
input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1")
|
| 432 |
-
load_info_btn = gr.Button("π Load Dataset Info", size="sm", variant="secondary")
|
| 433 |
-
load_info_status = gr.Markdown("", visible=True)
|
| 434 |
-
|
| 435 |
-
with gr.Column():
|
| 436 |
-
output_dataset_name = gr.Textbox(label="Output Dataset Name", placeholder="e.g., my-generated-dataset, must be unique. Will be created under the org 'synthetic-data-universe'", value=None, interactive=False, info="Click Load Info to populate")
|
| 437 |
-
|
| 438 |
-
with gr.Row():
|
| 439 |
-
with gr.Column():
|
| 440 |
-
input_dataset_config = gr.Dropdown(label="Dataset Config", choices=[], value=None, interactive=False, info="Click Load Info to populate")
|
| 441 |
-
prompt_column = gr.Dropdown(label="Prompt Column", choices=[], value=None, interactive=False, info="Click Load Info to populate")
|
| 442 |
-
|
| 443 |
-
with gr.Column():
|
| 444 |
-
input_dataset_split = gr.Dropdown(label="Dataset Split", choices=[], value=None, interactive=False, info="Click Load Info to populate")
|
| 445 |
-
num_output_samples = gr.Slider(label="Number of samples, leave as '0' for all", value=0, minimum=0, maximum=MAX_SAMPLES_FREE, step=1, interactive=False, info="Click Load Info to populate")
|
| 446 |
-
|
| 447 |
-
gr.Markdown("### Generation Parameters")
|
| 448 |
with gr.Row():
|
| 449 |
with gr.Column():
|
| 450 |
-
with gr.
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 458 |
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
|
|
|
| 463 |
with gr.Row():
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 468 |
|
| 469 |
submit_btn = gr.Button("Submit Generation Request", variant="primary")
|
| 470 |
output_status = gr.Textbox(label="Status", interactive=False)
|
|
@@ -610,7 +633,7 @@ def main():
|
|
| 610 |
# Wire up the Load Dataset Info button
|
| 611 |
load_info_btn.click(
|
| 612 |
load_dataset_info,
|
| 613 |
-
inputs=[input_dataset_name,
|
| 614 |
outputs=[input_dataset_config, input_dataset_split, prompt_column, output_dataset_name, num_output_samples, load_info_status]
|
| 615 |
)
|
| 616 |
|
|
|
|
| 198 |
return request
|
| 199 |
|
| 200 |
|
| 201 |
+
def load_dataset_info(dataset_name, model_name, oauth_token=None, dataset_token=None, ):
|
| 202 |
"""Load dataset information and return choices for dropdowns"""
|
| 203 |
if not dataset_name.strip():
|
| 204 |
return (
|
|
|
|
| 212 |
|
| 213 |
try:
|
| 214 |
# Get dataset info
|
| 215 |
+
dataset_infos = get_dataset_infos(dataset_name)
|
| 216 |
|
| 217 |
if not dataset_infos:
|
| 218 |
raise Exception("No configs found for this dataset")
|
|
|
|
| 254 |
# Set slider maximum to the minimum of dataset samples and user limit
|
| 255 |
slider_max = min(dataset_sample_count, user_max_samples) if dataset_sample_count > 0 else user_max_samples
|
| 256 |
|
| 257 |
+
# Generate a suggested output dataset name with model name and timestamp
|
| 258 |
dataset_base_name = dataset_name.split('/')[-1] if '/' in dataset_name else dataset_name
|
| 259 |
+
|
| 260 |
+
# Extract model short name (e.g., "Qwen/Qwen3-4B-Instruct-2507" -> "Qwen3-4B-Instruct-2507")
|
| 261 |
+
model_short_name = model_name.split('/')[-1]
|
| 262 |
+
|
| 263 |
+
# Create a compact timestamp (YYMMDD-HHMM format)
|
| 264 |
+
from datetime import datetime
|
| 265 |
+
timestamp = datetime.now().strftime("%y%m%d-%H%M")
|
| 266 |
+
|
| 267 |
+
# Build the output name: MODEL-dataset-timestamp
|
| 268 |
+
suggested_output_name = f"{model_short_name}-{dataset_base_name}-{timestamp}"
|
| 269 |
+
|
| 270 |
+
# Limit to 86 characters
|
| 271 |
+
if len(suggested_output_name) > 86:
|
| 272 |
+
# Truncate dataset name to fit within limit
|
| 273 |
+
available_for_dataset = 86 - len(model_short_name) - len(timestamp) - 2 # -2 for the hyphens
|
| 274 |
+
if available_for_dataset > 0:
|
| 275 |
+
dataset_base_name = dataset_base_name[:available_for_dataset]
|
| 276 |
+
suggested_output_name = f"{model_short_name}-{dataset_base_name}-{timestamp}"
|
| 277 |
+
else:
|
| 278 |
+
suggested_output_name = f"{model_short_name}-{timestamp}"
|
| 279 |
|
| 280 |
status_msg = f"β
Dataset info loaded successfully! Found {len(config_choices)} config(s), {len(split_choices)} split(s), and {len(column_choices)} column(s)."
|
| 281 |
if dataset_sample_count > 0:
|
|
|
|
| 430 |
|
| 431 |
with gr.Tabs():
|
| 432 |
with gr.TabItem("Generate Synthetic Data"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 433 |
with gr.Row():
|
| 434 |
with gr.Column():
|
| 435 |
+
with gr.Group():
|
| 436 |
+
gr.Markdown("## Model information")
|
| 437 |
+
with gr.Column():
|
| 438 |
+
with gr.Row():
|
| 439 |
+
model_name_or_path = gr.Dropdown(
|
| 440 |
+
choices=SUPPORTED_MODELS,
|
| 441 |
+
label="Select Model",
|
| 442 |
+
value="Qwen/Qwen3-4B-Instruct-2507",
|
| 443 |
+
info="Choose from popular instruction-tuned models under 40B parameters"
|
| 444 |
+
)
|
| 445 |
+
# model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
|
| 446 |
+
gr.Markdown("### Generation Parameters")
|
| 447 |
+
with gr.Row():
|
| 448 |
+
system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.")
|
| 449 |
+
with gr.Row():
|
| 450 |
+
with gr.Column():
|
| 451 |
+
with gr.Row():
|
| 452 |
+
max_tokens = gr.Slider(label="Max Tokens", value=1024, minimum=256, maximum=MAX_TOKENS, step=256)
|
| 453 |
+
temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.1)
|
| 454 |
+
with gr.Row():
|
| 455 |
+
top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5)
|
| 456 |
+
top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
|
| 457 |
|
| 458 |
+
with gr.Column():
|
| 459 |
+
with gr.Group():
|
| 460 |
+
gr.Markdown("## Dataset information")
|
| 461 |
+
# Dynamic user limit info - default to anonymous user
|
| 462 |
+
user_limit_info = gr.Markdown(value="π€ **Anonymous User**: You can generate up to 100 samples per request. Use the sign-in button above for PRO benefits (10,000 samples).", visible=True)
|
| 463 |
with gr.Row():
|
| 464 |
+
with gr.Column():
|
| 465 |
+
input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1")
|
| 466 |
+
load_info_btn = gr.Button("π Load Dataset Info", size="sm", variant="secondary")
|
| 467 |
+
load_info_status = gr.Markdown("", visible=True)
|
| 468 |
+
|
| 469 |
+
with gr.Column():
|
| 470 |
+
output_dataset_name = gr.Textbox(label="Output Dataset Name", placeholder="e.g., my-generated-dataset, must be unique. Will be created under the org 'synthetic-data-universe'", value=None, interactive=False, info="Click Load Info to populate")
|
| 471 |
+
|
| 472 |
+
with gr.Row():
|
| 473 |
+
with gr.Column():
|
| 474 |
+
input_dataset_config = gr.Dropdown(label="Dataset Config", choices=[], value=None, interactive=False, info="Click Load Info to populate")
|
| 475 |
+
prompt_column = gr.Dropdown(label="Prompt Column", choices=[], value=None, interactive=False, info="Click Load Info to populate")
|
| 476 |
+
|
| 477 |
+
with gr.Column():
|
| 478 |
+
input_dataset_split = gr.Dropdown(label="Dataset Split", choices=[], value=None, interactive=False, info="Click Load Info to populate")
|
| 479 |
+
num_output_samples = gr.Slider(label="Number of samples, leave as '0' for all", value=0, minimum=0, maximum=MAX_SAMPLES_FREE, step=1, interactive=False, info="Click Load Info to populate")
|
| 480 |
+
|
| 481 |
+
|
| 482 |
+
# with gr.Group():
|
| 483 |
+
# gr.Markdown("## User Information, for notification when your job is completed (still TODO)")
|
| 484 |
+
# with gr.Row():
|
| 485 |
+
# with gr.Column():
|
| 486 |
+
# with gr.Row():
|
| 487 |
+
# email = gr.Textbox(label="Email", placeholder="your.email@example.com")
|
| 488 |
+
# # with gr.Row():
|
| 489 |
+
# # input_dataset_token = gr.Textbox(label="Input dataset token", type="password", placeholder="Your HF token with read access to the input dataset, leave blank if public dataset")
|
| 490 |
+
# # output_dataset_token = gr.Textbox(label="Output dataset token", type="password", placeholder="Your HF token with write access to the output dataset")
|
| 491 |
|
| 492 |
submit_btn = gr.Button("Submit Generation Request", variant="primary")
|
| 493 |
output_status = gr.Textbox(label="Status", interactive=False)
|
|
|
|
| 633 |
# Wire up the Load Dataset Info button
|
| 634 |
load_info_btn.click(
|
| 635 |
load_dataset_info,
|
| 636 |
+
inputs=[input_dataset_name, model_name_or_path, current_oauth_token],
|
| 637 |
outputs=[input_dataset_config, input_dataset_split, prompt_column, output_dataset_name, num_output_samples, load_info_status]
|
| 638 |
)
|
| 639 |
|