edbeeching commited on
Commit
0fb1b95
·
1 Parent(s): 67c4080

vibing with claude

Browse files
Files changed (1) hide show
  1. app.py +103 -66
app.py CHANGED
@@ -49,7 +49,8 @@ class GenerationStatus(Enum):
49
  FAILED = "FAILED"
50
 
51
 
52
- MAX_SAMPLES = 10000 # max number of samples in the input dataset
 
53
  MAX_TOKENS = 8192
54
  MAX_MODEL_PARAMS = 20_000_000_000 # 20 billion parameters (for now)
55
 
@@ -79,7 +80,7 @@ class GenerationRequest:
79
  private: bool = False
80
  num_retries: int = 0
81
 
82
- def validate_request(request: GenerationRequest) -> GenerationRequest:
83
  # checks that the request is valid
84
  # - input dataset exists and can be accessed with the provided token
85
  try:
@@ -101,8 +102,13 @@ def validate_request(request: GenerationRequest) -> GenerationRequest:
101
 
102
 
103
 
104
- if request.num_output_examples > MAX_SAMPLES:
105
- raise Exception(f"Requested number of output examples {request.num_output_examples} exceeds the max limit of {MAX_SAMPLES}.")
 
 
 
 
 
106
 
107
  # check the prompt column exists in the dataset
108
  if request.prompt_column not in input_dataset_info.features:
@@ -195,16 +201,16 @@ def add_request_to_db(request: GenerationRequest):
195
  raise Exception("Failed to add request to database")
196
 
197
 
198
-
199
-
200
-
201
-
202
  def main():
203
  with gr.Blocks(title="Synthetic Data Generation") as demo:
204
- gr.HTML("<h3 style='text-align:center'>Hugging Face PRO users can use the Synthetic generation service. <a href='http://huggingface.co/subscribe/pro?source=synthetic-data-universe' target='_blank'>Subscribe to PRO</a></h3>", elem_id="sub_title")
205
 
206
  pro_message = gr.Markdown(visible=False)
207
  main_interface = gr.Column(visible=False)
 
 
 
 
208
  with main_interface:
209
  with gr.Group():
210
  with gr.Row():
@@ -214,7 +220,7 @@ def main():
214
  Welcome to the Synthetic Data Generation service! This tool allows you to generate synthetic data using large language models. Generation is FREE for Hugging Face PRO users and uses idle GPUs on the HF science cluster.\n
215
  Outputs from this service will be PUBLIC and available on the Hugging Face Hub under the organization [synthetic-data-universe](https://huggingface.co/synthetic-data-universe).\n
216
  """)
217
- with gr.Group():
218
  with gr.Row():
219
  gr.Markdown("""
220
  **How it works:**
@@ -232,54 +238,81 @@ def main():
232
  - Maximum of 8192 generated tokens
233
  """)
234
 
235
- with gr.Group():
236
- gr.Markdown("## Dataset information")
237
- with gr.Column():
238
- with gr.Row():
239
- input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1")
240
- input_dataset_split = gr.Textbox(label="Input Dataset Split", value="train", placeholder="e.g., train, test, validation")
241
- input_dataset_config = gr.Textbox(label="Input Dataset Config", value="default", placeholder="e.g., default, custom")
242
- prompt_column = gr.Textbox(label="Prompt Column", placeholder="e.g., text, prompt, question")
243
-
244
- with gr.Column():
245
- with gr.Row():
246
- output_dataset_name = gr.Textbox(label="Output Dataset Name", placeholder="e.g., my-generated-dataset, must be unique. Will be created under the org 'synthetic-data-universe'")
247
- num_output_samples = gr.Slider(label="Number of samples, leave as '0' for all", value=0, minimum=0, maximum=MAX_SAMPLES, step=1)
248
- with gr.Group():
249
- gr.Markdown("## Model information")
250
- with gr.Column():
251
- with gr.Row():
252
- model_name_or_path = gr.Textbox(label="Model Name or Path", placeholder="e.g., Qwen/Qwen3-4B-Instruct-2507")
253
- model_revision = gr.Textbox(label="Model Revision", value="main", placeholder="e.g., main, v1.0")
254
- # model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
255
- with gr.Group():
256
- gr.Markdown("## Generation Parameters")
257
- with gr.Row():
258
- with gr.Column():
259
- with gr.Row():
260
- max_tokens = gr.Slider(label="Max Tokens", value=512, minimum=256, maximum=MAX_TOKENS, step=256)
261
- temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.1)
 
262
  with gr.Row():
263
- top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5)
264
- top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
 
 
 
 
 
 
 
265
  with gr.Row():
266
- system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.")
267
-
268
- with gr.Group():
269
- gr.Markdown("## User Information, for notification when your job is completed (still TODO)")
270
- with gr.Row():
271
- with gr.Column():
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  with gr.Row():
273
- email = gr.Textbox(label="Email", placeholder="your.email@example.com")
274
- # with gr.Row():
275
- # input_dataset_token = gr.Textbox(label="Input dataset token", type="password", placeholder="Your HF token with read access to the input dataset, leave blank if public dataset")
276
- # output_dataset_token = gr.Textbox(label="Output dataset token", type="password", placeholder="Your HF token with write access to the output dataset")
277
-
278
- submit_btn = gr.Button("Submit Generation Request", variant="primary")
279
- output_status = gr.Textbox(label="Status", interactive=False)
 
 
 
 
 
 
280
 
281
  def submit_request(input_dataset_name, input_split, input_dataset_config, output_dataset_name, prompt_col, model_name, model_rev, sys_prompt,
282
- max_tok, temp, top_k_val, top_p_val, email_addr, num_output_samples):
283
 
284
  MASTER_ORG = "synthetic-data-universe/"
285
  model_token = False # This is currently not supported
@@ -312,7 +345,7 @@ def main():
312
  )
313
 
314
  # check the input dataset exists and can be accessed with the provided token
315
- request = validate_request(request)
316
  add_request_to_db(request)
317
 
318
  return "Request submitted successfully!"
@@ -322,25 +355,29 @@ def main():
322
  submit_btn.click(
323
  submit_request,
324
  inputs=[input_dataset_name, input_dataset_split, input_dataset_config, output_dataset_name, prompt_column, model_name_or_path,
325
- model_revision, system_prompt, max_tokens, temperature, top_k, top_p, email, num_output_samples],
326
  outputs=output_status
327
  )
328
 
329
- def control_access(profile: Optional[gr.OAuthProfile] = None, oauth_token: Optional[gr.OAuthToken] = None):
330
- if not profile: return gr.update(visible=False), gr.update(visible=False)
331
- if verify_pro_status(oauth_token): return gr.update(visible=True), gr.update(visible=False)
 
 
 
 
332
  else:
333
- message = (
334
- "## ✨ Exclusive Access for PRO Users\n\n"
335
- "Thank you for your interest! This app is available exclusively for our Hugging Face **PRO** members.\n\n"
336
- "To unlock this and many other cool stuff, please consider upgrading your account.\n\n"
337
- "### [**Become a PRO Today!**](http://huggingface.co/subscribe/pro?source=synthetic-data-universe)"
338
- )
339
- return gr.update(visible=False), gr.update(visible=True, value=message)
340
 
341
  login_button = gr.LoginButton() # this is required or AUTH will not work
342
 
343
- demo.load(control_access, inputs=None, outputs=[main_interface, pro_message])
344
  demo.queue(max_size=None, default_concurrency_limit=None).launch(show_error=True)
345
 
346
  if __name__ == "__main__":
 
49
  FAILED = "FAILED"
50
 
51
 
52
+ MAX_SAMPLES_PRO = 10000 # max number of samples for PRO/Enterprise users
53
+ MAX_SAMPLES_FREE = 100 # max number of samples for free users
54
  MAX_TOKENS = 8192
55
  MAX_MODEL_PARAMS = 20_000_000_000 # 20 billion parameters (for now)
56
 
 
80
  private: bool = False
81
  num_retries: int = 0
82
 
83
+ def validate_request(request: GenerationRequest, oauth_token: Optional[Union[gr.OAuthToken, str]] = None) -> GenerationRequest:
84
  # checks that the request is valid
85
  # - input dataset exists and can be accessed with the provided token
86
  try:
 
102
 
103
 
104
 
105
+ # Check user tier and apply appropriate limits
106
+ is_pro = verify_pro_status(oauth_token)
107
+ max_samples = MAX_SAMPLES_PRO if is_pro else MAX_SAMPLES_FREE
108
+
109
+ if request.num_output_examples > max_samples:
110
+ user_tier = "PRO/Enterprise" if is_pro else "free"
111
+ raise Exception(f"Requested number of output examples {request.num_output_examples} exceeds the max limit of {max_samples} for {user_tier} users.")
112
 
113
  # check the prompt column exists in the dataset
114
  if request.prompt_column not in input_dataset_info.features:
 
201
  raise Exception("Failed to add request to database")
202
 
203
 
 
 
 
 
204
  def main():
205
  with gr.Blocks(title="Synthetic Data Generation") as demo:
206
+ gr.HTML("<h3 style='text-align:center'>Generate synthetic data with AI models. PRO users get 10k samples, free users get 100 samples. <a href='http://huggingface.co/subscribe/pro?source=synthetic-data-universe' target='_blank'>Upgrade to PRO</a></h3>", elem_id="sub_title")
207
 
208
  pro_message = gr.Markdown(visible=False)
209
  main_interface = gr.Column(visible=False)
210
+
211
+ # Store the current oauth token for use in submit_request
212
+ current_oauth_token = gr.State(None)
213
+
214
  with main_interface:
215
  with gr.Group():
216
  with gr.Row():
 
220
  Welcome to the Synthetic Data Generation service! This tool allows you to generate synthetic data using large language models. Generation is FREE for Hugging Face PRO users and uses idle GPUs on the HF science cluster.\n
221
  Outputs from this service will be PUBLIC and available on the Hugging Face Hub under the organization [synthetic-data-universe](https://huggingface.co/synthetic-data-universe).\n
222
  """)
223
+ with gr.Accordion("How it works", open=False):
224
  with gr.Row():
225
  gr.Markdown("""
226
  **How it works:**
 
238
  - Maximum of 8192 generated tokens
239
  """)
240
 
241
+ with gr.Tabs():
242
+ with gr.TabItem("Generate Synthetic Data"):
243
+ with gr.Group():
244
+ gr.Markdown("## Model information")
245
+ with gr.Column():
246
+ with gr.Row():
247
+ model_name_or_path = gr.Dropdown(
248
+ choices=[
249
+ "microsoft/Phi-3.5-mini-instruct",
250
+ "Qwen/Qwen2.5-7B-Instruct",
251
+ "meta-llama/Llama-3.2-8B-Instruct",
252
+ "mistralai/Mistral-7B-Instruct-v0.3",
253
+ "google/gemma-2-9b-it",
254
+ "microsoft/DialoGPT-medium",
255
+ "HuggingFaceH4/zephyr-7b-beta",
256
+ "teknium/OpenHermes-2.5-Mistral-7B",
257
+ "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO",
258
+ "01-ai/Yi-34B-Chat"
259
+ ],
260
+ label="Select Model",
261
+ value="microsoft/Phi-3.5-mini-instruct",
262
+ info="Choose from popular instruction-tuned models under 40B parameters"
263
+ )
264
+ # model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
265
+ with gr.Group():
266
+ gr.Markdown("## Dataset information")
267
+ # Dynamic user limit info
268
+ user_limit_info = gr.Markdown(value="", visible=True)
269
  with gr.Row():
270
+ with gr.Column():
271
+ input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1")
272
+ prompt_column = gr.Textbox(label="Prompt Column", placeholder="e.g., text, prompt, question")
273
+
274
+ with gr.Column():
275
+ output_dataset_name = gr.Textbox(label="Output Dataset Name", placeholder="e.g., my-generated-dataset, must be unique. Will be created under the org 'synthetic-data-universe'")
276
+ num_output_samples = gr.Slider(label="Number of samples, leave as '0' for all", value=0, minimum=0, maximum=MAX_SAMPLES_PRO, step=1)
277
+
278
+ with gr.Accordion("Advanced Options", open=False):
279
  with gr.Row():
280
+ input_dataset_config = gr.Textbox(label="Input Dataset Config", value="default", placeholder="e.g., default, custom")
281
+ input_dataset_split = gr.Textbox(label="Input Dataset Split", value="train", placeholder="e.g., train, test, validation")
282
+ model_revision = gr.Textbox(label="Model Revision", value="main", placeholder="e.g., main, v1.0")
283
+
284
+ with gr.Group():
285
+ gr.Markdown("### Generation Parameters")
286
+ with gr.Row():
287
+ with gr.Column():
288
+ with gr.Row():
289
+ max_tokens = gr.Slider(label="Max Tokens", value=512, minimum=256, maximum=MAX_TOKENS, step=256)
290
+ temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.1)
291
+ with gr.Row():
292
+ top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5)
293
+ top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
294
+ with gr.Row():
295
+ system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.")
296
+
297
+ with gr.Group():
298
+ gr.Markdown("## User Information, for notification when your job is completed (still TODO)")
299
  with gr.Row():
300
+ with gr.Column():
301
+ with gr.Row():
302
+ email = gr.Textbox(label="Email", placeholder="your.email@example.com")
303
+ # with gr.Row():
304
+ # input_dataset_token = gr.Textbox(label="Input dataset token", type="password", placeholder="Your HF token with read access to the input dataset, leave blank if public dataset")
305
+ # output_dataset_token = gr.Textbox(label="Output dataset token", type="password", placeholder="Your HF token with write access to the output dataset")
306
+
307
+ submit_btn = gr.Button("Submit Generation Request", variant="primary")
308
+ output_status = gr.Textbox(label="Status", interactive=False)
309
+
310
+ with gr.TabItem("Coming Soon"):
311
+ gr.Markdown("## New features coming soon!")
312
+ gr.Markdown("This tab will contain additional functionality in future updates.")
313
 
314
  def submit_request(input_dataset_name, input_split, input_dataset_config, output_dataset_name, prompt_col, model_name, model_rev, sys_prompt,
315
+ max_tok, temp, top_k_val, top_p_val, email_addr, num_output_samples, oauth_token=None):
316
 
317
  MASTER_ORG = "synthetic-data-universe/"
318
  model_token = False # This is currently not supported
 
345
  )
346
 
347
  # check the input dataset exists and can be accessed with the provided token
348
+ request = validate_request(request, oauth_token)
349
  add_request_to_db(request)
350
 
351
  return "Request submitted successfully!"
 
355
  submit_btn.click(
356
  submit_request,
357
  inputs=[input_dataset_name, input_dataset_split, input_dataset_config, output_dataset_name, prompt_column, model_name_or_path,
358
+ model_revision, system_prompt, max_tokens, temperature, top_k, top_p, email, num_output_samples, current_oauth_token],
359
  outputs=output_status
360
  )
361
 
362
+ def update_user_limits(oauth_token):
363
+ if oauth_token is None:
364
+ return ""
365
+
366
+ is_pro = verify_pro_status(oauth_token)
367
+ if is_pro:
368
+ return "✨ **PRO User**: You can generate up to 10,000 samples per request."
369
  else:
370
+ return "👤 **Free User**: You can generate up to 100 samples per request. [Upgrade to PRO](http://huggingface.co/subscribe/pro?source=synthetic-data-universe) for 10,000 samples."
371
+
372
+ def control_access(profile: Optional[gr.OAuthProfile] = None, oauth_token: Optional[gr.OAuthToken] = None):
373
+ if not profile: return gr.update(visible=False), gr.update(visible=False), None, ""
374
+ # Allow all users but show different messaging, and store the token
375
+ limit_msg = update_user_limits(oauth_token)
376
+ return gr.update(visible=True), gr.update(visible=False), oauth_token, limit_msg
377
 
378
  login_button = gr.LoginButton() # this is required or AUTH will not work
379
 
380
+ demo.load(control_access, inputs=None, outputs=[main_interface, pro_message, current_oauth_token, user_limit_info])
381
  demo.queue(max_size=None, default_concurrency_limit=None).launch(show_error=True)
382
 
383
  if __name__ == "__main__":