edbeeching commited on
Commit
ab5bf76
·
1 Parent(s): 5d89dcd
Files changed (1) hide show
  1. app.py +170 -114
app.py CHANGED
@@ -7,6 +7,8 @@ from supabase.client import ClientOptions
7
  from enum import Enum
8
  from datasets import get_dataset_infos
9
  from transformers import AutoConfig
 
 
10
 
11
  """
12
  Still TODO:
@@ -16,6 +18,30 @@ from transformers import AutoConfig
16
  """
17
 
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  class GenerationStatus(Enum):
20
  PENDING = "PENDING"
21
  RUNNING = "RUNNING"
@@ -64,17 +90,23 @@ def validate_request(request: GenerationRequest) -> GenerationRequest:
64
  # check that the input dataset split exists
65
  if request.input_dataset_split not in input_dataset_info.splits:
66
  raise Exception(f"Dataset split {request.input_dataset_split} does not exist in dataset {request.input_dataset_name}. Available splits: {list(input_dataset_info.splits.keys())}")
67
-
68
- # check that the number of samples is less than MAX_SAMPLES
69
- if input_dataset_info.splits[request.input_dataset_split].num_examples > MAX_SAMPLES:
70
  request.num_output_examples = input_dataset_info.splits[request.input_dataset_split].num_examples
71
- raise Exception(f"Dataset split {request.input_dataset_split} in dataset {request.input_dataset_name} exceeds max sample limit of {MAX_SAMPLES}.")
 
 
 
 
 
 
 
72
 
73
  # check the prompt column exists in the dataset
74
  if request.prompt_column not in input_dataset_info.features:
75
  raise Exception(f"Prompt column {request.prompt_column} does not exist in dataset {request.input_dataset_name}. Available columns: {list(input_dataset_info.features.keys())}")
76
 
77
-
78
  # check the models exists
79
  try:
80
  model_config = AutoConfig.from_pretrained(request.model_name_or_path, revision=request.model_revision, token=request.model_token)
@@ -146,127 +178,151 @@ def add_request_to_db(request: GenerationRequest):
146
  raise Exception("Failed to add request to database")
147
 
148
 
149
- def create_gradio_interface():
150
- with gr.Blocks(title="Synthetic Data Generation") as interface:
151
- with gr.Group():
152
- with gr.Row():
153
- gr.Markdown("# Synthetic Data Generation Request")
154
- with gr.Row():
155
- gr.Markdown("""
156
- Welcome to the Synthetic Data Generation service! This tool allows you to generate synthetic data using large language models. Generation is FREE for Hugging Face PRO users and uses idle GPUs on the HF science cluster.\n
157
- """)
158
- with gr.Group():
159
- with gr.Row():
160
- gr.Markdown("""
161
- **How it works:**
162
- 1. Provide an input dataset with prompts
163
- 2. Select a public language model for generation
164
- 3. Configure generation parameters
165
- 4. Submit your request and receive generated data
166
- """)
167
- gr.Markdown("""
168
-
169
- **Requirements:**
170
- - Input dataset must be publicly accessible
171
- - Output dataset repository must exist and you must have write access
172
- - Model must be accessible (public or with valid token)
173
- - Maximum 10,000 samples per dataset
174
- - Maximum of 8192 generation tokens
175
- """)
176
 
177
- with gr.Group():
178
- gr.Markdown("## Dataset information")
179
- with gr.Column():
 
180
  with gr.Row():
181
- input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1")
182
- input_dataset_split = gr.Textbox(label="Input Dataset Split", value="train", placeholder="e.g., train, test, validation")
183
- input_dataset_config = gr.Textbox(label="Input Dataset Config", value="default", placeholder="e.g., default, custom")
184
- prompt_column = gr.Textbox(label="Prompt Column", placeholder="e.g., text, prompt, question")
185
- with gr.Column():
186
- output_dataset_name = gr.Textbox(label="Output Dataset Name", placeholder="e.g., my-generated-dataset, must be unique. Will be created under the org 'synthetic-data-universe'")
187
- with gr.Group():
188
- gr.Markdown("## Model information")
189
- with gr.Column():
190
  with gr.Row():
191
- model_name_or_path = gr.Textbox(label="Model Name or Path", placeholder="e.g., Qwen/Qwen3-4B-Instruct-2507")
192
- model_revision = gr.Textbox(label="Model Revision", value="main", placeholder="e.g., main, v1.0")
193
- # model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
194
- with gr.Group():
195
- gr.Markdown("## Generation Parameters")
196
- with gr.Row():
 
 
 
 
 
 
 
 
 
 
 
 
197
  with gr.Column():
198
  with gr.Row():
199
- max_tokens = gr.Slider(label="Max Tokens", value=512, minimum=256, maximum=MAX_TOKENS, step=256)
200
- temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.1)
201
- with gr.Row():
202
- top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5)
203
- top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
204
- with gr.Row():
205
- system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.")
206
-
207
- with gr.Group():
208
- gr.Markdown("## User Information, for notification when your job is completed")
209
- with gr.Row():
210
  with gr.Column():
211
  with gr.Row():
212
- email = gr.Textbox(label="Email", placeholder="your.email@example.com")
213
- # with gr.Row():
214
- # input_dataset_token = gr.Textbox(label="Input dataset token", type="password", placeholder="Your HF token with read access to the input dataset, leave blank if public dataset")
215
- # output_dataset_token = gr.Textbox(label="Output dataset token", type="password", placeholder="Your HF token with write access to the output dataset")
 
 
 
 
 
 
 
 
 
 
 
216
 
217
- submit_btn = gr.Button("Submit Generation Request", variant="primary")
218
- output_status = gr.Textbox(label="Status", interactive=False)
 
 
 
 
 
 
 
219
 
220
- def submit_request(input_dataset_name, input_split, input_dataset_config, output_dataset_name, prompt_col, model_name, model_rev, sys_prompt,
221
- max_tok, temp, top_k_val, top_p_val, email_addr):
222
 
223
- MASTER_ORG = "synthetic-data-universe/"
224
- model_token = None # This is currently not supported
225
- input_dataset_token = None # This is currently not supported
226
- output_dataset_token = os.getenv("OUTPUT_DATASET_TOKEN")
227
- try:
228
- request = GenerationRequest(
229
- id="", # Will be generated when adding to the database
230
- created_at="", # Will be set when adding to the database
231
- status=GenerationStatus.PENDING,
232
- input_dataset_name=input_dataset_name,
233
- input_dataset_split=input_split,
234
- input_dataset_config=input_dataset_config,
235
- output_dataset_name=MASTER_ORG + output_dataset_name,
236
- prompt_column=prompt_col,
237
- model_name_or_path=model_name,
238
- model_revision=model_rev,
239
- model_token=model_token if model_token else None,
240
- system_prompt=sys_prompt if sys_prompt else None,
241
- max_tokens=int(max_tok),
242
- temperature=temp,
243
- top_k=int(top_k_val),
244
- top_p=top_p_val,
245
- input_dataset_token=input_dataset_token if input_dataset_token else None,
246
- output_dataset_token=output_dataset_token,
247
- num_output_examples=0, # will be set after validating the input dataset
248
- username="user",
249
- email=email_addr
250
- )
251
 
252
- # check the input dataset exists and can be accessed with the provided token
253
- request = validate_request(request)
254
- add_request_to_db(request)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
 
256
- return "Request submitted successfully!"
257
- except Exception as e:
258
- return f"Error: {str(e)}"
 
 
 
 
 
 
 
259
 
260
- submit_btn.click(
261
- submit_request,
262
- inputs=[input_dataset_name, input_dataset_split, input_dataset_config, output_dataset_name, prompt_column, model_name_or_path,
263
- model_revision, system_prompt, max_tokens, temperature, top_k, top_p,
264
- email],
265
- outputs=output_status
266
- )
267
-
268
- return interface
 
 
 
 
 
269
 
270
  if __name__ == "__main__":
271
- app = create_gradio_interface()
272
- app.launch()
 
7
  from enum import Enum
8
  from datasets import get_dataset_infos
9
  from transformers import AutoConfig
10
+ from huggingface_hub import whoami
11
+ from typing import Optional, List, Tuple, Union
12
 
13
  """
14
  Still TODO:
 
18
  """
19
 
20
 
21
+ def verify_pro_status(token: Optional[Union[gr.OAuthToken, str]]) -> bool:
22
+ """Verifies if the user is a Hugging Face PRO user or part of an enterprise org."""
23
+ if not token:
24
+ return False
25
+
26
+ if isinstance(token, gr.OAuthToken):
27
+ token_str = token.token
28
+ elif isinstance(token, str):
29
+ token_str = token
30
+ else:
31
+ return False
32
+
33
+ try:
34
+ user_info = whoami(token=token_str)
35
+ return (
36
+ user_info.get("isPro", False) or
37
+ any(org.get("isEnterprise", False) for org in user_info.get("orgs", []))
38
+ )
39
+ except Exception as e:
40
+ print(f"Could not verify user's PRO/Enterprise status: {e}")
41
+ return False
42
+
43
+
44
+
45
  class GenerationStatus(Enum):
46
  PENDING = "PENDING"
47
  RUNNING = "RUNNING"
 
90
  # check that the input dataset split exists
91
  if request.input_dataset_split not in input_dataset_info.splits:
92
  raise Exception(f"Dataset split {request.input_dataset_split} does not exist in dataset {request.input_dataset_name}. Available splits: {list(input_dataset_info.splits.keys())}")
93
+
94
+ # if num_output_examples is 0, set it to the number of examples in the input dataset split
95
+ if request.num_output_examples == 0:
96
  request.num_output_examples = input_dataset_info.splits[request.input_dataset_split].num_examples
97
+ else:
98
+ if request.num_output_examples > input_dataset_info.splits[request.input_dataset_split].num_examples:
99
+ raise Exception(f"Requested number of output examples {request.num_output_examples} exceeds the number of examples in the input dataset split {input_dataset_info.splits[request.input_dataset_split].num_examples}.")
100
+ request.input_dataset_split = f"{request.input_dataset_split}[:{request.num_output_examples}]"
101
+
102
+
103
+ if request.num_output_examples > MAX_SAMPLES:
104
+ raise Exception(f"Requested number of output examples {request.num_output_examples} exceeds the max limit of {MAX_SAMPLES}.")
105
 
106
  # check the prompt column exists in the dataset
107
  if request.prompt_column not in input_dataset_info.features:
108
  raise Exception(f"Prompt column {request.prompt_column} does not exist in dataset {request.input_dataset_name}. Available columns: {list(input_dataset_info.features.keys())}")
109
 
 
110
  # check the models exists
111
  try:
112
  model_config = AutoConfig.from_pretrained(request.model_name_or_path, revision=request.model_revision, token=request.model_token)
 
178
  raise Exception("Failed to add request to database")
179
 
180
 
181
+
182
+
183
+
184
+
185
+ def main():
186
+ with gr.Blocks(title="Synthetic Data Generation") as demo:
187
+ gr.HTML("<h3 style='text-align:center'>Hugging Face PRO users can use the Synthetic generation service. <a href='http://huggingface.co/subscribe/pro?source=synthetic-data-universe' target='_blank'>Subscribe to PRO</a></h3>", elem_id="sub_title")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
+ pro_message = gr.Markdown(visible=False)
190
+ main_interface = gr.Column(visible=False)
191
+ with main_interface:
192
+ with gr.Group():
193
  with gr.Row():
194
+ gr.Markdown("# Synthetic Data Generation Request")
195
+ with gr.Row():
196
+ gr.Markdown("""
197
+ Welcome to the Synthetic Data Generation service! This tool allows you to generate synthetic data using large language models. Generation is FREE for Hugging Face PRO users and uses idle GPUs on the HF science cluster.\n
198
+ Outputs from this service will be PUBLIC and available on the Hugging Face Hub under the organization [synthetic-data-universe](https://huggingface.co/synthetic-data-universe).\n
199
+ """)
200
+ with gr.Group():
 
 
201
  with gr.Row():
202
+ gr.Markdown("""
203
+ **How it works:**
204
+ 1. Provide an input dataset with prompts
205
+ 2. Select a public language model for generation
206
+ 3. Configure generation parameters
207
+ 4. Submit your request.
208
+ """)
209
+ gr.Markdown("""
210
+
211
+ **Requirements:**
212
+ - Input dataset must be publicly accessible (for now)
213
+ - Model must be accessible (public and note gated, for now)
214
+ - Maximum 10,000 samples per dataset (for now)
215
+ - Maximum of 8192 generation tokens (for now)
216
+ """)
217
+
218
+ with gr.Group():
219
+ gr.Markdown("## Dataset information")
220
  with gr.Column():
221
  with gr.Row():
222
+ input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1")
223
+ input_dataset_split = gr.Textbox(label="Input Dataset Split", value="train", placeholder="e.g., train, test, validation")
224
+ input_dataset_config = gr.Textbox(label="Input Dataset Config", value="default", placeholder="e.g., default, custom")
225
+ prompt_column = gr.Textbox(label="Prompt Column", placeholder="e.g., text, prompt, question")
226
+
227
+ with gr.Column():
228
+ with gr.Row():
229
+ output_dataset_name = gr.Textbox(label="Output Dataset Name", placeholder="e.g., my-generated-dataset, must be unique. Will be created under the org 'synthetic-data-universe'")
230
+ num_output_samples = gr.Slider(label="Number of samples, leave as '0' for all", value=0, minimum=0, maximum=MAX_SAMPLES, step=1)
231
+ with gr.Group():
232
+ gr.Markdown("## Model information")
233
  with gr.Column():
234
  with gr.Row():
235
+ model_name_or_path = gr.Textbox(label="Model Name or Path", placeholder="e.g., Qwen/Qwen3-4B-Instruct-2507")
236
+ model_revision = gr.Textbox(label="Model Revision", value="main", placeholder="e.g., main, v1.0")
237
+ # model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
238
+ with gr.Group():
239
+ gr.Markdown("## Generation Parameters")
240
+ with gr.Row():
241
+ with gr.Column():
242
+ with gr.Row():
243
+ max_tokens = gr.Slider(label="Max Tokens", value=512, minimum=256, maximum=MAX_TOKENS, step=256)
244
+ temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.1)
245
+ with gr.Row():
246
+ top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5)
247
+ top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
248
+ with gr.Row():
249
+ system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.")
250
 
251
+ with gr.Group():
252
+ gr.Markdown("## User Information, for notification when your job is completed (still TODO)")
253
+ with gr.Row():
254
+ with gr.Column():
255
+ with gr.Row():
256
+ email = gr.Textbox(label="Email", placeholder="your.email@example.com")
257
+ # with gr.Row():
258
+ # input_dataset_token = gr.Textbox(label="Input dataset token", type="password", placeholder="Your HF token with read access to the input dataset, leave blank if public dataset")
259
+ # output_dataset_token = gr.Textbox(label="Output dataset token", type="password", placeholder="Your HF token with write access to the output dataset")
260
 
261
+ submit_btn = gr.Button("Submit Generation Request", variant="primary")
262
+ output_status = gr.Textbox(label="Status", interactive=False)
263
 
264
+ def submit_request(input_dataset_name, input_split, input_dataset_config, output_dataset_name, prompt_col, model_name, model_rev, sys_prompt,
265
+ max_tok, temp, top_k_val, top_p_val, email_addr, num_output_samples):
266
+
267
+ MASTER_ORG = "synthetic-data-universe/"
268
+ model_token = None # This is currently not supported
269
+ input_dataset_token = None # This is currently not supported
270
+ output_dataset_token = os.getenv("OUTPUT_DATASET_TOKEN")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
 
272
+ try:
273
+ request = GenerationRequest(
274
+ id="", # Will be generated when adding to the database
275
+ created_at="", # Will be set when adding to the database
276
+ status=GenerationStatus.PENDING,
277
+ input_dataset_name=input_dataset_name,
278
+ input_dataset_split=input_split,
279
+ input_dataset_config=input_dataset_config,
280
+ output_dataset_name=MASTER_ORG + output_dataset_name,
281
+ prompt_column=prompt_col,
282
+ model_name_or_path=model_name,
283
+ model_revision=model_rev,
284
+ model_token=model_token if model_token else None,
285
+ system_prompt=sys_prompt if sys_prompt else None,
286
+ max_tokens=int(max_tok),
287
+ temperature=temp,
288
+ top_k=int(top_k_val),
289
+ top_p=top_p_val,
290
+ input_dataset_token=input_dataset_token if input_dataset_token else None,
291
+ output_dataset_token=output_dataset_token,
292
+ num_output_examples=num_output_samples, # will be set after validating the input dataset
293
+ username="user",
294
+ email=email_addr
295
+ )
296
+
297
+ # check the input dataset exists and can be accessed with the provided token
298
+ request = validate_request(request)
299
+ add_request_to_db(request)
300
 
301
+ return "Request submitted successfully!"
302
+ except Exception as e:
303
+ return f"Error: {str(e)}"
304
+
305
+ submit_btn.click(
306
+ submit_request,
307
+ inputs=[input_dataset_name, input_dataset_split, input_dataset_config, output_dataset_name, prompt_column, model_name_or_path,
308
+ model_revision, system_prompt, max_tokens, temperature, top_k, top_p, email, num_output_samples],
309
+ outputs=output_status
310
+ )
311
 
312
+ def control_access(profile: Optional[gr.OAuthProfile] = None, oauth_token: Optional[gr.OAuthToken] = None):
313
+ if not profile: return gr.update(visible=False), gr.update(visible=False)
314
+ if verify_pro_status(oauth_token): return gr.update(visible=True), gr.update(visible=False)
315
+ else:
316
+ message = (
317
+ "## ✨ Exclusive Access for PRO Users\n\n"
318
+ "Thank you for your interest! This app is available exclusively for our Hugging Face **PRO** members.\n\n"
319
+ "To unlock this and many other cool stuff, please consider upgrading your account.\n\n"
320
+ "### [**Become a PRO Today!**](http://huggingface.co/subscribe/pro?source=synthetic-data-universe)"
321
+ )
322
+ return gr.update(visible=False), gr.update(visible=True, value=message)
323
+
324
+ demo.load(control_access, inputs=None, outputs=[main_interface, pro_message])
325
+ demo.queue(max_size=None, default_concurrency_limit=None).launch(show_error=True)
326
 
327
  if __name__ == "__main__":
328
+ main()