edbeeching commited on
Commit
588fd75
Β·
1 Parent(s): b50b8c1

iteration 2

Browse files
Files changed (1) hide show
  1. app.py +176 -18
app.py CHANGED
@@ -9,6 +9,8 @@ from datasets import get_dataset_infos
9
  from transformers import AutoConfig
10
  from huggingface_hub import whoami
11
  from typing import Optional, List, Tuple, Union
 
 
12
 
13
  """
14
  Still TODO:
@@ -17,6 +19,16 @@ from typing import Optional, List, Tuple, Union
17
  - validate max model params
18
  """
19
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  def verify_pro_status(token: Optional[Union[gr.OAuthToken, str]]) -> bool:
22
  """Verifies if the user is a Hugging Face PRO user or part of an enterprise org."""
@@ -127,8 +139,34 @@ def validate_request(request: GenerationRequest, oauth_token: Optional[Union[gr.
127
  try:
128
  output_dataset_info = get_dataset_infos(request.output_dataset_name, token=request.output_dataset_token)
129
  raise Exception(f"Output dataset {request.output_dataset_name} already exists. Please choose a different name.")
130
- except Exception as e:
131
  pass # dataset does not exist, which is expected
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
  # check the models exists
134
  try:
@@ -205,6 +243,44 @@ def add_request_to_db(request: GenerationRequest):
205
  raise Exception("Failed to add request to database")
206
 
207
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  def main():
209
  with gr.Blocks(title="Synthetic Data Generation") as demo:
210
  gr.HTML("<h3 style='text-align:center'>Generate synthetic data with AI models. Free to use! Sign in for PRO benefits (10k samples vs 100). <a href='http://huggingface.co/subscribe/pro?source=synthetic-data-universe' target='_blank'>Upgrade to PRO</a></h3>", elem_id="sub_title")
@@ -230,7 +306,7 @@ def main():
230
  Welcome to the Synthetic Data Generation service! This tool allows you to generate synthetic data using large language models. Generation is FREE for Hugging Face PRO users and uses idle GPUs on the HF science cluster.\n
231
  Outputs from this service will be PUBLIC and available on the Hugging Face Hub under the organization [synthetic-data-universe](https://huggingface.co/synthetic-data-universe).\n
232
  """)
233
- with gr.Accordion("How it works", open=False):
234
  with gr.Row():
235
  gr.Markdown("""
236
  **How it works:**
@@ -255,20 +331,9 @@ def main():
255
  with gr.Column():
256
  with gr.Row():
257
  model_name_or_path = gr.Dropdown(
258
- choices=[
259
- "microsoft/Phi-3.5-mini-instruct",
260
- "Qwen/Qwen2.5-7B-Instruct",
261
- "meta-llama/Llama-3.2-8B-Instruct",
262
- "mistralai/Mistral-7B-Instruct-v0.3",
263
- "google/gemma-2-9b-it",
264
- "microsoft/DialoGPT-medium",
265
- "HuggingFaceH4/zephyr-7b-beta",
266
- "teknium/OpenHermes-2.5-Mistral-7B",
267
- "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO",
268
- "01-ai/Yi-34B-Chat"
269
- ],
270
  label="Select Model",
271
- value="microsoft/Phi-3.5-mini-instruct",
272
  info="Choose from popular instruction-tuned models under 40B parameters"
273
  )
274
  # model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
@@ -317,9 +382,102 @@ def main():
317
  submit_btn = gr.Button("Submit Generation Request", variant="primary")
318
  output_status = gr.Textbox(label="Status", interactive=False)
319
 
320
- with gr.TabItem("Coming Soon"):
321
- gr.Markdown("## New features coming soon!")
322
- gr.Markdown("This tab will contain additional functionality in future updates.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
 
324
  def submit_request(input_dataset_name, input_split, input_dataset_config, output_dataset_name, prompt_col, model_name, model_rev, sys_prompt,
325
  max_tok, temp, top_k_val, top_p_val, email_addr, num_output_samples, oauth_token=None):
 
9
  from transformers import AutoConfig
10
  from huggingface_hub import whoami
11
  from typing import Optional, List, Tuple, Union
12
+ # import threading # DISABLED - was part of hanging database test functions
13
+ # import time # DISABLED - was part of hanging database test functions
14
 
15
  """
16
  Still TODO:
 
19
  - validate max model params
20
  """
21
 
22
+ SUPPORTED_MODELS = [
23
+ "Qwen/Qwen3-4B-Instruct-2507",
24
+ "Qwen/Qwen3-30B-A3B-Instruct-2507",
25
+ "meta-llama/Llama-3.2-1B-Instruct",
26
+ "meta-llama/Llama-3.2-3B-Instruct",
27
+ "baidu/ERNIE-4.5-21B-A3B-Thinking",
28
+ "LLM360/K2-Think",
29
+ "openai/gpt-oss-20b",
30
+ ]
31
+
32
 
33
  def verify_pro_status(token: Optional[Union[gr.OAuthToken, str]]) -> bool:
34
  """Verifies if the user is a Hugging Face PRO user or part of an enterprise org."""
 
139
  try:
140
  output_dataset_info = get_dataset_infos(request.output_dataset_name, token=request.output_dataset_token)
141
  raise Exception(f"Output dataset {request.output_dataset_name} already exists. Please choose a different name.")
142
+ except Exception:
143
  pass # dataset does not exist, which is expected
144
+
145
+ # check the output dataset name doesn't already exist in the database
146
+ try:
147
+ url = os.getenv("SUPABASE_URL")
148
+ key = os.getenv("SUPABASE_KEY")
149
+
150
+ if url and key:
151
+ supabase = create_client(
152
+ url,
153
+ key,
154
+ options=ClientOptions(
155
+ postgrest_client_timeout=10,
156
+ storage_client_timeout=10,
157
+ schema="public",
158
+ )
159
+ )
160
+
161
+ existing_request = supabase.table("gen-requests").select("id").eq("output_dataset_name", request.output_dataset_name).execute()
162
+ if existing_request.data:
163
+ raise Exception(f"Output dataset {request.output_dataset_name} is already being generated or has been requested. Please choose a different name.")
164
+ except Exception as e:
165
+ # If it's our custom exception about dataset already existing, re-raise it
166
+ if "already being generated" in str(e):
167
+ raise e
168
+ # Otherwise, ignore database connection errors and continue
169
+ pass
170
 
171
  # check the models exists
172
  try:
 
243
  raise Exception("Failed to add request to database")
244
 
245
 
246
+
247
+
248
+ def get_generation_stats_safe():
249
+ """Safely fetch generation request statistics with proper error handling"""
250
+ try:
251
+ url = os.getenv("SUPABASE_URL")
252
+ key = os.getenv("SUPABASE_KEY")
253
+
254
+ if not url or not key:
255
+ raise Exception("Missing SUPABASE_URL or SUPABASE_KEY environment variables")
256
+
257
+ supabase = create_client(
258
+ url,
259
+ key,
260
+ options=ClientOptions(
261
+ postgrest_client_timeout=10,
262
+ storage_client_timeout=10,
263
+ schema="public",
264
+ )
265
+ )
266
+
267
+ # Fetch data excluding sensitive token fields
268
+ response = supabase.table("gen-requests").select(
269
+ "id, created_at, status, input_dataset_name, input_dataset_config, "
270
+ "input_dataset_split, output_dataset_name, prompt_column, "
271
+ "model_name_or_path, model_revision, max_tokens, temperature, "
272
+ "top_k, top_p, username, num_output_examples, private"
273
+ ).order("created_at", desc=True).limit(50).execute()
274
+
275
+ return {"status": "success", "data": response.data}
276
+
277
+ except Exception as e:
278
+ return {"status": "error", "message": str(e), "data": []}
279
+
280
+
281
+ # Old commented code removed - replaced with DatabaseManager and get_generation_stats_safe()
282
+
283
+
284
  def main():
285
  with gr.Blocks(title="Synthetic Data Generation") as demo:
286
  gr.HTML("<h3 style='text-align:center'>Generate synthetic data with AI models. Free to use! Sign in for PRO benefits (10k samples vs 100). <a href='http://huggingface.co/subscribe/pro?source=synthetic-data-universe' target='_blank'>Upgrade to PRO</a></h3>", elem_id="sub_title")
 
306
  Welcome to the Synthetic Data Generation service! This tool allows you to generate synthetic data using large language models. Generation is FREE for Hugging Face PRO users and uses idle GPUs on the HF science cluster.\n
307
  Outputs from this service will be PUBLIC and available on the Hugging Face Hub under the organization [synthetic-data-universe](https://huggingface.co/synthetic-data-universe).\n
308
  """)
309
+ with gr.Accordion("More Information", open=False):
310
  with gr.Row():
311
  gr.Markdown("""
312
  **How it works:**
 
331
  with gr.Column():
332
  with gr.Row():
333
  model_name_or_path = gr.Dropdown(
334
+ choices=SUPPORTED_MODELS,
 
 
 
 
 
 
 
 
 
 
 
335
  label="Select Model",
336
+ value="Qwen/Qwen3-4B-Instruct-2507",
337
  info="Choose from popular instruction-tuned models under 40B parameters"
338
  )
339
  # model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
 
382
  submit_btn = gr.Button("Submit Generation Request", variant="primary")
383
  output_status = gr.Textbox(label="Status", interactive=False)
384
 
385
+ with gr.TabItem("Statistics Dashboard"):
386
+ gr.Markdown("## Generation Requests Statistics")
387
+ gr.Markdown("πŸ“Š View recent synthetic data generation requests and their status.")
388
+
389
+ with gr.Row():
390
+ refresh_stats_btn = gr.Button("πŸ”„ Refresh Statistics", size="sm", variant="secondary")
391
+ clear_stats_btn = gr.Button("πŸ—‘οΈ Clear Display", size="sm")
392
+
393
+ stats_status = gr.Markdown("Click 'Refresh Statistics' to load recent generation requests.", visible=True)
394
+
395
+ stats_dataframe = gr.Dataframe(
396
+ headers=["ID", "Created", "Status", "Input Dataset", "Output Dataset", "Model", "Samples", "User"],
397
+ datatype=["str", "str", "str", "str", "str", "str", "number", "str"],
398
+ interactive=False,
399
+ wrap=True,
400
+ value=[],
401
+ label="Recent Generation Requests (Last 50)",
402
+ visible=False
403
+ )
404
+
405
+ def load_statistics():
406
+ """Load and format statistics data"""
407
+ try:
408
+ # Use the new safe database function
409
+ result = get_generation_stats_safe()
410
+
411
+ if result["status"] == "error":
412
+ return (
413
+ f"❌ **Error loading statistics**: {result['message']}",
414
+ gr.update(visible=False),
415
+ gr.update(visible=True)
416
+ )
417
+
418
+ data = result["data"]
419
+ if not data:
420
+ return (
421
+ "πŸ“ **No data found**: The database appears to be empty or the table doesn't exist yet.",
422
+ gr.update(visible=False),
423
+ gr.update(visible=True)
424
+ )
425
+
426
+ # Format data for display
427
+ formatted_data = []
428
+ for item in data:
429
+ # Format timestamp
430
+ created_at = item.get('created_at', 'Unknown')
431
+ if created_at and created_at != 'Unknown':
432
+ try:
433
+ from datetime import datetime
434
+ dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
435
+ created_at = dt.strftime('%Y-%m-%d %H:%M')
436
+ except:
437
+ pass
438
+
439
+ formatted_data.append([
440
+ str(item.get('id', ''))[:8] + "..." if len(str(item.get('id', ''))) > 8 else str(item.get('id', '')),
441
+ created_at,
442
+ item.get('status', 'Unknown'),
443
+ (item.get('input_dataset_name', '')[:30] + "...") if len(item.get('input_dataset_name', '')) > 30 else item.get('input_dataset_name', ''),
444
+ (item.get('output_dataset_name', '')[:30] + "...") if len(item.get('output_dataset_name', '')) > 30 else item.get('output_dataset_name', ''),
445
+ (item.get('model_name_or_path', '')[:25] + "...") if len(item.get('model_name_or_path', '')) > 25 else item.get('model_name_or_path', ''),
446
+ item.get('num_output_examples', 0),
447
+ item.get('username', 'Anonymous')
448
+ ])
449
+
450
+ return (
451
+ f"βœ… **Statistics loaded successfully**: Found {len(formatted_data)} recent requests.",
452
+ gr.update(value=formatted_data, visible=True),
453
+ gr.update(visible=True)
454
+ )
455
+
456
+ except Exception as e:
457
+ return (
458
+ f"❌ **Unexpected error**: {str(e)}",
459
+ gr.update(visible=False),
460
+ gr.update(visible=True)
461
+ )
462
+
463
+ def clear_statistics():
464
+ """Clear the statistics display"""
465
+ return (
466
+ "Click 'Refresh Statistics' to load recent generation requests.",
467
+ gr.update(value=[], visible=False),
468
+ gr.update(visible=True)
469
+ )
470
+
471
+ # Connect buttons to functions
472
+ refresh_stats_btn.click(
473
+ load_statistics,
474
+ outputs=[stats_status, stats_dataframe, stats_status]
475
+ )
476
+
477
+ clear_stats_btn.click(
478
+ clear_statistics,
479
+ outputs=[stats_status, stats_dataframe, stats_status]
480
+ )
481
 
482
  def submit_request(input_dataset_name, input_split, input_dataset_config, output_dataset_name, prompt_col, model_name, model_rev, sys_prompt,
483
  max_tok, temp, top_k_val, top_p_val, email_addr, num_output_samples, oauth_token=None):