Spaces:
Sleeping
Sleeping
edbeeching
commited on
Commit
Β·
588fd75
1
Parent(s):
b50b8c1
iteration 2
Browse files
app.py
CHANGED
|
@@ -9,6 +9,8 @@ from datasets import get_dataset_infos
|
|
| 9 |
from transformers import AutoConfig
|
| 10 |
from huggingface_hub import whoami
|
| 11 |
from typing import Optional, List, Tuple, Union
|
|
|
|
|
|
|
| 12 |
|
| 13 |
"""
|
| 14 |
Still TODO:
|
|
@@ -17,6 +19,16 @@ from typing import Optional, List, Tuple, Union
|
|
| 17 |
- validate max model params
|
| 18 |
"""
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
def verify_pro_status(token: Optional[Union[gr.OAuthToken, str]]) -> bool:
|
| 22 |
"""Verifies if the user is a Hugging Face PRO user or part of an enterprise org."""
|
|
@@ -127,8 +139,34 @@ def validate_request(request: GenerationRequest, oauth_token: Optional[Union[gr.
|
|
| 127 |
try:
|
| 128 |
output_dataset_info = get_dataset_infos(request.output_dataset_name, token=request.output_dataset_token)
|
| 129 |
raise Exception(f"Output dataset {request.output_dataset_name} already exists. Please choose a different name.")
|
| 130 |
-
except Exception
|
| 131 |
pass # dataset does not exist, which is expected
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
|
| 133 |
# check the models exists
|
| 134 |
try:
|
|
@@ -205,6 +243,44 @@ def add_request_to_db(request: GenerationRequest):
|
|
| 205 |
raise Exception("Failed to add request to database")
|
| 206 |
|
| 207 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
def main():
|
| 209 |
with gr.Blocks(title="Synthetic Data Generation") as demo:
|
| 210 |
gr.HTML("<h3 style='text-align:center'>Generate synthetic data with AI models. Free to use! Sign in for PRO benefits (10k samples vs 100). <a href='http://huggingface.co/subscribe/pro?source=synthetic-data-universe' target='_blank'>Upgrade to PRO</a></h3>", elem_id="sub_title")
|
|
@@ -230,7 +306,7 @@ def main():
|
|
| 230 |
Welcome to the Synthetic Data Generation service! This tool allows you to generate synthetic data using large language models. Generation is FREE for Hugging Face PRO users and uses idle GPUs on the HF science cluster.\n
|
| 231 |
Outputs from this service will be PUBLIC and available on the Hugging Face Hub under the organization [synthetic-data-universe](https://huggingface.co/synthetic-data-universe).\n
|
| 232 |
""")
|
| 233 |
-
with gr.Accordion("
|
| 234 |
with gr.Row():
|
| 235 |
gr.Markdown("""
|
| 236 |
**How it works:**
|
|
@@ -255,20 +331,9 @@ def main():
|
|
| 255 |
with gr.Column():
|
| 256 |
with gr.Row():
|
| 257 |
model_name_or_path = gr.Dropdown(
|
| 258 |
-
choices=
|
| 259 |
-
"microsoft/Phi-3.5-mini-instruct",
|
| 260 |
-
"Qwen/Qwen2.5-7B-Instruct",
|
| 261 |
-
"meta-llama/Llama-3.2-8B-Instruct",
|
| 262 |
-
"mistralai/Mistral-7B-Instruct-v0.3",
|
| 263 |
-
"google/gemma-2-9b-it",
|
| 264 |
-
"microsoft/DialoGPT-medium",
|
| 265 |
-
"HuggingFaceH4/zephyr-7b-beta",
|
| 266 |
-
"teknium/OpenHermes-2.5-Mistral-7B",
|
| 267 |
-
"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO",
|
| 268 |
-
"01-ai/Yi-34B-Chat"
|
| 269 |
-
],
|
| 270 |
label="Select Model",
|
| 271 |
-
value="
|
| 272 |
info="Choose from popular instruction-tuned models under 40B parameters"
|
| 273 |
)
|
| 274 |
# model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
|
|
@@ -317,9 +382,102 @@ def main():
|
|
| 317 |
submit_btn = gr.Button("Submit Generation Request", variant="primary")
|
| 318 |
output_status = gr.Textbox(label="Status", interactive=False)
|
| 319 |
|
| 320 |
-
with gr.TabItem("
|
| 321 |
-
gr.Markdown("##
|
| 322 |
-
gr.Markdown("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
|
| 324 |
def submit_request(input_dataset_name, input_split, input_dataset_config, output_dataset_name, prompt_col, model_name, model_rev, sys_prompt,
|
| 325 |
max_tok, temp, top_k_val, top_p_val, email_addr, num_output_samples, oauth_token=None):
|
|
|
|
| 9 |
from transformers import AutoConfig
|
| 10 |
from huggingface_hub import whoami
|
| 11 |
from typing import Optional, List, Tuple, Union
|
| 12 |
+
# import threading # DISABLED - was part of hanging database test functions
|
| 13 |
+
# import time # DISABLED - was part of hanging database test functions
|
| 14 |
|
| 15 |
"""
|
| 16 |
Still TODO:
|
|
|
|
| 19 |
- validate max model params
|
| 20 |
"""
|
| 21 |
|
| 22 |
+
SUPPORTED_MODELS = [
|
| 23 |
+
"Qwen/Qwen3-4B-Instruct-2507",
|
| 24 |
+
"Qwen/Qwen3-30B-A3B-Instruct-2507",
|
| 25 |
+
"meta-llama/Llama-3.2-1B-Instruct",
|
| 26 |
+
"meta-llama/Llama-3.2-3B-Instruct",
|
| 27 |
+
"baidu/ERNIE-4.5-21B-A3B-Thinking",
|
| 28 |
+
"LLM360/K2-Think",
|
| 29 |
+
"openai/gpt-oss-20b",
|
| 30 |
+
]
|
| 31 |
+
|
| 32 |
|
| 33 |
def verify_pro_status(token: Optional[Union[gr.OAuthToken, str]]) -> bool:
|
| 34 |
"""Verifies if the user is a Hugging Face PRO user or part of an enterprise org."""
|
|
|
|
| 139 |
try:
|
| 140 |
output_dataset_info = get_dataset_infos(request.output_dataset_name, token=request.output_dataset_token)
|
| 141 |
raise Exception(f"Output dataset {request.output_dataset_name} already exists. Please choose a different name.")
|
| 142 |
+
except Exception:
|
| 143 |
pass # dataset does not exist, which is expected
|
| 144 |
+
|
| 145 |
+
# check the output dataset name doesn't already exist in the database
|
| 146 |
+
try:
|
| 147 |
+
url = os.getenv("SUPABASE_URL")
|
| 148 |
+
key = os.getenv("SUPABASE_KEY")
|
| 149 |
+
|
| 150 |
+
if url and key:
|
| 151 |
+
supabase = create_client(
|
| 152 |
+
url,
|
| 153 |
+
key,
|
| 154 |
+
options=ClientOptions(
|
| 155 |
+
postgrest_client_timeout=10,
|
| 156 |
+
storage_client_timeout=10,
|
| 157 |
+
schema="public",
|
| 158 |
+
)
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
existing_request = supabase.table("gen-requests").select("id").eq("output_dataset_name", request.output_dataset_name).execute()
|
| 162 |
+
if existing_request.data:
|
| 163 |
+
raise Exception(f"Output dataset {request.output_dataset_name} is already being generated or has been requested. Please choose a different name.")
|
| 164 |
+
except Exception as e:
|
| 165 |
+
# If it's our custom exception about dataset already existing, re-raise it
|
| 166 |
+
if "already being generated" in str(e):
|
| 167 |
+
raise e
|
| 168 |
+
# Otherwise, ignore database connection errors and continue
|
| 169 |
+
pass
|
| 170 |
|
| 171 |
# check the models exists
|
| 172 |
try:
|
|
|
|
| 243 |
raise Exception("Failed to add request to database")
|
| 244 |
|
| 245 |
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
def get_generation_stats_safe():
|
| 249 |
+
"""Safely fetch generation request statistics with proper error handling"""
|
| 250 |
+
try:
|
| 251 |
+
url = os.getenv("SUPABASE_URL")
|
| 252 |
+
key = os.getenv("SUPABASE_KEY")
|
| 253 |
+
|
| 254 |
+
if not url or not key:
|
| 255 |
+
raise Exception("Missing SUPABASE_URL or SUPABASE_KEY environment variables")
|
| 256 |
+
|
| 257 |
+
supabase = create_client(
|
| 258 |
+
url,
|
| 259 |
+
key,
|
| 260 |
+
options=ClientOptions(
|
| 261 |
+
postgrest_client_timeout=10,
|
| 262 |
+
storage_client_timeout=10,
|
| 263 |
+
schema="public",
|
| 264 |
+
)
|
| 265 |
+
)
|
| 266 |
+
|
| 267 |
+
# Fetch data excluding sensitive token fields
|
| 268 |
+
response = supabase.table("gen-requests").select(
|
| 269 |
+
"id, created_at, status, input_dataset_name, input_dataset_config, "
|
| 270 |
+
"input_dataset_split, output_dataset_name, prompt_column, "
|
| 271 |
+
"model_name_or_path, model_revision, max_tokens, temperature, "
|
| 272 |
+
"top_k, top_p, username, num_output_examples, private"
|
| 273 |
+
).order("created_at", desc=True).limit(50).execute()
|
| 274 |
+
|
| 275 |
+
return {"status": "success", "data": response.data}
|
| 276 |
+
|
| 277 |
+
except Exception as e:
|
| 278 |
+
return {"status": "error", "message": str(e), "data": []}
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
# Old commented code removed - replaced with DatabaseManager and get_generation_stats_safe()
|
| 282 |
+
|
| 283 |
+
|
| 284 |
def main():
|
| 285 |
with gr.Blocks(title="Synthetic Data Generation") as demo:
|
| 286 |
gr.HTML("<h3 style='text-align:center'>Generate synthetic data with AI models. Free to use! Sign in for PRO benefits (10k samples vs 100). <a href='http://huggingface.co/subscribe/pro?source=synthetic-data-universe' target='_blank'>Upgrade to PRO</a></h3>", elem_id="sub_title")
|
|
|
|
| 306 |
Welcome to the Synthetic Data Generation service! This tool allows you to generate synthetic data using large language models. Generation is FREE for Hugging Face PRO users and uses idle GPUs on the HF science cluster.\n
|
| 307 |
Outputs from this service will be PUBLIC and available on the Hugging Face Hub under the organization [synthetic-data-universe](https://huggingface.co/synthetic-data-universe).\n
|
| 308 |
""")
|
| 309 |
+
with gr.Accordion("More Information", open=False):
|
| 310 |
with gr.Row():
|
| 311 |
gr.Markdown("""
|
| 312 |
**How it works:**
|
|
|
|
| 331 |
with gr.Column():
|
| 332 |
with gr.Row():
|
| 333 |
model_name_or_path = gr.Dropdown(
|
| 334 |
+
choices=SUPPORTED_MODELS,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 335 |
label="Select Model",
|
| 336 |
+
value="Qwen/Qwen3-4B-Instruct-2507",
|
| 337 |
info="Choose from popular instruction-tuned models under 40B parameters"
|
| 338 |
)
|
| 339 |
# model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
|
|
|
|
| 382 |
submit_btn = gr.Button("Submit Generation Request", variant="primary")
|
| 383 |
output_status = gr.Textbox(label="Status", interactive=False)
|
| 384 |
|
| 385 |
+
with gr.TabItem("Statistics Dashboard"):
|
| 386 |
+
gr.Markdown("## Generation Requests Statistics")
|
| 387 |
+
gr.Markdown("π View recent synthetic data generation requests and their status.")
|
| 388 |
+
|
| 389 |
+
with gr.Row():
|
| 390 |
+
refresh_stats_btn = gr.Button("π Refresh Statistics", size="sm", variant="secondary")
|
| 391 |
+
clear_stats_btn = gr.Button("ποΈ Clear Display", size="sm")
|
| 392 |
+
|
| 393 |
+
stats_status = gr.Markdown("Click 'Refresh Statistics' to load recent generation requests.", visible=True)
|
| 394 |
+
|
| 395 |
+
stats_dataframe = gr.Dataframe(
|
| 396 |
+
headers=["ID", "Created", "Status", "Input Dataset", "Output Dataset", "Model", "Samples", "User"],
|
| 397 |
+
datatype=["str", "str", "str", "str", "str", "str", "number", "str"],
|
| 398 |
+
interactive=False,
|
| 399 |
+
wrap=True,
|
| 400 |
+
value=[],
|
| 401 |
+
label="Recent Generation Requests (Last 50)",
|
| 402 |
+
visible=False
|
| 403 |
+
)
|
| 404 |
+
|
| 405 |
+
def load_statistics():
|
| 406 |
+
"""Load and format statistics data"""
|
| 407 |
+
try:
|
| 408 |
+
# Use the new safe database function
|
| 409 |
+
result = get_generation_stats_safe()
|
| 410 |
+
|
| 411 |
+
if result["status"] == "error":
|
| 412 |
+
return (
|
| 413 |
+
f"β **Error loading statistics**: {result['message']}",
|
| 414 |
+
gr.update(visible=False),
|
| 415 |
+
gr.update(visible=True)
|
| 416 |
+
)
|
| 417 |
+
|
| 418 |
+
data = result["data"]
|
| 419 |
+
if not data:
|
| 420 |
+
return (
|
| 421 |
+
"π **No data found**: The database appears to be empty or the table doesn't exist yet.",
|
| 422 |
+
gr.update(visible=False),
|
| 423 |
+
gr.update(visible=True)
|
| 424 |
+
)
|
| 425 |
+
|
| 426 |
+
# Format data for display
|
| 427 |
+
formatted_data = []
|
| 428 |
+
for item in data:
|
| 429 |
+
# Format timestamp
|
| 430 |
+
created_at = item.get('created_at', 'Unknown')
|
| 431 |
+
if created_at and created_at != 'Unknown':
|
| 432 |
+
try:
|
| 433 |
+
from datetime import datetime
|
| 434 |
+
dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
|
| 435 |
+
created_at = dt.strftime('%Y-%m-%d %H:%M')
|
| 436 |
+
except:
|
| 437 |
+
pass
|
| 438 |
+
|
| 439 |
+
formatted_data.append([
|
| 440 |
+
str(item.get('id', ''))[:8] + "..." if len(str(item.get('id', ''))) > 8 else str(item.get('id', '')),
|
| 441 |
+
created_at,
|
| 442 |
+
item.get('status', 'Unknown'),
|
| 443 |
+
(item.get('input_dataset_name', '')[:30] + "...") if len(item.get('input_dataset_name', '')) > 30 else item.get('input_dataset_name', ''),
|
| 444 |
+
(item.get('output_dataset_name', '')[:30] + "...") if len(item.get('output_dataset_name', '')) > 30 else item.get('output_dataset_name', ''),
|
| 445 |
+
(item.get('model_name_or_path', '')[:25] + "...") if len(item.get('model_name_or_path', '')) > 25 else item.get('model_name_or_path', ''),
|
| 446 |
+
item.get('num_output_examples', 0),
|
| 447 |
+
item.get('username', 'Anonymous')
|
| 448 |
+
])
|
| 449 |
+
|
| 450 |
+
return (
|
| 451 |
+
f"β
**Statistics loaded successfully**: Found {len(formatted_data)} recent requests.",
|
| 452 |
+
gr.update(value=formatted_data, visible=True),
|
| 453 |
+
gr.update(visible=True)
|
| 454 |
+
)
|
| 455 |
+
|
| 456 |
+
except Exception as e:
|
| 457 |
+
return (
|
| 458 |
+
f"β **Unexpected error**: {str(e)}",
|
| 459 |
+
gr.update(visible=False),
|
| 460 |
+
gr.update(visible=True)
|
| 461 |
+
)
|
| 462 |
+
|
| 463 |
+
def clear_statistics():
|
| 464 |
+
"""Clear the statistics display"""
|
| 465 |
+
return (
|
| 466 |
+
"Click 'Refresh Statistics' to load recent generation requests.",
|
| 467 |
+
gr.update(value=[], visible=False),
|
| 468 |
+
gr.update(visible=True)
|
| 469 |
+
)
|
| 470 |
+
|
| 471 |
+
# Connect buttons to functions
|
| 472 |
+
refresh_stats_btn.click(
|
| 473 |
+
load_statistics,
|
| 474 |
+
outputs=[stats_status, stats_dataframe, stats_status]
|
| 475 |
+
)
|
| 476 |
+
|
| 477 |
+
clear_stats_btn.click(
|
| 478 |
+
clear_statistics,
|
| 479 |
+
outputs=[stats_status, stats_dataframe, stats_status]
|
| 480 |
+
)
|
| 481 |
|
| 482 |
def submit_request(input_dataset_name, input_split, input_dataset_config, output_dataset_name, prompt_col, model_name, model_rev, sys_prompt,
|
| 483 |
max_tok, temp, top_k_val, top_p_val, email_addr, num_output_samples, oauth_token=None):
|