Spaces:
Sleeping
Sleeping
File size: 16,658 Bytes
b6d1901 ab5bf76 b6d1901 8854100 f32647d ac0089d 8854100 b6d1901 ab5bf76 b6d1901 f32647d b6d1901 f32647d b6d1901 8854100 b6d1901 f32647d b6d1901 5d89dcd b6d1901 ab5bf76 00f53b5 ab5bf76 b6d1901 8854100 b6d1901 5d89dcd b6d1901 8854100 f32647d 00f53b5 8854100 b6d1901 ab5bf76 b6d1901 ab5bf76 f32647d ab5bf76 f32647d ab5bf76 b6d1901 ab5bf76 b6d1901 ab5bf76 b6d1901 ab5bf76 b6d1901 ab5bf76 f32647d ab5bf76 b6d1901 ab5bf76 b6d1901 ab5bf76 b6d1901 ab5bf76 b6d1901 ab5bf76 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 |
import gradio as gr
from dataclasses import dataclass
import os
from supabase import create_client, Client
from supabase.client import ClientOptions
from enum import Enum
from datasets import get_dataset_infos
from transformers import AutoConfig
from huggingface_hub import whoami
from typing import Optional, List, Tuple, Union
"""
Still TODO:
- validate the user is PRO
- check the output dataset token is valid (hardcoded for now as a secret)
- validate max model params
"""
def verify_pro_status(token: Optional[Union[gr.OAuthToken, str]]) -> bool:
"""Verifies if the user is a Hugging Face PRO user or part of an enterprise org."""
if not token:
return False
if isinstance(token, gr.OAuthToken):
token_str = token.token
elif isinstance(token, str):
token_str = token
else:
return False
try:
user_info = whoami(token=token_str)
return (
user_info.get("isPro", False) or
any(org.get("isEnterprise", False) for org in user_info.get("orgs", []))
)
except Exception as e:
print(f"Could not verify user's PRO/Enterprise status: {e}")
return False
class GenerationStatus(Enum):
PENDING = "PENDING"
RUNNING = "RUNNING"
COMPLETED = "COMPLETED"
FAILED = "FAILED"
MAX_SAMPLES = 10000 # max number of samples in the input dataset
MAX_TOKENS = 8192
MAX_MODEL_PARAMS = 20_000_000_000 # 20 billion parameters (for now)
@dataclass
class GenerationRequest:
id: str
created_at: str
status: GenerationStatus
input_dataset_name: str
input_dataset_config: str
input_dataset_split: str
output_dataset_name: str
prompt_column: str
model_name_or_path: str
model_revision: str
model_token: str | None
system_prompt: str | None
max_tokens: int
temperature: float
top_k: int
top_p: float
input_dataset_token: str | None
output_dataset_token: str
username: str
email: str
num_output_examples: int
private: bool = False
num_retries: int = 0
def validate_request(request: GenerationRequest) -> GenerationRequest:
# checks that the request is valid
# - input dataset exists and can be accessed with the provided token
try:
input_dataset_info = get_dataset_infos(request.input_dataset_name, token=request.input_dataset_token)[request.input_dataset_config]
except Exception as e:
raise Exception(f"Dataset {request.input_dataset_name} does not exist or cannot be accessed with the provided token.")
# check that the input dataset split exists
if request.input_dataset_split not in input_dataset_info.splits:
raise Exception(f"Dataset split {request.input_dataset_split} does not exist in dataset {request.input_dataset_name}. Available splits: {list(input_dataset_info.splits.keys())}")
# if num_output_examples is 0, set it to the number of examples in the input dataset split
if request.num_output_examples == 0:
request.num_output_examples = input_dataset_info.splits[request.input_dataset_split].num_examples
else:
if request.num_output_examples > input_dataset_info.splits[request.input_dataset_split].num_examples:
raise Exception(f"Requested number of output examples {request.num_output_examples} exceeds the number of examples in the input dataset split {input_dataset_info.splits[request.input_dataset_split].num_examples}.")
request.input_dataset_split = f"{request.input_dataset_split}[:{request.num_output_examples}]"
if request.num_output_examples > MAX_SAMPLES:
raise Exception(f"Requested number of output examples {request.num_output_examples} exceeds the max limit of {MAX_SAMPLES}.")
# check the prompt column exists in the dataset
if request.prompt_column not in input_dataset_info.features:
raise Exception(f"Prompt column {request.prompt_column} does not exist in dataset {request.input_dataset_name}. Available columns: {list(input_dataset_info.features.keys())}")
# check the models exists
try:
model_config = AutoConfig.from_pretrained(request.model_name_or_path, revision=request.model_revision, token=request.model_token)
except Exception as e:
print(e)
raise Exception(f"Model {request.model_name_or_path} revision {request.model_revision} does not exist or cannot be accessed with the provided token.")
# check the model max position embeddings is greater than the requested max tokens and less than MAX_TOKENS
if model_config.max_position_embeddings < request.max_tokens:
raise Exception(f"Model {request.model_name_or_path} max position embeddings {model_config.max_position_embeddings} is less than the requested max tokens {request.max_tokens}.")
if request.max_tokens > MAX_TOKENS:
raise Exception(f"Requested max tokens {request.max_tokens} exceeds the limit of {MAX_TOKENS}.")
# check sampling parameters are valid
if request.temperature < 0.0 or request.temperature > 2.0:
raise Exception("Temperature must be between 0.0 and 2.0")
if request.top_k < 1 or request.top_k > 100:
raise Exception("Top K must be between 1 and 100")
if request.top_p < 0.0 or request.top_p > 1.0:
raise Exception("Top P must be between 0.0 and 1.0")
# check valid email address TODO: use py3-validate-email https://stackoverflow.com/questions/8022530/how-to-check-for-valid-email-address
if "@" not in request.email or "." not in request.email.split("@")[-1]:
raise Exception("Invalid email address")
return request
def add_request_to_db(request: GenerationRequest):
url: str = os.getenv("SUPABASE_URL")
key: str = os.getenv("SUPABASE_KEY")
try:
supabase: Client = create_client(
url,
key,
options=ClientOptions(
postgrest_client_timeout=10,
storage_client_timeout=10,
schema="public",
)
)
data = {
"status": request.status.value,
"input_dataset_name": request.input_dataset_name,
"input_dataset_config": request.input_dataset_config,
"input_dataset_split": request.input_dataset_split,
"output_dataset_name": request.output_dataset_name,
"prompt_column": request.prompt_column,
"model_name_or_path": request.model_name_or_path,
"model_revision": request.model_revision,
"model_token": request.model_token,
"system_prompt": request.system_prompt,
"max_tokens": request.max_tokens,
"temperature": request.temperature,
"top_k": request.top_k,
"top_p": request.top_p,
"input_dataset_token": request.input_dataset_token,
"output_dataset_token": request.output_dataset_token,
"username": request.username,
"email": request.email,
"num_output_examples": request.num_output_examples,
"private": request.private,
}
supabase.table("gen-requests").insert(data).execute()
except Exception as e:
raise Exception("Failed to add request to database")
def main():
with gr.Blocks(title="Synthetic Data Generation") as demo:
gr.HTML("<h3 style='text-align:center'>Hugging Face PRO users can use the Synthetic generation service. <a href='http://huggingface.co/subscribe/pro?source=synthetic-data-universe' target='_blank'>Subscribe to PRO</a></h3>", elem_id="sub_title")
pro_message = gr.Markdown(visible=False)
main_interface = gr.Column(visible=False)
with main_interface:
with gr.Group():
with gr.Row():
gr.Markdown("# Synthetic Data Generation Request")
with gr.Row():
gr.Markdown("""
Welcome to the Synthetic Data Generation service! This tool allows you to generate synthetic data using large language models. Generation is FREE for Hugging Face PRO users and uses idle GPUs on the HF science cluster.\n
Outputs from this service will be PUBLIC and available on the Hugging Face Hub under the organization [synthetic-data-universe](https://huggingface.co/synthetic-data-universe).\n
""")
with gr.Group():
with gr.Row():
gr.Markdown("""
**How it works:**
1. Provide an input dataset with prompts
2. Select a public language model for generation
3. Configure generation parameters
4. Submit your request.
""")
gr.Markdown("""
**Requirements:**
- Input dataset must be publicly accessible (for now)
- Model must be accessible (public and note gated, for now)
- Maximum 10,000 samples per dataset (for now)
- Maximum of 8192 generation tokens (for now)
""")
with gr.Group():
gr.Markdown("## Dataset information")
with gr.Column():
with gr.Row():
input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1")
input_dataset_split = gr.Textbox(label="Input Dataset Split", value="train", placeholder="e.g., train, test, validation")
input_dataset_config = gr.Textbox(label="Input Dataset Config", value="default", placeholder="e.g., default, custom")
prompt_column = gr.Textbox(label="Prompt Column", placeholder="e.g., text, prompt, question")
with gr.Column():
with gr.Row():
output_dataset_name = gr.Textbox(label="Output Dataset Name", placeholder="e.g., my-generated-dataset, must be unique. Will be created under the org 'synthetic-data-universe'")
num_output_samples = gr.Slider(label="Number of samples, leave as '0' for all", value=0, minimum=0, maximum=MAX_SAMPLES, step=1)
with gr.Group():
gr.Markdown("## Model information")
with gr.Column():
with gr.Row():
model_name_or_path = gr.Textbox(label="Model Name or Path", placeholder="e.g., Qwen/Qwen3-4B-Instruct-2507")
model_revision = gr.Textbox(label="Model Revision", value="main", placeholder="e.g., main, v1.0")
# model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
with gr.Group():
gr.Markdown("## Generation Parameters")
with gr.Row():
with gr.Column():
with gr.Row():
max_tokens = gr.Slider(label="Max Tokens", value=512, minimum=256, maximum=MAX_TOKENS, step=256)
temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.1)
with gr.Row():
top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5)
top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
with gr.Row():
system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.")
with gr.Group():
gr.Markdown("## User Information, for notification when your job is completed (still TODO)")
with gr.Row():
with gr.Column():
with gr.Row():
email = gr.Textbox(label="Email", placeholder="your.email@example.com")
# with gr.Row():
# input_dataset_token = gr.Textbox(label="Input dataset token", type="password", placeholder="Your HF token with read access to the input dataset, leave blank if public dataset")
# output_dataset_token = gr.Textbox(label="Output dataset token", type="password", placeholder="Your HF token with write access to the output dataset")
submit_btn = gr.Button("Submit Generation Request", variant="primary")
output_status = gr.Textbox(label="Status", interactive=False)
def submit_request(input_dataset_name, input_split, input_dataset_config, output_dataset_name, prompt_col, model_name, model_rev, sys_prompt,
max_tok, temp, top_k_val, top_p_val, email_addr, num_output_samples):
MASTER_ORG = "synthetic-data-universe/"
model_token = None # This is currently not supported
input_dataset_token = None # This is currently not supported
output_dataset_token = os.getenv("OUTPUT_DATASET_TOKEN")
try:
request = GenerationRequest(
id="", # Will be generated when adding to the database
created_at="", # Will be set when adding to the database
status=GenerationStatus.PENDING,
input_dataset_name=input_dataset_name,
input_dataset_split=input_split,
input_dataset_config=input_dataset_config,
output_dataset_name=MASTER_ORG + output_dataset_name,
prompt_column=prompt_col,
model_name_or_path=model_name,
model_revision=model_rev,
model_token=model_token if model_token else None,
system_prompt=sys_prompt if sys_prompt else None,
max_tokens=int(max_tok),
temperature=temp,
top_k=int(top_k_val),
top_p=top_p_val,
input_dataset_token=input_dataset_token if input_dataset_token else None,
output_dataset_token=output_dataset_token,
num_output_examples=num_output_samples, # will be set after validating the input dataset
username="user",
email=email_addr
)
# check the input dataset exists and can be accessed with the provided token
request = validate_request(request)
add_request_to_db(request)
return "Request submitted successfully!"
except Exception as e:
return f"Error: {str(e)}"
submit_btn.click(
submit_request,
inputs=[input_dataset_name, input_dataset_split, input_dataset_config, output_dataset_name, prompt_column, model_name_or_path,
model_revision, system_prompt, max_tokens, temperature, top_k, top_p, email, num_output_samples],
outputs=output_status
)
def control_access(profile: Optional[gr.OAuthProfile] = None, oauth_token: Optional[gr.OAuthToken] = None):
if not profile: return gr.update(visible=False), gr.update(visible=False)
if verify_pro_status(oauth_token): return gr.update(visible=True), gr.update(visible=False)
else:
message = (
"## ✨ Exclusive Access for PRO Users\n\n"
"Thank you for your interest! This app is available exclusively for our Hugging Face **PRO** members.\n\n"
"To unlock this and many other cool stuff, please consider upgrading your account.\n\n"
"### [**Become a PRO Today!**](http://huggingface.co/subscribe/pro?source=synthetic-data-universe)"
)
return gr.update(visible=False), gr.update(visible=True, value=message)
demo.load(control_access, inputs=None, outputs=[main_interface, pro_message])
demo.queue(max_size=None, default_concurrency_limit=None).launch(show_error=True)
if __name__ == "__main__":
main()
|