Spaces:
Sleeping
Sleeping
edbeeching
commited on
Commit
·
ab5bf76
1
Parent(s):
5d89dcd
add auth
Browse files
app.py
CHANGED
|
@@ -7,6 +7,8 @@ from supabase.client import ClientOptions
|
|
| 7 |
from enum import Enum
|
| 8 |
from datasets import get_dataset_infos
|
| 9 |
from transformers import AutoConfig
|
|
|
|
|
|
|
| 10 |
|
| 11 |
"""
|
| 12 |
Still TODO:
|
|
@@ -16,6 +18,30 @@ from transformers import AutoConfig
|
|
| 16 |
"""
|
| 17 |
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
class GenerationStatus(Enum):
|
| 20 |
PENDING = "PENDING"
|
| 21 |
RUNNING = "RUNNING"
|
|
@@ -64,17 +90,23 @@ def validate_request(request: GenerationRequest) -> GenerationRequest:
|
|
| 64 |
# check that the input dataset split exists
|
| 65 |
if request.input_dataset_split not in input_dataset_info.splits:
|
| 66 |
raise Exception(f"Dataset split {request.input_dataset_split} does not exist in dataset {request.input_dataset_name}. Available splits: {list(input_dataset_info.splits.keys())}")
|
| 67 |
-
|
| 68 |
-
#
|
| 69 |
-
if
|
| 70 |
request.num_output_examples = input_dataset_info.splits[request.input_dataset_split].num_examples
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
# check the prompt column exists in the dataset
|
| 74 |
if request.prompt_column not in input_dataset_info.features:
|
| 75 |
raise Exception(f"Prompt column {request.prompt_column} does not exist in dataset {request.input_dataset_name}. Available columns: {list(input_dataset_info.features.keys())}")
|
| 76 |
|
| 77 |
-
|
| 78 |
# check the models exists
|
| 79 |
try:
|
| 80 |
model_config = AutoConfig.from_pretrained(request.model_name_or_path, revision=request.model_revision, token=request.model_token)
|
|
@@ -146,127 +178,151 @@ def add_request_to_db(request: GenerationRequest):
|
|
| 146 |
raise Exception("Failed to add request to database")
|
| 147 |
|
| 148 |
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
Welcome to the Synthetic Data Generation service! This tool allows you to generate synthetic data using large language models. Generation is FREE for Hugging Face PRO users and uses idle GPUs on the HF science cluster.\n
|
| 157 |
-
""")
|
| 158 |
-
with gr.Group():
|
| 159 |
-
with gr.Row():
|
| 160 |
-
gr.Markdown("""
|
| 161 |
-
**How it works:**
|
| 162 |
-
1. Provide an input dataset with prompts
|
| 163 |
-
2. Select a public language model for generation
|
| 164 |
-
3. Configure generation parameters
|
| 165 |
-
4. Submit your request and receive generated data
|
| 166 |
-
""")
|
| 167 |
-
gr.Markdown("""
|
| 168 |
-
|
| 169 |
-
**Requirements:**
|
| 170 |
-
- Input dataset must be publicly accessible
|
| 171 |
-
- Output dataset repository must exist and you must have write access
|
| 172 |
-
- Model must be accessible (public or with valid token)
|
| 173 |
-
- Maximum 10,000 samples per dataset
|
| 174 |
-
- Maximum of 8192 generation tokens
|
| 175 |
-
""")
|
| 176 |
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
|
|
|
| 180 |
with gr.Row():
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
gr.Markdown("## Model information")
|
| 189 |
-
with gr.Column():
|
| 190 |
with gr.Row():
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
with gr.Column():
|
| 198 |
with gr.Row():
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
with gr.
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
gr.
|
| 209 |
-
|
| 210 |
with gr.Column():
|
| 211 |
with gr.Row():
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
|
| 217 |
-
|
| 218 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
|
| 220 |
-
|
| 221 |
-
|
| 222 |
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
created_at="", # Will be set when adding to the database
|
| 231 |
-
status=GenerationStatus.PENDING,
|
| 232 |
-
input_dataset_name=input_dataset_name,
|
| 233 |
-
input_dataset_split=input_split,
|
| 234 |
-
input_dataset_config=input_dataset_config,
|
| 235 |
-
output_dataset_name=MASTER_ORG + output_dataset_name,
|
| 236 |
-
prompt_column=prompt_col,
|
| 237 |
-
model_name_or_path=model_name,
|
| 238 |
-
model_revision=model_rev,
|
| 239 |
-
model_token=model_token if model_token else None,
|
| 240 |
-
system_prompt=sys_prompt if sys_prompt else None,
|
| 241 |
-
max_tokens=int(max_tok),
|
| 242 |
-
temperature=temp,
|
| 243 |
-
top_k=int(top_k_val),
|
| 244 |
-
top_p=top_p_val,
|
| 245 |
-
input_dataset_token=input_dataset_token if input_dataset_token else None,
|
| 246 |
-
output_dataset_token=output_dataset_token,
|
| 247 |
-
num_output_examples=0, # will be set after validating the input dataset
|
| 248 |
-
username="user",
|
| 249 |
-
email=email_addr
|
| 250 |
-
)
|
| 251 |
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
|
| 270 |
if __name__ == "__main__":
|
| 271 |
-
|
| 272 |
-
app.launch()
|
|
|
|
| 7 |
from enum import Enum
|
| 8 |
from datasets import get_dataset_infos
|
| 9 |
from transformers import AutoConfig
|
| 10 |
+
from huggingface_hub import whoami
|
| 11 |
+
from typing import Optional, List, Tuple, Union
|
| 12 |
|
| 13 |
"""
|
| 14 |
Still TODO:
|
|
|
|
| 18 |
"""
|
| 19 |
|
| 20 |
|
| 21 |
+
def verify_pro_status(token: Optional[Union[gr.OAuthToken, str]]) -> bool:
|
| 22 |
+
"""Verifies if the user is a Hugging Face PRO user or part of an enterprise org."""
|
| 23 |
+
if not token:
|
| 24 |
+
return False
|
| 25 |
+
|
| 26 |
+
if isinstance(token, gr.OAuthToken):
|
| 27 |
+
token_str = token.token
|
| 28 |
+
elif isinstance(token, str):
|
| 29 |
+
token_str = token
|
| 30 |
+
else:
|
| 31 |
+
return False
|
| 32 |
+
|
| 33 |
+
try:
|
| 34 |
+
user_info = whoami(token=token_str)
|
| 35 |
+
return (
|
| 36 |
+
user_info.get("isPro", False) or
|
| 37 |
+
any(org.get("isEnterprise", False) for org in user_info.get("orgs", []))
|
| 38 |
+
)
|
| 39 |
+
except Exception as e:
|
| 40 |
+
print(f"Could not verify user's PRO/Enterprise status: {e}")
|
| 41 |
+
return False
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
|
| 45 |
class GenerationStatus(Enum):
|
| 46 |
PENDING = "PENDING"
|
| 47 |
RUNNING = "RUNNING"
|
|
|
|
| 90 |
# check that the input dataset split exists
|
| 91 |
if request.input_dataset_split not in input_dataset_info.splits:
|
| 92 |
raise Exception(f"Dataset split {request.input_dataset_split} does not exist in dataset {request.input_dataset_name}. Available splits: {list(input_dataset_info.splits.keys())}")
|
| 93 |
+
|
| 94 |
+
# if num_output_examples is 0, set it to the number of examples in the input dataset split
|
| 95 |
+
if request.num_output_examples == 0:
|
| 96 |
request.num_output_examples = input_dataset_info.splits[request.input_dataset_split].num_examples
|
| 97 |
+
else:
|
| 98 |
+
if request.num_output_examples > input_dataset_info.splits[request.input_dataset_split].num_examples:
|
| 99 |
+
raise Exception(f"Requested number of output examples {request.num_output_examples} exceeds the number of examples in the input dataset split {input_dataset_info.splits[request.input_dataset_split].num_examples}.")
|
| 100 |
+
request.input_dataset_split = f"{request.input_dataset_split}[:{request.num_output_examples}]"
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
if request.num_output_examples > MAX_SAMPLES:
|
| 104 |
+
raise Exception(f"Requested number of output examples {request.num_output_examples} exceeds the max limit of {MAX_SAMPLES}.")
|
| 105 |
|
| 106 |
# check the prompt column exists in the dataset
|
| 107 |
if request.prompt_column not in input_dataset_info.features:
|
| 108 |
raise Exception(f"Prompt column {request.prompt_column} does not exist in dataset {request.input_dataset_name}. Available columns: {list(input_dataset_info.features.keys())}")
|
| 109 |
|
|
|
|
| 110 |
# check the models exists
|
| 111 |
try:
|
| 112 |
model_config = AutoConfig.from_pretrained(request.model_name_or_path, revision=request.model_revision, token=request.model_token)
|
|
|
|
| 178 |
raise Exception("Failed to add request to database")
|
| 179 |
|
| 180 |
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
def main():
|
| 186 |
+
with gr.Blocks(title="Synthetic Data Generation") as demo:
|
| 187 |
+
gr.HTML("<h3 style='text-align:center'>Hugging Face PRO users can use the Synthetic generation service. <a href='http://huggingface.co/subscribe/pro?source=synthetic-data-universe' target='_blank'>Subscribe to PRO</a></h3>", elem_id="sub_title")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
|
| 189 |
+
pro_message = gr.Markdown(visible=False)
|
| 190 |
+
main_interface = gr.Column(visible=False)
|
| 191 |
+
with main_interface:
|
| 192 |
+
with gr.Group():
|
| 193 |
with gr.Row():
|
| 194 |
+
gr.Markdown("# Synthetic Data Generation Request")
|
| 195 |
+
with gr.Row():
|
| 196 |
+
gr.Markdown("""
|
| 197 |
+
Welcome to the Synthetic Data Generation service! This tool allows you to generate synthetic data using large language models. Generation is FREE for Hugging Face PRO users and uses idle GPUs on the HF science cluster.\n
|
| 198 |
+
Outputs from this service will be PUBLIC and available on the Hugging Face Hub under the organization [synthetic-data-universe](https://huggingface.co/synthetic-data-universe).\n
|
| 199 |
+
""")
|
| 200 |
+
with gr.Group():
|
|
|
|
|
|
|
| 201 |
with gr.Row():
|
| 202 |
+
gr.Markdown("""
|
| 203 |
+
**How it works:**
|
| 204 |
+
1. Provide an input dataset with prompts
|
| 205 |
+
2. Select a public language model for generation
|
| 206 |
+
3. Configure generation parameters
|
| 207 |
+
4. Submit your request.
|
| 208 |
+
""")
|
| 209 |
+
gr.Markdown("""
|
| 210 |
+
|
| 211 |
+
**Requirements:**
|
| 212 |
+
- Input dataset must be publicly accessible (for now)
|
| 213 |
+
- Model must be accessible (public and note gated, for now)
|
| 214 |
+
- Maximum 10,000 samples per dataset (for now)
|
| 215 |
+
- Maximum of 8192 generation tokens (for now)
|
| 216 |
+
""")
|
| 217 |
+
|
| 218 |
+
with gr.Group():
|
| 219 |
+
gr.Markdown("## Dataset information")
|
| 220 |
with gr.Column():
|
| 221 |
with gr.Row():
|
| 222 |
+
input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1")
|
| 223 |
+
input_dataset_split = gr.Textbox(label="Input Dataset Split", value="train", placeholder="e.g., train, test, validation")
|
| 224 |
+
input_dataset_config = gr.Textbox(label="Input Dataset Config", value="default", placeholder="e.g., default, custom")
|
| 225 |
+
prompt_column = gr.Textbox(label="Prompt Column", placeholder="e.g., text, prompt, question")
|
| 226 |
+
|
| 227 |
+
with gr.Column():
|
| 228 |
+
with gr.Row():
|
| 229 |
+
output_dataset_name = gr.Textbox(label="Output Dataset Name", placeholder="e.g., my-generated-dataset, must be unique. Will be created under the org 'synthetic-data-universe'")
|
| 230 |
+
num_output_samples = gr.Slider(label="Number of samples, leave as '0' for all", value=0, minimum=0, maximum=MAX_SAMPLES, step=1)
|
| 231 |
+
with gr.Group():
|
| 232 |
+
gr.Markdown("## Model information")
|
| 233 |
with gr.Column():
|
| 234 |
with gr.Row():
|
| 235 |
+
model_name_or_path = gr.Textbox(label="Model Name or Path", placeholder="e.g., Qwen/Qwen3-4B-Instruct-2507")
|
| 236 |
+
model_revision = gr.Textbox(label="Model Revision", value="main", placeholder="e.g., main, v1.0")
|
| 237 |
+
# model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
|
| 238 |
+
with gr.Group():
|
| 239 |
+
gr.Markdown("## Generation Parameters")
|
| 240 |
+
with gr.Row():
|
| 241 |
+
with gr.Column():
|
| 242 |
+
with gr.Row():
|
| 243 |
+
max_tokens = gr.Slider(label="Max Tokens", value=512, minimum=256, maximum=MAX_TOKENS, step=256)
|
| 244 |
+
temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.1)
|
| 245 |
+
with gr.Row():
|
| 246 |
+
top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5)
|
| 247 |
+
top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
|
| 248 |
+
with gr.Row():
|
| 249 |
+
system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.")
|
| 250 |
|
| 251 |
+
with gr.Group():
|
| 252 |
+
gr.Markdown("## User Information, for notification when your job is completed (still TODO)")
|
| 253 |
+
with gr.Row():
|
| 254 |
+
with gr.Column():
|
| 255 |
+
with gr.Row():
|
| 256 |
+
email = gr.Textbox(label="Email", placeholder="your.email@example.com")
|
| 257 |
+
# with gr.Row():
|
| 258 |
+
# input_dataset_token = gr.Textbox(label="Input dataset token", type="password", placeholder="Your HF token with read access to the input dataset, leave blank if public dataset")
|
| 259 |
+
# output_dataset_token = gr.Textbox(label="Output dataset token", type="password", placeholder="Your HF token with write access to the output dataset")
|
| 260 |
|
| 261 |
+
submit_btn = gr.Button("Submit Generation Request", variant="primary")
|
| 262 |
+
output_status = gr.Textbox(label="Status", interactive=False)
|
| 263 |
|
| 264 |
+
def submit_request(input_dataset_name, input_split, input_dataset_config, output_dataset_name, prompt_col, model_name, model_rev, sys_prompt,
|
| 265 |
+
max_tok, temp, top_k_val, top_p_val, email_addr, num_output_samples):
|
| 266 |
+
|
| 267 |
+
MASTER_ORG = "synthetic-data-universe/"
|
| 268 |
+
model_token = None # This is currently not supported
|
| 269 |
+
input_dataset_token = None # This is currently not supported
|
| 270 |
+
output_dataset_token = os.getenv("OUTPUT_DATASET_TOKEN")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
|
| 272 |
+
try:
|
| 273 |
+
request = GenerationRequest(
|
| 274 |
+
id="", # Will be generated when adding to the database
|
| 275 |
+
created_at="", # Will be set when adding to the database
|
| 276 |
+
status=GenerationStatus.PENDING,
|
| 277 |
+
input_dataset_name=input_dataset_name,
|
| 278 |
+
input_dataset_split=input_split,
|
| 279 |
+
input_dataset_config=input_dataset_config,
|
| 280 |
+
output_dataset_name=MASTER_ORG + output_dataset_name,
|
| 281 |
+
prompt_column=prompt_col,
|
| 282 |
+
model_name_or_path=model_name,
|
| 283 |
+
model_revision=model_rev,
|
| 284 |
+
model_token=model_token if model_token else None,
|
| 285 |
+
system_prompt=sys_prompt if sys_prompt else None,
|
| 286 |
+
max_tokens=int(max_tok),
|
| 287 |
+
temperature=temp,
|
| 288 |
+
top_k=int(top_k_val),
|
| 289 |
+
top_p=top_p_val,
|
| 290 |
+
input_dataset_token=input_dataset_token if input_dataset_token else None,
|
| 291 |
+
output_dataset_token=output_dataset_token,
|
| 292 |
+
num_output_examples=num_output_samples, # will be set after validating the input dataset
|
| 293 |
+
username="user",
|
| 294 |
+
email=email_addr
|
| 295 |
+
)
|
| 296 |
+
|
| 297 |
+
# check the input dataset exists and can be accessed with the provided token
|
| 298 |
+
request = validate_request(request)
|
| 299 |
+
add_request_to_db(request)
|
| 300 |
|
| 301 |
+
return "Request submitted successfully!"
|
| 302 |
+
except Exception as e:
|
| 303 |
+
return f"Error: {str(e)}"
|
| 304 |
+
|
| 305 |
+
submit_btn.click(
|
| 306 |
+
submit_request,
|
| 307 |
+
inputs=[input_dataset_name, input_dataset_split, input_dataset_config, output_dataset_name, prompt_column, model_name_or_path,
|
| 308 |
+
model_revision, system_prompt, max_tokens, temperature, top_k, top_p, email, num_output_samples],
|
| 309 |
+
outputs=output_status
|
| 310 |
+
)
|
| 311 |
|
| 312 |
+
def control_access(profile: Optional[gr.OAuthProfile] = None, oauth_token: Optional[gr.OAuthToken] = None):
|
| 313 |
+
if not profile: return gr.update(visible=False), gr.update(visible=False)
|
| 314 |
+
if verify_pro_status(oauth_token): return gr.update(visible=True), gr.update(visible=False)
|
| 315 |
+
else:
|
| 316 |
+
message = (
|
| 317 |
+
"## ✨ Exclusive Access for PRO Users\n\n"
|
| 318 |
+
"Thank you for your interest! This app is available exclusively for our Hugging Face **PRO** members.\n\n"
|
| 319 |
+
"To unlock this and many other cool stuff, please consider upgrading your account.\n\n"
|
| 320 |
+
"### [**Become a PRO Today!**](http://huggingface.co/subscribe/pro?source=synthetic-data-universe)"
|
| 321 |
+
)
|
| 322 |
+
return gr.update(visible=False), gr.update(visible=True, value=message)
|
| 323 |
+
|
| 324 |
+
demo.load(control_access, inputs=None, outputs=[main_interface, pro_message])
|
| 325 |
+
demo.queue(max_size=None, default_concurrency_limit=None).launch(show_error=True)
|
| 326 |
|
| 327 |
if __name__ == "__main__":
|
| 328 |
+
main()
|
|
|