Spaces:
Sleeping
Sleeping
edbeeching
commited on
Commit
·
0fb1b95
1
Parent(s):
67c4080
vibing with claude
Browse files
app.py
CHANGED
|
@@ -49,7 +49,8 @@ class GenerationStatus(Enum):
|
|
| 49 |
FAILED = "FAILED"
|
| 50 |
|
| 51 |
|
| 52 |
-
|
|
|
|
| 53 |
MAX_TOKENS = 8192
|
| 54 |
MAX_MODEL_PARAMS = 20_000_000_000 # 20 billion parameters (for now)
|
| 55 |
|
|
@@ -79,7 +80,7 @@ class GenerationRequest:
|
|
| 79 |
private: bool = False
|
| 80 |
num_retries: int = 0
|
| 81 |
|
| 82 |
-
def validate_request(request: GenerationRequest) -> GenerationRequest:
|
| 83 |
# checks that the request is valid
|
| 84 |
# - input dataset exists and can be accessed with the provided token
|
| 85 |
try:
|
|
@@ -101,8 +102,13 @@ def validate_request(request: GenerationRequest) -> GenerationRequest:
|
|
| 101 |
|
| 102 |
|
| 103 |
|
| 104 |
-
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
# check the prompt column exists in the dataset
|
| 108 |
if request.prompt_column not in input_dataset_info.features:
|
|
@@ -195,16 +201,16 @@ def add_request_to_db(request: GenerationRequest):
|
|
| 195 |
raise Exception("Failed to add request to database")
|
| 196 |
|
| 197 |
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
def main():
|
| 203 |
with gr.Blocks(title="Synthetic Data Generation") as demo:
|
| 204 |
-
gr.HTML("<h3 style='text-align:center'>
|
| 205 |
|
| 206 |
pro_message = gr.Markdown(visible=False)
|
| 207 |
main_interface = gr.Column(visible=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
with main_interface:
|
| 209 |
with gr.Group():
|
| 210 |
with gr.Row():
|
|
@@ -214,7 +220,7 @@ def main():
|
|
| 214 |
Welcome to the Synthetic Data Generation service! This tool allows you to generate synthetic data using large language models. Generation is FREE for Hugging Face PRO users and uses idle GPUs on the HF science cluster.\n
|
| 215 |
Outputs from this service will be PUBLIC and available on the Hugging Face Hub under the organization [synthetic-data-universe](https://huggingface.co/synthetic-data-universe).\n
|
| 216 |
""")
|
| 217 |
-
with gr.
|
| 218 |
with gr.Row():
|
| 219 |
gr.Markdown("""
|
| 220 |
**How it works:**
|
|
@@ -232,54 +238,81 @@ def main():
|
|
| 232 |
- Maximum of 8192 generated tokens
|
| 233 |
""")
|
| 234 |
|
| 235 |
-
with gr.
|
| 236 |
-
gr.
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
|
|
|
| 262 |
with gr.Row():
|
| 263 |
-
|
| 264 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
with gr.Row():
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
with gr.Row():
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
|
| 281 |
def submit_request(input_dataset_name, input_split, input_dataset_config, output_dataset_name, prompt_col, model_name, model_rev, sys_prompt,
|
| 282 |
-
max_tok, temp, top_k_val, top_p_val, email_addr, num_output_samples):
|
| 283 |
|
| 284 |
MASTER_ORG = "synthetic-data-universe/"
|
| 285 |
model_token = False # This is currently not supported
|
|
@@ -312,7 +345,7 @@ def main():
|
|
| 312 |
)
|
| 313 |
|
| 314 |
# check the input dataset exists and can be accessed with the provided token
|
| 315 |
-
request = validate_request(request)
|
| 316 |
add_request_to_db(request)
|
| 317 |
|
| 318 |
return "Request submitted successfully!"
|
|
@@ -322,25 +355,29 @@ def main():
|
|
| 322 |
submit_btn.click(
|
| 323 |
submit_request,
|
| 324 |
inputs=[input_dataset_name, input_dataset_split, input_dataset_config, output_dataset_name, prompt_column, model_name_or_path,
|
| 325 |
-
model_revision, system_prompt, max_tokens, temperature, top_k, top_p, email, num_output_samples],
|
| 326 |
outputs=output_status
|
| 327 |
)
|
| 328 |
|
| 329 |
-
def
|
| 330 |
-
if
|
| 331 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
else:
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
|
| 341 |
login_button = gr.LoginButton() # this is required or AUTH will not work
|
| 342 |
|
| 343 |
-
demo.load(control_access, inputs=None, outputs=[main_interface, pro_message])
|
| 344 |
demo.queue(max_size=None, default_concurrency_limit=None).launch(show_error=True)
|
| 345 |
|
| 346 |
if __name__ == "__main__":
|
|
|
|
| 49 |
FAILED = "FAILED"
|
| 50 |
|
| 51 |
|
| 52 |
+
MAX_SAMPLES_PRO = 10000 # max number of samples for PRO/Enterprise users
|
| 53 |
+
MAX_SAMPLES_FREE = 100 # max number of samples for free users
|
| 54 |
MAX_TOKENS = 8192
|
| 55 |
MAX_MODEL_PARAMS = 20_000_000_000 # 20 billion parameters (for now)
|
| 56 |
|
|
|
|
| 80 |
private: bool = False
|
| 81 |
num_retries: int = 0
|
| 82 |
|
| 83 |
+
def validate_request(request: GenerationRequest, oauth_token: Optional[Union[gr.OAuthToken, str]] = None) -> GenerationRequest:
|
| 84 |
# checks that the request is valid
|
| 85 |
# - input dataset exists and can be accessed with the provided token
|
| 86 |
try:
|
|
|
|
| 102 |
|
| 103 |
|
| 104 |
|
| 105 |
+
# Check user tier and apply appropriate limits
|
| 106 |
+
is_pro = verify_pro_status(oauth_token)
|
| 107 |
+
max_samples = MAX_SAMPLES_PRO if is_pro else MAX_SAMPLES_FREE
|
| 108 |
+
|
| 109 |
+
if request.num_output_examples > max_samples:
|
| 110 |
+
user_tier = "PRO/Enterprise" if is_pro else "free"
|
| 111 |
+
raise Exception(f"Requested number of output examples {request.num_output_examples} exceeds the max limit of {max_samples} for {user_tier} users.")
|
| 112 |
|
| 113 |
# check the prompt column exists in the dataset
|
| 114 |
if request.prompt_column not in input_dataset_info.features:
|
|
|
|
| 201 |
raise Exception("Failed to add request to database")
|
| 202 |
|
| 203 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
def main():
|
| 205 |
with gr.Blocks(title="Synthetic Data Generation") as demo:
|
| 206 |
+
gr.HTML("<h3 style='text-align:center'>Generate synthetic data with AI models. PRO users get 10k samples, free users get 100 samples. <a href='http://huggingface.co/subscribe/pro?source=synthetic-data-universe' target='_blank'>Upgrade to PRO</a></h3>", elem_id="sub_title")
|
| 207 |
|
| 208 |
pro_message = gr.Markdown(visible=False)
|
| 209 |
main_interface = gr.Column(visible=False)
|
| 210 |
+
|
| 211 |
+
# Store the current oauth token for use in submit_request
|
| 212 |
+
current_oauth_token = gr.State(None)
|
| 213 |
+
|
| 214 |
with main_interface:
|
| 215 |
with gr.Group():
|
| 216 |
with gr.Row():
|
|
|
|
| 220 |
Welcome to the Synthetic Data Generation service! This tool allows you to generate synthetic data using large language models. Generation is FREE for Hugging Face PRO users and uses idle GPUs on the HF science cluster.\n
|
| 221 |
Outputs from this service will be PUBLIC and available on the Hugging Face Hub under the organization [synthetic-data-universe](https://huggingface.co/synthetic-data-universe).\n
|
| 222 |
""")
|
| 223 |
+
with gr.Accordion("How it works", open=False):
|
| 224 |
with gr.Row():
|
| 225 |
gr.Markdown("""
|
| 226 |
**How it works:**
|
|
|
|
| 238 |
- Maximum of 8192 generated tokens
|
| 239 |
""")
|
| 240 |
|
| 241 |
+
with gr.Tabs():
|
| 242 |
+
with gr.TabItem("Generate Synthetic Data"):
|
| 243 |
+
with gr.Group():
|
| 244 |
+
gr.Markdown("## Model information")
|
| 245 |
+
with gr.Column():
|
| 246 |
+
with gr.Row():
|
| 247 |
+
model_name_or_path = gr.Dropdown(
|
| 248 |
+
choices=[
|
| 249 |
+
"microsoft/Phi-3.5-mini-instruct",
|
| 250 |
+
"Qwen/Qwen2.5-7B-Instruct",
|
| 251 |
+
"meta-llama/Llama-3.2-8B-Instruct",
|
| 252 |
+
"mistralai/Mistral-7B-Instruct-v0.3",
|
| 253 |
+
"google/gemma-2-9b-it",
|
| 254 |
+
"microsoft/DialoGPT-medium",
|
| 255 |
+
"HuggingFaceH4/zephyr-7b-beta",
|
| 256 |
+
"teknium/OpenHermes-2.5-Mistral-7B",
|
| 257 |
+
"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO",
|
| 258 |
+
"01-ai/Yi-34B-Chat"
|
| 259 |
+
],
|
| 260 |
+
label="Select Model",
|
| 261 |
+
value="microsoft/Phi-3.5-mini-instruct",
|
| 262 |
+
info="Choose from popular instruction-tuned models under 40B parameters"
|
| 263 |
+
)
|
| 264 |
+
# model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
|
| 265 |
+
with gr.Group():
|
| 266 |
+
gr.Markdown("## Dataset information")
|
| 267 |
+
# Dynamic user limit info
|
| 268 |
+
user_limit_info = gr.Markdown(value="", visible=True)
|
| 269 |
with gr.Row():
|
| 270 |
+
with gr.Column():
|
| 271 |
+
input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1")
|
| 272 |
+
prompt_column = gr.Textbox(label="Prompt Column", placeholder="e.g., text, prompt, question")
|
| 273 |
+
|
| 274 |
+
with gr.Column():
|
| 275 |
+
output_dataset_name = gr.Textbox(label="Output Dataset Name", placeholder="e.g., my-generated-dataset, must be unique. Will be created under the org 'synthetic-data-universe'")
|
| 276 |
+
num_output_samples = gr.Slider(label="Number of samples, leave as '0' for all", value=0, minimum=0, maximum=MAX_SAMPLES_PRO, step=1)
|
| 277 |
+
|
| 278 |
+
with gr.Accordion("Advanced Options", open=False):
|
| 279 |
with gr.Row():
|
| 280 |
+
input_dataset_config = gr.Textbox(label="Input Dataset Config", value="default", placeholder="e.g., default, custom")
|
| 281 |
+
input_dataset_split = gr.Textbox(label="Input Dataset Split", value="train", placeholder="e.g., train, test, validation")
|
| 282 |
+
model_revision = gr.Textbox(label="Model Revision", value="main", placeholder="e.g., main, v1.0")
|
| 283 |
+
|
| 284 |
+
with gr.Group():
|
| 285 |
+
gr.Markdown("### Generation Parameters")
|
| 286 |
+
with gr.Row():
|
| 287 |
+
with gr.Column():
|
| 288 |
+
with gr.Row():
|
| 289 |
+
max_tokens = gr.Slider(label="Max Tokens", value=512, minimum=256, maximum=MAX_TOKENS, step=256)
|
| 290 |
+
temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.1)
|
| 291 |
+
with gr.Row():
|
| 292 |
+
top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5)
|
| 293 |
+
top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
|
| 294 |
+
with gr.Row():
|
| 295 |
+
system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.")
|
| 296 |
+
|
| 297 |
+
with gr.Group():
|
| 298 |
+
gr.Markdown("## User Information, for notification when your job is completed (still TODO)")
|
| 299 |
with gr.Row():
|
| 300 |
+
with gr.Column():
|
| 301 |
+
with gr.Row():
|
| 302 |
+
email = gr.Textbox(label="Email", placeholder="your.email@example.com")
|
| 303 |
+
# with gr.Row():
|
| 304 |
+
# input_dataset_token = gr.Textbox(label="Input dataset token", type="password", placeholder="Your HF token with read access to the input dataset, leave blank if public dataset")
|
| 305 |
+
# output_dataset_token = gr.Textbox(label="Output dataset token", type="password", placeholder="Your HF token with write access to the output dataset")
|
| 306 |
+
|
| 307 |
+
submit_btn = gr.Button("Submit Generation Request", variant="primary")
|
| 308 |
+
output_status = gr.Textbox(label="Status", interactive=False)
|
| 309 |
+
|
| 310 |
+
with gr.TabItem("Coming Soon"):
|
| 311 |
+
gr.Markdown("## New features coming soon!")
|
| 312 |
+
gr.Markdown("This tab will contain additional functionality in future updates.")
|
| 313 |
|
| 314 |
def submit_request(input_dataset_name, input_split, input_dataset_config, output_dataset_name, prompt_col, model_name, model_rev, sys_prompt,
|
| 315 |
+
max_tok, temp, top_k_val, top_p_val, email_addr, num_output_samples, oauth_token=None):
|
| 316 |
|
| 317 |
MASTER_ORG = "synthetic-data-universe/"
|
| 318 |
model_token = False # This is currently not supported
|
|
|
|
| 345 |
)
|
| 346 |
|
| 347 |
# check the input dataset exists and can be accessed with the provided token
|
| 348 |
+
request = validate_request(request, oauth_token)
|
| 349 |
add_request_to_db(request)
|
| 350 |
|
| 351 |
return "Request submitted successfully!"
|
|
|
|
| 355 |
submit_btn.click(
|
| 356 |
submit_request,
|
| 357 |
inputs=[input_dataset_name, input_dataset_split, input_dataset_config, output_dataset_name, prompt_column, model_name_or_path,
|
| 358 |
+
model_revision, system_prompt, max_tokens, temperature, top_k, top_p, email, num_output_samples, current_oauth_token],
|
| 359 |
outputs=output_status
|
| 360 |
)
|
| 361 |
|
| 362 |
+
def update_user_limits(oauth_token):
|
| 363 |
+
if oauth_token is None:
|
| 364 |
+
return ""
|
| 365 |
+
|
| 366 |
+
is_pro = verify_pro_status(oauth_token)
|
| 367 |
+
if is_pro:
|
| 368 |
+
return "✨ **PRO User**: You can generate up to 10,000 samples per request."
|
| 369 |
else:
|
| 370 |
+
return "👤 **Free User**: You can generate up to 100 samples per request. [Upgrade to PRO](http://huggingface.co/subscribe/pro?source=synthetic-data-universe) for 10,000 samples."
|
| 371 |
+
|
| 372 |
+
def control_access(profile: Optional[gr.OAuthProfile] = None, oauth_token: Optional[gr.OAuthToken] = None):
|
| 373 |
+
if not profile: return gr.update(visible=False), gr.update(visible=False), None, ""
|
| 374 |
+
# Allow all users but show different messaging, and store the token
|
| 375 |
+
limit_msg = update_user_limits(oauth_token)
|
| 376 |
+
return gr.update(visible=True), gr.update(visible=False), oauth_token, limit_msg
|
| 377 |
|
| 378 |
login_button = gr.LoginButton() # this is required or AUTH will not work
|
| 379 |
|
| 380 |
+
demo.load(control_access, inputs=None, outputs=[main_interface, pro_message, current_oauth_token, user_limit_info])
|
| 381 |
demo.queue(max_size=None, default_concurrency_limit=None).launch(show_error=True)
|
| 382 |
|
| 383 |
if __name__ == "__main__":
|