Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from dataclasses import dataclass | |
| import os | |
| from supabase import create_client, Client | |
| from supabase.client import ClientOptions | |
| from enum import Enum | |
| from datasets import get_dataset_infos | |
| from transformers import AutoConfig | |
| from huggingface_hub import whoami | |
| from typing import Optional, List, Tuple, Union | |
| """ | |
| Still TODO: | |
| - validate the user is PRO | |
| - check the output dataset token is valid (hardcoded for now as a secret) | |
| - validate max model params | |
| """ | |
| def verify_pro_status(token: Optional[Union[gr.OAuthToken, str]]) -> bool: | |
| """Verifies if the user is a Hugging Face PRO user or part of an enterprise org.""" | |
| if not token: | |
| return False | |
| if isinstance(token, gr.OAuthToken): | |
| token_str = token.token | |
| elif isinstance(token, str): | |
| token_str = token | |
| else: | |
| return False | |
| try: | |
| user_info = whoami(token=token_str) | |
| return ( | |
| user_info.get("isPro", False) or | |
| any(org.get("isEnterprise", False) for org in user_info.get("orgs", [])) | |
| ) | |
| except Exception as e: | |
| print(f"Could not verify user's PRO/Enterprise status: {e}") | |
| return False | |
| class GenerationStatus(Enum): | |
| PENDING = "PENDING" | |
| RUNNING = "RUNNING" | |
| COMPLETED = "COMPLETED" | |
| FAILED = "FAILED" | |
| MAX_SAMPLES_PRO = 10000 # max number of samples for PRO/Enterprise users | |
| MAX_SAMPLES_FREE = 100 # max number of samples for free users | |
| MAX_TOKENS = 8192 | |
| MAX_MODEL_PARAMS = 20_000_000_000 # 20 billion parameters (for now) | |
| class GenerationRequest: | |
| id: str | |
| created_at: str | |
| status: GenerationStatus | |
| input_dataset_name: str | |
| input_dataset_config: str | |
| input_dataset_split: str | |
| output_dataset_name: str | |
| prompt_column: str | |
| model_name_or_path: str | |
| model_revision: str | |
| model_token: str | None | |
| system_prompt: str | None | |
| max_tokens: int | |
| temperature: float | |
| top_k: int | |
| top_p: float | |
| input_dataset_token: str | None | |
| output_dataset_token: str | |
| username: str | |
| email: str | |
| num_output_examples: int | |
| private: bool = False | |
| num_retries: int = 0 | |
| def validate_request(request: GenerationRequest, oauth_token: Optional[Union[gr.OAuthToken, str]] = None) -> GenerationRequest: | |
| # checks that the request is valid | |
| # - input dataset exists and can be accessed with the provided token | |
| try: | |
| input_dataset_info = get_dataset_infos(request.input_dataset_name, token=request.input_dataset_token)[request.input_dataset_config] | |
| except Exception as e: | |
| raise Exception(f"Dataset {request.input_dataset_name} does not exist or cannot be accessed with the provided token.") | |
| # check that the input dataset split exists | |
| if request.input_dataset_split not in input_dataset_info.splits: | |
| raise Exception(f"Dataset split {request.input_dataset_split} does not exist in dataset {request.input_dataset_name}. Available splits: {list(input_dataset_info.splits.keys())}") | |
| # if num_output_examples is 0, set it to the number of examples in the input dataset split | |
| if request.num_output_examples == 0: | |
| request.num_output_examples = input_dataset_info.splits[request.input_dataset_split].num_examples | |
| else: | |
| if request.num_output_examples > input_dataset_info.splits[request.input_dataset_split].num_examples: | |
| raise Exception(f"Requested number of output examples {request.num_output_examples} exceeds the number of examples in the input dataset split {input_dataset_info.splits[request.input_dataset_split].num_examples}.") | |
| request.input_dataset_split = f"{request.input_dataset_split}[:{request.num_output_examples}]" | |
| # Check user tier and apply appropriate limits | |
| # Anonymous users (oauth_token is None) are treated as free tier | |
| is_pro = verify_pro_status(oauth_token) if oauth_token else False | |
| max_samples = MAX_SAMPLES_PRO if is_pro else MAX_SAMPLES_FREE | |
| if request.num_output_examples > max_samples: | |
| if oauth_token is None: | |
| user_tier = "anonymous" | |
| else: | |
| user_tier = "PRO/Enterprise" if is_pro else "free" | |
| raise Exception(f"Requested number of output examples {request.num_output_examples} exceeds the max limit of {max_samples} for {user_tier} users.") | |
| # check the prompt column exists in the dataset | |
| if request.prompt_column not in input_dataset_info.features: | |
| raise Exception(f"Prompt column {request.prompt_column} does not exist in dataset {request.input_dataset_name}. Available columns: {list(input_dataset_info.features.keys())}") | |
| # This is currently not supported, the output dataset will be created under the org 'synthetic-data-universe' | |
| # check output_dataset name is valid | |
| if request.output_dataset_name.count("/") != 1: | |
| raise Exception("Output dataset name must be in the format 'dataset_name', e.g., 'my-dataset'. The dataset will be created under the org 'synthetic-data-universe/my-dataset'.") | |
| # check the output dataset is valid and accessible with the provided token | |
| try: | |
| output_dataset_info = get_dataset_infos(request.output_dataset_name, token=request.output_dataset_token) | |
| raise Exception(f"Output dataset {request.output_dataset_name} already exists. Please choose a different name.") | |
| except Exception as e: | |
| pass # dataset does not exist, which is expected | |
| # check the models exists | |
| try: | |
| model_config = AutoConfig.from_pretrained(request.model_name_or_path, | |
| revision=request.model_revision, | |
| force_download=True, | |
| token=False | |
| ) | |
| except Exception as e: | |
| print(e) | |
| raise Exception(f"Model {request.model_name_or_path} revision {request.model_revision} does not exist or cannot be accessed. The model may be private or gated, which is not supported at this time.") | |
| # check the model max position embeddings is greater than the requested max tokens and less than MAX_TOKENS | |
| if model_config.max_position_embeddings < request.max_tokens: | |
| raise Exception(f"Model {request.model_name_or_path} max position embeddings {model_config.max_position_embeddings} is less than the requested max tokens {request.max_tokens}.") | |
| if request.max_tokens > MAX_TOKENS: | |
| raise Exception(f"Requested max tokens {request.max_tokens} exceeds the limit of {MAX_TOKENS}.") | |
| # check sampling parameters are valid | |
| if request.temperature < 0.0 or request.temperature > 2.0: | |
| raise Exception("Temperature must be between 0.0 and 2.0") | |
| if request.top_k < 1 or request.top_k > 100: | |
| raise Exception("Top K must be between 1 and 100") | |
| if request.top_p < 0.0 or request.top_p > 1.0: | |
| raise Exception("Top P must be between 0.0 and 1.0") | |
| # check valid email address TODO: use py3-validate-email https://stackoverflow.com/questions/8022530/how-to-check-for-valid-email-address | |
| if "@" not in request.email or "." not in request.email.split("@")[-1]: | |
| raise Exception("Invalid email address") | |
| return request | |
| def add_request_to_db(request: GenerationRequest): | |
| url: str = os.getenv("SUPABASE_URL") | |
| key: str = os.getenv("SUPABASE_KEY") | |
| try: | |
| supabase: Client = create_client( | |
| url, | |
| key, | |
| options=ClientOptions( | |
| postgrest_client_timeout=10, | |
| storage_client_timeout=10, | |
| schema="public", | |
| ) | |
| ) | |
| data = { | |
| "status": request.status.value, | |
| "input_dataset_name": request.input_dataset_name, | |
| "input_dataset_config": request.input_dataset_config, | |
| "input_dataset_split": request.input_dataset_split, | |
| "output_dataset_name": request.output_dataset_name, | |
| "prompt_column": request.prompt_column, | |
| "model_name_or_path": request.model_name_or_path, | |
| "model_revision": request.model_revision, | |
| "model_token": request.model_token, | |
| "system_prompt": request.system_prompt, | |
| "max_tokens": request.max_tokens, | |
| "temperature": request.temperature, | |
| "top_k": request.top_k, | |
| "top_p": request.top_p, | |
| "input_dataset_token": request.input_dataset_token, | |
| "output_dataset_token": request.output_dataset_token, | |
| "username": request.username, | |
| "email": request.email, | |
| "num_output_examples": request.num_output_examples, | |
| "private": request.private, | |
| } | |
| supabase.table("gen-requests").insert(data).execute() | |
| except Exception as e: | |
| raise Exception("Failed to add request to database") | |
| def main(): | |
| with gr.Blocks(title="Synthetic Data Generation") as demo: | |
| gr.HTML("<h3 style='text-align:center'>Generate synthetic data with AI models. Free to use! Sign in for PRO benefits (10k samples vs 100). <a href='http://huggingface.co/subscribe/pro?source=synthetic-data-universe' target='_blank'>Upgrade to PRO</a></h3>", elem_id="sub_title") | |
| # Add sign-in button at the top | |
| with gr.Row(): | |
| gr.Markdown("") # Empty space for alignment | |
| login_button = gr.LoginButton(value="π Sign in", size="sm") | |
| gr.Markdown("") # Empty space for alignment | |
| pro_message = gr.Markdown(visible=False) | |
| main_interface = gr.Column(visible=True) | |
| # Store the current oauth token for use in submit_request | |
| current_oauth_token = gr.State(None) | |
| with main_interface: | |
| with gr.Group(): | |
| with gr.Row(): | |
| gr.Markdown("# Synthetic Data Generation Request") | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| Welcome to the Synthetic Data Generation service! This tool allows you to generate synthetic data using large language models. Generation is FREE for Hugging Face PRO users and uses idle GPUs on the HF science cluster.\n | |
| Outputs from this service will be PUBLIC and available on the Hugging Face Hub under the organization [synthetic-data-universe](https://huggingface.co/synthetic-data-universe).\n | |
| """) | |
| with gr.Accordion("How it works", open=False): | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| **How it works:** | |
| 1. Provide an input dataset with prompts | |
| 2. Select a public language model for generation | |
| 3. Configure generation parameters | |
| 4. Submit your request. | |
| """) | |
| gr.Markdown(""" | |
| **Requirements:** | |
| - Input dataset must be publicly accessible | |
| - Model must be publicly accessible (and not gated) | |
| - Maximum 10,000 samples per dataset | |
| - Maximum of 8192 generated tokens | |
| """) | |
| with gr.Tabs(): | |
| with gr.TabItem("Generate Synthetic Data"): | |
| with gr.Group(): | |
| gr.Markdown("## Model information") | |
| with gr.Column(): | |
| with gr.Row(): | |
| model_name_or_path = gr.Dropdown( | |
| choices=[ | |
| "microsoft/Phi-3.5-mini-instruct", | |
| "Qwen/Qwen2.5-7B-Instruct", | |
| "meta-llama/Llama-3.2-8B-Instruct", | |
| "mistralai/Mistral-7B-Instruct-v0.3", | |
| "google/gemma-2-9b-it", | |
| "microsoft/DialoGPT-medium", | |
| "HuggingFaceH4/zephyr-7b-beta", | |
| "teknium/OpenHermes-2.5-Mistral-7B", | |
| "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO", | |
| "01-ai/Yi-34B-Chat" | |
| ], | |
| label="Select Model", | |
| value="microsoft/Phi-3.5-mini-instruct", | |
| info="Choose from popular instruction-tuned models under 40B parameters" | |
| ) | |
| # model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...") | |
| with gr.Group(): | |
| gr.Markdown("## Dataset information") | |
| # Dynamic user limit info - default to anonymous user | |
| user_limit_info = gr.Markdown(value="π€ **Anonymous User**: You can generate up to 100 samples per request. Use the sign-in button above for PRO benefits (10,000 samples).", visible=True) | |
| with gr.Row(): | |
| with gr.Column(): | |
| input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1") | |
| prompt_column = gr.Textbox(label="Prompt Column", placeholder="e.g., text, prompt, question") | |
| with gr.Column(): | |
| output_dataset_name = gr.Textbox(label="Output Dataset Name", placeholder="e.g., my-generated-dataset, must be unique. Will be created under the org 'synthetic-data-universe'") | |
| num_output_samples = gr.Slider(label="Number of samples, leave as '0' for all", value=0, minimum=0, maximum=MAX_SAMPLES_FREE, step=1) | |
| with gr.Accordion("Advanced Options", open=False): | |
| with gr.Row(): | |
| input_dataset_config = gr.Textbox(label="Input Dataset Config", value="default", placeholder="e.g., default, custom") | |
| input_dataset_split = gr.Textbox(label="Input Dataset Split", value="train", placeholder="e.g., train, test, validation") | |
| model_revision = gr.Textbox(label="Model Revision", value="main", placeholder="e.g., main, v1.0") | |
| with gr.Group(): | |
| gr.Markdown("### Generation Parameters") | |
| with gr.Row(): | |
| with gr.Column(): | |
| with gr.Row(): | |
| max_tokens = gr.Slider(label="Max Tokens", value=512, minimum=256, maximum=MAX_TOKENS, step=256) | |
| temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.1) | |
| with gr.Row(): | |
| top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5) | |
| top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05) | |
| with gr.Row(): | |
| system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.") | |
| with gr.Group(): | |
| gr.Markdown("## User Information, for notification when your job is completed (still TODO)") | |
| with gr.Row(): | |
| with gr.Column(): | |
| with gr.Row(): | |
| email = gr.Textbox(label="Email", placeholder="your.email@example.com") | |
| # with gr.Row(): | |
| # input_dataset_token = gr.Textbox(label="Input dataset token", type="password", placeholder="Your HF token with read access to the input dataset, leave blank if public dataset") | |
| # output_dataset_token = gr.Textbox(label="Output dataset token", type="password", placeholder="Your HF token with write access to the output dataset") | |
| submit_btn = gr.Button("Submit Generation Request", variant="primary") | |
| output_status = gr.Textbox(label="Status", interactive=False) | |
| with gr.TabItem("Coming Soon"): | |
| gr.Markdown("## New features coming soon!") | |
| gr.Markdown("This tab will contain additional functionality in future updates.") | |
| def submit_request(input_dataset_name, input_split, input_dataset_config, output_dataset_name, prompt_col, model_name, model_rev, sys_prompt, | |
| max_tok, temp, top_k_val, top_p_val, email_addr, num_output_samples, oauth_token=None): | |
| MASTER_ORG = "synthetic-data-universe/" | |
| model_token = False # This is currently not supported | |
| input_dataset_token = None # This is currently not supported | |
| output_dataset_token = os.getenv("OUTPUT_DATASET_TOKEN") | |
| try: | |
| request = GenerationRequest( | |
| id="", # Will be generated when adding to the database | |
| created_at="", # Will be set when adding to the database | |
| status=GenerationStatus.PENDING, | |
| input_dataset_name=input_dataset_name, | |
| input_dataset_split=input_split, | |
| input_dataset_config=input_dataset_config, | |
| output_dataset_name=MASTER_ORG + output_dataset_name, | |
| prompt_column=prompt_col, | |
| model_name_or_path=model_name, | |
| model_revision=model_rev, | |
| model_token=model_token, | |
| system_prompt=sys_prompt if sys_prompt else None, | |
| max_tokens=int(max_tok), | |
| temperature=temp, | |
| top_k=int(top_k_val), | |
| top_p=top_p_val, | |
| input_dataset_token=input_dataset_token if input_dataset_token else None, | |
| output_dataset_token=output_dataset_token, | |
| num_output_examples=num_output_samples, # will be set after validating the input dataset | |
| username="user", | |
| email=email_addr | |
| ) | |
| # check the input dataset exists and can be accessed with the provided token | |
| request = validate_request(request, oauth_token) | |
| add_request_to_db(request) | |
| return "Request submitted successfully!" | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| submit_btn.click( | |
| submit_request, | |
| inputs=[input_dataset_name, input_dataset_split, input_dataset_config, output_dataset_name, prompt_column, model_name_or_path, | |
| model_revision, system_prompt, max_tokens, temperature, top_k, top_p, email, num_output_samples, current_oauth_token], | |
| outputs=output_status | |
| ) | |
| def update_user_limits(oauth_token): | |
| if oauth_token is None: | |
| return "π€ **Anonymous User**: You can generate up to 100 samples per request. Use the sign-in button above for PRO benefits (10,000 samples)." | |
| is_pro = verify_pro_status(oauth_token) | |
| if is_pro: | |
| return "β¨ **PRO User**: You can generate up to 10,000 samples per request." | |
| else: | |
| return "π€ **Free User**: You can generate up to 100 samples per request. [Upgrade to PRO](http://huggingface.co/subscribe/pro?source=synthetic-data-universe) for 10,000 samples." | |
| def control_access(profile: Optional[gr.OAuthProfile] = None, oauth_token: Optional[gr.OAuthToken] = None): | |
| # Always show the interface, whether user is logged in or not | |
| limit_msg = update_user_limits(oauth_token) | |
| # Update slider maximum based on user tier | |
| if oauth_token is None: | |
| max_samples = MAX_SAMPLES_FREE | |
| button_text = "π Sign in for PRO benefits" | |
| else: | |
| is_pro = verify_pro_status(oauth_token) | |
| max_samples = MAX_SAMPLES_PRO if is_pro else MAX_SAMPLES_FREE | |
| if is_pro: | |
| button_text = f"β¨ Signed in as PRO ({profile.name if profile else 'User'})" | |
| else: | |
| button_text = f"π€ Signed in as {profile.name if profile else 'User'}" | |
| slider_update = gr.update(maximum=max_samples) | |
| button_update = gr.update(value=button_text) | |
| return gr.update(visible=True), gr.update(visible=False), oauth_token, limit_msg, slider_update, button_update | |
| # Handle login state changes - LoginButton automatically handles auth state changes | |
| # The demo.load will handle both initial load and auth changes | |
| demo.load(control_access, inputs=None, outputs=[main_interface, pro_message, current_oauth_token, user_limit_info, num_output_samples, login_button]) | |
| demo.queue(max_size=None, default_concurrency_limit=None).launch(show_error=True) | |
| if __name__ == "__main__": | |
| main() | |