Spaces:

synthetic-data-universe
/

synth

Running

File size: 35,347 Bytes

import gradio as gr

from dataclasses import dataclass
import os
from supabase import create_client, Client
from supabase.client import ClientOptions
from enum import Enum
from datasets import get_dataset_infos
from transformers import AutoConfig
from huggingface_hub import whoami
from typing import Optional, Union

"""
 Still TODO:
 - validate the user is PRO
 - check the output dataset token is valid (hardcoded for now as a secret)
 - validate max model params
"""

SUPPORTED_MODELS = [
    "Qwen/Qwen3-4B-Instruct-2507",
    "Qwen/Qwen3-30B-A3B-Instruct-2507",
    "meta-llama/Llama-3.2-1B-Instruct",
    "meta-llama/Llama-3.2-3B-Instruct",
    "baidu/ERNIE-4.5-21B-A3B-Thinking",
    "LLM360/K2-Think",
    "openai/gpt-oss-20b",
]


def verify_pro_status(token: Optional[Union[gr.OAuthToken, str]]) -> bool:
    """Verifies if the user is a Hugging Face PRO user or part of an enterprise org."""
    if not token:
        return False
    
    if isinstance(token, gr.OAuthToken):
        token_str = token.token
    elif isinstance(token, str):
        token_str = token
    else:
        return False
    
    try:
        user_info = whoami(token=token_str)
        return (
            user_info.get("isPro", False) or
            any(org.get("isEnterprise", False) for org in user_info.get("orgs", []))
        )
    except Exception as e:
        print(f"Could not verify user's PRO/Enterprise status: {e}")
        return False



class GenerationStatus(Enum):
    PENDING = "PENDING"
    RUNNING = "RUNNING"
    COMPLETED = "COMPLETED"
    FAILED = "FAILED"


MAX_SAMPLES_PRO = 10000  # max number of samples for PRO/Enterprise users
MAX_SAMPLES_FREE = 100   # max number of samples for free users
MAX_TOKENS = 8192
MAX_MODEL_PARAMS = 20_000_000_000  # 20 billion parameters (for now)

@dataclass
class GenerationRequest:
    id: str
    created_at: str
    status: GenerationStatus
    input_dataset_name: str
    input_dataset_config: str
    input_dataset_split: str
    output_dataset_name: str
    prompt_column: str
    model_name_or_path: str
    model_revision: str
    model_token: str | None
    system_prompt: str | None
    max_tokens: int
    temperature: float
    top_k: int
    top_p: float
    input_dataset_token: str | None
    output_dataset_token: str
    username: str
    email: str
    num_output_examples: int
    private: bool = False
    num_retries: int = 0

def validate_request(request: GenerationRequest, oauth_token: Optional[Union[gr.OAuthToken, str]] = None) -> GenerationRequest:
    # checks that the request is valid
    # - input dataset exists and can be accessed with the provided token
    try:
        input_dataset_info = get_dataset_infos(request.input_dataset_name, token=request.input_dataset_token)[request.input_dataset_config]
    except Exception as e:
        raise Exception(f"Dataset {request.input_dataset_name} does not exist or cannot be accessed with the provided token.")
    
    # check that the input dataset split exists
    if request.input_dataset_split not in input_dataset_info.splits:
        raise Exception(f"Dataset split {request.input_dataset_split} does not exist in dataset {request.input_dataset_name}. Available splits: {list(input_dataset_info.splits.keys())}")

    # if num_output_examples is 0, set it to the number of examples in the input dataset split
    if request.num_output_examples == 0:
        request.num_output_examples = input_dataset_info.splits[request.input_dataset_split].num_examples
    else: 
        if request.num_output_examples > input_dataset_info.splits[request.input_dataset_split].num_examples:
            raise Exception(f"Requested number of output examples {request.num_output_examples} exceeds the number of examples in the input dataset split {input_dataset_info.splits[request.input_dataset_split].num_examples}.")
        request.input_dataset_split = f"{request.input_dataset_split}[:{request.num_output_examples}]"



    # Check user tier and apply appropriate limits
    # Anonymous users (oauth_token is None) are treated as free tier
    is_pro = verify_pro_status(oauth_token) if oauth_token else False
    max_samples = MAX_SAMPLES_PRO if is_pro else MAX_SAMPLES_FREE
    
    if request.num_output_examples > max_samples:
        if oauth_token is None:
            user_tier = "non-signed-in"
        else:
            user_tier = "PRO/Enterprise" if is_pro else "free"
        raise Exception(f"Requested number of output examples {request.num_output_examples} exceeds the max limit of {max_samples} for {user_tier} users.")

    # check the prompt column exists in the dataset
    if request.prompt_column not in input_dataset_info.features:
        raise Exception(f"Prompt column {request.prompt_column} does not exist in dataset {request.input_dataset_name}. Available columns: {list(input_dataset_info.features.keys())}")

    # This is currently not supported, the output dataset will be created under the org 'synthetic-data-universe'
    # check output_dataset name is valid
    if request.output_dataset_name.count("/") != 1:
        raise Exception("Output dataset name must be in the format 'dataset_name', e.g., 'my-dataset'. The dataset will be created under the org 'synthetic-data-universe/my-dataset'.")
        
    # check the output dataset is valid and accessible with the provided token
    try:
        output_dataset_info = get_dataset_infos(request.output_dataset_name, token=request.output_dataset_token)
        raise Exception(f"Output dataset {request.output_dataset_name} already exists. Please choose a different name.")
    except Exception:
        pass  # dataset does not exist, which is expected
    
    # check the output dataset name doesn't already exist in the database
    try:
        url = os.getenv("SUPABASE_URL")
        key = os.getenv("SUPABASE_KEY")
        
        if url and key:
            supabase = create_client(
                url,
                key,
                options=ClientOptions(
                    postgrest_client_timeout=10,
                    storage_client_timeout=10,
                    schema="public",
                )
            )
            
            existing_request = supabase.table("gen-requests").select("id").eq("output_dataset_name", request.output_dataset_name).execute()
            if existing_request.data:
                raise Exception(f"Output dataset {request.output_dataset_name} is already being generated or has been requested. Please choose a different name.")
    except Exception as e:
        # If it's our custom exception about dataset already existing, re-raise it
        if "already being generated" in str(e):
            raise e
        # Otherwise, ignore database connection errors and continue
        pass

    # check the models exists
    try:
        model_config = AutoConfig.from_pretrained(request.model_name_or_path, 
                                                  revision=request.model_revision,     
                                                  force_download=True,
                                                  token=False
        )
    except Exception as e:
        print(e)
        raise Exception(f"Model {request.model_name_or_path} revision {request.model_revision} does not exist or cannot be accessed. The model may be private or gated, which is not supported at this time.")

    # check the model max position embeddings is greater than the requested max tokens and less than MAX_TOKENS    
    if model_config.max_position_embeddings < request.max_tokens:
        raise Exception(f"Model {request.model_name_or_path} max position embeddings {model_config.max_position_embeddings} is less than the requested max tokens {request.max_tokens}.")
    if request.max_tokens > MAX_TOKENS:
        raise Exception(f"Requested max tokens {request.max_tokens} exceeds the limit of {MAX_TOKENS}.")
    
    # check sampling parameters are valid
    if request.temperature < 0.0 or request.temperature > 2.0:
        raise Exception("Temperature must be between 0.0 and 2.0")
    if request.top_k < 1 or request.top_k > 100:
        raise Exception("Top K must be between 1 and 100")
    if request.top_p < 0.0 or request.top_p > 1.0:
        raise Exception("Top P must be between 0.0 and 1.0")   
    
    # check valid email address TODO: could use py3-validate-email https://stackoverflow.com/questions/8022530/how-to-check-for-valid-email-address
    if "@" not in request.email or "." not in request.email.split("@")[-1]:
        raise Exception("Invalid email address")
    
    return request


def load_dataset_info(dataset_name, model_name, oauth_token=None, dataset_token=None, ):
    """Load dataset information and return choices for dropdowns"""
    if not dataset_name.strip():
        return (
            gr.update(choices=[], value=None),  # config
            gr.update(choices=[], value=None),  # split
            gr.update(choices=[], value=None),  # prompt_column
            gr.update(value="", interactive=True),  # output_dataset_name
            gr.update(interactive=False),  # num_output_samples
            "Please enter a dataset name first."
        )

    try:
        # Get dataset info
        dataset_infos = get_dataset_infos(dataset_name)

        if not dataset_infos:
            raise Exception("No configs found for this dataset")

        # Get available configs
        config_choices = list(dataset_infos.keys())
        default_config = config_choices[0] if config_choices else None

        # Get splits and features for the default config
        if default_config:
            config_info = dataset_infos[default_config]
            split_choices = list(config_info.splits.keys())
            default_split = split_choices[0] if split_choices else None

            # Get column choices (features)
            column_choices = list(config_info.features.keys())
            default_column = None
            # Try to find a likely prompt column
            for col in column_choices:
                if any(keyword in col.lower() for keyword in ['prompt', 'text', 'question', 'input']):
                    default_column = col
                    break
            if not default_column and column_choices:
                default_column = column_choices[0]

            # Get sample count for the default split
            dataset_sample_count = config_info.splits[default_split].num_examples if default_split else 0
        else:
            split_choices = []
            column_choices = []
            default_split = None
            default_column = None
            dataset_sample_count = 0

        # Determine user limits
        is_pro = verify_pro_status(oauth_token) if oauth_token else False
        user_max_samples = MAX_SAMPLES_PRO if is_pro else MAX_SAMPLES_FREE

        # Set slider maximum to the minimum of dataset samples and user limit
        slider_max = min(dataset_sample_count, user_max_samples) if dataset_sample_count > 0 else user_max_samples

        # Generate a suggested output dataset name with model name and timestamp
        dataset_base_name = dataset_name.split('/')[-1] if '/' in dataset_name else dataset_name

        # Extract model short name (e.g., "Qwen/Qwen3-4B-Instruct-2507" -> "Qwen3-4B-Instruct-2507")
        model_short_name = model_name.split('/')[-1]

        # Create a compact timestamp (YYMMDD-HHMM format)
        from datetime import datetime
        timestamp = datetime.now().strftime("%y%m%d-%H%M")

        # Build the output name: MODEL-dataset-timestamp
        suggested_output_name = f"{model_short_name}-{dataset_base_name}-{timestamp}"

        # Limit to 86 characters
        if len(suggested_output_name) > 86:
            # Truncate dataset name to fit within limit
            available_for_dataset = 86 - len(model_short_name) - len(timestamp) - 2  # -2 for the hyphens
            if available_for_dataset > 0:
                dataset_base_name = dataset_base_name[:available_for_dataset]
                suggested_output_name = f"{model_short_name}-{dataset_base_name}-{timestamp}"
            else:
                suggested_output_name = f"{model_short_name}-{timestamp}"

        status_msg = f"✅ Dataset info loaded successfully! Found {len(config_choices)} config(s), {len(split_choices)} split(s), and {len(column_choices)} column(s)."
        if dataset_sample_count > 0:
            status_msg += f" Dataset has {dataset_sample_count:,} samples."
            if dataset_sample_count > user_max_samples:
                user_tier = "PRO/Enterprise" if is_pro else "free tier"
                status_msg += f" Limited to {user_max_samples:,} samples for {user_tier} users."

        return (
            gr.update(choices=config_choices, value=default_config, interactive=True),  # config
            gr.update(choices=split_choices, value=default_split, interactive=True),  # split
            gr.update(choices=column_choices, value=default_column, interactive=True),  # prompt_column
            gr.update(value=suggested_output_name, interactive=True),  # output_dataset_name
            gr.update(interactive=True, maximum=slider_max, value=0),  # num_output_samples
            status_msg
        )

    except Exception as e:
        return (
            gr.update(choices=[], value=None, interactive=False),  # config
            gr.update(choices=[], value=None, interactive=False),  # split
            gr.update(choices=[], value=None, interactive=False),  # prompt_column
            gr.update(value="", interactive=False),  # output_dataset_name
            gr.update(interactive=False),  # num_output_samples
            f"❌ Error loading dataset info: {str(e)}"
        )


def add_request_to_db(request: GenerationRequest):
    url: str = os.getenv("SUPABASE_URL")
    key: str = os.getenv("SUPABASE_KEY")
    
    try:
        supabase: Client = create_client(
            url,
            key,
            options=ClientOptions(
                postgrest_client_timeout=10,
                storage_client_timeout=10,
                schema="public",
            )
        )
        
        data = {
            "status": request.status.value,
            "input_dataset_name": request.input_dataset_name,
            "input_dataset_config": request.input_dataset_config,
            "input_dataset_split": request.input_dataset_split,
            "output_dataset_name": request.output_dataset_name,
            "prompt_column": request.prompt_column,
            "model_name_or_path": request.model_name_or_path,
            "model_revision": request.model_revision,
            "model_token": request.model_token,
            "system_prompt": request.system_prompt,
            "max_tokens": request.max_tokens,
            "temperature": request.temperature,
            "top_k": request.top_k,
            "top_p": request.top_p,
            "input_dataset_token": request.input_dataset_token,
            "output_dataset_token": request.output_dataset_token,
            "username": request.username,
            "email": request.email,
            "num_output_examples": request.num_output_examples,
            "private": request.private,
        }
            
        supabase.table("gen-requests").insert(data).execute()
    except Exception as e:
        raise Exception("Failed to add request to database")




def get_generation_stats_safe():
    """Safely fetch generation request statistics with proper error handling"""
    try:
        url = os.getenv("SUPABASE_URL")
        key = os.getenv("SUPABASE_KEY")
        
        if not url or not key:
            raise Exception("Missing SUPABASE_URL or SUPABASE_KEY environment variables")
        
        supabase = create_client(
            url,
            key,
            options=ClientOptions(
                postgrest_client_timeout=10,
                storage_client_timeout=10,
                schema="public",
            )
        )
        
        # Fetch data excluding sensitive token fields
        response = supabase.table("gen-requests").select(
            "id, created_at, status, input_dataset_name, input_dataset_config, "
            "input_dataset_split, output_dataset_name, prompt_column, "
            "model_name_or_path, model_revision, max_tokens, temperature, "
            "top_k, top_p, username, num_output_examples, private"
        ).order("created_at", desc=True).limit(50).execute()
        
        return {"status": "success", "data": response.data}
        
    except Exception as e:
        return {"status": "error", "message": str(e), "data": []}


# Old commented code removed - replaced with DatabaseManager and get_generation_stats_safe()


def main():
    with gr.Blocks(title="Synthetic Data Generation") as demo:
        gr.HTML("<h3 style='text-align:center'>Generate synthetic data with AI models. Free to use! Sign up for PRO benefits (10k samples vs 100). <a href='http://huggingface.co/subscribe/pro?source=synthetic-data-universe' target='_blank'>Upgrade to PRO</a></h3>", elem_id="sub_title")
        
        # Add sign-in button at the top
        with gr.Row():
            gr.Markdown("")  # Empty space for alignment
            login_button = gr.LoginButton(value="🔑 Sign in", size="sm")
            gr.Markdown("")  # Empty space for alignment
        
        signin_message = gr.Markdown("## 🔑 Sign In Required\n\nPlease sign in with your Hugging Face account to access the synthetic data generation service. Click the **Sign in** button above to continue.", visible=True)
        main_interface = gr.Column(visible=False)
        
        # Store the current oauth token for use in submit_request
        current_oauth_token = gr.State(None)
        
        with main_interface:
            with gr.Group():
                with gr.Row():
                    gr.Markdown("# Synthetic Data Generation Request")    
                with gr.Row():
                    gr.Markdown("""
                    Welcome to the Synthetic Data Generation service! This tool allows you to generate synthetic data using large language models. Generation is FREE for Hugging Face PRO users and uses idle GPUs on the HF science cluster.\n
                    Outputs from this service will be PUBLIC and available on the Hugging Face Hub under the organization [synthetic-data-universe](https://huggingface.co/synthetic-data-universe).\n
                    """)
            with gr.Accordion("More Information", open=False):
                with gr.Row():
                    gr.Markdown("""
                    **How it works:**
                    1. Provide an input dataset with prompts
                    2. Select a public language model for generation
                    3. Configure generation parameters
                    4. Submit your request.
                    """)
                    gr.Markdown("""              
                    
                    **Requirements:**
                    - Input dataset must be publicly accessible
                    - Model must be publicly accessible (and not gated)
                    - Maximum 10,000 samples per dataset
                    - Maximum of 8192 generated tokens
                    """)
            
            with gr.Tabs():
                with gr.TabItem("Generate Synthetic Data"):
                    with gr.Row():
                        with gr.Column():
                            with gr.Group():
                                gr.Markdown("##  Model information")
                                with gr.Column():
                                    with gr.Row():
                                        model_name_or_path = gr.Dropdown(
                                            choices=SUPPORTED_MODELS,
                                            label="Select Model",
                                            value="Qwen/Qwen3-4B-Instruct-2507",
                                            info="Choose from popular instruction-tuned models under 40B parameters"
                                        )
                                    # model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
                                with gr.Row():
                                    system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.")
                                gr.Markdown("### Generation Parameters")
                                with gr.Row():
                                    with gr.Column():
                                        with gr.Row():
                                            max_tokens = gr.Slider(label="Max Tokens", value=1024, minimum=256, maximum=MAX_TOKENS, step=256)
                                            temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.1)
                                        with gr.Row():
                                            top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5)
                                            top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)

                        with gr.Column():
                            with gr.Group():
                                gr.Markdown("##  Dataset information")
                                # Dynamic user limit info - default to anonymous user
                                user_limit_info = gr.Markdown(value="👤 **Anonymous User**: You can generate up to 100 samples per request. Use the sign-in button above for PRO benefits (10,000 samples).", visible=True)
                                with gr.Row():
                                    with gr.Column():
                                        input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1")
                                        load_info_btn = gr.Button("📊 Load Dataset Info", size="sm", variant="secondary")
                                        load_info_status = gr.Markdown("", visible=True)

                                    with gr.Column():
                                        output_dataset_name = gr.Textbox(label="Output Dataset Name", placeholder="e.g., my-generated-dataset, must be unique. Will be created under the org 'synthetic-data-universe'", value=None, interactive=False, info="Click Load Info to populate")
                                
                                with gr.Row():
                                    with gr.Column():
                                        input_dataset_config = gr.Dropdown(label="Dataset Config", choices=[], value=None, interactive=False, info="Click Load Info to populate")
                                        prompt_column = gr.Dropdown(label="Prompt Column", choices=[], value=None, interactive=False, info="Click Load Info to populate")
                                        
                                    with gr.Column():
                                        input_dataset_split = gr.Dropdown(label="Dataset Split", choices=[], value=None, interactive=False, info="Click Load Info to populate")
                                        num_output_samples = gr.Slider(label="Number of samples, leave as '0' for all", value=0, minimum=0, maximum=MAX_SAMPLES_FREE, step=1, interactive=False, info="Click Load Info to populate")
                            

                    submit_btn = gr.Button("Submit Generation Request", variant="primary")
                    output_status = gr.Textbox(label="Status", interactive=False)
                
                with gr.TabItem("Statistics Dashboard"):
                    gr.Markdown("## Generation Requests Statistics")
                    gr.Markdown("📊 View recent synthetic data generation requests and their status.")
                    
                    with gr.Row():
                        refresh_stats_btn = gr.Button("🔄 Refresh Statistics", size="sm", variant="secondary")
                        clear_stats_btn = gr.Button("🗑️ Clear Display", size="sm")
                    
                    stats_status = gr.Markdown("Click 'Refresh Statistics' to load recent generation requests.", visible=True)
                    
                    stats_dataframe = gr.Dataframe(
                        headers=["ID", "Created", "Status", "Input Dataset", "Output Dataset", "Model", "Samples", "User"],
                        datatype=["str", "str", "str", "str", "str", "str", "number", "str"],
                        interactive=False,
                        wrap=True,
                        value=[],
                        label="Recent Generation Requests (Last 50)",
                        visible=False
                    )
                    
                    def load_statistics():
                        """Load and format statistics data"""
                        try:
                            # Use the new safe database function
                            result = get_generation_stats_safe()
                            
                            if result["status"] == "error":
                                return (
                                    f"❌ **Error loading statistics**: {result['message']}",
                                    gr.update(visible=False),
                                    gr.update(visible=True)
                                )
                            
                            data = result["data"]
                            if not data:
                                return (
                                    "📝 **No data found**: The database appears to be empty or the table doesn't exist yet.",
                                    gr.update(visible=False),
                                    gr.update(visible=True)
                                )
                            
                            # Format data for display
                            formatted_data = []
                            for item in data:
                                # Format timestamp
                                created_at = item.get('created_at', 'Unknown')
                                if created_at and created_at != 'Unknown':
                                    try:
                                        from datetime import datetime
                                        dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
                                        created_at = dt.strftime('%Y-%m-%d %H:%M')
                                    except:
                                        pass
                                
                                formatted_data.append([
                                    str(item.get('id', ''))[:8] + "..." if len(str(item.get('id', ''))) > 8 else str(item.get('id', '')),
                                    created_at,
                                    item.get('status', 'Unknown'),
                                    (item.get('input_dataset_name', '')[:30] + "...") if len(item.get('input_dataset_name', '')) > 30 else item.get('input_dataset_name', ''),
                                    (item.get('output_dataset_name', '')[:30] + "...") if len(item.get('output_dataset_name', '')) > 30 else item.get('output_dataset_name', ''),
                                    (item.get('model_name_or_path', '')[:25] + "...") if len(item.get('model_name_or_path', '')) > 25 else item.get('model_name_or_path', ''),
                                    item.get('num_output_examples', 0),
                                    item.get('username', 'Anonymous')
                                ])
                            
                            return (
                                f"✅ **Statistics loaded successfully**: Found {len(formatted_data)} recent requests.",
                                gr.update(value=formatted_data, visible=True),
                                gr.update(visible=True)
                            )
                            
                        except Exception as e:
                            return (
                                f"❌ **Unexpected error**: {str(e)}",
                                gr.update(visible=False),
                                gr.update(visible=True)
                            )
                    
                    def clear_statistics():
                        """Clear the statistics display"""
                        return (
                            "Click 'Refresh Statistics' to load recent generation requests.",
                            gr.update(value=[], visible=False),
                            gr.update(visible=True)
                        )
                    
                    # Connect buttons to functions
                    refresh_stats_btn.click(
                        load_statistics,
                        outputs=[stats_status, stats_dataframe, stats_status]
                    )
                    
                    clear_stats_btn.click(
                        clear_statistics,
                        outputs=[stats_status, stats_dataframe, stats_status]
                    )

            def submit_request(input_dataset_name, input_split, input_dataset_config, output_dataset_name, prompt_col, model_name, sys_prompt,
                                max_tok, temp, top_k_val, top_p_val, num_output_samples, oauth_token=None):

                MASTER_ORG = "synthetic-data-universe/"
                model_token = False # This is currently not supported
                input_dataset_token = None # This is currently not supported
                output_dataset_token = os.getenv("OUTPUT_DATASET_TOKEN")
                
                try:
                    request = GenerationRequest(
                        id="",  # Will be generated when adding to the database
                        created_at="",  # Will be set when adding to the database
                        status=GenerationStatus.PENDING,
                        input_dataset_name=input_dataset_name,
                        input_dataset_split=input_split,
                        input_dataset_config=input_dataset_config,
                        output_dataset_name=MASTER_ORG + output_dataset_name,
                        prompt_column=prompt_col,
                        model_name_or_path=model_name,
                        model_revision="main",
                        model_token=model_token,
                        system_prompt=sys_prompt if sys_prompt else None,
                        max_tokens=int(max_tok),
                        temperature=temp,
                        top_k=int(top_k_val),
                        top_p=top_p_val,
                        input_dataset_token=input_dataset_token if input_dataset_token else None,
                        output_dataset_token=output_dataset_token,
                        num_output_examples=num_output_samples,  # will be set after validating the input dataset
                        username="user",
                        email="n/a",
                    )
                    
                    # check the input dataset exists and can be accessed with the provided token
                    request = validate_request(request, oauth_token)
                    add_request_to_db(request)

                    return "Request submitted successfully!"
                except Exception as e:
                    return f"Error: {str(e)}"
            
            # Wire up the Load Dataset Info button
            load_info_btn.click(
                load_dataset_info,
                inputs=[input_dataset_name, model_name_or_path, current_oauth_token],
                outputs=[input_dataset_config, input_dataset_split, prompt_column, output_dataset_name, num_output_samples, load_info_status]
            )

            submit_btn.click(
                submit_request,
                inputs=[input_dataset_name, input_dataset_split, input_dataset_config, output_dataset_name, prompt_column, model_name_or_path,
                         system_prompt, max_tokens, temperature, top_k, top_p, num_output_samples, current_oauth_token],
                outputs=output_status
            )
        
        def update_user_limits(oauth_token):
            if oauth_token is None:
                return "👤 **Anonymous User**: You can generate up to 100 samples per request. Use the sign-in button above for PRO benefits (10,000 samples)."
            
            is_pro = verify_pro_status(oauth_token)
            if is_pro:
                return "✨ **PRO User**: You can generate up to 10,000 samples per request."
            else:
                return "👤 **Free User**: You can generate up to 100 samples per request. [Upgrade to PRO](http://huggingface.co/subscribe/pro?source=synthetic-data-universe) for 10,000 samples."

        def control_access(profile: Optional[gr.OAuthProfile] = None, oauth_token: Optional[gr.OAuthToken] = None):
            # Require users to be signed in
            if oauth_token is None:
                # User is not signed in - show sign-in prompt, hide main interface
                return (
                    gr.update(visible=False),  # main_interface
                    gr.update(visible=True),   # signin_message
                    oauth_token,               # current_oauth_token
                    "",                        # user_limit_info (empty when not signed in)
                    gr.update(),               # num_output_samples (no change)
                    gr.update(value="🔑 Sign in")  # login_button
                )
            else:
                # User is signed in - show main interface, hide sign-in prompt
                limit_msg = update_user_limits(oauth_token)
                is_pro = verify_pro_status(oauth_token)
                max_samples = MAX_SAMPLES_PRO if is_pro else MAX_SAMPLES_FREE
                
                if is_pro:
                    button_text = f"✨ Signed in as PRO ({profile.name if profile else 'User'})"
                else:
                    button_text = f"👤 Signed in as {profile.name if profile else 'User'}"
                
                return (
                    gr.update(visible=True),   # main_interface
                    gr.update(visible=False),  # signin_message
                    oauth_token,               # current_oauth_token
                    limit_msg,                 # user_limit_info
                    gr.update(maximum=max_samples),  # num_output_samples
                    gr.update(value=button_text)     # login_button
                )


        # Handle login state changes - LoginButton automatically handles auth state changes
        # The demo.load will handle both initial load and auth changes
        
        demo.load(control_access, inputs=None, outputs=[main_interface, signin_message, current_oauth_token, user_limit_info, num_output_samples, login_button])
        demo.queue(max_size=None, default_concurrency_limit=None).launch(show_error=True)

if __name__ == "__main__":
    main()