Spaces:

synthetic-data-universe
/

synth

Sleeping

App Files Files Community

edbeeching commited on Sep 8

Commit

f32647d

1 Parent(s): ac0089d

make outputs in same org and public, remove tokens

Browse files

Files changed (1) hide show

app.py +46 -37

app.py CHANGED Viewed

@@ -11,7 +11,7 @@ from transformers import AutoConfig
 """
  Still TODO:
  - validate the user is PRO
- - check the output dataset token is valid
  - validate max model params
 """
@@ -24,12 +24,13 @@ class GenerationStatus(Enum):
 MAX_SAMPLES = 10000  # max number of samples in the input dataset
-MAX_TOKENS = 32768
 MAX_MODEL_PARAMS = 20_000_000_000  # 20 billion parameters (for now)
 @dataclass
 class GenerationRequest:
     id: str
     status: GenerationStatus
     input_dataset_name: str
     input_dataset_config: str
@@ -48,7 +49,9 @@ class GenerationRequest:
     output_dataset_token: str
     username: str
     email: str
 def validate_request(request: GenerationRequest):
     # checks that the request is valid
@@ -130,7 +133,9 @@ def add_request_to_db(request: GenerationRequest):
             "input_dataset_token": request.input_dataset_token,
             "output_dataset_token": request.output_dataset_token,
             "username": request.username,
-            "email": request.email
         }
         supabase.table("gen-requests").insert(data).execute()
@@ -152,38 +157,37 @@ def create_gradio_interface():
                 gr.Markdown("""
                 **How it works:**
                 1. Provide an input dataset with prompts
-                2. Select a language model for generation
                 3. Configure generation parameters
                 4. Submit your request and receive generated data
                 """)
                 gr.Markdown("""
                 **Requirements:**
-                - Input dataset must be publicly accessible or you must provide a valid HuggingFace token
                 - Output dataset repository must exist and you must have write access
                 - Model must be accessible (public or with valid token)
                 - Maximum 10,000 samples per dataset
-                - Maximum of 32k generation tokens
                 """)
-        with gr.Row():
-            with gr.Group():
-                gr.Markdown("##  Dataset information")
-                with gr.Column():
-                    with gr.Row():
-                        input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1")
-                        input_dataset_split = gr.Textbox(label="Input Dataset Split", value="train", placeholder="e.g., train, test, validation")
-                        input_dataset_config = gr.Textbox(label="Input Dataset Config", value="default", placeholder="e.g., default, custom")
-                        prompt_column = gr.Textbox(label="Prompt Column", placeholder="e.g., text, prompt, question")
-                    with gr.Column():
-                        output_dataset_name = gr.Textbox(label="Output Dataset Name", placeholder="e.g., MyOrg/my-generated-dataset")
-            with gr.Group():
-                gr.Markdown("##  Model information")
                 with gr.Column():
-                    with gr.Row():
-                        model_name_or_path = gr.Textbox(label="Model Name or Path", placeholder="e.g., Qwen/Qwen3-4B-Instruct-2507")
-                        model_revision = gr.Textbox(label="Model Revision", value="main", placeholder="e.g., main, v1.0")
-                    model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
         with gr.Group():
             gr.Markdown("##  Generation Parameters")
             with gr.Row():
@@ -194,33 +198,38 @@ def create_gradio_interface():
                     with gr.Row():
                         top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5)
                         top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
-                with gr.Column():
-                    system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.")
         with gr.Group():
-            gr.Markdown("##  User Information, for tokens refer to guide [here](https://huggingface.co/docs/hub/en/security-tokens#user-access-tokens)")
             with gr.Row():
                 with gr.Column():
                     with gr.Row():
-                        username = gr.Textbox(label="Hugging Face Username", placeholder="Your HF username")
                         email = gr.Textbox(label="Email", placeholder="your.email@example.com")
-                    with gr.Row():
-                        input_dataset_token = gr.Textbox(label="Input dataset token", type="password", placeholder="Your HF token with read access to the input dataset, leave blank if public dataset")
-                        output_dataset_token = gr.Textbox(label="Output dataset token", type="password", placeholder="Your HF token with write access to the output dataset")
         submit_btn = gr.Button("Submit Generation Request", variant="primary")
         output_status = gr.Textbox(label="Status", interactive=False)
-        def submit_request(input_dataset_name, input_split, input_dataset_config, output_dataset_name, prompt_col, model_name, model_rev, model_token, sys_prompt,
-                            max_tok, temp, top_k_val, top_p_val, user, email_addr, input_dataset_token, output_dataset_token):
             try:
                 request = GenerationRequest(
                     id="",  # Will be generated when adding to the database
                     status=GenerationStatus.PENDING,
                     input_dataset_name=input_dataset_name,
                     input_dataset_split=input_split,
                     input_dataset_config=input_dataset_config,
-                    output_dataset_name=output_dataset_name,
                     prompt_column=prompt_col,
                     model_name_or_path=model_name,
                     model_revision=model_rev,
@@ -232,7 +241,7 @@ def create_gradio_interface():
                     top_p=top_p_val,
                     input_dataset_token=input_dataset_token if input_dataset_token else None,
                     output_dataset_token=output_dataset_token,
-                    username=user,
                     email=email_addr
                 )
@@ -247,8 +256,8 @@ def create_gradio_interface():
         submit_btn.click(
             submit_request,
             inputs=[input_dataset_name, input_dataset_split, input_dataset_config, output_dataset_name, prompt_column, model_name_or_path,
-                    model_revision, model_token, system_prompt, max_tokens, temperature, top_k, top_p,
-                    username, email, input_dataset_token, output_dataset_token],
             outputs=output_status
         )

 """
  Still TODO:
  - validate the user is PRO
+ - check the output dataset token is valid (hardcoded for now as a secret)
  - validate max model params
 """
 MAX_SAMPLES = 10000  # max number of samples in the input dataset
+MAX_TOKENS = 8192
 MAX_MODEL_PARAMS = 20_000_000_000  # 20 billion parameters (for now)
 @dataclass
 class GenerationRequest:
     id: str
+    created_at: str
     status: GenerationStatus
     input_dataset_name: str
     input_dataset_config: str
     output_dataset_token: str
     username: str
     email: str
+    num_output_examples: int
+    private: bool = False
+    num_retries: int = 0
 def validate_request(request: GenerationRequest):
     # checks that the request is valid
             "input_dataset_token": request.input_dataset_token,
             "output_dataset_token": request.output_dataset_token,
             "username": request.username,
+            "email": request.email,
+            "num_output_examples": MAX_SAMPLES,  # currently always max samples
+            "private": False,
         }
         supabase.table("gen-requests").insert(data).execute()
                 gr.Markdown("""
                 **How it works:**
                 1. Provide an input dataset with prompts
+                2. Select a public language model for generation
                 3. Configure generation parameters
                 4. Submit your request and receive generated data
                 """)
                 gr.Markdown("""
                 **Requirements:**
+                - Input dataset must be publicly accessible
                 - Output dataset repository must exist and you must have write access
                 - Model must be accessible (public or with valid token)
                 - Maximum 10,000 samples per dataset
+                - Maximum of 8192 generation tokens
                 """)
+        with gr.Group():
+            gr.Markdown("##  Dataset information")
+            with gr.Column():
+                with gr.Row():
+                    input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1")
+                    input_dataset_split = gr.Textbox(label="Input Dataset Split", value="train", placeholder="e.g., train, test, validation")
+                    input_dataset_config = gr.Textbox(label="Input Dataset Config", value="default", placeholder="e.g., default, custom")
+                    prompt_column = gr.Textbox(label="Prompt Column", placeholder="e.g., text, prompt, question")
                 with gr.Column():
+                    output_dataset_name = gr.Textbox(label="Output Dataset Name", placeholder="e.g., my-generated-dataset, must be unique. Will be created under the org 'synthetic-data-universe'")
+        with gr.Group():
+            gr.Markdown("##  Model information")
+            with gr.Column():
+                with gr.Row():
+                    model_name_or_path = gr.Textbox(label="Model Name or Path", placeholder="e.g., Qwen/Qwen3-4B-Instruct-2507")
+                    model_revision = gr.Textbox(label="Model Revision", value="main", placeholder="e.g., main, v1.0")
+                # model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
         with gr.Group():
             gr.Markdown("##  Generation Parameters")
             with gr.Row():
                     with gr.Row():
                         top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5)
                         top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
+                    with gr.Row():
+                        system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.")
         with gr.Group():
+            gr.Markdown("##  User Information, for notification when your job is completed")
             with gr.Row():
                 with gr.Column():
                     with gr.Row():
                         email = gr.Textbox(label="Email", placeholder="your.email@example.com")
+                    # with gr.Row():
+                        # input_dataset_token = gr.Textbox(label="Input dataset token", type="password", placeholder="Your HF token with read access to the input dataset, leave blank if public dataset")
+                        # output_dataset_token = gr.Textbox(label="Output dataset token", type="password", placeholder="Your HF token with write access to the output dataset")
         submit_btn = gr.Button("Submit Generation Request", variant="primary")
         output_status = gr.Textbox(label="Status", interactive=False)
+        def submit_request(input_dataset_name, input_split, input_dataset_config, output_dataset_name, prompt_col, model_name, model_rev, sys_prompt,
+                            max_tok, temp, top_k_val, top_p_val, email_addr):
+            MASTER_ORG = "synthetic-data-universe/"
+            model_token = None # This is currently not supported
+            input_dataset_token = None # This is currently not supported
+            output_dataset_token = os.getenv("OUTPUT_DATASET_TOKEN")
             try:
                 request = GenerationRequest(
                     id="",  # Will be generated when adding to the database
+                    created_at="",  # Will be set when adding to the database
                     status=GenerationStatus.PENDING,
                     input_dataset_name=input_dataset_name,
                     input_dataset_split=input_split,
                     input_dataset_config=input_dataset_config,
+                    output_dataset_name=MASTER_ORG + output_dataset_name,
                     prompt_column=prompt_col,
                     model_name_or_path=model_name,
                     model_revision=model_rev,
                     top_p=top_p_val,
                     input_dataset_token=input_dataset_token if input_dataset_token else None,
                     output_dataset_token=output_dataset_token,
+                    username="user",
                     email=email_addr
                 )
         submit_btn.click(
             submit_request,
             inputs=[input_dataset_name, input_dataset_split, input_dataset_config, output_dataset_name, prompt_column, model_name_or_path,
+                    model_revision, system_prompt, max_tokens, temperature, top_k, top_p,
+                     email],
             outputs=output_status
         )