Spaces:
Sleeping
Sleeping
edbeeching
commited on
Commit
·
f32647d
1
Parent(s):
ac0089d
make outputs in same org and public, remove tokens
Browse files
app.py
CHANGED
|
@@ -11,7 +11,7 @@ from transformers import AutoConfig
|
|
| 11 |
"""
|
| 12 |
Still TODO:
|
| 13 |
- validate the user is PRO
|
| 14 |
-
- check the output dataset token is valid
|
| 15 |
- validate max model params
|
| 16 |
"""
|
| 17 |
|
|
@@ -24,12 +24,13 @@ class GenerationStatus(Enum):
|
|
| 24 |
|
| 25 |
|
| 26 |
MAX_SAMPLES = 10000 # max number of samples in the input dataset
|
| 27 |
-
MAX_TOKENS =
|
| 28 |
MAX_MODEL_PARAMS = 20_000_000_000 # 20 billion parameters (for now)
|
| 29 |
|
| 30 |
@dataclass
|
| 31 |
class GenerationRequest:
|
| 32 |
id: str
|
|
|
|
| 33 |
status: GenerationStatus
|
| 34 |
input_dataset_name: str
|
| 35 |
input_dataset_config: str
|
|
@@ -48,7 +49,9 @@ class GenerationRequest:
|
|
| 48 |
output_dataset_token: str
|
| 49 |
username: str
|
| 50 |
email: str
|
| 51 |
-
|
|
|
|
|
|
|
| 52 |
|
| 53 |
def validate_request(request: GenerationRequest):
|
| 54 |
# checks that the request is valid
|
|
@@ -130,7 +133,9 @@ def add_request_to_db(request: GenerationRequest):
|
|
| 130 |
"input_dataset_token": request.input_dataset_token,
|
| 131 |
"output_dataset_token": request.output_dataset_token,
|
| 132 |
"username": request.username,
|
| 133 |
-
"email": request.email
|
|
|
|
|
|
|
| 134 |
}
|
| 135 |
|
| 136 |
supabase.table("gen-requests").insert(data).execute()
|
|
@@ -152,38 +157,37 @@ def create_gradio_interface():
|
|
| 152 |
gr.Markdown("""
|
| 153 |
**How it works:**
|
| 154 |
1. Provide an input dataset with prompts
|
| 155 |
-
2. Select a language model for generation
|
| 156 |
3. Configure generation parameters
|
| 157 |
4. Submit your request and receive generated data
|
| 158 |
""")
|
| 159 |
gr.Markdown("""
|
| 160 |
|
| 161 |
**Requirements:**
|
| 162 |
-
- Input dataset must be publicly accessible
|
| 163 |
- Output dataset repository must exist and you must have write access
|
| 164 |
- Model must be accessible (public or with valid token)
|
| 165 |
- Maximum 10,000 samples per dataset
|
| 166 |
-
- Maximum of
|
| 167 |
""")
|
| 168 |
|
| 169 |
-
with gr.
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
with gr.
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
prompt_column = gr.Textbox(label="Prompt Column", placeholder="e.g., text, prompt, question")
|
| 178 |
-
with gr.Column():
|
| 179 |
-
output_dataset_name = gr.Textbox(label="Output Dataset Name", placeholder="e.g., MyOrg/my-generated-dataset")
|
| 180 |
-
with gr.Group():
|
| 181 |
-
gr.Markdown("## Model information")
|
| 182 |
with gr.Column():
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
with gr.Group():
|
| 188 |
gr.Markdown("## Generation Parameters")
|
| 189 |
with gr.Row():
|
|
@@ -194,33 +198,38 @@ def create_gradio_interface():
|
|
| 194 |
with gr.Row():
|
| 195 |
top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5)
|
| 196 |
top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
|
| 197 |
-
|
| 198 |
-
|
| 199 |
|
| 200 |
with gr.Group():
|
| 201 |
-
gr.Markdown("## User Information, for
|
| 202 |
with gr.Row():
|
| 203 |
with gr.Column():
|
| 204 |
with gr.Row():
|
| 205 |
-
username = gr.Textbox(label="Hugging Face Username", placeholder="Your HF username")
|
| 206 |
email = gr.Textbox(label="Email", placeholder="your.email@example.com")
|
| 207 |
-
with gr.Row():
|
| 208 |
-
input_dataset_token = gr.Textbox(label="Input dataset token", type="password", placeholder="Your HF token with read access to the input dataset, leave blank if public dataset")
|
| 209 |
-
output_dataset_token = gr.Textbox(label="Output dataset token", type="password", placeholder="Your HF token with write access to the output dataset")
|
| 210 |
|
| 211 |
submit_btn = gr.Button("Submit Generation Request", variant="primary")
|
| 212 |
output_status = gr.Textbox(label="Status", interactive=False)
|
| 213 |
|
| 214 |
-
def submit_request(input_dataset_name, input_split, input_dataset_config, output_dataset_name, prompt_col, model_name, model_rev,
|
| 215 |
-
max_tok, temp, top_k_val, top_p_val,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
try:
|
| 217 |
request = GenerationRequest(
|
| 218 |
id="", # Will be generated when adding to the database
|
|
|
|
| 219 |
status=GenerationStatus.PENDING,
|
| 220 |
input_dataset_name=input_dataset_name,
|
| 221 |
input_dataset_split=input_split,
|
| 222 |
input_dataset_config=input_dataset_config,
|
| 223 |
-
output_dataset_name=output_dataset_name,
|
| 224 |
prompt_column=prompt_col,
|
| 225 |
model_name_or_path=model_name,
|
| 226 |
model_revision=model_rev,
|
|
@@ -232,7 +241,7 @@ def create_gradio_interface():
|
|
| 232 |
top_p=top_p_val,
|
| 233 |
input_dataset_token=input_dataset_token if input_dataset_token else None,
|
| 234 |
output_dataset_token=output_dataset_token,
|
| 235 |
-
username=user,
|
| 236 |
email=email_addr
|
| 237 |
)
|
| 238 |
|
|
@@ -247,8 +256,8 @@ def create_gradio_interface():
|
|
| 247 |
submit_btn.click(
|
| 248 |
submit_request,
|
| 249 |
inputs=[input_dataset_name, input_dataset_split, input_dataset_config, output_dataset_name, prompt_column, model_name_or_path,
|
| 250 |
-
model_revision,
|
| 251 |
-
|
| 252 |
outputs=output_status
|
| 253 |
)
|
| 254 |
|
|
|
|
| 11 |
"""
|
| 12 |
Still TODO:
|
| 13 |
- validate the user is PRO
|
| 14 |
+
- check the output dataset token is valid (hardcoded for now as a secret)
|
| 15 |
- validate max model params
|
| 16 |
"""
|
| 17 |
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
MAX_SAMPLES = 10000 # max number of samples in the input dataset
|
| 27 |
+
MAX_TOKENS = 8192
|
| 28 |
MAX_MODEL_PARAMS = 20_000_000_000 # 20 billion parameters (for now)
|
| 29 |
|
| 30 |
@dataclass
|
| 31 |
class GenerationRequest:
|
| 32 |
id: str
|
| 33 |
+
created_at: str
|
| 34 |
status: GenerationStatus
|
| 35 |
input_dataset_name: str
|
| 36 |
input_dataset_config: str
|
|
|
|
| 49 |
output_dataset_token: str
|
| 50 |
username: str
|
| 51 |
email: str
|
| 52 |
+
num_output_examples: int
|
| 53 |
+
private: bool = False
|
| 54 |
+
num_retries: int = 0
|
| 55 |
|
| 56 |
def validate_request(request: GenerationRequest):
|
| 57 |
# checks that the request is valid
|
|
|
|
| 133 |
"input_dataset_token": request.input_dataset_token,
|
| 134 |
"output_dataset_token": request.output_dataset_token,
|
| 135 |
"username": request.username,
|
| 136 |
+
"email": request.email,
|
| 137 |
+
"num_output_examples": MAX_SAMPLES, # currently always max samples
|
| 138 |
+
"private": False,
|
| 139 |
}
|
| 140 |
|
| 141 |
supabase.table("gen-requests").insert(data).execute()
|
|
|
|
| 157 |
gr.Markdown("""
|
| 158 |
**How it works:**
|
| 159 |
1. Provide an input dataset with prompts
|
| 160 |
+
2. Select a public language model for generation
|
| 161 |
3. Configure generation parameters
|
| 162 |
4. Submit your request and receive generated data
|
| 163 |
""")
|
| 164 |
gr.Markdown("""
|
| 165 |
|
| 166 |
**Requirements:**
|
| 167 |
+
- Input dataset must be publicly accessible
|
| 168 |
- Output dataset repository must exist and you must have write access
|
| 169 |
- Model must be accessible (public or with valid token)
|
| 170 |
- Maximum 10,000 samples per dataset
|
| 171 |
+
- Maximum of 8192 generation tokens
|
| 172 |
""")
|
| 173 |
|
| 174 |
+
with gr.Group():
|
| 175 |
+
gr.Markdown("## Dataset information")
|
| 176 |
+
with gr.Column():
|
| 177 |
+
with gr.Row():
|
| 178 |
+
input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1")
|
| 179 |
+
input_dataset_split = gr.Textbox(label="Input Dataset Split", value="train", placeholder="e.g., train, test, validation")
|
| 180 |
+
input_dataset_config = gr.Textbox(label="Input Dataset Config", value="default", placeholder="e.g., default, custom")
|
| 181 |
+
prompt_column = gr.Textbox(label="Prompt Column", placeholder="e.g., text, prompt, question")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
with gr.Column():
|
| 183 |
+
output_dataset_name = gr.Textbox(label="Output Dataset Name", placeholder="e.g., my-generated-dataset, must be unique. Will be created under the org 'synthetic-data-universe'")
|
| 184 |
+
with gr.Group():
|
| 185 |
+
gr.Markdown("## Model information")
|
| 186 |
+
with gr.Column():
|
| 187 |
+
with gr.Row():
|
| 188 |
+
model_name_or_path = gr.Textbox(label="Model Name or Path", placeholder="e.g., Qwen/Qwen3-4B-Instruct-2507")
|
| 189 |
+
model_revision = gr.Textbox(label="Model Revision", value="main", placeholder="e.g., main, v1.0")
|
| 190 |
+
# model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
|
| 191 |
with gr.Group():
|
| 192 |
gr.Markdown("## Generation Parameters")
|
| 193 |
with gr.Row():
|
|
|
|
| 198 |
with gr.Row():
|
| 199 |
top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5)
|
| 200 |
top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
|
| 201 |
+
with gr.Row():
|
| 202 |
+
system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.")
|
| 203 |
|
| 204 |
with gr.Group():
|
| 205 |
+
gr.Markdown("## User Information, for notification when your job is completed")
|
| 206 |
with gr.Row():
|
| 207 |
with gr.Column():
|
| 208 |
with gr.Row():
|
|
|
|
| 209 |
email = gr.Textbox(label="Email", placeholder="your.email@example.com")
|
| 210 |
+
# with gr.Row():
|
| 211 |
+
# input_dataset_token = gr.Textbox(label="Input dataset token", type="password", placeholder="Your HF token with read access to the input dataset, leave blank if public dataset")
|
| 212 |
+
# output_dataset_token = gr.Textbox(label="Output dataset token", type="password", placeholder="Your HF token with write access to the output dataset")
|
| 213 |
|
| 214 |
submit_btn = gr.Button("Submit Generation Request", variant="primary")
|
| 215 |
output_status = gr.Textbox(label="Status", interactive=False)
|
| 216 |
|
| 217 |
+
def submit_request(input_dataset_name, input_split, input_dataset_config, output_dataset_name, prompt_col, model_name, model_rev, sys_prompt,
|
| 218 |
+
max_tok, temp, top_k_val, top_p_val, email_addr):
|
| 219 |
+
|
| 220 |
+
MASTER_ORG = "synthetic-data-universe/"
|
| 221 |
+
model_token = None # This is currently not supported
|
| 222 |
+
input_dataset_token = None # This is currently not supported
|
| 223 |
+
output_dataset_token = os.getenv("OUTPUT_DATASET_TOKEN")
|
| 224 |
try:
|
| 225 |
request = GenerationRequest(
|
| 226 |
id="", # Will be generated when adding to the database
|
| 227 |
+
created_at="", # Will be set when adding to the database
|
| 228 |
status=GenerationStatus.PENDING,
|
| 229 |
input_dataset_name=input_dataset_name,
|
| 230 |
input_dataset_split=input_split,
|
| 231 |
input_dataset_config=input_dataset_config,
|
| 232 |
+
output_dataset_name=MASTER_ORG + output_dataset_name,
|
| 233 |
prompt_column=prompt_col,
|
| 234 |
model_name_or_path=model_name,
|
| 235 |
model_revision=model_rev,
|
|
|
|
| 241 |
top_p=top_p_val,
|
| 242 |
input_dataset_token=input_dataset_token if input_dataset_token else None,
|
| 243 |
output_dataset_token=output_dataset_token,
|
| 244 |
+
username="user",
|
| 245 |
email=email_addr
|
| 246 |
)
|
| 247 |
|
|
|
|
| 256 |
submit_btn.click(
|
| 257 |
submit_request,
|
| 258 |
inputs=[input_dataset_name, input_dataset_split, input_dataset_config, output_dataset_name, prompt_column, model_name_or_path,
|
| 259 |
+
model_revision, system_prompt, max_tokens, temperature, top_k, top_p,
|
| 260 |
+
email],
|
| 261 |
outputs=output_status
|
| 262 |
)
|
| 263 |
|