edbeeching commited on
Commit
f32647d
·
1 Parent(s): ac0089d

make outputs in same org and public, remove tokens

Browse files
Files changed (1) hide show
  1. app.py +46 -37
app.py CHANGED
@@ -11,7 +11,7 @@ from transformers import AutoConfig
11
  """
12
  Still TODO:
13
  - validate the user is PRO
14
- - check the output dataset token is valid
15
  - validate max model params
16
  """
17
 
@@ -24,12 +24,13 @@ class GenerationStatus(Enum):
24
 
25
 
26
  MAX_SAMPLES = 10000 # max number of samples in the input dataset
27
- MAX_TOKENS = 32768
28
  MAX_MODEL_PARAMS = 20_000_000_000 # 20 billion parameters (for now)
29
 
30
  @dataclass
31
  class GenerationRequest:
32
  id: str
 
33
  status: GenerationStatus
34
  input_dataset_name: str
35
  input_dataset_config: str
@@ -48,7 +49,9 @@ class GenerationRequest:
48
  output_dataset_token: str
49
  username: str
50
  email: str
51
-
 
 
52
 
53
  def validate_request(request: GenerationRequest):
54
  # checks that the request is valid
@@ -130,7 +133,9 @@ def add_request_to_db(request: GenerationRequest):
130
  "input_dataset_token": request.input_dataset_token,
131
  "output_dataset_token": request.output_dataset_token,
132
  "username": request.username,
133
- "email": request.email
 
 
134
  }
135
 
136
  supabase.table("gen-requests").insert(data).execute()
@@ -152,38 +157,37 @@ def create_gradio_interface():
152
  gr.Markdown("""
153
  **How it works:**
154
  1. Provide an input dataset with prompts
155
- 2. Select a language model for generation
156
  3. Configure generation parameters
157
  4. Submit your request and receive generated data
158
  """)
159
  gr.Markdown("""
160
 
161
  **Requirements:**
162
- - Input dataset must be publicly accessible or you must provide a valid HuggingFace token
163
  - Output dataset repository must exist and you must have write access
164
  - Model must be accessible (public or with valid token)
165
  - Maximum 10,000 samples per dataset
166
- - Maximum of 32k generation tokens
167
  """)
168
 
169
- with gr.Row():
170
- with gr.Group():
171
- gr.Markdown("## Dataset information")
172
- with gr.Column():
173
- with gr.Row():
174
- input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1")
175
- input_dataset_split = gr.Textbox(label="Input Dataset Split", value="train", placeholder="e.g., train, test, validation")
176
- input_dataset_config = gr.Textbox(label="Input Dataset Config", value="default", placeholder="e.g., default, custom")
177
- prompt_column = gr.Textbox(label="Prompt Column", placeholder="e.g., text, prompt, question")
178
- with gr.Column():
179
- output_dataset_name = gr.Textbox(label="Output Dataset Name", placeholder="e.g., MyOrg/my-generated-dataset")
180
- with gr.Group():
181
- gr.Markdown("## Model information")
182
  with gr.Column():
183
- with gr.Row():
184
- model_name_or_path = gr.Textbox(label="Model Name or Path", placeholder="e.g., Qwen/Qwen3-4B-Instruct-2507")
185
- model_revision = gr.Textbox(label="Model Revision", value="main", placeholder="e.g., main, v1.0")
186
- model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
 
 
 
 
187
  with gr.Group():
188
  gr.Markdown("## Generation Parameters")
189
  with gr.Row():
@@ -194,33 +198,38 @@ def create_gradio_interface():
194
  with gr.Row():
195
  top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5)
196
  top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
197
- with gr.Column():
198
- system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.")
199
 
200
  with gr.Group():
201
- gr.Markdown("## User Information, for tokens refer to guide [here](https://huggingface.co/docs/hub/en/security-tokens#user-access-tokens)")
202
  with gr.Row():
203
  with gr.Column():
204
  with gr.Row():
205
- username = gr.Textbox(label="Hugging Face Username", placeholder="Your HF username")
206
  email = gr.Textbox(label="Email", placeholder="your.email@example.com")
207
- with gr.Row():
208
- input_dataset_token = gr.Textbox(label="Input dataset token", type="password", placeholder="Your HF token with read access to the input dataset, leave blank if public dataset")
209
- output_dataset_token = gr.Textbox(label="Output dataset token", type="password", placeholder="Your HF token with write access to the output dataset")
210
 
211
  submit_btn = gr.Button("Submit Generation Request", variant="primary")
212
  output_status = gr.Textbox(label="Status", interactive=False)
213
 
214
- def submit_request(input_dataset_name, input_split, input_dataset_config, output_dataset_name, prompt_col, model_name, model_rev, model_token, sys_prompt,
215
- max_tok, temp, top_k_val, top_p_val, user, email_addr, input_dataset_token, output_dataset_token):
 
 
 
 
 
216
  try:
217
  request = GenerationRequest(
218
  id="", # Will be generated when adding to the database
 
219
  status=GenerationStatus.PENDING,
220
  input_dataset_name=input_dataset_name,
221
  input_dataset_split=input_split,
222
  input_dataset_config=input_dataset_config,
223
- output_dataset_name=output_dataset_name,
224
  prompt_column=prompt_col,
225
  model_name_or_path=model_name,
226
  model_revision=model_rev,
@@ -232,7 +241,7 @@ def create_gradio_interface():
232
  top_p=top_p_val,
233
  input_dataset_token=input_dataset_token if input_dataset_token else None,
234
  output_dataset_token=output_dataset_token,
235
- username=user,
236
  email=email_addr
237
  )
238
 
@@ -247,8 +256,8 @@ def create_gradio_interface():
247
  submit_btn.click(
248
  submit_request,
249
  inputs=[input_dataset_name, input_dataset_split, input_dataset_config, output_dataset_name, prompt_column, model_name_or_path,
250
- model_revision, model_token, system_prompt, max_tokens, temperature, top_k, top_p,
251
- username, email, input_dataset_token, output_dataset_token],
252
  outputs=output_status
253
  )
254
 
 
11
  """
12
  Still TODO:
13
  - validate the user is PRO
14
+ - check the output dataset token is valid (hardcoded for now as a secret)
15
  - validate max model params
16
  """
17
 
 
24
 
25
 
26
  MAX_SAMPLES = 10000 # max number of samples in the input dataset
27
+ MAX_TOKENS = 8192
28
  MAX_MODEL_PARAMS = 20_000_000_000 # 20 billion parameters (for now)
29
 
30
  @dataclass
31
  class GenerationRequest:
32
  id: str
33
+ created_at: str
34
  status: GenerationStatus
35
  input_dataset_name: str
36
  input_dataset_config: str
 
49
  output_dataset_token: str
50
  username: str
51
  email: str
52
+ num_output_examples: int
53
+ private: bool = False
54
+ num_retries: int = 0
55
 
56
  def validate_request(request: GenerationRequest):
57
  # checks that the request is valid
 
133
  "input_dataset_token": request.input_dataset_token,
134
  "output_dataset_token": request.output_dataset_token,
135
  "username": request.username,
136
+ "email": request.email,
137
+ "num_output_examples": MAX_SAMPLES, # currently always max samples
138
+ "private": False,
139
  }
140
 
141
  supabase.table("gen-requests").insert(data).execute()
 
157
  gr.Markdown("""
158
  **How it works:**
159
  1. Provide an input dataset with prompts
160
+ 2. Select a public language model for generation
161
  3. Configure generation parameters
162
  4. Submit your request and receive generated data
163
  """)
164
  gr.Markdown("""
165
 
166
  **Requirements:**
167
+ - Input dataset must be publicly accessible
168
  - Output dataset repository must exist and you must have write access
169
  - Model must be accessible (public or with valid token)
170
  - Maximum 10,000 samples per dataset
171
+ - Maximum of 8192 generation tokens
172
  """)
173
 
174
+ with gr.Group():
175
+ gr.Markdown("## Dataset information")
176
+ with gr.Column():
177
+ with gr.Row():
178
+ input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1")
179
+ input_dataset_split = gr.Textbox(label="Input Dataset Split", value="train", placeholder="e.g., train, test, validation")
180
+ input_dataset_config = gr.Textbox(label="Input Dataset Config", value="default", placeholder="e.g., default, custom")
181
+ prompt_column = gr.Textbox(label="Prompt Column", placeholder="e.g., text, prompt, question")
 
 
 
 
 
182
  with gr.Column():
183
+ output_dataset_name = gr.Textbox(label="Output Dataset Name", placeholder="e.g., my-generated-dataset, must be unique. Will be created under the org 'synthetic-data-universe'")
184
+ with gr.Group():
185
+ gr.Markdown("## Model information")
186
+ with gr.Column():
187
+ with gr.Row():
188
+ model_name_or_path = gr.Textbox(label="Model Name or Path", placeholder="e.g., Qwen/Qwen3-4B-Instruct-2507")
189
+ model_revision = gr.Textbox(label="Model Revision", value="main", placeholder="e.g., main, v1.0")
190
+ # model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
191
  with gr.Group():
192
  gr.Markdown("## Generation Parameters")
193
  with gr.Row():
 
198
  with gr.Row():
199
  top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5)
200
  top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
201
+ with gr.Row():
202
+ system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.")
203
 
204
  with gr.Group():
205
+ gr.Markdown("## User Information, for notification when your job is completed")
206
  with gr.Row():
207
  with gr.Column():
208
  with gr.Row():
 
209
  email = gr.Textbox(label="Email", placeholder="your.email@example.com")
210
+ # with gr.Row():
211
+ # input_dataset_token = gr.Textbox(label="Input dataset token", type="password", placeholder="Your HF token with read access to the input dataset, leave blank if public dataset")
212
+ # output_dataset_token = gr.Textbox(label="Output dataset token", type="password", placeholder="Your HF token with write access to the output dataset")
213
 
214
  submit_btn = gr.Button("Submit Generation Request", variant="primary")
215
  output_status = gr.Textbox(label="Status", interactive=False)
216
 
217
+ def submit_request(input_dataset_name, input_split, input_dataset_config, output_dataset_name, prompt_col, model_name, model_rev, sys_prompt,
218
+ max_tok, temp, top_k_val, top_p_val, email_addr):
219
+
220
+ MASTER_ORG = "synthetic-data-universe/"
221
+ model_token = None # This is currently not supported
222
+ input_dataset_token = None # This is currently not supported
223
+ output_dataset_token = os.getenv("OUTPUT_DATASET_TOKEN")
224
  try:
225
  request = GenerationRequest(
226
  id="", # Will be generated when adding to the database
227
+ created_at="", # Will be set when adding to the database
228
  status=GenerationStatus.PENDING,
229
  input_dataset_name=input_dataset_name,
230
  input_dataset_split=input_split,
231
  input_dataset_config=input_dataset_config,
232
+ output_dataset_name=MASTER_ORG + output_dataset_name,
233
  prompt_column=prompt_col,
234
  model_name_or_path=model_name,
235
  model_revision=model_rev,
 
241
  top_p=top_p_val,
242
  input_dataset_token=input_dataset_token if input_dataset_token else None,
243
  output_dataset_token=output_dataset_token,
244
+ username="user",
245
  email=email_addr
246
  )
247
 
 
256
  submit_btn.click(
257
  submit_request,
258
  inputs=[input_dataset_name, input_dataset_split, input_dataset_config, output_dataset_name, prompt_column, model_name_or_path,
259
+ model_revision, system_prompt, max_tokens, temperature, top_k, top_p,
260
+ email],
261
  outputs=output_status
262
  )
263