File size: 43,969 Bytes
b6d1901
 
 
 
 
 
 
 
b26773a
ab5bf76
976a6d2
b6d1901
8854100
 
 
f32647d
ac0089d
8854100
 
b6d1901
 
 
 
 
 
 
0fb1b95
 
f32647d
b6d1901
 
b26773a
 
 
b6d1901
 
 
f32647d
b6d1901
 
 
 
8854100
b6d1901
 
 
 
 
 
 
 
 
 
 
 
 
f32647d
 
 
b26773a
 
 
 
 
 
 
 
 
 
 
 
 
61d5b4a
b26773a
 
 
 
61d5b4a
 
 
b26773a
 
 
61d5b4a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b26773a
 
61d5b4a
b26773a
61d5b4a
b26773a
 
 
61d5b4a
 
 
 
 
 
 
 
 
 
 
 
30e16b4
 
 
b26773a
 
61d5b4a
b26773a
 
 
 
 
 
 
 
61d5b4a
b26773a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61d5b4a
 
 
 
b26773a
61d5b4a
b26773a
 
 
 
 
 
 
 
61d5b4a
b26773a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61d5b4a
 
 
b26773a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b6d1901
0fb1b95
b6d1901
 
 
 
 
 
 
 
 
 
ab5bf76
 
 
00f53b5
ab5bf76
 
 
 
 
 
5fad7f1
0fb1b95
002e03d
 
0fb1b95
 
 
002e03d
976a6d2
002e03d
 
0fb1b95
b6d1901
 
 
 
 
5fad7f1
 
 
30e16b4
5fad7f1
 
 
f00ab9d
5fad7f1
588fd75
5fad7f1
588fd75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5fad7f1
b6d1901
 
5fad7f1
 
 
 
 
b6d1901
8854100
5fad7f1
b6d1901
 
 
 
 
 
 
 
 
 
 
 
 
 
385de10
5d89dcd
b6d1901
 
f00ab9d
08390cd
 
 
 
 
 
 
 
 
 
 
 
 
eb54763
08390cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7580ee9
 
 
08390cd
 
 
 
 
7580ee9
 
 
 
 
 
 
 
08390cd
a2a9a72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
08390cd
eb54763
a2a9a72
30e16b4
a2a9a72
eb54763
a2a9a72
 
eb54763
 
 
 
a2a9a72
eb54763
 
a2a9a72
eb54763
a2a9a72
08390cd
7580ee9
 
 
 
 
 
 
08390cd
 
 
 
 
7580ee9
 
08390cd
 
 
 
 
 
 
 
 
 
 
 
 
b6d1901
 
 
 
8854100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f32647d
00f53b5
 
8854100
 
 
 
30e16b4
b6d1901
 
588fd75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab5bf76
b26773a
 
 
 
 
f00ab9d
 
3e3c42b
 
684d1a6
 
 
 
3e3c42b
684d1a6
 
3e3c42b
 
684d1a6
3e3c42b
684d1a6
 
e621b4d
 
3e3c42b
684d1a6
002e03d
684d1a6
 
 
 
e621b4d
 
 
684d1a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3e3c42b
684d1a6
 
ba02fde
30e16b4
3e3c42b
0fb1b95
f00ab9d
3a7f84b
 
eb54763
 
 
 
 
 
 
 
 
 
 
 
c80506e
cacca81
eb54763
 
 
30e16b4
 
eb54763
30e16b4
 
eb54763
 
 
 
 
 
 
 
30e16b4
eb54763
 
0fb1b95
eb54763
 
 
0fb1b95
eb54763
 
 
 
 
 
 
 
3e3c42b
0fb1b95
 
 
588fd75
f00ab9d
588fd75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f32647d
e255864
cacca81
ab5bf76
 
5fad7f1
ab5bf76
 
a2a9a72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab5bf76
 
 
 
 
 
 
 
 
 
 
e255864
5fad7f1
ab5bf76
 
 
 
 
 
 
 
a2a9a72
cacca81
ab5bf76
 
 
0fb1b95
ab5bf76
b6d1901
ab5bf76
 
 
 
08390cd
 
 
eb54763
08390cd
 
 
b26773a
 
 
 
 
 
 
ab5bf76
 
 
cacca81
ab5bf76
 
b6d1901
0fb1b95
 
cc8e33a
0fb1b95
 
 
 
ab5bf76
0fb1b95
 
 
ba02fde
002e03d
ba02fde
 
 
 
 
 
 
 
 
002e03d
ba02fde
 
002e03d
 
ba02fde
cc8e33a
 
 
 
ba02fde
 
 
 
 
 
 
 
 
5fad7f1
 
002e03d
 
 
ba02fde
5fad7f1
b6d1901
 
ab5bf76
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
import gradio as gr

from dataclasses import dataclass
import os
from supabase import create_client, Client
from supabase.client import ClientOptions
from enum import Enum
from datasets import get_dataset_infos
from transformers import AutoConfig, GenerationConfig
from huggingface_hub import whoami
from typing import Optional, Union

"""
 Still TODO:
 - validate the user is PRO
 - check the output dataset token is valid (hardcoded for now as a secret)
 - validate max model params
"""

class GenerationStatus(Enum):
    PENDING = "PENDING"
    RUNNING = "RUNNING"
    COMPLETED = "COMPLETED"
    FAILED = "FAILED"


MAX_SAMPLES_PRO = 10000  # max number of samples for PRO/Enterprise users
MAX_SAMPLES_FREE = 100   # max number of samples for free users
MAX_TOKENS = 8192
MAX_MODEL_PARAMS = 20_000_000_000  # 20 billion parameters (for now)

# Cache for model generation parameters
MODEL_GEN_PARAMS_CACHE = {}

@dataclass
class GenerationRequest:
    id: str
    created_at: str
    status: GenerationStatus
    input_dataset_name: str
    input_dataset_config: str
    input_dataset_split: str
    output_dataset_name: str
    prompt_column: str
    model_name_or_path: str
    model_revision: str
    model_token: str | None
    system_prompt: str | None
    max_tokens: int
    temperature: float
    top_k: int
    top_p: float
    input_dataset_token: str | None
    output_dataset_token: str
    username: str
    email: str
    num_output_examples: int
    private: bool = False
    num_retries: int = 0
    
SUPPORTED_MODELS = [
    "Qwen/Qwen3-4B-Instruct-2507",
    "Qwen/Qwen3-30B-A3B-Instruct-2507",
    "meta-llama/Llama-3.2-1B-Instruct",
    "meta-llama/Llama-3.2-3B-Instruct",
    "baidu/ERNIE-4.5-21B-A3B-Thinking",
    "LLM360/K2-Think",
    "openai/gpt-oss-20b",
]


def fetch_model_generation_params(model_name: str) -> dict:
    """Fetch generation parameters and model config from the hub"""
    default_params = {
        "max_tokens": 1024,
        "temperature": 0.7,
        "top_k": 50,
        "top_p": 0.95,
        "max_position_embeddings": 2048,
        "recommended_max_tokens": 1024
    }

    try:
        print(f"Attempting to fetch configs for: {model_name}")

        # Always try to load the model config first for max_position_embeddings
        model_config = None
        max_position_embeddings = default_params["max_position_embeddings"]

        try:
            output_dataset_token = os.getenv("OUTPUT_DATASET_TOKEN")
            model_config = AutoConfig.from_pretrained(model_name, force_download=False, token=output_dataset_token)
            max_position_embeddings = getattr(model_config, 'max_position_embeddings', default_params["max_position_embeddings"])
            print(f"Loaded AutoConfig for {model_name}, max_position_embeddings: {max_position_embeddings}")
        except Exception as e:
            print(f"Failed to load AutoConfig for {model_name}: {e}")

        # Calculate recommended max tokens (conservative estimate)
        # Leave some room for the prompt, so use ~75% of max_position_embeddings
        recommended_max_tokens = min(int(max_position_embeddings * 0.75), MAX_TOKENS)
        recommended_max_tokens = max(256, recommended_max_tokens)  # Ensure minimum

        # Try to load the generation config
        gen_config = None
        try:
            gen_config = GenerationConfig.from_pretrained(model_name, force_download=False, token=output_dataset_token)
            print(f"Successfully loaded generation config for {model_name}")
        except Exception as e:
            print(f"Failed to load GenerationConfig for {model_name}: {e}")

        # Extract parameters from generation config or use model-specific defaults
        if gen_config:
            params = {
                "max_tokens": getattr(gen_config, 'max_new_tokens', None) or getattr(gen_config, 'max_length', recommended_max_tokens),
                "temperature": getattr(gen_config, 'temperature', default_params["temperature"]),
                "top_k": getattr(gen_config, 'top_k', default_params["top_k"]),
                "top_p": getattr(gen_config, 'top_p', default_params["top_p"]),
                "max_position_embeddings": max_position_embeddings,
                "recommended_max_tokens": recommended_max_tokens
            }
        else:
            params = dict(default_params)
            params["max_position_embeddings"] = max_position_embeddings
            params["recommended_max_tokens"] = recommended_max_tokens

        # Ensure parameters are within valid ranges
        params["max_tokens"] = max(256, min(params["max_tokens"], MAX_TOKENS, params["recommended_max_tokens"]))
        params["temperature"] = max(0.0, min(params["temperature"], 2.0))
        params["top_k"] = max(5, min(params["top_k"], 100))
        params["top_p"] = max(0.0, min(params["top_p"], 1.0))

        print(f"Final params for {model_name}: {params}")
        return params

    except Exception as e:
        print(f"Could not fetch configs for {model_name}: {e}")
        return default_params


def update_generation_params(model_name: str):
    """Update generation parameters based on selected model"""
    global MODEL_GEN_PARAMS_CACHE

    print(f"Updating generation parameters for model: {model_name}")
    print(f"Cache is empty: {len(MODEL_GEN_PARAMS_CACHE) == 0}")
    print(f"Current cache keys: {list(MODEL_GEN_PARAMS_CACHE.keys())}")

    # If cache is empty, try to populate it now
    if len(MODEL_GEN_PARAMS_CACHE) == 0:
        print("Cache is empty, attempting to populate now...")
        cache_all_model_params()

    if model_name in MODEL_GEN_PARAMS_CACHE:
        params = MODEL_GEN_PARAMS_CACHE[model_name]
        print(f"Found cached params for {model_name}: {params}")

        # Set the max_tokens slider maximum to the model's recommended max
        max_tokens_limit = min(params.get("recommended_max_tokens", MAX_TOKENS), MAX_TOKENS)

        return (
            gr.update(value=params["max_tokens"], maximum=max_tokens_limit),  # max_tokens with dynamic maximum
            gr.update(value=params["temperature"]),   # temperature
            gr.update(value=params["top_k"]),         # top_k
            gr.update(value=params["top_p"])          # top_p
        )
    else:
        # Fallback to defaults if model not in cache
        print(f"Model {model_name} not found in cache, using defaults")
        return (
            gr.update(value=1024, maximum=MAX_TOKENS),   # max_tokens
            gr.update(value=0.7),    # temperature
            gr.update(value=50),     # top_k
            gr.update(value=0.95)    # top_p
        )


def cache_all_model_params():
    """Cache generation parameters for all supported models at startup"""
    global MODEL_GEN_PARAMS_CACHE

    print(f"Starting to cache parameters for {len(SUPPORTED_MODELS)} models...")
    print(f"Supported models: {SUPPORTED_MODELS}")

    for model_name in SUPPORTED_MODELS:
        try:
            print(f"Processing model: {model_name}")
            params = fetch_model_generation_params(model_name)
            MODEL_GEN_PARAMS_CACHE[model_name] = params
            print(f"Successfully cached params for {model_name}: {params}")
        except Exception as e:
            print(f"Exception while caching params for {model_name}: {e}")
            # Use default parameters if caching fails
            default_params = {
                "max_tokens": 1024,
                "temperature": 0.7,
                "top_k": 50,
                "top_p": 0.95,
                "max_position_embeddings": 2048,
                "recommended_max_tokens": 1024
            }
            MODEL_GEN_PARAMS_CACHE[model_name] = default_params
            print(f"Using default params for {model_name}: {default_params}")

    print(f"Caching complete. Final cache contents:")
    for model, params in MODEL_GEN_PARAMS_CACHE.items():
        print(f"  {model}: {params}")
    print(f"Cache size: {len(MODEL_GEN_PARAMS_CACHE)} models")

def verify_pro_status(token: Optional[Union[gr.OAuthToken, str]]) -> bool:
    """Verifies if the user is a Hugging Face PRO user or part of an enterprise org."""
    if not token:
        return False
    
    if isinstance(token, gr.OAuthToken):
        token_str = token.token
    elif isinstance(token, str):
        token_str = token
    else:
        return False
    
    try:
        user_info = whoami(token=token_str)
        return (
            user_info.get("isPro", False) or
            any(org.get("isEnterprise", False) for org in user_info.get("orgs", []))
        )
    except Exception as e:
        print(f"Could not verify user's PRO/Enterprise status: {e}")
        return False




def validate_request(request: GenerationRequest, oauth_token: Optional[Union[gr.OAuthToken, str]] = None) -> GenerationRequest:
    # checks that the request is valid
    # - input dataset exists and can be accessed with the provided token
    try:
        input_dataset_info = get_dataset_infos(request.input_dataset_name, token=request.input_dataset_token)[request.input_dataset_config]
    except Exception as e:
        raise Exception(f"Dataset {request.input_dataset_name} does not exist or cannot be accessed with the provided token.")
    
    # check that the input dataset split exists
    if request.input_dataset_split not in input_dataset_info.splits:
        raise Exception(f"Dataset split {request.input_dataset_split} does not exist in dataset {request.input_dataset_name}. Available splits: {list(input_dataset_info.splits.keys())}")

    # if num_output_examples is 0, set it to the number of examples in the input dataset split
    if request.num_output_examples == 0:
        request.num_output_examples = input_dataset_info.splits[request.input_dataset_split].num_examples
    else: 
        if request.num_output_examples > input_dataset_info.splits[request.input_dataset_split].num_examples:
            raise Exception(f"Requested number of output examples {request.num_output_examples} exceeds the number of examples in the input dataset split {input_dataset_info.splits[request.input_dataset_split].num_examples}.")
        request.input_dataset_split = f"{request.input_dataset_split}[:{request.num_output_examples}]"



    # Check user tier and apply appropriate limits
    # Anonymous users (oauth_token is None) are treated as free tier
    is_pro = verify_pro_status(oauth_token) if oauth_token else False
    max_samples = MAX_SAMPLES_PRO if is_pro else MAX_SAMPLES_FREE
    
    if request.num_output_examples > max_samples:
        if oauth_token is None:
            user_tier = "non-signed-in"
        else:
            user_tier = "PRO/Enterprise" if is_pro else "free"
        raise Exception(f"Requested number of output examples {request.num_output_examples} exceeds the max limit of {max_samples} for {user_tier} users.")

    # check the prompt column exists in the dataset
    if request.prompt_column not in input_dataset_info.features:
        raise Exception(f"Prompt column {request.prompt_column} does not exist in dataset {request.input_dataset_name}. Available columns: {list(input_dataset_info.features.keys())}")

    # This is currently not supported, the output dataset will be created under the org 'synthetic-data-universe'
    # check output_dataset name is valid
    if request.output_dataset_name.count("/") != 1:
        raise Exception("Output dataset will be popululated automatically. The dataset will be created under the org 'synthetic-data-universe/my-dataset'.")
        
    # check the output dataset is valid and accessible with the provided token
    try:
        get_dataset_infos(request.output_dataset_name, token=request.output_dataset_token)
        raise Exception(f"Output dataset {request.output_dataset_name} already exists. Please choose a different name.")
    except Exception:
        pass  # dataset does not exist, which is expected
    
    # check the output dataset name doesn't already exist in the database
    try:
        url = os.getenv("SUPABASE_URL")
        key = os.getenv("SUPABASE_KEY")
        
        if url and key:
            supabase = create_client(
                url,
                key,
                options=ClientOptions(
                    postgrest_client_timeout=10,
                    storage_client_timeout=10,
                    schema="public",
                )
            )
            
            existing_request = supabase.table("gen-requests").select("id").eq("output_dataset_name", request.output_dataset_name).execute()
            if existing_request.data:
                raise Exception(f"Output dataset {request.output_dataset_name} is already being generated or has been requested. Please choose a different name.")
    except Exception as e:
        # If it's our custom exception about dataset already existing, re-raise it
        if "already being generated" in str(e):
            raise e
        # Otherwise, ignore database connection errors and continue
        pass

    # check the models exists
    try:
        model_config = AutoConfig.from_pretrained(request.model_name_or_path, 
                                                  revision=request.model_revision,     
                                                  force_download=True,
                                                  token=False
        )
    except Exception as e:
        print(e)
        raise Exception(f"Model {request.model_name_or_path} revision {request.model_revision} does not exist or cannot be accessed. The model may be private or gated, which is not supported at this time.")

    # check the model max position embeddings is greater than the requested max tokens and less than MAX_TOKENS    
    if model_config.max_position_embeddings < request.max_tokens:
        raise Exception(f"Model {request.model_name_or_path} max position embeddings {model_config.max_position_embeddings} is less than the requested max tokens {request.max_tokens}.")
    if request.max_tokens > MAX_TOKENS:
        raise Exception(f"Requested max tokens {request.max_tokens} exceeds the limit of {MAX_TOKENS}.")
    
    # check sampling parameters are valid
    if request.temperature < 0.0 or request.temperature > 2.0:
        raise Exception("Temperature must be between 0.0 and 2.0")
    if request.top_k < 1 or request.top_k > 100:
        raise Exception("Top K must be between 1 and 100")
    if request.top_p < 0.0 or request.top_p > 1.0:
        raise Exception("Top P must be between 0.0 and 1.0")   
        
    return request


def load_dataset_info(dataset_name, model_name, oauth_token=None):
    """Load dataset information and return choices for dropdowns"""
    if not dataset_name.strip():
        return (
            gr.update(choices=[], value=None),  # config
            gr.update(choices=[], value=None),  # split
            gr.update(choices=[], value=None),  # prompt_column
            gr.update(value="", interactive=True),  # output_dataset_name
            gr.update(interactive=False),  # num_output_samples
            "Please enter a dataset name first."
        )

    try:
        # Get dataset info
        dataset_infos = get_dataset_infos(dataset_name)

        if not dataset_infos:
            raise Exception("No configs found for this dataset")

        # Get available configs
        config_choices = list(dataset_infos.keys())
        default_config = config_choices[0] if config_choices else None

        # Get splits and features for the default config
        if default_config:
            config_info = dataset_infos[default_config]
            split_choices = list(config_info.splits.keys())
            default_split = split_choices[0] if split_choices else None

            # Get column choices (features)
            column_choices = list(config_info.features.keys())
            default_column = None
            # Try to find a likely prompt column
            for col in column_choices:
                if any(keyword in col.lower() for keyword in ['prompt', 'text', 'question', 'input']):
                    default_column = col
                    break
            if not default_column and column_choices:
                default_column = column_choices[0]

            # Get sample count for the default split
            dataset_sample_count = config_info.splits[default_split].num_examples if default_split else 0
        else:
            split_choices = []
            column_choices = []
            default_split = None
            default_column = None
            dataset_sample_count = 0

        # Determine user limits
        is_pro = verify_pro_status(oauth_token) if oauth_token else False
        user_max_samples = MAX_SAMPLES_PRO if is_pro else MAX_SAMPLES_FREE

        # Set slider maximum to the minimum of dataset samples and user limit
        slider_max = min(dataset_sample_count, user_max_samples) if dataset_sample_count > 0 else user_max_samples

        # Get username from OAuth token
        username = "anonymous"
        if oauth_token:
            try:
                if isinstance(oauth_token, gr.OAuthToken):
                    token_str = oauth_token.token
                elif isinstance(oauth_token, str):
                    token_str = oauth_token
                else:
                    token_str = None

                if token_str:
                    user_info = whoami(token=token_str)
                    username = user_info.get("name", "anonymous")
            except Exception:
                username = "anonymous"

        # Generate a suggested output dataset name: username-model-dataset
        dataset_base_name = dataset_name.split('/')[-1] if '/' in dataset_name else dataset_name

        # Extract model short name (e.g., "Qwen/Qwen3-4B-Instruct-2507" -> "qwen3-4b")
        model_short_name = model_name.split('/')[-1]
        # Remove common suffixes and simplify

        # Build the output name: username-model-dataset
        suggested_output_name = f"{username}-{model_short_name}-{dataset_base_name}"

        # Limit to 86 characters
        if len(suggested_output_name) > 86:
            # Truncate dataset name to fit within limit
            available_for_dataset = 86 - len(username) - len(model_short_name) - 2  # -2 for the hyphens
            if available_for_dataset > 0:
                dataset_base_name = dataset_base_name[:available_for_dataset]
                suggested_output_name = f"{username}-{model_short_name}-{dataset_base_name}"
            else:
                suggested_output_name = f"{username}-{model_short_name}"

        status_msg = f"βœ… Dataset info loaded successfully! Found {len(config_choices)} config(s), {len(split_choices)} split(s), and {len(column_choices)} column(s)."
        if dataset_sample_count > 0:
            status_msg += f" Dataset has {dataset_sample_count:,} samples."
            if dataset_sample_count > user_max_samples:
                user_tier = "PRO/Enterprise" if is_pro else "free tier"
                status_msg += f" Limited to {user_max_samples:,} samples for {user_tier} users."

        return (
            gr.update(choices=config_choices, value=default_config, interactive=True),  # config
            gr.update(choices=split_choices, value=default_split, interactive=True),  # split
            gr.update(choices=column_choices, value=default_column, interactive=True),  # prompt_column
            gr.update(value=suggested_output_name, interactive=True),  # output_dataset_name
            gr.update(interactive=True, maximum=slider_max, value=0),  # num_output_samples
            status_msg
        )

    except Exception as e:
        return (
            gr.update(choices=[], value=None, interactive=False),  # config
            gr.update(choices=[], value=None, interactive=False),  # split
            gr.update(choices=[], value=None, interactive=False),  # prompt_column
            gr.update(value="", interactive=False),  # output_dataset_name
            gr.update(interactive=False),  # num_output_samples
            f"❌ Error loading dataset info: {str(e)}"
        )


def add_request_to_db(request: GenerationRequest):
    url: str = os.getenv("SUPABASE_URL")
    key: str = os.getenv("SUPABASE_KEY")
    
    try:
        supabase: Client = create_client(
            url,
            key,
            options=ClientOptions(
                postgrest_client_timeout=10,
                storage_client_timeout=10,
                schema="public",
            )
        )
        
        data = {
            "status": request.status.value,
            "input_dataset_name": request.input_dataset_name,
            "input_dataset_config": request.input_dataset_config,
            "input_dataset_split": request.input_dataset_split,
            "output_dataset_name": request.output_dataset_name,
            "prompt_column": request.prompt_column,
            "model_name_or_path": request.model_name_or_path,
            "model_revision": request.model_revision,
            "model_token": request.model_token,
            "system_prompt": request.system_prompt,
            "max_tokens": request.max_tokens,
            "temperature": request.temperature,
            "top_k": request.top_k,
            "top_p": request.top_p,
            "input_dataset_token": request.input_dataset_token,
            "output_dataset_token": request.output_dataset_token,
            "username": request.username,
            "email": request.email,
            "num_output_examples": request.num_output_examples,
            "private": request.private,
        }
            
        supabase.table("gen-requests").insert(data).execute()
    except Exception as e:
        raise Exception(f"Failed to add request to database: {str(e)}")




def get_generation_stats_safe():
    """Safely fetch generation request statistics with proper error handling"""
    try:
        url = os.getenv("SUPABASE_URL")
        key = os.getenv("SUPABASE_KEY")
        
        if not url or not key:
            raise Exception("Missing SUPABASE_URL or SUPABASE_KEY environment variables")
        
        supabase = create_client(
            url,
            key,
            options=ClientOptions(
                postgrest_client_timeout=10,
                storage_client_timeout=10,
                schema="public",
            )
        )
        
        # Fetch data excluding sensitive token fields
        response = supabase.table("gen-requests").select(
            "id, created_at, status, input_dataset_name, input_dataset_config, "
            "input_dataset_split, output_dataset_name, prompt_column, "
            "model_name_or_path, model_revision, max_tokens, temperature, "
            "top_k, top_p, username, num_output_examples, private"
        ).order("created_at", desc=True).limit(50).execute()
        
        return {"status": "success", "data": response.data}
        
    except Exception as e:
        return {"status": "error", "message": str(e), "data": []}


# Old commented code removed - replaced with DatabaseManager and get_generation_stats_safe()


def main():
    # Cache model generation parameters at startup
    print("Caching model generation parameters...")
    cache_all_model_params()
    print("Model parameter caching complete.")

    with gr.Blocks(title="DataForge - Synthetic Data Generation") as demo:
        gr.Image("dataforge.png", show_label=False, show_download_button=False, container=False, height=300)
        # Store the current oauth token for use in submit_request
        current_oauth_token = gr.State(None)
        with gr.Row():
            gr.Markdown("")  # Empty space for alignment
            login_button = gr.LoginButton(value="πŸ”‘ Sign in", size="sm")
            gr.Markdown("")  # Empty space for alignment

        signin_message = gr.Markdown("## πŸ”‘ Sign In Required\n\nPlease sign in with your Hugging Face account to access the synthetic data generation service. Click the **Sign in** button above to continue.", visible=True)
        
        # Main description
        gr.Markdown("""
        This tool allows you to **generate synthetic data from existing datasets**, for all your **fine-tuning/research/data augmentation** needs!

        DataForge is built on top of [DataTrove](https://github.com/huggingface/datatrove), our backend data generation script is open-source and available on [GitHub](https://github.com/huggingface/dataforge). DataForge is **FREE** for HuggingFace PRO users (10,000 samples) β€’ 100 samples for free users.
        """)
        gr.Markdown("**All generated datasets will be publicly available under the [synthetic-data-universe](https://huggingface.co/synthetic-data-universe) organization.**")
        

        # Usage guide and examples (right below description)
        with gr.Row():
            with gr.Column(scale=1):
                with gr.Accordion("Usage Guide", open=False):
                    gr.Markdown("""
                    **Step-by-Step Process:**
                    1. **Choose Model**: Select from 20+ models
                    2. **Load Dataset**: Enter a HF dataset name
                    3. **Load Info**: Click "Load Dataset Info"
                    4. **Configure**: Set generation parameters
                    5. **Submit**: Monitor progress in Statistics tab

                    **Requirements:**
                    - Input dataset must be public on HF Hub
                    - Model must be publicly accessible
                    - Free users: 100 samples max, PRO: 10K max
                    - Token limit: 8,192 per sample
                    """)
            with gr.Column(scale=1):
                with gr.Accordion("Examples", open=False):
                    gr.Markdown("""
                    **Popular Use Cases:**

                    **Conversational**: Multi-turn dialogues
                    - Models: Llama-3.2-3B, Mistral-7B
                    - Temperature: 0.7-0.9

                    **Code**: Problem β†’ Solution
                    - Models: Qwen2.5-Coder, DeepSeek-Coder
                    - Temperature: 0.1-0.3

                    **Example datasets to try:**
                    ```
                    simplescaling/s1K-1.1
                    HuggingFaceH4/ultrachat_200k
                    iamtarun/python_code_instructions_18k_alpaca
                    ```
                    """)


        # Sign in button
        main_interface = gr.Column(visible=False)

        with main_interface:
            with gr.Tabs():
                with gr.TabItem("Generate Data"):
                    with gr.Row():
                        with gr.Column():
                            with gr.Group():
                                gr.Markdown("##  Model information")
                                with gr.Column():
                                    with gr.Row():
                                        model_name_or_path = gr.Dropdown(
                                            choices=SUPPORTED_MODELS,
                                            label="Select Model",
                                            value="Qwen/Qwen3-4B-Instruct-2507",
                                            info="Choose from popular instruction-tuned models under 40B parameters"
                                        )
                                    # model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
                                with gr.Row():
                                    system_prompt = gr.Textbox(label="System Prompt (Optional)", placeholder="Optional system prompt... e.g., You are a helpful assistant.", info="Sets the AI's role/behavior. Leave empty for default model behavior.")
                                gr.Markdown("### Generation Parameters")
                                with gr.Row():
                                    with gr.Column():
                                        with gr.Row():
                                            max_tokens = gr.Slider(label="Max Tokens", value=1024, minimum=256, maximum=MAX_TOKENS, step=256, info="Maximum tokens to generate per sample. Higher = longer responses.")
                                            temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.1, info="Creativity level: 0.1=focused, 0.7=balanced, 1.0+=creative")
                                        with gr.Row():
                                            top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5, info="Limits word choices to top K options. Lower = more focused.")
                                            top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05, info="Nucleus sampling: 0.9=focused, 0.95=balanced diversity")

                        with gr.Column():
                            with gr.Group():
                                gr.Markdown("##  Dataset information")
                                # Dynamic user limit info - default to anonymous user
                                user_limit_info = gr.Markdown(value="πŸ‘€ **Anonymous User**: You can generate up to 100 samples per request. Use the sign-in button above for PRO benefits (10,000 samples).", visible=True)
                                with gr.Row():
                                    with gr.Column():
                                        input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1", info="Public HF dataset with prompts to generate from")
                                        load_info_btn = gr.Button("πŸ“Š Load Dataset Info", size="sm", variant="secondary")
                                        load_info_status = gr.Markdown("", visible=True)

                                    with gr.Column():
                                        output_dataset_name = gr.Textbox(label="Output Dataset Name", placeholder="e.g., my-generated-dataset, must be unique. Will be created under the org 'synthetic-data-universe'", value=None, interactive=False, info="Click Load Info to populate")
                                
                                with gr.Row():
                                    with gr.Column():
                                        input_dataset_config = gr.Dropdown(label="Dataset Config", choices=[], value=None, interactive=False, info="Click Load Info to populate")
                                        prompt_column = gr.Dropdown(label="Prompt Column", choices=[], value=None, interactive=False, info="Click Load Info to populate")
                                        
                                    with gr.Column():
                                        input_dataset_split = gr.Dropdown(label="Dataset Split", choices=[], value=None, interactive=False, info="Click Load Info to populate")
                                        num_output_samples = gr.Slider(label="Number of samples, leave as '0' for all", value=0, minimum=0, maximum=MAX_SAMPLES_FREE, step=1, interactive=False, info="Click Load Info to populate")


                    submit_btn = gr.Button("Submit Generation Request", variant="primary")
                    output_status = gr.Textbox(label="Status", interactive=False)
                
                with gr.TabItem("Statistics Dashboard"):
                    gr.Markdown("## DataForge Generation Statistics")
                    gr.Markdown("πŸ“Š View recent synthetic data generation requests and their status.")
                    
                    with gr.Row():
                        refresh_stats_btn = gr.Button("πŸ”„ Refresh Statistics", size="sm", variant="secondary")
                        clear_stats_btn = gr.Button("πŸ—‘οΈ Clear Display", size="sm")
                    
                    stats_status = gr.Markdown("Click 'Refresh Statistics' to load recent generation requests.", visible=True)
                    
                    stats_dataframe = gr.Dataframe(
                        headers=["ID", "Created", "Status", "Input Dataset", "Output Dataset", "Model", "Samples", "User"],
                        datatype=["str", "str", "str", "str", "str", "str", "number", "str"],
                        interactive=False,
                        wrap=True,
                        value=[],
                        label="Recent Generation Requests (Last 50)",
                        visible=False
                    )
                    
                    def load_statistics():
                        """Load and format statistics data"""
                        try:
                            # Use the new safe database function
                            result = get_generation_stats_safe()
                            
                            if result["status"] == "error":
                                return (
                                    f"❌ **Error loading statistics**: {result['message']}",
                                    gr.update(visible=False),
                                    gr.update(visible=True)
                                )
                            
                            data = result["data"]
                            if not data:
                                return (
                                    "πŸ“ **No data found**: The database appears to be empty or the table doesn't exist yet.",
                                    gr.update(visible=False),
                                    gr.update(visible=True)
                                )
                            
                            # Format data for display
                            formatted_data = []
                            for item in data:
                                # Format timestamp
                                created_at = item.get('created_at', 'Unknown')
                                if created_at and created_at != 'Unknown':
                                    try:
                                        from datetime import datetime
                                        dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
                                        created_at = dt.strftime('%Y-%m-%d %H:%M')
                                    except:
                                        pass
                                
                                formatted_data.append([
                                    str(item.get('id', ''))[:8] + "..." if len(str(item.get('id', ''))) > 8 else str(item.get('id', '')),
                                    created_at,
                                    item.get('status', 'Unknown'),
                                    (item.get('input_dataset_name', '')[:30] + "...") if len(item.get('input_dataset_name', '')) > 30 else item.get('input_dataset_name', ''),
                                    (item.get('output_dataset_name', '')[:30] + "...") if len(item.get('output_dataset_name', '')) > 30 else item.get('output_dataset_name', ''),
                                    (item.get('model_name_or_path', '')[:25] + "...") if len(item.get('model_name_or_path', '')) > 25 else item.get('model_name_or_path', ''),
                                    item.get('num_output_examples', 0),
                                    item.get('username', 'Anonymous')
                                ])
                            
                            return (
                                f"βœ… **Statistics loaded successfully**: Found {len(formatted_data)} recent requests.",
                                gr.update(value=formatted_data, visible=True),
                                gr.update(visible=True)
                            )
                            
                        except Exception as e:
                            return (
                                f"❌ **Unexpected error**: {str(e)}",
                                gr.update(visible=False),
                                gr.update(visible=True)
                            )
                    
                    def clear_statistics():
                        """Clear the statistics display"""
                        return (
                            "Click 'Refresh Statistics' to load recent generation requests.",
                            gr.update(value=[], visible=False),
                            gr.update(visible=True)
                        )
                    
                    # Connect buttons to functions
                    refresh_stats_btn.click(
                        load_statistics,
                        outputs=[stats_status, stats_dataframe, stats_status]
                    )
                    
                    clear_stats_btn.click(
                        clear_statistics,
                        outputs=[stats_status, stats_dataframe, stats_status]
                    )

            def submit_request(input_dataset_name, input_split, input_dataset_config, output_dataset_name, prompt_col, model_name, sys_prompt,
                                max_tok, temp, top_k_val, top_p_val, num_output_samples, oauth_token=None):

                MASTER_ORG = "synthetic-data-universe/"
                model_token = False # This is currently not supported
                input_dataset_token = None # This is currently not supported
                output_dataset_token = os.getenv("OUTPUT_DATASET_TOKEN")

                # Get username from OAuth token
                username = "anonymous"
                if oauth_token:
                    try:
                        if isinstance(oauth_token, gr.OAuthToken):
                            token_str = oauth_token.token
                        elif isinstance(oauth_token, str):
                            token_str = oauth_token
                        else:
                            token_str = None

                        if token_str:
                            user_info = whoami(token=token_str)
                            username = user_info.get("name", "unknown")
                    except Exception:
                        username = "unknown"

                try:
                    request = GenerationRequest(
                        id="",  # Will be generated when adding to the database
                        created_at="",  # Will be set when adding to the database
                        status=GenerationStatus.PENDING,
                        input_dataset_name=input_dataset_name,
                        input_dataset_split=input_split,
                        input_dataset_config=input_dataset_config,
                        output_dataset_name=MASTER_ORG + output_dataset_name,
                        prompt_column=prompt_col,
                        model_name_or_path=model_name,
                        model_revision="main",
                        model_token=model_token,
                        system_prompt=sys_prompt if sys_prompt else None,
                        max_tokens=int(max_tok),
                        temperature=temp,
                        top_k=int(top_k_val),
                        top_p=top_p_val,
                        input_dataset_token=input_dataset_token if input_dataset_token else None,
                        output_dataset_token=output_dataset_token,
                        num_output_examples=num_output_samples,  # will be set after validating the input dataset
                        username=username,
                        email="n/a",
                    )
                    
                    # check the input dataset exists and can be accessed with the provided token
                    request = validate_request(request, oauth_token)
                    add_request_to_db(request)

                    return "Request submitted successfully!"
                except Exception as e:
                    return f"Error: {str(e)}"
            
            # Wire up the Load Dataset Info button
            load_info_btn.click(
                load_dataset_info,
                inputs=[input_dataset_name, model_name_or_path, current_oauth_token],
                outputs=[input_dataset_config, input_dataset_split, prompt_column, output_dataset_name, num_output_samples, load_info_status]
            )

            # Wire up model change to update generation parameters
            model_name_or_path.change(
                update_generation_params,
                inputs=[model_name_or_path],
                outputs=[max_tokens, temperature, top_k, top_p]
            )

            submit_btn.click(
                submit_request,
                inputs=[input_dataset_name, input_dataset_split, input_dataset_config, output_dataset_name, prompt_column, model_name_or_path,
                         system_prompt, max_tokens, temperature, top_k, top_p, num_output_samples, current_oauth_token],
                outputs=output_status
            )
        
        def update_user_limits(oauth_token):
            if oauth_token is None:
                return "πŸ‘€ **Anonymous User**: You can generate up to 100 samples per request. Use the sign-in button above for PRO benefits (10,000 samples)."
            
            is_pro = verify_pro_status(oauth_token)
            if is_pro:
                return "✨ **PRO User**: You can generate up to 10,000 samples per request."
            else:
                return "πŸ‘€ **Free User**: You can generate up to 100 samples per request. [Upgrade to PRO](http://huggingface.co/subscribe/pro?source=synthetic-data-universe) for 10,000 samples."

        def control_access(profile: Optional[gr.OAuthProfile] = None, oauth_token: Optional[gr.OAuthToken] = None):
            # Require users to be signed in
            if oauth_token is None:
                # User is not signed in - show sign-in prompt, hide main interface
                return (
                    gr.update(visible=False),  # main_interface
                    gr.update(visible=True),   # signin_message
                    oauth_token,               # current_oauth_token
                    "",                        # user_limit_info (empty when not signed in)
                    gr.update(),               # num_output_samples (no change)
                    gr.update(value="πŸ”‘ Sign in")  # login_button
                )
            else:
                # User is signed in - show main interface, hide sign-in prompt
                limit_msg = update_user_limits(oauth_token)
                is_pro = verify_pro_status(oauth_token)
                max_samples = MAX_SAMPLES_PRO if is_pro else MAX_SAMPLES_FREE
                
                if is_pro:
                    button_text = f"✨ Signed in as PRO ({profile.name if profile else 'User'})"
                else:
                    button_text = f"πŸ‘€ Signed in as {profile.name if profile else 'User'}"
                
                return (
                    gr.update(visible=True),   # main_interface
                    gr.update(visible=False),  # signin_message
                    oauth_token,               # current_oauth_token
                    limit_msg,                 # user_limit_info
                    gr.update(maximum=max_samples),  # num_output_samples
                    gr.update(value=button_text)     # login_button
                )


        # Handle login state changes - LoginButton automatically handles auth state changes
        # The demo.load will handle both initial load and auth changes
        
        demo.load(control_access, inputs=None, outputs=[main_interface, signin_message, current_oauth_token, user_limit_info, num_output_samples, login_button])
        demo.queue(max_size=None, default_concurrency_limit=None).launch(show_error=True)

if __name__ == "__main__":
    main()