Xenova HF Staff commited on
Commit
b462912
·
verified ·
1 Parent(s): 093911c

Update tokenizer

Browse files
Files changed (1) hide show
  1. tokenizer.json +177 -56
tokenizer.json CHANGED
@@ -21,15 +21,6 @@
21
  "rstrip": false,
22
  "normalized": false
23
  },
24
- {
25
- "id": 2,
26
- "special": true,
27
- "content": "[SPACE]",
28
- "single_word": false,
29
- "lstrip": false,
30
- "rstrip": false,
31
- "normalized": true
32
- },
33
  {
34
  "id": 255,
35
  "special": true,
@@ -443,69 +434,199 @@
443
  "rstrip": false,
444
  "normalized": false,
445
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
446
  }
447
  ],
448
  "normalizer": {
449
- "type": "Sequence",
450
- "normalizers": [
451
- {
452
- "type": "Replace",
453
- "pattern": { "String": " " },
454
- "content": "[SPACE]"
455
- }
456
- ]
457
- },
458
- "pre_tokenizer": {
459
- "type": "Whitespace"
460
  },
 
461
  "post_processor": {
462
  "type": "TemplateProcessing",
463
  "single": [
464
- { "SpecialToken": { "id": "EXAGGERATION", "type_id": 0 } },
465
- { "SpecialToken": { "id": "BOS", "type_id": 0 } },
466
- { "Sequence": { "id": "A", "type_id": 0 } },
467
- { "SpecialToken": { "id": "EOS", "type_id": 0 } },
468
- { "SpecialToken": { "id": "START_SPEECH", "type_id": 0 } },
469
- { "SpecialToken": { "id": "START_SPEECH", "type_id": 0 } }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
470
  ],
471
  "pair": [
472
- { "SpecialToken": { "id": "EXAGGERATION", "type_id": 0 } },
473
- { "SpecialToken": { "id": "BOS", "type_id": 0 } },
474
- { "Sequence": { "id": "A", "type_id": 0 } },
475
- { "SpecialToken": { "id": "EOS", "type_id": 0 } },
476
- { "SpecialToken": { "id": "START_SPEECH", "type_id": 0 } },
477
- { "SpecialToken": { "id": "START_SPEECH", "type_id": 0 } },
478
- { "SpecialToken": { "id": "EXAGGERATION", "type_id": 1 } },
479
- { "SpecialToken": { "id": "BOS", "type_id": 1 } },
480
- { "Sequence": { "id": "B", "type_id": 1 } },
481
- { "SpecialToken": { "id": "EOS", "type_id": 1 } },
482
- { "SpecialToken": { "id": "START_SPEECH", "type_id": 1 } },
483
- { "SpecialToken": { "id": "START_SPEECH", "type_id": 1 } }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
484
  ],
485
  "special_tokens": {
486
- "BOS": {
487
- "id": "BOS",
488
- "ids": [255],
489
- "tokens": ["<s>"]
 
 
 
 
490
  },
491
- "EOS": {
492
- "id": "EOS",
493
- "ids": [0],
494
- "tokens": ["</s>"]
 
 
 
 
495
  },
496
- "EXAGGERATION": {
497
- "id": "EXAGGERATION",
498
- "ids": [6563],
499
- "tokens": ["<EXAGGERATION>"]
 
 
 
 
500
  },
501
- "START_SPEECH": {
502
- "id": "START_SPEECH",
503
- "ids": [6561],
504
- "tokens": ["<START_SPEECH>"]
 
 
 
 
505
  }
506
  }
507
  },
508
- "decoder": null,
 
 
509
  "model": {
510
  "type": "BPE",
511
  "dropout": null,
@@ -516,7 +637,7 @@
516
  "vocab": {
517
  "[STOP]": 0,
518
  "[UNK]": 1,
519
- "[SPACE]": 2,
520
  "!": 3,
521
  "'": 4,
522
  "(": 5,
 
21
  "rstrip": false,
22
  "normalized": false
23
  },
 
 
 
 
 
 
 
 
 
24
  {
25
  "id": 255,
26
  "special": true,
 
434
  "rstrip": false,
435
  "normalized": false,
436
  "special": true
437
+ },
438
+ {
439
+ "id": 6561,
440
+ "content": "[START_SPEECH]",
441
+ "single_word": false,
442
+ "lstrip": false,
443
+ "rstrip": false,
444
+ "normalized": false,
445
+ "special": true
446
+ },
447
+ {
448
+ "id": 6562,
449
+ "content": "[STOP_SPEECH]",
450
+ "single_word": false,
451
+ "lstrip": false,
452
+ "rstrip": false,
453
+ "normalized": false,
454
+ "special": true
455
+ },
456
+ {
457
+ "id": 6563,
458
+ "content": "[EXAGGERATION]",
459
+ "single_word": false,
460
+ "lstrip": false,
461
+ "rstrip": false,
462
+ "normalized": false,
463
+ "special": true
464
  }
465
  ],
466
  "normalizer": {
467
+ "type": "Replace",
468
+ "pattern": {
469
+ "Regex": "\\s+"
470
+ },
471
+ "content": " "
 
 
 
 
 
 
472
  },
473
+ "pre_tokenizer": null,
474
  "post_processor": {
475
  "type": "TemplateProcessing",
476
  "single": [
477
+ {
478
+ "SpecialToken": {
479
+ "id": "[EXAGGERATION]",
480
+ "type_id": 0
481
+ }
482
+ },
483
+ {
484
+ "SpecialToken": {
485
+ "id": "[START]",
486
+ "type_id": 0
487
+ }
488
+ },
489
+ {
490
+ "Sequence": {
491
+ "id": "A",
492
+ "type_id": 0
493
+ }
494
+ },
495
+ {
496
+ "SpecialToken": {
497
+ "id": "[STOP]",
498
+ "type_id": 0
499
+ }
500
+ },
501
+ {
502
+ "SpecialToken": {
503
+ "id": "[START_SPEECH]",
504
+ "type_id": 0
505
+ }
506
+ },
507
+ {
508
+ "SpecialToken": {
509
+ "id": "[START_SPEECH]",
510
+ "type_id": 0
511
+ }
512
+ }
513
  ],
514
  "pair": [
515
+ {
516
+ "SpecialToken": {
517
+ "id": "[EXAGGERATION]",
518
+ "type_id": 0
519
+ }
520
+ },
521
+ {
522
+ "SpecialToken": {
523
+ "id": "[START]",
524
+ "type_id": 0
525
+ }
526
+ },
527
+ {
528
+ "Sequence": {
529
+ "id": "A",
530
+ "type_id": 0
531
+ }
532
+ },
533
+ {
534
+ "SpecialToken": {
535
+ "id": "[STOP]",
536
+ "type_id": 0
537
+ }
538
+ },
539
+ {
540
+ "SpecialToken": {
541
+ "id": "[START_SPEECH]",
542
+ "type_id": 0
543
+ }
544
+ },
545
+ {
546
+ "SpecialToken": {
547
+ "id": "[START_SPEECH]",
548
+ "type_id": 0
549
+ }
550
+ },
551
+ {
552
+ "SpecialToken": {
553
+ "id": "[EXAGGERATION]",
554
+ "type_id": 1
555
+ }
556
+ },
557
+ {
558
+ "SpecialToken": {
559
+ "id": "[START]",
560
+ "type_id": 1
561
+ }
562
+ },
563
+ {
564
+ "Sequence": {
565
+ "id": "B",
566
+ "type_id": 1
567
+ }
568
+ },
569
+ {
570
+ "SpecialToken": {
571
+ "id": "[STOP]",
572
+ "type_id": 1
573
+ }
574
+ },
575
+ {
576
+ "SpecialToken": {
577
+ "id": "[START_SPEECH]",
578
+ "type_id": 1
579
+ }
580
+ },
581
+ {
582
+ "SpecialToken": {
583
+ "id": "[START_SPEECH]",
584
+ "type_id": 1
585
+ }
586
+ }
587
  ],
588
  "special_tokens": {
589
+ "[START]": {
590
+ "id": "[START]",
591
+ "ids": [
592
+ 255
593
+ ],
594
+ "tokens": [
595
+ "[START]"
596
+ ]
597
  },
598
+ "[STOP]": {
599
+ "id": "[STOP]",
600
+ "ids": [
601
+ 0
602
+ ],
603
+ "tokens": [
604
+ "[STOP]"
605
+ ]
606
  },
607
+ "[EXAGGERATION]": {
608
+ "id": "[EXAGGERATION]",
609
+ "ids": [
610
+ 6563
611
+ ],
612
+ "tokens": [
613
+ "[EXAGGERATION]"
614
+ ]
615
  },
616
+ "[START_SPEECH]": {
617
+ "id": "[START_SPEECH]",
618
+ "ids": [
619
+ 6561
620
+ ],
621
+ "tokens": [
622
+ "[START_SPEECH]"
623
+ ]
624
  }
625
  }
626
  },
627
+ "decoder": {
628
+ "type": "Fuse"
629
+ },
630
  "model": {
631
  "type": "BPE",
632
  "dropout": null,
 
637
  "vocab": {
638
  "[STOP]": 0,
639
  "[UNK]": 1,
640
+ " ": 2,
641
  "!": 3,
642
  "'": 4,
643
  "(": 5,