Update tokenizer
Browse files- tokenizer.json +177 -56
tokenizer.json
CHANGED
|
@@ -21,15 +21,6 @@
|
|
| 21 |
"rstrip": false,
|
| 22 |
"normalized": false
|
| 23 |
},
|
| 24 |
-
{
|
| 25 |
-
"id": 2,
|
| 26 |
-
"special": true,
|
| 27 |
-
"content": "[SPACE]",
|
| 28 |
-
"single_word": false,
|
| 29 |
-
"lstrip": false,
|
| 30 |
-
"rstrip": false,
|
| 31 |
-
"normalized": true
|
| 32 |
-
},
|
| 33 |
{
|
| 34 |
"id": 255,
|
| 35 |
"special": true,
|
|
@@ -443,69 +434,199 @@
|
|
| 443 |
"rstrip": false,
|
| 444 |
"normalized": false,
|
| 445 |
"special": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 446 |
}
|
| 447 |
],
|
| 448 |
"normalizer": {
|
| 449 |
-
"type": "
|
| 450 |
-
"
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
"content": "[SPACE]"
|
| 455 |
-
}
|
| 456 |
-
]
|
| 457 |
-
},
|
| 458 |
-
"pre_tokenizer": {
|
| 459 |
-
"type": "Whitespace"
|
| 460 |
},
|
|
|
|
| 461 |
"post_processor": {
|
| 462 |
"type": "TemplateProcessing",
|
| 463 |
"single": [
|
| 464 |
-
{
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 470 |
],
|
| 471 |
"pair": [
|
| 472 |
-
{
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
{
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 484 |
],
|
| 485 |
"special_tokens": {
|
| 486 |
-
"
|
| 487 |
-
"id": "
|
| 488 |
-
"ids": [
|
| 489 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 490 |
},
|
| 491 |
-
"
|
| 492 |
-
"id": "
|
| 493 |
-
"ids": [
|
| 494 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
},
|
| 496 |
-
"EXAGGERATION": {
|
| 497 |
-
"id": "EXAGGERATION",
|
| 498 |
-
"ids": [
|
| 499 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 500 |
},
|
| 501 |
-
"START_SPEECH": {
|
| 502 |
-
"id": "START_SPEECH",
|
| 503 |
-
"ids": [
|
| 504 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 505 |
}
|
| 506 |
}
|
| 507 |
},
|
| 508 |
-
"decoder":
|
|
|
|
|
|
|
| 509 |
"model": {
|
| 510 |
"type": "BPE",
|
| 511 |
"dropout": null,
|
|
@@ -516,7 +637,7 @@
|
|
| 516 |
"vocab": {
|
| 517 |
"[STOP]": 0,
|
| 518 |
"[UNK]": 1,
|
| 519 |
-
"
|
| 520 |
"!": 3,
|
| 521 |
"'": 4,
|
| 522 |
"(": 5,
|
|
|
|
| 21 |
"rstrip": false,
|
| 22 |
"normalized": false
|
| 23 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
{
|
| 25 |
"id": 255,
|
| 26 |
"special": true,
|
|
|
|
| 434 |
"rstrip": false,
|
| 435 |
"normalized": false,
|
| 436 |
"special": true
|
| 437 |
+
},
|
| 438 |
+
{
|
| 439 |
+
"id": 6561,
|
| 440 |
+
"content": "[START_SPEECH]",
|
| 441 |
+
"single_word": false,
|
| 442 |
+
"lstrip": false,
|
| 443 |
+
"rstrip": false,
|
| 444 |
+
"normalized": false,
|
| 445 |
+
"special": true
|
| 446 |
+
},
|
| 447 |
+
{
|
| 448 |
+
"id": 6562,
|
| 449 |
+
"content": "[STOP_SPEECH]",
|
| 450 |
+
"single_word": false,
|
| 451 |
+
"lstrip": false,
|
| 452 |
+
"rstrip": false,
|
| 453 |
+
"normalized": false,
|
| 454 |
+
"special": true
|
| 455 |
+
},
|
| 456 |
+
{
|
| 457 |
+
"id": 6563,
|
| 458 |
+
"content": "[EXAGGERATION]",
|
| 459 |
+
"single_word": false,
|
| 460 |
+
"lstrip": false,
|
| 461 |
+
"rstrip": false,
|
| 462 |
+
"normalized": false,
|
| 463 |
+
"special": true
|
| 464 |
}
|
| 465 |
],
|
| 466 |
"normalizer": {
|
| 467 |
+
"type": "Replace",
|
| 468 |
+
"pattern": {
|
| 469 |
+
"Regex": "\\s+"
|
| 470 |
+
},
|
| 471 |
+
"content": " "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 472 |
},
|
| 473 |
+
"pre_tokenizer": null,
|
| 474 |
"post_processor": {
|
| 475 |
"type": "TemplateProcessing",
|
| 476 |
"single": [
|
| 477 |
+
{
|
| 478 |
+
"SpecialToken": {
|
| 479 |
+
"id": "[EXAGGERATION]",
|
| 480 |
+
"type_id": 0
|
| 481 |
+
}
|
| 482 |
+
},
|
| 483 |
+
{
|
| 484 |
+
"SpecialToken": {
|
| 485 |
+
"id": "[START]",
|
| 486 |
+
"type_id": 0
|
| 487 |
+
}
|
| 488 |
+
},
|
| 489 |
+
{
|
| 490 |
+
"Sequence": {
|
| 491 |
+
"id": "A",
|
| 492 |
+
"type_id": 0
|
| 493 |
+
}
|
| 494 |
+
},
|
| 495 |
+
{
|
| 496 |
+
"SpecialToken": {
|
| 497 |
+
"id": "[STOP]",
|
| 498 |
+
"type_id": 0
|
| 499 |
+
}
|
| 500 |
+
},
|
| 501 |
+
{
|
| 502 |
+
"SpecialToken": {
|
| 503 |
+
"id": "[START_SPEECH]",
|
| 504 |
+
"type_id": 0
|
| 505 |
+
}
|
| 506 |
+
},
|
| 507 |
+
{
|
| 508 |
+
"SpecialToken": {
|
| 509 |
+
"id": "[START_SPEECH]",
|
| 510 |
+
"type_id": 0
|
| 511 |
+
}
|
| 512 |
+
}
|
| 513 |
],
|
| 514 |
"pair": [
|
| 515 |
+
{
|
| 516 |
+
"SpecialToken": {
|
| 517 |
+
"id": "[EXAGGERATION]",
|
| 518 |
+
"type_id": 0
|
| 519 |
+
}
|
| 520 |
+
},
|
| 521 |
+
{
|
| 522 |
+
"SpecialToken": {
|
| 523 |
+
"id": "[START]",
|
| 524 |
+
"type_id": 0
|
| 525 |
+
}
|
| 526 |
+
},
|
| 527 |
+
{
|
| 528 |
+
"Sequence": {
|
| 529 |
+
"id": "A",
|
| 530 |
+
"type_id": 0
|
| 531 |
+
}
|
| 532 |
+
},
|
| 533 |
+
{
|
| 534 |
+
"SpecialToken": {
|
| 535 |
+
"id": "[STOP]",
|
| 536 |
+
"type_id": 0
|
| 537 |
+
}
|
| 538 |
+
},
|
| 539 |
+
{
|
| 540 |
+
"SpecialToken": {
|
| 541 |
+
"id": "[START_SPEECH]",
|
| 542 |
+
"type_id": 0
|
| 543 |
+
}
|
| 544 |
+
},
|
| 545 |
+
{
|
| 546 |
+
"SpecialToken": {
|
| 547 |
+
"id": "[START_SPEECH]",
|
| 548 |
+
"type_id": 0
|
| 549 |
+
}
|
| 550 |
+
},
|
| 551 |
+
{
|
| 552 |
+
"SpecialToken": {
|
| 553 |
+
"id": "[EXAGGERATION]",
|
| 554 |
+
"type_id": 1
|
| 555 |
+
}
|
| 556 |
+
},
|
| 557 |
+
{
|
| 558 |
+
"SpecialToken": {
|
| 559 |
+
"id": "[START]",
|
| 560 |
+
"type_id": 1
|
| 561 |
+
}
|
| 562 |
+
},
|
| 563 |
+
{
|
| 564 |
+
"Sequence": {
|
| 565 |
+
"id": "B",
|
| 566 |
+
"type_id": 1
|
| 567 |
+
}
|
| 568 |
+
},
|
| 569 |
+
{
|
| 570 |
+
"SpecialToken": {
|
| 571 |
+
"id": "[STOP]",
|
| 572 |
+
"type_id": 1
|
| 573 |
+
}
|
| 574 |
+
},
|
| 575 |
+
{
|
| 576 |
+
"SpecialToken": {
|
| 577 |
+
"id": "[START_SPEECH]",
|
| 578 |
+
"type_id": 1
|
| 579 |
+
}
|
| 580 |
+
},
|
| 581 |
+
{
|
| 582 |
+
"SpecialToken": {
|
| 583 |
+
"id": "[START_SPEECH]",
|
| 584 |
+
"type_id": 1
|
| 585 |
+
}
|
| 586 |
+
}
|
| 587 |
],
|
| 588 |
"special_tokens": {
|
| 589 |
+
"[START]": {
|
| 590 |
+
"id": "[START]",
|
| 591 |
+
"ids": [
|
| 592 |
+
255
|
| 593 |
+
],
|
| 594 |
+
"tokens": [
|
| 595 |
+
"[START]"
|
| 596 |
+
]
|
| 597 |
},
|
| 598 |
+
"[STOP]": {
|
| 599 |
+
"id": "[STOP]",
|
| 600 |
+
"ids": [
|
| 601 |
+
0
|
| 602 |
+
],
|
| 603 |
+
"tokens": [
|
| 604 |
+
"[STOP]"
|
| 605 |
+
]
|
| 606 |
},
|
| 607 |
+
"[EXAGGERATION]": {
|
| 608 |
+
"id": "[EXAGGERATION]",
|
| 609 |
+
"ids": [
|
| 610 |
+
6563
|
| 611 |
+
],
|
| 612 |
+
"tokens": [
|
| 613 |
+
"[EXAGGERATION]"
|
| 614 |
+
]
|
| 615 |
},
|
| 616 |
+
"[START_SPEECH]": {
|
| 617 |
+
"id": "[START_SPEECH]",
|
| 618 |
+
"ids": [
|
| 619 |
+
6561
|
| 620 |
+
],
|
| 621 |
+
"tokens": [
|
| 622 |
+
"[START_SPEECH]"
|
| 623 |
+
]
|
| 624 |
}
|
| 625 |
}
|
| 626 |
},
|
| 627 |
+
"decoder": {
|
| 628 |
+
"type": "Fuse"
|
| 629 |
+
},
|
| 630 |
"model": {
|
| 631 |
"type": "BPE",
|
| 632 |
"dropout": null,
|
|
|
|
| 637 |
"vocab": {
|
| 638 |
"[STOP]": 0,
|
| 639 |
"[UNK]": 1,
|
| 640 |
+
" ": 2,
|
| 641 |
"!": 3,
|
| 642 |
"'": 4,
|
| 643 |
"(": 5,
|