Spaces:
Running
Running
Implemented edit support for tokenizer.ggml.scores and token_types
Browse files- _hf_explorer.py +2 -2
- _hf_gguf.py +50 -0
- app.py +207 -26
_hf_explorer.py
CHANGED
|
@@ -33,7 +33,7 @@ class FileExplorer(Component):
|
|
| 33 |
value: str | list[str] | Callable | None = None,
|
| 34 |
file_count: Literal["single", "multiple"] = "multiple",
|
| 35 |
root_dir: str = None,
|
| 36 |
-
branch: str =
|
| 37 |
token: str | None = None,
|
| 38 |
ignore_glob: str | None = None,
|
| 39 |
label: str | None = None,
|
|
@@ -76,7 +76,7 @@ class FileExplorer(Component):
|
|
| 76 |
key: if assigned, will be used to assume identity across a re-render. Components that have the same key across a re-render will have their value preserved.
|
| 77 |
"""
|
| 78 |
self.root_dir = root_dir
|
| 79 |
-
self.branch = branch
|
| 80 |
self.fs = HfFileSystem(token = token)
|
| 81 |
self.glob = glob
|
| 82 |
self.ignore_glob = ignore_glob
|
|
|
|
| 33 |
value: str | list[str] | Callable | None = None,
|
| 34 |
file_count: Literal["single", "multiple"] = "multiple",
|
| 35 |
root_dir: str = None,
|
| 36 |
+
branch: str | None = None,
|
| 37 |
token: str | None = None,
|
| 38 |
ignore_glob: str | None = None,
|
| 39 |
label: str | None = None,
|
|
|
|
| 76 |
key: if assigned, will be used to assume identity across a re-render. Components that have the same key across a re-render will have their value preserved.
|
| 77 |
"""
|
| 78 |
self.root_dir = root_dir
|
| 79 |
+
self.branch = branch or "main"
|
| 80 |
self.fs = HfFileSystem(token = token)
|
| 81 |
self.glob = glob
|
| 82 |
self.ignore_glob = ignore_glob
|
_hf_gguf.py
CHANGED
|
@@ -4,6 +4,56 @@ from fsspec.spec import AbstractBufferedFile
|
|
| 4 |
from typing import Any, Iterator, NamedTuple
|
| 5 |
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
class GGUFValueType(IntEnum):
|
| 8 |
UINT8 = 0
|
| 9 |
INT8 = 1
|
|
|
|
| 4 |
from typing import Any, Iterator, NamedTuple
|
| 5 |
|
| 6 |
|
| 7 |
+
class TokenType(IntEnum):
|
| 8 |
+
NORMAL = 1
|
| 9 |
+
UNKNOWN = 2
|
| 10 |
+
CONTROL = 3
|
| 11 |
+
USER_DEFINED = 4
|
| 12 |
+
UNUSED = 5
|
| 13 |
+
BYTE = 6
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class LlamaFileType(IntEnum):
|
| 17 |
+
ALL_F32 = 0
|
| 18 |
+
MOSTLY_F16 = 1
|
| 19 |
+
MOSTLY_Q4_0 = 2
|
| 20 |
+
MOSTLY_Q4_1 = 3
|
| 21 |
+
MOSTLY_Q4_1_SOME_F16 = 4
|
| 22 |
+
MOSTLY_Q4_2 = 5
|
| 23 |
+
MOSTLY_Q4_3 = 6
|
| 24 |
+
MOSTLY_Q8_0 = 7
|
| 25 |
+
MOSTLY_Q5_0 = 8
|
| 26 |
+
MOSTLY_Q5_1 = 9
|
| 27 |
+
MOSTLY_Q2_K = 10
|
| 28 |
+
MOSTLY_Q3_K_S = 11
|
| 29 |
+
MOSTLY_Q3_K_M = 12
|
| 30 |
+
MOSTLY_Q3_K_L = 13
|
| 31 |
+
MOSTLY_Q4_K_S = 14
|
| 32 |
+
MOSTLY_Q4_K_M = 15
|
| 33 |
+
MOSTLY_Q5_K_S = 16
|
| 34 |
+
MOSTLY_Q5_K_M = 17
|
| 35 |
+
MOSTLY_Q6_K = 18
|
| 36 |
+
MOSTLY_IQ2_XXS = 19
|
| 37 |
+
MOSTLY_IQ2_XS = 20
|
| 38 |
+
MOSTLY_Q2_K_S = 21
|
| 39 |
+
MOSTLY_IQ3_XS = 22
|
| 40 |
+
MOSTLY_IQ3_XXS = 23
|
| 41 |
+
MOSTLY_IQ1_S = 24
|
| 42 |
+
MOSTLY_IQ4_NL = 25
|
| 43 |
+
MOSTLY_IQ3_S = 26
|
| 44 |
+
MOSTLY_IQ3_M = 27
|
| 45 |
+
MOSTLY_IQ2_S = 28
|
| 46 |
+
MOSTLY_IQ2_M = 29
|
| 47 |
+
MOSTLY_IQ4_XS = 30
|
| 48 |
+
MOSTLY_IQ1_M = 31
|
| 49 |
+
MOSTLY_BF16 = 32
|
| 50 |
+
MOSTLY_Q4_0_4_4 = 33
|
| 51 |
+
MOSTLY_Q4_0_4_8 = 34
|
| 52 |
+
MOSTLY_Q4_0_8_8 = 35
|
| 53 |
+
MOSTLY_TQ1_0 = 36
|
| 54 |
+
MOSTLY_TQ2_0 = 37
|
| 55 |
+
|
| 56 |
+
|
| 57 |
class GGUFValueType(IntEnum):
|
| 58 |
UINT8 = 0
|
| 59 |
INT8 = 1
|
app.py
CHANGED
|
@@ -9,7 +9,7 @@ from typing import Annotated, Any, NamedTuple
|
|
| 9 |
from urllib.parse import urlencode
|
| 10 |
|
| 11 |
from _hf_explorer import FileExplorer
|
| 12 |
-
from _hf_gguf import standard_metadata, GGUFValueType, HuggingGGUFstream
|
| 13 |
|
| 14 |
|
| 15 |
hfapi = HfApi()
|
|
@@ -49,6 +49,14 @@ def human_readable_metadata(
|
|
| 49 |
val = str(val[:8])[:-1] + ', ...]'
|
| 50 |
else:
|
| 51 |
val = str(val)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
elif key.endswith('_token_id'):
|
| 53 |
tokens = meta.key.get('tokenizer.ggml.tokens', (-1, []))[1]
|
| 54 |
|
|
@@ -113,8 +121,23 @@ with gr.Blocks(
|
|
| 113 |
)
|
| 114 |
|
| 115 |
with gr.Row():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
meta_lookup = gr.Dropdown(
|
| 117 |
label = 'Lookup token',
|
|
|
|
| 118 |
allow_custom_value = True,
|
| 119 |
visible = False,
|
| 120 |
)
|
|
@@ -169,6 +192,8 @@ with gr.Blocks(
|
|
| 169 |
# BUG: For some reason using gr.State initial value turns tuple to list?
|
| 170 |
meta_state.value = init_state()
|
| 171 |
|
|
|
|
|
|
|
| 172 |
file_change_components = [
|
| 173 |
meta_changes,
|
| 174 |
file_meta,
|
|
@@ -400,6 +425,8 @@ with gr.Blocks(
|
|
| 400 |
],
|
| 401 |
outputs = [
|
| 402 |
meta_boolean,
|
|
|
|
|
|
|
| 403 |
meta_lookup,
|
| 404 |
meta_number,
|
| 405 |
meta_string,
|
|
@@ -420,7 +447,21 @@ with gr.Blocks(
|
|
| 420 |
elif not key:
|
| 421 |
typ = None
|
| 422 |
|
| 423 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 424 |
# TODO: Support arrays?
|
| 425 |
typ = GGUFValueType.ARRAY
|
| 426 |
|
|
@@ -435,15 +476,25 @@ with gr.Blocks(
|
|
| 435 |
value = val if typ == GGUFValueType.BOOL and data is not None else False,
|
| 436 |
visible = True if typ == GGUFValueType.BOOL else False,
|
| 437 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 438 |
meta_lookup: gr.Dropdown(
|
| 439 |
None,
|
| 440 |
-
value = tokens[val] if is_number and data is not None and
|
| 441 |
-
visible = True if is_number and
|
| 442 |
),
|
| 443 |
meta_number: gr.Number(
|
| 444 |
-
value = val if is_number and data is not None else 0,
|
| 445 |
precision = 10 if typ == GGUFValueType.FLOAT32 or typ == GGUFValueType.FLOAT64 else 0,
|
| 446 |
-
|
|
|
|
| 447 |
),
|
| 448 |
meta_string: gr.Textbox(
|
| 449 |
value = val if typ == GGUFValueType.STRING else '',
|
|
@@ -483,8 +534,9 @@ with gr.Blocks(
|
|
| 483 |
changes = [(k, 'rem') for k in meta.rem]
|
| 484 |
|
| 485 |
for k, v in meta.add.items():
|
|
|
|
| 486 |
changes.append((k, 'add'))
|
| 487 |
-
changes.append((str(
|
| 488 |
|
| 489 |
m = []
|
| 490 |
for k, v in meta.key.items():
|
|
@@ -498,7 +550,7 @@ with gr.Blocks(
|
|
| 498 |
link += '&' + urlencode(
|
| 499 |
{
|
| 500 |
'rem': meta.rem,
|
| 501 |
-
'add': [json.dumps([k, *v], ensure_ascii = False) for k, v in meta.add.items()],
|
| 502 |
},
|
| 503 |
doseq = True,
|
| 504 |
safe = '[]{}:"\',',
|
|
@@ -554,6 +606,97 @@ with gr.Blocks(
|
|
| 554 |
)
|
| 555 |
|
| 556 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 557 |
@gr.on(
|
| 558 |
triggers = [
|
| 559 |
meta_lookup.key_up,
|
|
@@ -563,6 +706,7 @@ with gr.Blocks(
|
|
| 563 |
],
|
| 564 |
outputs = [
|
| 565 |
meta_lookup,
|
|
|
|
| 566 |
],
|
| 567 |
show_progress = 'hidden',
|
| 568 |
trigger_mode = 'always_last',
|
|
@@ -571,16 +715,13 @@ with gr.Blocks(
|
|
| 571 |
meta: MetadataState,
|
| 572 |
keyup: gr.KeyUpData,
|
| 573 |
):
|
| 574 |
-
found =
|
| 575 |
-
value = keyup.input_value.lower()
|
| 576 |
-
tokens = meta.key.get('tokenizer.ggml.tokens', (-1, []))[1]
|
| 577 |
-
|
| 578 |
-
any(((found.append(t), len(found) > 5)[1] for i, t in enumerate(tokens) if value in t.lower()))
|
| 579 |
|
| 580 |
return {
|
| 581 |
meta_lookup: gr.Dropdown(
|
| 582 |
-
found,
|
| 583 |
),
|
|
|
|
| 584 |
}
|
| 585 |
|
| 586 |
|
|
@@ -590,6 +731,8 @@ with gr.Blocks(
|
|
| 590 |
typ: int | None,
|
| 591 |
val: Any,
|
| 592 |
request: gr.Request,
|
|
|
|
|
|
|
| 593 |
):
|
| 594 |
if not key or typ is None:
|
| 595 |
if key:
|
|
@@ -603,7 +746,18 @@ with gr.Blocks(
|
|
| 603 |
if key in meta.rem:
|
| 604 |
meta.rem.remove(key)
|
| 605 |
|
| 606 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 607 |
|
| 608 |
if key.startswith('tokenizer.chat_template.'):
|
| 609 |
template = key[24:]
|
|
@@ -617,29 +771,25 @@ with gr.Blocks(
|
|
| 617 |
)
|
| 618 |
|
| 619 |
|
| 620 |
-
def
|
| 621 |
-
|
| 622 |
-
|
| 623 |
):
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
try:
|
| 627 |
-
found = tokens.index(token)
|
| 628 |
-
except Exception as e:
|
| 629 |
raise gr.Error('Token not found')
|
| 630 |
|
| 631 |
return {
|
| 632 |
meta_number: gr.Number(
|
| 633 |
-
|
| 634 |
),
|
| 635 |
}
|
| 636 |
|
| 637 |
|
| 638 |
meta_lookup.input(
|
| 639 |
-
|
| 640 |
inputs = [
|
| 641 |
-
meta_state,
|
| 642 |
meta_lookup,
|
|
|
|
| 643 |
],
|
| 644 |
outputs = [
|
| 645 |
meta_number,
|
|
@@ -668,6 +818,20 @@ with gr.Blocks(
|
|
| 668 |
] + state_change_components,
|
| 669 |
)
|
| 670 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 671 |
meta_number.submit(
|
| 672 |
add_metadata,
|
| 673 |
inputs = [
|
|
@@ -675,6 +839,8 @@ with gr.Blocks(
|
|
| 675 |
meta_keys,
|
| 676 |
meta_types,
|
| 677 |
meta_number,
|
|
|
|
|
|
|
| 678 |
],
|
| 679 |
outputs = [
|
| 680 |
] + state_change_components,
|
|
@@ -736,9 +902,24 @@ def stream_repo_file(
|
|
| 736 |
for k in rem_meta:
|
| 737 |
gguf.remove_metadata(k)
|
| 738 |
|
|
|
|
| 739 |
for k in add_meta:
|
| 740 |
k = json.loads(k)
|
| 741 |
if isinstance(k, list) and len(k) == 3:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 742 |
gguf.add_metadata(*k)
|
| 743 |
|
| 744 |
yield gguf.filesize
|
|
|
|
| 9 |
from urllib.parse import urlencode
|
| 10 |
|
| 11 |
from _hf_explorer import FileExplorer
|
| 12 |
+
from _hf_gguf import standard_metadata, TokenType, LlamaFileType, GGUFValueType, HuggingGGUFstream
|
| 13 |
|
| 14 |
|
| 15 |
hfapi = HfApi()
|
|
|
|
| 49 |
val = str(val[:8])[:-1] + ', ...]'
|
| 50 |
else:
|
| 51 |
val = str(val)
|
| 52 |
+
elif isinstance(val, dict):
|
| 53 |
+
val = '[' + ', '.join((f'{k}: {v}' for k, v in val.items())) + ']'
|
| 54 |
+
elif key == 'general.file_type':
|
| 55 |
+
try:
|
| 56 |
+
ftype = LlamaFileType(val).name
|
| 57 |
+
except:
|
| 58 |
+
ftype = 'UNKNOWN'
|
| 59 |
+
val = f'{ftype} ({val})'
|
| 60 |
elif key.endswith('_token_id'):
|
| 61 |
tokens = meta.key.get('tokenizer.ggml.tokens', (-1, []))[1]
|
| 62 |
|
|
|
|
| 121 |
)
|
| 122 |
|
| 123 |
with gr.Row():
|
| 124 |
+
meta_token_select = gr.Dropdown(
|
| 125 |
+
label = 'Select token',
|
| 126 |
+
type = 'index',
|
| 127 |
+
allow_custom_value = True,
|
| 128 |
+
visible = False,
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
meta_token_type = gr.Dropdown(
|
| 132 |
+
[e.name for e in TokenType],
|
| 133 |
+
label = 'Token type',
|
| 134 |
+
type = 'index',
|
| 135 |
+
visible = False,
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
meta_lookup = gr.Dropdown(
|
| 139 |
label = 'Lookup token',
|
| 140 |
+
type = 'index',
|
| 141 |
allow_custom_value = True,
|
| 142 |
visible = False,
|
| 143 |
)
|
|
|
|
| 192 |
# BUG: For some reason using gr.State initial value turns tuple to list?
|
| 193 |
meta_state.value = init_state()
|
| 194 |
|
| 195 |
+
token_select_indices = gr.State([])
|
| 196 |
+
|
| 197 |
file_change_components = [
|
| 198 |
meta_changes,
|
| 199 |
file_meta,
|
|
|
|
| 425 |
],
|
| 426 |
outputs = [
|
| 427 |
meta_boolean,
|
| 428 |
+
meta_token_select,
|
| 429 |
+
meta_token_type,
|
| 430 |
meta_lookup,
|
| 431 |
meta_number,
|
| 432 |
meta_string,
|
|
|
|
| 447 |
elif not key:
|
| 448 |
typ = None
|
| 449 |
|
| 450 |
+
do_select_token = False
|
| 451 |
+
do_lookup_token = False
|
| 452 |
+
do_token_type = False
|
| 453 |
+
match key:
|
| 454 |
+
case 'tokenizer.ggml.scores':
|
| 455 |
+
do_select_token = True
|
| 456 |
+
case 'tokenizer.ggml.token_type':
|
| 457 |
+
do_select_token = True
|
| 458 |
+
do_token_type = True
|
| 459 |
+
case s if s.endswith('_token_id'):
|
| 460 |
+
do_lookup_token = True
|
| 461 |
+
case _:
|
| 462 |
+
pass
|
| 463 |
+
|
| 464 |
+
if isinstance(val, list) and not do_select_token:
|
| 465 |
# TODO: Support arrays?
|
| 466 |
typ = GGUFValueType.ARRAY
|
| 467 |
|
|
|
|
| 476 |
value = val if typ == GGUFValueType.BOOL and data is not None else False,
|
| 477 |
visible = True if typ == GGUFValueType.BOOL else False,
|
| 478 |
),
|
| 479 |
+
meta_token_select: gr.Dropdown(
|
| 480 |
+
None,
|
| 481 |
+
value = '',
|
| 482 |
+
visible = True if do_select_token else False,
|
| 483 |
+
),
|
| 484 |
+
meta_token_type: gr.Dropdown(
|
| 485 |
+
interactive = False,
|
| 486 |
+
visible = True if do_token_type else False,
|
| 487 |
+
),
|
| 488 |
meta_lookup: gr.Dropdown(
|
| 489 |
None,
|
| 490 |
+
value = tokens[val] if is_number and data is not None and do_lookup_token and val < len(tokens) else '',
|
| 491 |
+
visible = True if is_number and do_lookup_token else False,
|
| 492 |
),
|
| 493 |
meta_number: gr.Number(
|
| 494 |
+
value = val if is_number and data is not None and not do_select_token else 0,
|
| 495 |
precision = 10 if typ == GGUFValueType.FLOAT32 or typ == GGUFValueType.FLOAT64 else 0,
|
| 496 |
+
interactive = False if do_select_token else True,
|
| 497 |
+
visible = True if is_number and not do_token_type else False,
|
| 498 |
),
|
| 499 |
meta_string: gr.Textbox(
|
| 500 |
value = val if typ == GGUFValueType.STRING else '',
|
|
|
|
| 534 |
changes = [(k, 'rem') for k in meta.rem]
|
| 535 |
|
| 536 |
for k, v in meta.add.items():
|
| 537 |
+
key, typ, val = human_readable_metadata(meta, k, *v)
|
| 538 |
changes.append((k, 'add'))
|
| 539 |
+
changes.append((str(val), None))
|
| 540 |
|
| 541 |
m = []
|
| 542 |
for k, v in meta.key.items():
|
|
|
|
| 550 |
link += '&' + urlencode(
|
| 551 |
{
|
| 552 |
'rem': meta.rem,
|
| 553 |
+
'add': [json.dumps([k, *v], ensure_ascii = False, separators = (',', ':')) for k, v in meta.add.items()],
|
| 554 |
},
|
| 555 |
doseq = True,
|
| 556 |
safe = '[]{}:"\',',
|
|
|
|
| 606 |
)
|
| 607 |
|
| 608 |
|
| 609 |
+
def token_search(
|
| 610 |
+
meta: MetadataState,
|
| 611 |
+
name: str,
|
| 612 |
+
):
|
| 613 |
+
found = {}
|
| 614 |
+
name = name.lower()
|
| 615 |
+
tokens = meta.key.get('tokenizer.ggml.tokens', (-1, []))[1]
|
| 616 |
+
|
| 617 |
+
any(((len(found) > 5, found.setdefault(i, t))[0] for i, t in enumerate(tokens) if name in t.lower()))
|
| 618 |
+
|
| 619 |
+
return found
|
| 620 |
+
|
| 621 |
+
|
| 622 |
+
@gr.on(
|
| 623 |
+
triggers = [
|
| 624 |
+
meta_token_select.key_up,
|
| 625 |
+
],
|
| 626 |
+
inputs = [
|
| 627 |
+
meta_state,
|
| 628 |
+
],
|
| 629 |
+
outputs = [
|
| 630 |
+
meta_token_select,
|
| 631 |
+
token_select_indices,
|
| 632 |
+
],
|
| 633 |
+
show_progress = 'hidden',
|
| 634 |
+
trigger_mode = 'always_last',
|
| 635 |
+
)
|
| 636 |
+
def token_select(
|
| 637 |
+
meta: MetadataState,
|
| 638 |
+
keyup: gr.KeyUpData,
|
| 639 |
+
):
|
| 640 |
+
found = token_search(meta, keyup.input_value)
|
| 641 |
+
|
| 642 |
+
return {
|
| 643 |
+
meta_token_select: gr.Dropdown(
|
| 644 |
+
list(found.values()),
|
| 645 |
+
),
|
| 646 |
+
token_select_indices: list(found.keys()),
|
| 647 |
+
}
|
| 648 |
+
|
| 649 |
+
|
| 650 |
+
@gr.on(
|
| 651 |
+
triggers = [
|
| 652 |
+
meta_token_select.input,
|
| 653 |
+
],
|
| 654 |
+
inputs = [
|
| 655 |
+
meta_state,
|
| 656 |
+
meta_keys,
|
| 657 |
+
meta_token_select,
|
| 658 |
+
token_select_indices,
|
| 659 |
+
],
|
| 660 |
+
outputs = [
|
| 661 |
+
meta_token_type,
|
| 662 |
+
meta_number,
|
| 663 |
+
],
|
| 664 |
+
)
|
| 665 |
+
def token_selected(
|
| 666 |
+
meta: MetadataState,
|
| 667 |
+
key: str,
|
| 668 |
+
choice: int,
|
| 669 |
+
indices: list[int],
|
| 670 |
+
):
|
| 671 |
+
if choice < 0 or choice >= len(indices) or (token := indices[choice]) < 0:
|
| 672 |
+
raise gr.Error('Token not found')
|
| 673 |
+
|
| 674 |
+
tokens = meta.key.get('tokenizer.ggml.tokens', (-1, []))[1]
|
| 675 |
+
|
| 676 |
+
if token >= len(tokens):
|
| 677 |
+
raise gr.Error('Invalid token')
|
| 678 |
+
|
| 679 |
+
data = meta.key.get(key, (-1, []))[1]
|
| 680 |
+
|
| 681 |
+
match key:
|
| 682 |
+
case 'tokenizer.ggml.scores':
|
| 683 |
+
return {
|
| 684 |
+
meta_number: gr.Number(
|
| 685 |
+
value = data[token] if data and len(data) > token else 0.0,
|
| 686 |
+
interactive = True,
|
| 687 |
+
),
|
| 688 |
+
}
|
| 689 |
+
case 'tokenizer.ggml.token_type':
|
| 690 |
+
return {
|
| 691 |
+
meta_token_type: gr.Dropdown(
|
| 692 |
+
value = TokenType(data[token]).name if data and len(data) > token else TokenType.NORMAL.name,
|
| 693 |
+
interactive = True,
|
| 694 |
+
),
|
| 695 |
+
}
|
| 696 |
+
case _:
|
| 697 |
+
raise gr.Error('Invalid metadata key')
|
| 698 |
+
|
| 699 |
+
|
| 700 |
@gr.on(
|
| 701 |
triggers = [
|
| 702 |
meta_lookup.key_up,
|
|
|
|
| 706 |
],
|
| 707 |
outputs = [
|
| 708 |
meta_lookup,
|
| 709 |
+
token_select_indices,
|
| 710 |
],
|
| 711 |
show_progress = 'hidden',
|
| 712 |
trigger_mode = 'always_last',
|
|
|
|
| 715 |
meta: MetadataState,
|
| 716 |
keyup: gr.KeyUpData,
|
| 717 |
):
|
| 718 |
+
found = token_search(meta, keyup.input_value)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 719 |
|
| 720 |
return {
|
| 721 |
meta_lookup: gr.Dropdown(
|
| 722 |
+
list(found.values()),
|
| 723 |
),
|
| 724 |
+
token_select_indices: list(found.keys()),
|
| 725 |
}
|
| 726 |
|
| 727 |
|
|
|
|
| 731 |
typ: int | None,
|
| 732 |
val: Any,
|
| 733 |
request: gr.Request,
|
| 734 |
+
choice: int | None = None,
|
| 735 |
+
indices: list[int] | None = None,
|
| 736 |
):
|
| 737 |
if not key or typ is None:
|
| 738 |
if key:
|
|
|
|
| 746 |
if key in meta.rem:
|
| 747 |
meta.rem.remove(key)
|
| 748 |
|
| 749 |
+
match key:
|
| 750 |
+
case 'tokenizer.ggml.scores' | 'tokenizer.ggml.token_type':
|
| 751 |
+
if choice >= 0 and choice < len(indices) and (token := indices[choice]) >= 0:
|
| 752 |
+
tok = meta.add.setdefault(key, (typ, {}))[1]
|
| 753 |
+
tok[str(token)] = val + 1 if key == 'tokenizer.ggml.token_type' else val
|
| 754 |
+
|
| 755 |
+
data = meta.key.setdefault(key, (typ, [0.0 if key == 'tokenizer.ggml.scores' else int(TokenType.NORMAL)] * len(meta.key.get('tokenizer.ggml.tokens', (-1, []))[1])))[1]
|
| 756 |
+
if data:
|
| 757 |
+
for k, v in tok.items():
|
| 758 |
+
data[int(k)] = v
|
| 759 |
+
case _:
|
| 760 |
+
meta.key[key] = meta.add[key] = (typ, val)
|
| 761 |
|
| 762 |
if key.startswith('tokenizer.chat_template.'):
|
| 763 |
template = key[24:]
|
|
|
|
| 771 |
)
|
| 772 |
|
| 773 |
|
| 774 |
+
def token_select_to_id(
|
| 775 |
+
choice: int,
|
| 776 |
+
indices: list[int],
|
| 777 |
):
|
| 778 |
+
if choice < 0 or choice >= len(indices) or (token := indices[choice]) < 0:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 779 |
raise gr.Error('Token not found')
|
| 780 |
|
| 781 |
return {
|
| 782 |
meta_number: gr.Number(
|
| 783 |
+
token,
|
| 784 |
),
|
| 785 |
}
|
| 786 |
|
| 787 |
|
| 788 |
meta_lookup.input(
|
| 789 |
+
token_select_to_id,
|
| 790 |
inputs = [
|
|
|
|
| 791 |
meta_lookup,
|
| 792 |
+
token_select_indices,
|
| 793 |
],
|
| 794 |
outputs = [
|
| 795 |
meta_number,
|
|
|
|
| 818 |
] + state_change_components,
|
| 819 |
)
|
| 820 |
|
| 821 |
+
meta_token_type.input(
|
| 822 |
+
add_metadata,
|
| 823 |
+
inputs = [
|
| 824 |
+
meta_state,
|
| 825 |
+
meta_keys,
|
| 826 |
+
meta_types,
|
| 827 |
+
meta_token_type,
|
| 828 |
+
meta_token_select,
|
| 829 |
+
token_select_indices,
|
| 830 |
+
],
|
| 831 |
+
outputs = [
|
| 832 |
+
] + state_change_components,
|
| 833 |
+
)
|
| 834 |
+
|
| 835 |
meta_number.submit(
|
| 836 |
add_metadata,
|
| 837 |
inputs = [
|
|
|
|
| 839 |
meta_keys,
|
| 840 |
meta_types,
|
| 841 |
meta_number,
|
| 842 |
+
meta_token_select,
|
| 843 |
+
token_select_indices,
|
| 844 |
],
|
| 845 |
outputs = [
|
| 846 |
] + state_change_components,
|
|
|
|
| 902 |
for k in rem_meta:
|
| 903 |
gguf.remove_metadata(k)
|
| 904 |
|
| 905 |
+
tokens = gguf.metadata.get('tokenizer.ggml.tokens')
|
| 906 |
for k in add_meta:
|
| 907 |
k = json.loads(k)
|
| 908 |
if isinstance(k, list) and len(k) == 3:
|
| 909 |
+
if isinstance(k[2], dict):
|
| 910 |
+
if tokens:
|
| 911 |
+
if (data := gguf.metadata.get(k[0])):
|
| 912 |
+
data = data.value
|
| 913 |
+
else:
|
| 914 |
+
data = [0.0 if k[0] == 'tokenizer.ggml.scores' else int(TokenType.NORMAL)] * len(tokens.value)
|
| 915 |
+
|
| 916 |
+
for i, v in k[2].items():
|
| 917 |
+
data[int(i)] = v
|
| 918 |
+
|
| 919 |
+
k[2] = data
|
| 920 |
+
else:
|
| 921 |
+
k[2] = []
|
| 922 |
+
|
| 923 |
gguf.add_metadata(*k)
|
| 924 |
|
| 925 |
yield gguf.filesize
|