Spaces:
Running
Running
Bor Hodošček
commited on
feat: remove debug
Browse files
app.py
CHANGED
|
@@ -160,7 +160,9 @@ def _(doc, mo, pl):
|
|
| 160 |
str(token.morph) for token in doc
|
| 161 |
], # To be more precise, this should be merged back in via .to_dict()
|
| 162 |
"Token Position": list(range(len(doc))),
|
| 163 |
-
"Sentence Number": [
|
|
|
|
|
|
|
| 164 |
}
|
| 165 |
)
|
| 166 |
|
|
@@ -367,7 +369,6 @@ def _(hashlib):
|
|
| 367 |
@app.function
|
| 368 |
def fix_token(token: str) -> str:
|
| 369 |
"""Fix token for display with improved space visualization."""
|
| 370 |
-
print(token)
|
| 371 |
# Replace SentencePiece space marker U+2581 with a middle dot
|
| 372 |
token = token.replace(" ", "·")
|
| 373 |
# Replace BPE space marker 'Ġ' with a middle dot
|
|
@@ -388,7 +389,6 @@ def get_tokenizer_info(tokenizer):
|
|
| 388 |
Extract useful information from a tokenizer.
|
| 389 |
Returns a dictionary with tokenizer details.
|
| 390 |
"""
|
| 391 |
-
print(tokenizer)
|
| 392 |
|
| 393 |
info = {}
|
| 394 |
try:
|
|
@@ -485,7 +485,6 @@ def _(
|
|
| 485 |
# Tokenize the input text
|
| 486 |
# Use tokenize to get string representations for analysis and display
|
| 487 |
all_tokens = tokenizer.tokenize(current_text)
|
| 488 |
-
print(all_tokens)
|
| 489 |
total_token_count = len(all_tokens)
|
| 490 |
|
| 491 |
# Limit the number of tokens for display to avoid browser slowdown
|
|
@@ -528,15 +527,19 @@ def _(
|
|
| 528 |
style = f"background-color: {item['colors']['background']}; color: {item['colors']['text']}; padding: 1px 3px; margin: 1px; border-radius: 3px; display: inline-block; white-space: pre-wrap; line-height: 1.4;"
|
| 529 |
# Add title attribute for hover info (original token + ID)
|
| 530 |
title = f"Original: {item['original']}\nID: {item['token_id']}"
|
| 531 |
-
display_content =
|
|
|
|
|
|
|
| 532 |
html_parts.append(
|
| 533 |
f'<span style="{style}" title="{title}">{display_content}</span>'
|
| 534 |
)
|
| 535 |
|
| 536 |
-
token_viz_html = mo.Html(
|
|
|
|
|
|
|
| 537 |
|
| 538 |
-
basic_stats = token_stats[
|
| 539 |
-
length_stats = token_stats[
|
| 540 |
|
| 541 |
basic_stats_md = "**Basic Stats:**\n\n" + "\n".join(
|
| 542 |
f"- **{key.replace('_', ' ').title()}:** `{value}`"
|
|
|
|
| 160 |
str(token.morph) for token in doc
|
| 161 |
], # To be more precise, this should be merged back in via .to_dict()
|
| 162 |
"Token Position": list(range(len(doc))),
|
| 163 |
+
"Sentence Number": [
|
| 164 |
+
i for i, sent in enumerate(doc.sents) for token in sent
|
| 165 |
+
],
|
| 166 |
}
|
| 167 |
)
|
| 168 |
|
|
|
|
| 369 |
@app.function
|
| 370 |
def fix_token(token: str) -> str:
|
| 371 |
"""Fix token for display with improved space visualization."""
|
|
|
|
| 372 |
# Replace SentencePiece space marker U+2581 with a middle dot
|
| 373 |
token = token.replace(" ", "·")
|
| 374 |
# Replace BPE space marker 'Ġ' with a middle dot
|
|
|
|
| 389 |
Extract useful information from a tokenizer.
|
| 390 |
Returns a dictionary with tokenizer details.
|
| 391 |
"""
|
|
|
|
| 392 |
|
| 393 |
info = {}
|
| 394 |
try:
|
|
|
|
| 485 |
# Tokenize the input text
|
| 486 |
# Use tokenize to get string representations for analysis and display
|
| 487 |
all_tokens = tokenizer.tokenize(current_text)
|
|
|
|
| 488 |
total_token_count = len(all_tokens)
|
| 489 |
|
| 490 |
# Limit the number of tokens for display to avoid browser slowdown
|
|
|
|
| 527 |
style = f"background-color: {item['colors']['background']}; color: {item['colors']['text']}; padding: 1px 3px; margin: 1px; border-radius: 3px; display: inline-block; white-space: pre-wrap; line-height: 1.4;"
|
| 528 |
# Add title attribute for hover info (original token + ID)
|
| 529 |
title = f"Original: {item['original']}\nID: {item['token_id']}"
|
| 530 |
+
display_content = (
|
| 531 |
+
str(item["token_id"]) if show_ids_switch.value else item["display"]
|
| 532 |
+
)
|
| 533 |
html_parts.append(
|
| 534 |
f'<span style="{style}" title="{title}">{display_content}</span>'
|
| 535 |
)
|
| 536 |
|
| 537 |
+
token_viz_html = mo.Html(
|
| 538 |
+
f'<div style="line-height: 1.6;">{"".join(html_parts)}</div>'
|
| 539 |
+
)
|
| 540 |
|
| 541 |
+
basic_stats = token_stats["basic_stats"]
|
| 542 |
+
length_stats = token_stats["length_stats"]
|
| 543 |
|
| 544 |
basic_stats_md = "**Basic Stats:**\n\n" + "\n".join(
|
| 545 |
f"- **{key.replace('_', ' ').title()}:** `{value}`"
|