LLM-Token-Visual

Running

App Files Files Community

openfree commited on Apr 19

Commit

623f1f7

verified ·

1 Parent(s): 80e2685

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -16

app.py CHANGED Viewed

@@ -184,12 +184,18 @@ def get_varied_color(token: str) -> dict:
         'text': f'hsl({hue}, {saturation}%, {text_lightness}%)'
     }
-def fix_token(token: str) -> str:
-    """Fix token for display with improved space visualization."""
-    if token.startswith('Ġ'):
-        space_count = token.count('Ġ')
-        return '·' * space_count + token[space_count:]
-    return token
 def get_token_stats(tokens: list, original_text: str) -> dict:
     """Calculate enhanced statistics about the tokens."""
@@ -286,14 +292,23 @@ def process_text(text: str, model_id_or_name: str, is_full_file: bool = False, f
     token_data = []
     for idx, token in enumerate(display_tokens):
         colors = get_varied_color(token)
-        fixed_token = fix_token(token)
         # Compute the numerical token ID from the tokenizer
         token_id = tokenizer.convert_tokens_to_ids(token)
         token_data.append({
-            'original': token,
-            'display': fixed_token[:-1] if fixed_token.endswith('Ċ') else fixed_token,
             'colors': colors,
-            'newline': fixed_token.endswith('Ċ'),
             'token_id': token_id,
             'token_index': idx
         })
@@ -1549,12 +1564,6 @@ HTML_TEMPLATE = """
             fileDropZone[0].addEventListener('drop', handleDrop, false);
-            function handleDrop(e) {
-                const dt = e.dataTransfer;
-                const files = dt.files;
-                handleFiles(files);
-            }
             fileUploadIcon.on('click', function() {
                 const input = document.createElement('input');
                 input.type = 'file';

         'text': f'hsl({hue}, {saturation}%, {text_lightness}%)'
     }
+def fix_token(token: str, tokenizer) -> str:
+    """
+    실제로 UI에 표시하기 전에, tokenizer.decode()를 통해
+    사람이 읽을 수 있는 형태로 디코딩한다.
+    """
+    if not token.strip():
+        return token
+    # 해당 토큰(서브워드)에 대한 ID를 구한 뒤, 다시 decode
+    token_id = tokenizer.convert_tokens_to_ids(token)
+    decoded = tokenizer.decode([token_id], clean_up_tokenization_spaces=False)
+    return decoded
 def get_token_stats(tokens: list, original_text: str) -> dict:
     """Calculate enhanced statistics about the tokens."""
     token_data = []
     for idx, token in enumerate(display_tokens):
         colors = get_varied_color(token)
+        # 디코딩된 토큰으로 교체
+        decoded_token = fix_token(token, tokenizer)
         # Compute the numerical token ID from the tokenizer
         token_id = tokenizer.convert_tokens_to_ids(token)
+        # 개행 여부를 단순히 decoded_token의 끝이 newline인지만 확인 (원하는대로 조정 가능)
+        newline_flag = decoded_token.endswith('\n')
+        # UI에 넣을 display(맨 끝 \n 제거 등)
+        display_str = decoded_token[:-1] if newline_flag else decoded_token
         token_data.append({
+            'original': token,       # raw token
+            'display': display_str,  # 사람이 읽을 수 있는 디코딩된 토큰
             'colors': colors,
+            'newline': newline_flag,
             'token_id': token_id,
             'token_index': idx
         })
             fileDropZone[0].addEventListener('drop', handleDrop, false);
             fileUploadIcon.on('click', function() {
                 const input = document.createElement('input');
                 input.type = 'file';