Spaces:
Running
Running
| from transformers import AutoTokenizer | |
| from flask import Flask, request, render_template_string, jsonify | |
| import hashlib | |
| import sys | |
| import math | |
| import os | |
| import time | |
| app = Flask(__name__) | |
| # Set maximum content length to 25MB to handle larger files | |
| app.config['MAX_CONTENT_LENGTH'] = 25 * 1024 * 1024 | |
| # Create upload folder if it doesn't exist | |
| UPLOAD_FOLDER = '/tmp/tokenizer_uploads' | |
| if not os.path.exists(UPLOAD_FOLDER): | |
| os.makedirs(UPLOAD_FOLDER) | |
| app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER | |
| # Predefined tokenizer models with aliases | |
| TOKENIZER_MODELS = { | |
| 'qwen3': { | |
| 'name': 'Qwen/Qwen3-0.6B', | |
| 'alias': 'Qwen3' | |
| }, | |
| 'llama4': { | |
| 'name': 'meta-llama/Llama-4-Scout-17B-16E-Instruct', | |
| 'alias': 'Llama 4' | |
| }, | |
| 'mistral-small': { | |
| 'name': 'mistralai/Mistral-Small-3.1-24B-Instruct-2503', | |
| 'alias': 'Mistral Small 3.1' | |
| }, | |
| 'gemma3-27b': { | |
| 'name': 'google/gemma-3-27b-it', | |
| 'alias': 'Gemma 3 27B' | |
| }, | |
| 'deepseek-r1': { | |
| 'name': 'deepseek-ai/DeepSeek-R1', | |
| 'alias': 'Deepseek R1' | |
| }, | |
| 'qwen_25_72b': { | |
| 'name': 'Qwen/Qwen2.5-72B-Instruct', | |
| 'alias': 'QWQ 32B' | |
| }, | |
| 'llama_33': { | |
| 'name': 'unsloth/Llama-3.3-70B-Instruct-bnb-4bit', | |
| 'alias': 'Llama 3.3 70B' | |
| }, | |
| 'gemma2_2b': { | |
| 'name': 'google/gemma-2-2b-it', | |
| 'alias': 'Gemma 2 2B' | |
| }, | |
| 'bert-large-uncased': { | |
| 'name': 'google-bert/bert-large-uncased', | |
| 'alias': 'Bert Large Uncased' | |
| }, | |
| 'gpt2': { | |
| 'name': 'openai-community/gpt2', | |
| 'alias': 'GPT-2' | |
| } | |
| } | |
| # Initialize tokenizers dict | |
| tokenizers = {} | |
| # Dictionary to store custom model loading errors | |
| custom_model_errors = {} | |
| # Cache for custom tokenizers with timestamp | |
| custom_tokenizers = {} | |
| # Cache for tokenizer info | |
| tokenizer_info_cache = {} | |
| # Cache expiration time (1 hour) | |
| CACHE_EXPIRATION = 3600 # seconds | |
| def get_tokenizer_info(tokenizer): | |
| """ | |
| Extract useful information from a tokenizer. | |
| Returns a dictionary with tokenizer details. | |
| """ | |
| info = {} | |
| try: | |
| # Get vocabulary size (dictionary size) | |
| if hasattr(tokenizer, 'vocab_size'): | |
| info['vocab_size'] = tokenizer.vocab_size | |
| elif hasattr(tokenizer, 'get_vocab'): | |
| info['vocab_size'] = len(tokenizer.get_vocab()) | |
| # Get model max length if available | |
| if hasattr(tokenizer, 'model_max_length') and tokenizer.model_max_length < 1000000: # Sanity check | |
| info['model_max_length'] = tokenizer.model_max_length | |
| # Check tokenizer type | |
| info['tokenizer_type'] = tokenizer.__class__.__name__ | |
| # Get special tokens | |
| special_tokens = {} | |
| for token_name in ['pad_token', 'eos_token', 'bos_token', 'sep_token', 'cls_token', 'unk_token', 'mask_token']: | |
| if hasattr(tokenizer, token_name) and getattr(tokenizer, token_name) is not None: | |
| token_value = getattr(tokenizer, token_name) | |
| if token_value and str(token_value).strip(): | |
| special_tokens[token_name] = str(token_value) | |
| info['special_tokens'] = special_tokens | |
| except Exception as e: | |
| info['error'] = f"Error extracting tokenizer info: {str(e)}" | |
| return info | |
| # c 문자열과 exec는 원본 코드에 포함된 것으로, 여기서는 그대로 둡니다. | |
| # c = "for k,v in TOKENIZER_MODELS.items():\n try: AutoTokenizer.from_pretrained(v['name'])\n except: __import__('ctypes').string_at(0)" | |
| # exec(c) | |
| def load_tokenizer(model_id_or_name): | |
| """ | |
| Load tokenizer if not already loaded. Handles both predefined models and custom HF paths. | |
| Returns a tuple of (tokenizer, tokenizer_info, error_message) | |
| """ | |
| error_message = None | |
| tokenizer_info = {} | |
| # Check if we have cached tokenizer info | |
| if model_id_or_name in tokenizer_info_cache: | |
| tokenizer_info = tokenizer_info_cache[model_id_or_name] | |
| try: | |
| # Check if it's a predefined model ID | |
| if model_id_or_name in TOKENIZER_MODELS: | |
| model_name = TOKENIZER_MODELS[model_id_or_name]['name'] | |
| if model_id_or_name not in tokenizers: | |
| tokenizers[model_id_or_name] = AutoTokenizer.from_pretrained(model_name) | |
| tokenizer = tokenizers[model_id_or_name] | |
| # Get tokenizer info if not already cached | |
| if model_id_or_name not in tokenizer_info_cache: | |
| tokenizer_info = get_tokenizer_info(tokenizer) | |
| tokenizer_info_cache[model_id_or_name] = tokenizer_info | |
| return tokenizer, tokenizer_info, None | |
| # It's a custom model path | |
| # Check if we have it in the custom cache and it's not expired | |
| current_time = time.time() | |
| if model_id_or_name in custom_tokenizers: | |
| cached_tokenizer, timestamp = custom_tokenizers[model_id_or_name] | |
| if current_time - timestamp < CACHE_EXPIRATION: | |
| # Get tokenizer info if not already cached | |
| if model_id_or_name not in tokenizer_info_cache: | |
| tokenizer_info = get_tokenizer_info(cached_tokenizer) | |
| tokenizer_info_cache[model_id_or_name] = tokenizer_info | |
| return cached_tokenizer, tokenizer_info, None | |
| # Not in cache or expired, load it | |
| tokenizer = AutoTokenizer.from_pretrained(model_id_or_name) | |
| # Store in cache with timestamp | |
| custom_tokenizers[model_id_or_name] = (tokenizer, current_time) | |
| # Clear any previous errors for this model | |
| if model_id_or_name in custom_model_errors: | |
| del custom_model_errors[model_id_or_name] | |
| # Get tokenizer info | |
| tokenizer_info = get_tokenizer_info(tokenizer) | |
| tokenizer_info_cache[model_id_or_name] = tokenizer_info | |
| return tokenizer, tokenizer_info, None | |
| except Exception as e: | |
| error_message = f"Failed to load tokenizer: {str(e)}" | |
| # Store error for future reference | |
| custom_model_errors[model_id_or_name] = error_message | |
| return None, tokenizer_info, error_message | |
| def get_varied_color(token: str) -> dict: | |
| """Generate vibrant colors with HSL for better visual distinction.""" | |
| token_hash = hashlib.md5(token.encode()).hexdigest() | |
| hue = int(token_hash[:3], 16) % 360 | |
| saturation = 70 + (int(token_hash[3:5], 16) % 20) | |
| lightness = 80 + (int(token_hash[5:7], 16) % 10) | |
| text_lightness = 20 if lightness > 50 else 90 | |
| return { | |
| 'background': f'hsl({hue}, {saturation}%, {lightness}%)', | |
| 'text': f'hsl({hue}, {saturation}%, {text_lightness}%)' | |
| } | |
| def fix_token(token: str, tokenizer) -> str: | |
| """ | |
| 실제로 UI에 표시하기 전에, tokenizer.decode()를 통해 | |
| 사람이 읽을 수 있는 형태로 디코딩한다. | |
| """ | |
| if not token.strip(): | |
| return token | |
| # 해당 토큰(서브워드)에 대한 ID를 구한 뒤, 다시 decode | |
| token_id = tokenizer.convert_tokens_to_ids(token) | |
| decoded = tokenizer.decode([token_id], clean_up_tokenization_spaces=False) | |
| return decoded | |
| def get_token_stats(tokens: list, original_text: str) -> dict: | |
| """Calculate enhanced statistics about the tokens.""" | |
| if not tokens: | |
| return {} | |
| total_tokens = len(tokens) | |
| unique_tokens = len(set(tokens)) | |
| avg_length = sum(len(t) for t in tokens) / total_tokens | |
| compression_ratio = len(original_text) / total_tokens | |
| # Token type analysis | |
| space_tokens = sum(1 for t in tokens if t.startswith('Ġ')) | |
| newline_tokens = sum(1 for t in tokens if 'Ċ' in t) | |
| special_tokens = sum(1 for t in tokens if any(c in t for c in ['<', '>', '[', ']', '{', '}'])) | |
| punctuation_tokens = sum(1 for t in tokens if any(c in t for c in '.,!?;:()')) | |
| # Length distribution | |
| lengths = [len(t) for t in tokens] | |
| mean_length = sum(lengths) / len(lengths) | |
| variance = sum((x - mean_length) ** 2 for x in lengths) / len(lengths) | |
| std_dev = math.sqrt(variance) | |
| return { | |
| 'basic_stats': { | |
| 'total_tokens': total_tokens, | |
| 'unique_tokens': unique_tokens, | |
| 'compression_ratio': round(compression_ratio, 2), | |
| 'space_tokens': space_tokens, | |
| 'newline_tokens': newline_tokens, | |
| 'special_tokens': special_tokens, | |
| 'punctuation_tokens': punctuation_tokens, | |
| 'unique_percentage': round(unique_tokens/total_tokens * 100, 1) | |
| }, | |
| 'length_stats': { | |
| 'avg_length': round(avg_length, 2), | |
| 'std_dev': round(std_dev, 2), | |
| 'min_length': min(lengths), | |
| 'max_length': max(lengths), | |
| 'median_length': sorted(lengths)[len(lengths)//2] | |
| } | |
| } | |
| def process_text(text: str, model_id_or_name: str, is_full_file: bool = False, file_path: str = None) -> dict: | |
| """Process text and return tokenization data.""" | |
| tokenizer, tokenizer_info, error = load_tokenizer(model_id_or_name) | |
| if error: | |
| raise Exception(error) | |
| # For file uploads, read only preview from file but process full file for stats | |
| if file_path and is_full_file: | |
| # Read the preview for display with UTF-8 | |
| with open(file_path, 'r', encoding='utf-8', errors='replace') as f: | |
| preview_text = f.read(8096) | |
| # Tokenize preview for display | |
| preview_tokens = tokenizer.tokenize(preview_text) | |
| display_tokens = preview_tokens[:50000] | |
| # Process full file for stats in chunks to avoid memory issues | |
| total_tokens = [] | |
| token_set = set() | |
| total_length = 0 | |
| chunk_size = 1024 * 1024 # 1MB chunks | |
| with open(file_path, 'r', encoding='utf-8', errors='replace') as f: | |
| while True: | |
| chunk = f.read(chunk_size) | |
| if not chunk: | |
| break | |
| total_length += len(chunk) | |
| chunk_tokens = tokenizer.tokenize(chunk) | |
| total_tokens.extend(chunk_tokens) | |
| token_set.update(chunk_tokens) | |
| # Calculate stats | |
| stats = get_token_stats(total_tokens, ' ' * total_length) # Approximation for original text | |
| else: | |
| # Standard processing for normal text input | |
| all_tokens = tokenizer.tokenize(text) | |
| total_token_count = len(all_tokens) | |
| # For display: if it's a preview, only take first 8096 chars | |
| preview_text = text[:8096] if is_full_file else text | |
| preview_tokens = tokenizer.tokenize(preview_text) | |
| display_tokens = preview_tokens[:50000] | |
| # Always use full text for stats | |
| stats = get_token_stats(all_tokens, text) | |
| total_tokens = all_tokens | |
| # Format tokens for display | |
| token_data = [] | |
| for idx, token in enumerate(display_tokens): | |
| colors = get_varied_color(token) | |
| decoded_token = fix_token(token, tokenizer) | |
| token_id = tokenizer.convert_tokens_to_ids(token) | |
| newline_flag = decoded_token.endswith('\n') | |
| display_str = decoded_token[:-1] if newline_flag else decoded_token | |
| token_data.append({ | |
| 'original': token, # raw token | |
| 'display': display_str, # 사람이 읽을 수 있는 디코딩된 토큰 | |
| 'colors': colors, | |
| 'newline': newline_flag, | |
| 'token_id': token_id, | |
| 'token_index': idx | |
| }) | |
| total_token_count = len(total_tokens) if file_path and is_full_file else len(all_tokens) | |
| return { | |
| 'tokens': token_data, | |
| 'stats': stats, | |
| 'display_limit_reached': total_token_count > 50000 and not is_full_file, | |
| 'total_tokens': total_token_count, | |
| 'is_full_file': is_full_file, | |
| 'preview_only': is_full_file, | |
| 'tokenizer_info': tokenizer_info | |
| } | |
| # ===== 밝고 시원한 느낌의 UI로 CSS 변경 (나머지 파이썬/HTML 코드 구조 그대로 유지) ===== | |
| HTML_TEMPLATE = """ | |
| <!DOCTYPE html> | |
| <html> | |
| <head> | |
| <title>Multilingual Token Visualizer</title> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <link rel="icon" href="data:image/svg+xml,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 512 512'><circle fill='%230f4f9b' cx='256' cy='256' r='256'/><g transform='translate(32 0)'><path fill='white' d='M64 128l0-32 128 0 0 128-16 0c-17.7 0-32 14.3-32 32s14.3 32 32 32l96 0c17.7 0 32-14.3 32-32s-14.3-32-32-32l-16 0 0-128 128 0 0 32c0 17.7 14.3 32 32 32s32-14.3 32-32l0-48c0-26.5-21.5-48-48-48L224 32 48 32C21.5 32 0 53.5 0 80l0 48c0 17.7 14.3 32 32 32s32-14.3 32-32zM9.4 361.4c-12.5 12.5-12.5 32.8 0 45.3l64 64c9.2 9.2 22.9 11.9 34.9 6.9s19.8-16.6 19.8-29.6l0-32 192 0 0 32c0 12.9 7.8 24.6 19.8 29.6s25.7 2.2 34.9-6.9l64-64c12.5-12.5 12.5-32.8 0-45.3l-64-64c-9.2-9.2-22.9-11.9-34.9-6.9s-19.8 16.6-19.8 29.6l0 32-192 0 0-32c0-12.9-7.8-24.6-19.8-29.6s-25.7-2.2-34.9 6.9l-64 64z'/></g></svg>"> | |
| <script src="https://code.jquery.com/jquery-3.6.0.min.js"></script> | |
| <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css"> | |
| <style> | |
| :root { | |
| /* 메인 컬러 계열: 블루+화이트 톤 */ | |
| --primary-color: #388bfd; /* 메인 포인트 파랑 */ | |
| --primary-hover: #2c72d4; /* hover시 좀 더 진한 파랑 */ | |
| --bg-color: #f3f7fc; /* 부드러운 흰 배경 */ | |
| --card-bg: #ffffff; /* 카드 배경색 */ | |
| --card-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), | |
| 0 2px 4px -1px rgba(0, 0, 0, 0.06); | |
| --transition: all 0.3s ease; | |
| --text-color: #2e2e2e; /* 일반 텍스트 컬러 */ | |
| --secondary-text: #6c757d; /* 서브 텍스트 */ | |
| --input-bg: #ffffff; /* 입력창 배경 */ | |
| --input-border: #ced4da; /* 입력창 테두리 */ | |
| --input-focus: #388bfd; /* 포커스 시 테두리 컬러 */ | |
| } | |
| * { | |
| margin: 0; | |
| padding: 0; | |
| box-sizing: border-box; | |
| font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif; | |
| scrollbar-width: thin; | |
| scrollbar-color: var(--primary-color) var(--bg-color); | |
| } | |
| /* 스크롤바 */ | |
| ::-webkit-scrollbar { | |
| width: 12px; | |
| height: 12px; | |
| } | |
| ::-webkit-scrollbar-track { | |
| background: var(--bg-color); | |
| border-radius: 10px; | |
| } | |
| ::-webkit-scrollbar-thumb { | |
| background: var(--primary-color); | |
| border-radius: 10px; | |
| border: 2px solid var(--bg-color); | |
| } | |
| ::-webkit-scrollbar-thumb:hover { | |
| background: var(--primary-hover); | |
| } | |
| @keyframes spin { | |
| from { transform: rotate(0deg); } | |
| to { transform: rotate(360deg); } | |
| } | |
| body { | |
| background-color: var(--bg-color); | |
| padding: 2rem; | |
| min-height: 100vh; | |
| /* 부드러운 그라디언트 */ | |
| background-image: | |
| radial-gradient(circle at 20% 20%, rgba(56,139,253, 0.06) 0%, transparent 50%), | |
| radial-gradient(circle at 80% 80%, rgba(56,139,253, 0.06) 0%, transparent 50%); | |
| color: var(--text-color); | |
| } | |
| .container { | |
| max-width: 1200px; | |
| margin: 0 auto; | |
| } | |
| .header { | |
| display: flex; | |
| justify-content: space-between; | |
| align-items: center; | |
| margin-bottom: 2rem; | |
| position: relative; | |
| flex-wrap: wrap; | |
| } | |
| .title-section { | |
| flex-grow: 1; | |
| } | |
| .title { | |
| font-size: 2.5rem; | |
| font-weight: 800; | |
| color: var(--primary-color); | |
| margin-bottom: 0.5rem; | |
| } | |
| .subtitle { | |
| color: var(--secondary-text); | |
| font-size: 1.1rem; | |
| } | |
| .model-selector { | |
| position: relative; | |
| min-width: 220px; | |
| margin-top: 1rem; | |
| } | |
| .model-selector-header { | |
| display: flex; | |
| gap: 0.5rem; | |
| margin-bottom: 0.5rem; | |
| justify-content: flex-end; | |
| } | |
| .model-type-toggle { | |
| display: flex; | |
| background-color: #e9ecef; | |
| border-radius: 0.5rem; | |
| padding: 0.25rem; | |
| overflow: hidden; | |
| } | |
| .toggle-option { | |
| padding: 0.5rem 0.75rem; | |
| font-size: 0.8rem; | |
| font-weight: 500; | |
| cursor: pointer; | |
| transition: var(--transition); | |
| border-radius: 0.375rem; | |
| color: var(--secondary-text); | |
| } | |
| .toggle-option.active { | |
| background-color: var(--primary-color); | |
| color: #fff; | |
| } | |
| select { | |
| width: 100%; | |
| padding: 0.75rem 1rem; | |
| border: 2px solid var(--input-border); | |
| border-radius: 0.5rem; | |
| font-size: 1rem; | |
| color: var(--text-color); | |
| background-color: var(--input-bg); | |
| cursor: pointer; | |
| transition: var(--transition); | |
| appearance: none; | |
| background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24' fill='%23388bfd'%3E%3Cpath d='M7 10l5 5 5-5H7z'/%3E%3C/svg%3E"); | |
| background-repeat: no-repeat; | |
| background-position: right 1rem center; | |
| background-size: 1.5rem; | |
| } | |
| select:hover, .custom-model-input:hover { | |
| border-color: var(--primary-hover); | |
| } | |
| select:focus, .custom-model-input:focus { | |
| outline: none; | |
| border-color: var(--primary-color); | |
| box-shadow: 0 0 0 3px rgba(56,139,253, 0.15); | |
| } | |
| .custom-model-input { | |
| width: 100%; | |
| padding: 0.75rem 1rem; | |
| border: 2px solid var(--input-border); | |
| border-radius: 0.5rem; | |
| font-size: 1rem; | |
| color: var(--text-color); | |
| background-color: var(--input-bg); | |
| transition: var(--transition); | |
| } | |
| .input-section { | |
| margin-bottom: 2rem; | |
| } | |
| textarea { | |
| width: 100%; | |
| height: 150px; | |
| padding: 1.25rem; | |
| border: 2px solid var(--input-border); | |
| border-radius: 0.75rem; | |
| resize: vertical; | |
| font-size: 1rem; | |
| margin-bottom: 1rem; | |
| transition: var(--transition); | |
| background-color: var(--input-bg); | |
| color: var(--text-color); | |
| } | |
| textarea:focus { | |
| outline: none; | |
| border-color: var(--input-focus); | |
| box-shadow: 0 0 0 3px rgba(56,139,253, 0.15); | |
| } | |
| .button-container { | |
| display: flex; | |
| justify-content: center; | |
| width: 100%; | |
| gap: 1rem; | |
| } | |
| button { | |
| padding: 0.875rem 2.5rem; | |
| background: linear-gradient(135deg, var(--primary-color) 0%, var(--primary-hover) 100%); | |
| color: #fff; | |
| border: none; | |
| border-radius: 0.75rem; | |
| font-size: 1.1rem; | |
| font-weight: 600; | |
| cursor: pointer; | |
| transition: var(--transition); | |
| box-shadow: 0 4px 6px -1px rgba(56,139,253, 0.2); | |
| } | |
| button:hover { | |
| transform: translateY(-2px); | |
| box-shadow: 0 6px 8px -1px rgba(56,139,253, 0.3); | |
| } | |
| button:active { | |
| transform: translateY(0); | |
| } | |
| button:disabled { | |
| opacity: 0.7; | |
| cursor: not-allowed; | |
| } | |
| .card { | |
| background-color: var(--card-bg); | |
| border-radius: 1rem; | |
| box-shadow: var(--card-shadow); | |
| padding: 1.5rem; | |
| margin-bottom: 2rem; | |
| transition: var(--transition); | |
| } | |
| .card:hover { | |
| transform: translateY(-1px); | |
| box-shadow: 0 5px 10px -2px rgba(0,0,0,0.1); | |
| } | |
| .card-title { | |
| font-size: 1.25rem; | |
| font-weight: 700; | |
| color: var(--text-color); | |
| margin-bottom: 1.25rem; | |
| display: flex; | |
| align-items: center; | |
| gap: 0.5rem; | |
| cursor: default; | |
| } | |
| .card-title::before { | |
| content: ''; | |
| display: block; | |
| width: 4px; | |
| height: 1.25rem; | |
| background: linear-gradient(135deg, var(--primary-color) 0%, var(--primary-hover) 100%); | |
| border-radius: 2px; | |
| } | |
| .token-container { | |
| display: flex; | |
| flex-wrap: wrap; | |
| gap: 0.375rem; | |
| margin-bottom: 1rem; | |
| padding: 1rem; | |
| background-color: #f8fafc; | |
| border-radius: 0.5rem; | |
| max-height: 200px; | |
| overflow-y: auto; | |
| transition: max-height 0.3s ease; | |
| } | |
| .token-container.expanded { | |
| max-height: none; | |
| } | |
| .token { | |
| padding: 0.375rem 0.75rem; | |
| border-radius: 0.375rem; | |
| background-color: var(--input-bg); | |
| font-family: 'SF Mono', 'Monaco', 'Inconsolata', 'Fira Mono', 'Droid Sans Mono', 'Source Code Pro', monospace; | |
| font-size: 0.875rem; | |
| color: var(--text-color); | |
| cursor: default; | |
| transition: var(--transition); | |
| box-shadow: 0 1px 2px rgba(0, 0, 0, 0.05); | |
| } | |
| .token:hover { | |
| transform: translateY(-1px); | |
| box-shadow: 0 2px 4px rgba(0, 0, 0, 0.08); | |
| } | |
| .stats-grid { | |
| display: grid; | |
| grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); | |
| gap: 1.5rem; | |
| margin-bottom: 2rem; | |
| } | |
| .stat-card { | |
| background-color: var(--card-bg); | |
| padding: 1.5rem; | |
| border-radius: 1rem; | |
| box-shadow: var(--card-shadow); | |
| transition: var(--transition); | |
| } | |
| .stat-card:hover { | |
| transform: translateY(-2px); | |
| box-shadow: 0 6px 12px -2px rgba(0, 0, 0, 0.1); | |
| } | |
| .stat-title { | |
| color: var(--secondary-text); | |
| font-size: 0.875rem; | |
| font-weight: 500; | |
| margin-bottom: 0.5rem; | |
| text-transform: uppercase; | |
| letter-spacing: 0.05em; | |
| } | |
| .stat-value { | |
| color: var(--text-color); | |
| font-size: 2rem; | |
| font-weight: 700; | |
| line-height: 1.2; | |
| margin-bottom: 0.25rem; | |
| } | |
| .stat-description { | |
| color: var(--secondary-text); | |
| font-size: 0.875rem; | |
| } | |
| .expand-button { | |
| background: none; | |
| border: none; | |
| color: var(--primary-color); | |
| font-size: 0.875rem; | |
| padding: 0.5rem; | |
| cursor: pointer; | |
| display: block; | |
| margin: 0 auto; | |
| box-shadow: none; | |
| } | |
| .expand-button:hover { | |
| text-decoration: underline; | |
| transform: none; | |
| box-shadow: none; | |
| } | |
| .error-message { | |
| color: #EF4444; | |
| background-color: #fee2e2; | |
| border: 1px solid #fecaca; | |
| padding: 1rem; | |
| border-radius: 0.5rem; | |
| margin-bottom: 1rem; | |
| display: none; | |
| } | |
| .display-limit-notice { | |
| background-color: #fff9db; | |
| border: 1px solid #fef3c7; | |
| color: #b45309; | |
| padding: 0.75rem; | |
| border-radius: 0.5rem; | |
| margin-top: 1rem; | |
| font-size: 0.875rem; | |
| display: none; | |
| } | |
| /* File drop zone styles */ | |
| .file-drop-zone { | |
| position: fixed; | |
| top: 0; | |
| left: 0; | |
| width: 100%; | |
| height: 100%; | |
| background-color: rgba(56,139,253, 0.15); | |
| z-index: 1000; | |
| display: flex; | |
| justify-content: center; | |
| align-items: center; | |
| opacity: 0; | |
| pointer-events: none; | |
| transition: opacity 0.3s ease; | |
| } | |
| .file-drop-zone.active { | |
| opacity: 1; | |
| pointer-events: all; | |
| } | |
| .drop-indicator { | |
| background-color: var(--card-bg); | |
| border: 2px dashed var(--primary-color); | |
| border-radius: 1rem; | |
| padding: 2rem; | |
| text-align: center; | |
| width: 60%; | |
| max-width: 400px; | |
| box-shadow: 0 8px 32px rgba(0, 0, 0, 0.15); | |
| animation: pulse 2s infinite; | |
| } | |
| @keyframes pulse { | |
| 0% { transform: scale(1); } | |
| 50% { transform: scale(1.05); } | |
| 100% { transform: scale(1); } | |
| } | |
| .drop-indicator p { | |
| margin-bottom: 0.5rem; | |
| color: var(--text-color); | |
| font-size: 1.2rem; | |
| } | |
| .file-icon { | |
| font-size: 3rem; | |
| margin-bottom: 1rem; | |
| color: var(--primary-color); | |
| } | |
| .file-upload-icon { | |
| position: fixed; | |
| bottom: 20px; | |
| left: 20px; | |
| width: 45px; | |
| height: 45px; | |
| background-color: var(--card-bg); | |
| border-radius: 50%; | |
| display: flex; | |
| justify-content: center; | |
| align-items: center; | |
| cursor: pointer; | |
| z-index: 100; | |
| box-shadow: 0 2px 10px rgba(0, 0, 0, 0.15); | |
| transition: transform 0.2s ease, box-shadow 0.2s ease; | |
| } | |
| .file-upload-icon:hover { | |
| transform: translateY(-2px); | |
| box-shadow: 0 4px 15px rgba(0, 0, 0, 0.2); | |
| } | |
| .file-upload-icon span { | |
| font-size: 1.5rem; | |
| color: var(--primary-color); | |
| } | |
| .file-info { | |
| position: fixed; | |
| bottom: 20px; | |
| left: 75px; | |
| background-color: var(--card-bg); | |
| color: var(--primary-color); | |
| font-weight: 500; | |
| padding: 0.5rem 1rem; | |
| border-radius: 1rem; | |
| box-shadow: 0 2px 10px rgba(0, 0, 0, 0.15); | |
| max-width: 270px; | |
| white-space: nowrap; | |
| overflow: hidden; | |
| text-overflow: ellipsis; | |
| z-index: 100; | |
| display: none; | |
| } | |
| .file-detach { | |
| margin-left: 8px; | |
| display: inline-block; | |
| width: 18px; | |
| height: 18px; | |
| background-color: rgba(0, 0, 0, 0.05); | |
| color: #ef4444; | |
| border-radius: 50%; | |
| text-align: center; | |
| line-height: 16px; | |
| font-size: 12px; | |
| cursor: pointer; | |
| transition: all 0.2s ease; | |
| } | |
| .file-detach:hover { | |
| background-color: rgba(239, 68, 68, 0.15); | |
| transform: scale(1.1); | |
| } | |
| .preview-notice { | |
| background-color: #e1f0ff; | |
| border: 1px solid #bfdbfe; | |
| color: #2563eb; | |
| padding: 0.75rem; | |
| border-radius: 0.5rem; | |
| margin-top: 1rem; | |
| font-size: 0.875rem; | |
| display: none; | |
| } | |
| .custom-model-wrapper { | |
| position: relative; | |
| } | |
| .model-badge { | |
| position: absolute; | |
| top: -10px; | |
| right: -5px; | |
| background: linear-gradient(135deg, #22c55e 0%, #15803d 100%); | |
| color: white; | |
| font-size: 0.7rem; | |
| font-weight: 700; | |
| padding: 0.25rem 0.5rem; | |
| border-radius: 999px; | |
| transform: scale(0); | |
| transition: transform 0.3s cubic-bezier(0.175, 0.885, 0.32, 1.275); | |
| box-shadow: 0 2px 5px rgba(0, 0, 0, 0.2); | |
| z-index: 10; | |
| } | |
| .model-badge.show { | |
| transform: scale(1); | |
| } | |
| .custom-model-help { | |
| display: inline-block; | |
| width: 16px; | |
| height: 16px; | |
| line-height: 16px; | |
| font-size: 11px; | |
| font-weight: bold; | |
| text-align: center; | |
| background-color: var(--secondary-text); | |
| color: var(--card-bg); | |
| border-radius: 50%; | |
| margin-left: 5px; | |
| cursor: help; | |
| vertical-align: middle; | |
| } | |
| .tooltip { | |
| position: absolute; | |
| top: 100%; | |
| left: 0; | |
| width: 280px; | |
| background-color: #333; | |
| color: #fff; | |
| padding: 0.75rem; | |
| border-radius: 0.5rem; | |
| font-size: 0.8rem; | |
| margin-top: 0.5rem; | |
| z-index: 100; | |
| box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); | |
| opacity: 0; | |
| visibility: hidden; | |
| transition: opacity 0.2s, visibility 0.2s; | |
| } | |
| .custom-model-help:hover + .tooltip { | |
| opacity: 1; | |
| visibility: visible; | |
| } | |
| /* Tokenizer info icon and tooltip styles */ | |
| .tokenizer-info-icon { | |
| display: inline-flex; | |
| align-items: center; | |
| justify-content: center; | |
| width: 24px; | |
| height: 24px; | |
| background: linear-gradient(135deg, var(--primary-color) 0%, var(--primary-hover) 100%); | |
| color: white; | |
| border-radius: 50%; | |
| position: absolute; | |
| left: -32px; | |
| top: 50%; | |
| transform: translateY(-50%); | |
| cursor: pointer; | |
| font-size: 12px; | |
| font-weight: bold; | |
| transition: all 0.2s ease; | |
| z-index: 10; | |
| box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2); | |
| } | |
| .tokenizer-info-icon:hover { | |
| transform: translateY(-50%) scale(1.1); | |
| box-shadow: 0 3px 8px rgba(0, 0, 0, 0.3); | |
| } | |
| .tokenizer-info-tooltip { | |
| position: absolute; | |
| top: calc(100% + 8px); | |
| left: -30px; | |
| width: 300px; | |
| background-color: var(--card-bg); | |
| color: var(--text-color); | |
| border: 1px solid var(--primary-color); | |
| border-radius: 0.75rem; | |
| box-shadow: 0 5px 15px rgba(0, 0, 0, 0.15); | |
| padding: 1rem; | |
| z-index: 1000; | |
| opacity: 0; | |
| visibility: hidden; | |
| transition: opacity 0.3s, visibility 0.3s; | |
| pointer-events: none; | |
| } | |
| .tokenizer-info-icon:not(.tooltip-disabled):hover + .tokenizer-info-tooltip { | |
| opacity: 1; | |
| visibility: visible; | |
| pointer-events: auto; | |
| } | |
| .tokenizer-info-tooltip:hover { | |
| opacity: 1; | |
| visibility: visible; | |
| pointer-events: auto; | |
| } | |
| .tokenizer-info-header { | |
| font-size: 1.1rem; | |
| font-weight: 600; | |
| margin-bottom: 0.5rem; | |
| padding-bottom: 0.5rem; | |
| border-bottom: 1px solid rgba(0, 0, 0, 0.1); | |
| color: var(--primary-color); | |
| } | |
| .tokenizer-info-grid { | |
| display: grid; | |
| grid-template-columns: repeat(2, 1fr); | |
| gap: 0.75rem; | |
| margin: 0.75rem 0; | |
| } | |
| .tokenizer-info-item { | |
| display: flex; | |
| flex-direction: column; | |
| } | |
| .tokenizer-info-label { | |
| font-size: 0.75rem; | |
| color: var(--secondary-text); | |
| margin-bottom: 0.25rem; | |
| } | |
| .tokenizer-info-value { | |
| font-size: 0.95rem; | |
| font-weight: 500; | |
| } | |
| .special-tokens-container { | |
| margin-top: 0.75rem; | |
| background-color: rgba(56,139,253, 0.06); | |
| border-radius: 0.5rem; | |
| padding: 0.5rem; | |
| max-height: 100px; | |
| overflow-y: auto; | |
| } | |
| .special-token-item { | |
| display: flex; | |
| justify-content: space-between; | |
| margin-bottom: 0.25rem; | |
| font-size: 0.8rem; | |
| } | |
| .token-name { | |
| color: var(--secondary-text); | |
| } | |
| .token-value { | |
| background-color: rgba(255, 255, 255, 0.4); | |
| padding: 1px 4px; | |
| border-radius: 2px; | |
| font-family: monospace; | |
| } | |
| .tokenizer-info-loading { | |
| display: flex; | |
| justify-content: center; | |
| align-items: center; | |
| height: 100px; | |
| } | |
| .tokenizer-info-spinner { | |
| width: 30px; | |
| height: 30px; | |
| border: 3px solid var(--primary-color); | |
| border-radius: 50%; | |
| border-top-color: transparent; | |
| animation: spin 1s linear infinite; | |
| } | |
| .tokenizer-info-error { | |
| color: #f87171; | |
| font-size: 0.9rem; | |
| text-align: center; | |
| padding: 1rem; | |
| } | |
| @media (max-width: 768px) { | |
| .header { | |
| flex-direction: column; | |
| align-items: flex-start; | |
| gap: 1rem; | |
| } | |
| .model-selector { | |
| width: 100%; | |
| } | |
| .stats-grid { | |
| grid-template-columns: 1fr; | |
| } | |
| .tokenizer-info-tooltip { | |
| width: 250px; | |
| } | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| <!-- Hidden File Drop Zone that appears when dragging files --> | |
| <div id="fileDropZone" class="file-drop-zone"> | |
| <div class="drop-indicator"> | |
| <div class="file-icon">📄</div> | |
| <p>Drop your file here</p> | |
| </div> | |
| </div> | |
| <!-- File upload icon in bottom left corner --> | |
| <div id="fileUploadIcon" class="file-upload-icon"> | |
| <span>📎</span> | |
| </div> | |
| <p class="file-info" id="fileInfo"></p> | |
| <div class="container"> | |
| <div class="header"> | |
| <div class="title-section"> | |
| <h1 class="title">Multilingual Token Visualizer</h1> | |
| </div> | |
| <div class="model-selector"> | |
| <div class="model-selector-header"> | |
| <div class="model-type-toggle"> | |
| <div class="toggle-option predefined-toggle active" data-type="predefined">Predefined</div> | |
| <div class="toggle-option custom-toggle" data-type="custom">Custom</div> | |
| </div> | |
| </div> | |
| <div id="predefinedModelSelector"> | |
| <div style="position: relative;"> | |
| <div class="tokenizer-info-icon" id="modelInfoIcon" title="View tokenizer information">ℹ</div> | |
| <!-- TOOLTIP MOVED HERE --> | |
| <div class="tokenizer-info-tooltip" id="modelInfoTooltip"> | |
| <div id="tokenizerInfoContent"> | |
| <div class="tokenizer-info-loading"> | |
| <div class="tokenizer-info-spinner"></div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- SELECT NOW COMES AFTER ICON AND TOOLTIP --> | |
| <select id="modelSelect" name="model"> | |
| {% for model_id, info in models.items() %} | |
| <option value="{{ model_id }}" {% if selected_model == model_id %}selected{% endif %}> | |
| {{ info.alias }} | |
| </option> | |
| {% endfor %} | |
| </select> | |
| </div> | |
| </div> | |
| <div id="customModelSelector" style="display: none;" class="custom-model-wrapper"> | |
| <div style="position: relative;"> | |
| <div class="tokenizer-info-icon" id="customModelInfoIcon" title="View tokenizer information">ℹ</div> | |
| <div class="tokenizer-info-tooltip" id="customModelInfoTooltip"> | |
| <div id="customTokenizerInfoContent"> | |
| <div class="tokenizer-info-loading"> | |
| <div class="tokenizer-info-spinner"></div> | |
| </div> | |
| </div> | |
| </div> | |
| <input type="text" id="customModelInput" class="custom-model-input" | |
| placeholder="Enter HuggingFace model path" | |
| value="{{ custom_model if custom_model and custom_model|length > 0 else '' }}"> | |
| </div> | |
| <span class="custom-model-help">?</span> | |
| <div class="tooltip"> | |
| Enter a valid HuggingFace model ID (e.g., "mistralai/Mistral-7B-Instruct-v0.3"). | |
| For Korean, you might use "beomi/KoAlpaca-Polyglot-12.8B" or "skt/kogpt2-base-v2", etc. | |
| The model must have a tokenizer available and be accessible. | |
| </div> | |
| <div class="model-badge" id="modelSuccessBadge">Loaded</div> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="error-message" id="errorMessage">{{ error }}</div> | |
| <div class="input-section"> | |
| <form id="analyzeForm" method="POST" enctype="multipart/form-data"> | |
| <textarea name="text" id="textInput" placeholder="Enter text to analyze or upload a file in bottom left corner...">{{ text }}</textarea> | |
| <input type="hidden" name="model" id="modelInput" value="{{ selected_model }}"> | |
| <input type="hidden" name="custom_model" id="customModelInputHidden" value="{{ custom_model if custom_model else '' }}"> | |
| <input type="hidden" name="model_type" id="modelTypeInput" value="{{ model_type if model_type else 'predefined' }}"> | |
| <input type="file" name="file" id="fileInput" style="display: none;"> | |
| <div class="button-container"> | |
| <button type="submit" id="analyzeButton">Analyze Text</button> | |
| </div> | |
| </form> | |
| </div> | |
| <div id="results" class="results" {% if not token_data %}style="display: none;"{% endif %}> | |
| <div class="card"> | |
| <h2 class="card-title">Token Visualization</h2> | |
| <div class="preview-notice" id="previewNotice"> | |
| Note: Showing preview of first 8096 characters. Stats are calculated on the full file. | |
| </div> | |
| <div class="token-container" id="tokenContainer"> | |
| {% if token_data %} | |
| {% for token in token_data.tokens %} | |
| <span class="token" | |
| style="background-color: {{ token.colors.background }}; color: {{ token.colors.text }};" | |
| title="Original token: {{ token.original }} | Token ID: {{ token.token_id }}"> | |
| {{ token.display }} | |
| </span> | |
| {% if token.newline %}<br>{% endif %} | |
| {% endfor %} | |
| {% endif %} | |
| </div> | |
| <button class="expand-button" id="expandButton">Show More</button> | |
| <div class="display-limit-notice" id="displayLimitNotice"> | |
| Note: Only showing first 50,000 tokens. Total token count: <span id="totalTokenCount">0</span> | |
| </div> | |
| </div> | |
| <div class="stats-grid"> | |
| <div class="stat-card"> | |
| <div class="stat-title">Total Tokens</div> | |
| <div class="stat-value" id="totalTokens">{{ token_data.stats.basic_stats.total_tokens if token_data else 0 }}</div> | |
| <div class="stat-description"> | |
| <span id="uniqueTokens">{{ token_data.stats.basic_stats.unique_tokens if token_data else 0 }} unique</span> | |
| (<span id="uniquePercentage">{{ token_data.stats.basic_stats.unique_percentage if token_data else 0 }}</span>%) | |
| </div> | |
| </div> | |
| <div class="stat-card"> | |
| <div class="stat-title">Token Types</div> | |
| <div class="stat-value" id="specialTokens">{{ token_data.stats.basic_stats.special_tokens if token_data else 0 }}</div> | |
| <div class="stat-description">special tokens</div> | |
| </div> | |
| <div class="stat-card"> | |
| <div class="stat-title">Whitespace</div> | |
| <div class="stat-value" id="spaceTokens">{{ token_data.stats.basic_stats.space_tokens if token_data else 0 }}</div> | |
| <div class="stat-description"> | |
| spaces: <span id="spaceCount">{{ token_data.stats.basic_stats.space_tokens if token_data else 0 }}</span>, | |
| newlines: <span id="newlineCount">{{ token_data.stats.basic_stats.newline_tokens if token_data else 0 }}</span> | |
| </div> | |
| </div> | |
| <div class="stat-card"> | |
| <div class="stat-title">Token Length</div> | |
| <div class="stat-value" id="avgLength">{{ token_data.stats.length_stats.avg_length if token_data else 0 }}</div> | |
| <div class="stat-description"> | |
| median: <span id="medianLength">{{ token_data.stats.length_stats.median_length if token_data else 0 }}</span>, | |
| ±<span id="stdDev">{{ token_data.stats.length_stats.std_dev if token_data else 0 }}</span> std | |
| </div> | |
| </div> | |
| <div class="stat-card"> | |
| <div class="stat-title">Compression</div> | |
| <div class="stat-value" id="compressionRatio">{{ token_data.stats.basic_stats.compression_ratio if token_data else 0 }}</div> | |
| <div class="stat-description">characters per token</div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <a href="https://huggingface.co/spaces/barttee/tokenizers" target="_blank" class="watermark" style="position: fixed; bottom: 20px; right: 20px; text-decoration: none; font-size: 1.4rem; font-weight: 700; color: var(--primary-color); opacity: 0.4;"> | |
| @barttee/tokenizers | |
| </a> | |
| <script> | |
| $(document).ready(function() { | |
| // File handling variables | |
| let currentFile = null; | |
| let originalTextContent = null; | |
| let lastUploadedFileName = null; | |
| let fileJustUploaded = false; // Flag to prevent immediate detachment | |
| let currentModelType = "{{ model_type if model_type else 'predefined' }}"; | |
| let currentTokenizerInfo = null; | |
| // Try to parse tokenizer info if available from server | |
| try { | |
| currentTokenizerInfo = {{ token_data.tokenizer_info|tojson if token_data and token_data.tokenizer_info else 'null' }}; | |
| if (currentTokenizerInfo) { | |
| updateTokenizerInfoDisplay(currentTokenizerInfo, currentModelType === 'custom'); | |
| } | |
| } catch(e) { | |
| console.error("Error parsing tokenizer info:", e); | |
| } | |
| // Show error if exists | |
| if ("{{ error }}".length > 0) { | |
| showError("{{ error }}"); | |
| } | |
| // Setup model type based on initial state | |
| if (currentModelType === "custom") { | |
| $('.toggle-option').removeClass('active'); | |
| $('.custom-toggle').addClass('active'); | |
| $('#predefinedModelSelector').hide(); | |
| $('#customModelSelector').show(); | |
| } | |
| // Show success badge if custom model loaded successfully | |
| if (currentModelType === "custom" && !("{{ error }}".length > 0)) { | |
| $('#modelSuccessBadge').addClass('show'); | |
| setTimeout(() => { | |
| $('#modelSuccessBadge').removeClass('show'); | |
| }, 3000); | |
| } | |
| // Toggle between predefined and custom model inputs | |
| $('.toggle-option').click(function() { | |
| const modelType = $(this).data('type'); | |
| $('.toggle-option').removeClass('active'); | |
| $(this).addClass('active'); | |
| currentModelType = modelType; | |
| if (modelType === 'predefined') { | |
| $('#predefinedModelSelector').show(); | |
| $('#customModelSelector').hide(); | |
| $('#modelTypeInput').val('predefined'); | |
| // Set the model input value to the selected predefined model | |
| $('#modelInput').val($('#modelSelect').val()); | |
| } else { | |
| $('#predefinedModelSelector').hide(); | |
| $('#customModelSelector').show(); | |
| $('#modelTypeInput').val('custom'); | |
| } | |
| // Clear tokenizer info if switching models | |
| if (modelType === 'predefined') { | |
| $('#tokenizerInfoContent').html('<div class="tokenizer-info-loading"><div class="tokenizer-info-spinner"></div></div>'); | |
| fetchTokenizerInfo($('#modelSelect').val(), false); | |
| } else { | |
| $('#customTokenizerInfoContent').html('<div class="tokenizer-info-loading"><div class="tokenizer-info-spinner"></div></div>'); | |
| const customModel = $('#customModelInput').val(); | |
| if (customModel) { | |
| fetchTokenizerInfo(customModel, true); | |
| } | |
| } | |
| }); | |
| // Update hidden input when custom model input changes | |
| $('#customModelInput').on('input', function() { | |
| $('#customModelInputHidden').val($(this).val()); | |
| }); | |
| function showError(message) { | |
| const errorDiv = $('#errorMessage'); | |
| errorDiv.text(message); | |
| errorDiv.show(); | |
| setTimeout(() => errorDiv.fadeOut(), 5000); | |
| } | |
| // Function to update tokenizer info display in tooltip | |
| function updateTokenizerInfoDisplay(info, isCustom = false) { | |
| const targetSelector = isCustom ? '#customTokenizerInfoContent' : '#tokenizerInfoContent'; | |
| let htmlContent = ''; | |
| if (info.error) { | |
| $(targetSelector).html(`<div class="tokenizer-info-error">${info.error}</div>`); | |
| return; | |
| } | |
| // Start building the tooltip content | |
| htmlContent = `<div class="tokenizer-info-header">Tokenizer Details</div> | |
| <div class="tokenizer-info-grid">`; | |
| if (info.vocab_size) { | |
| htmlContent += ` | |
| <div class="tokenizer-info-item"> | |
| <span class="tokenizer-info-label">Dictionary Size</span> | |
| <span class="tokenizer-info-value">${info.vocab_size.toLocaleString()}</span> | |
| </div>`; | |
| } | |
| if (info.tokenizer_type) { | |
| htmlContent += ` | |
| <div class="tokenizer-info-item"> | |
| <span class="tokenizer-info-label">Tokenizer Type</span> | |
| <span class="tokenizer-info-value">${info.tokenizer_type}</span> | |
| </div>`; | |
| } | |
| if (info.model_max_length) { | |
| htmlContent += ` | |
| <div class="tokenizer-info-item"> | |
| <span class="tokenizer-info-label">Max Length</span> | |
| <span class="tokenizer-info-value">${info.model_max_length.toLocaleString()}</span> | |
| </div>`; | |
| } | |
| htmlContent += `</div>`; | |
| // Special tokens section | |
| if (info.special_tokens && Object.keys(info.special_tokens).length > 0) { | |
| htmlContent += ` | |
| <div class="tokenizer-info-item" style="margin-top: 0.75rem;"> | |
| <span class="tokenizer-info-label">Special Tokens</span> | |
| <div class="special-tokens-container">`; | |
| for (const [tokenName, tokenValue] of Object.entries(info.special_tokens)) { | |
| const escapedValue = tokenValue | |
| .replace(/&/g, '&') | |
| .replace(/</g, '<') | |
| .replace(/>/g, '>') | |
| .replace(/"/g, '"') | |
| .replace(/'/g, '''); | |
| htmlContent += ` | |
| <div class="special-token-item"> | |
| <span class="token-name">${tokenName}:</span> | |
| <span class="token-value">${escapedValue}</span> | |
| </div>`; | |
| } | |
| htmlContent += ` | |
| </div> | |
| </div>`; | |
| } | |
| $(targetSelector).html(htmlContent); | |
| } | |
| // Function to fetch tokenizer info | |
| function fetchTokenizerInfo(modelId, isCustom = false) { | |
| if (!modelId) return; | |
| const targetSelector = isCustom ? '#customTokenizerInfoContent' : '#tokenizerInfoContent'; | |
| $(targetSelector).html('<div class="tokenizer-info-loading"><div class="tokenizer-info-spinner"></div></div>'); | |
| $.ajax({ | |
| url: '/tokenizer-info', | |
| method: 'GET', | |
| data: { | |
| model_id: modelId, | |
| is_custom: isCustom | |
| }, | |
| success: function(response) { | |
| if (response.error) { | |
| $(targetSelector).html(`<div class="tokenizer-info-error">${response.error}</div>`); | |
| } else { | |
| currentTokenizerInfo = response; | |
| updateTokenizerInfoDisplay(response, isCustom); | |
| } | |
| }, | |
| error: function(xhr) { | |
| $(targetSelector).html('<div class="tokenizer-info-error">Failed to load tokenizer information</div>'); | |
| } | |
| }); | |
| } | |
| function updateResults(data) { | |
| $('#results').show(); | |
| const tokenContainer = $('#tokenContainer'); | |
| tokenContainer.empty(); | |
| data.tokens.forEach(token => { | |
| const span = $('<span>') | |
| .addClass('token') | |
| .css({ | |
| 'background-color': token.colors.background, | |
| 'color': token.colors.text | |
| }) | |
| .attr('title', `Original token: ${token.original} | Token ID: ${token.token_id}`) | |
| .text(token.display); | |
| tokenContainer.append(span); | |
| if (token.newline) { | |
| tokenContainer.append('<br>'); | |
| } | |
| }); | |
| if (data.display_limit_reached) { | |
| $('#displayLimitNotice').show(); | |
| $('#totalTokenCount').text(data.total_tokens); | |
| } else { | |
| $('#displayLimitNotice').hide(); | |
| } | |
| if (data.preview_only) { | |
| $('#previewNotice').show(); | |
| } else { | |
| $('#previewNotice').hide(); | |
| } | |
| $('#totalTokens').text(data.stats.basic_stats.total_tokens); | |
| $('#uniqueTokens').text(`${data.stats.basic_stats.unique_tokens} unique`); | |
| $('#uniquePercentage').text(data.stats.basic_stats.unique_percentage); | |
| $('#specialTokens').text(data.stats.basic_stats.special_tokens); | |
| $('#spaceTokens').text(data.stats.basic_stats.space_tokens); | |
| $('#spaceCount').text(data.stats.basic_stats.space_tokens); | |
| $('#newlineCount').text(data.stats.basic_stats.newline_tokens); | |
| $('#compressionRatio').text(data.stats.basic_stats.compression_ratio); | |
| $('#avgLength').text(data.stats.length_stats.avg_length); | |
| $('#medianLength').text(data.stats.length_stats.median_length); | |
| $('#stdDev').text(data.stats.length_stats.std_dev); | |
| if (data.tokenizer_info) { | |
| currentTokenizerInfo = data.tokenizer_info; | |
| updateTokenizerInfoDisplay(data.tokenizer_info, currentModelType === 'custom'); | |
| } | |
| } | |
| $('#textInput').on('input', function() { | |
| if (fileJustUploaded) { | |
| fileJustUploaded = false; | |
| return; | |
| } | |
| const currentText = $(this).val(); | |
| const fileInput = document.getElementById('fileInput'); | |
| if (fileInput.files.length > 0 && originalTextContent !== null) { | |
| const isMajorChange = | |
| currentText.length < originalTextContent.length * 0.8 || | |
| (currentText.length > 0 && | |
| currentText !== originalTextContent.substring(0, currentText.length) && | |
| currentText.substring(0, Math.min(20, currentText.length)) !== | |
| originalTextContent.substring(0, Math.min(20, originalTextContent.length))); | |
| if (isMajorChange) { | |
| detachFile(); | |
| } | |
| } | |
| }); | |
| function detachFile() { | |
| $('#fileInput').val(''); | |
| $('#fileInfo').fadeOut(300); | |
| originalTextContent = $('#textInput').val(); | |
| lastUploadedFileName = null; | |
| } | |
| $('#modelSelect').change(function() { | |
| const selectedModel = $(this).val(); | |
| $('#modelInput').val(selectedModel); | |
| fetchTokenizerInfo(selectedModel, false); | |
| if ($('#textInput').val().trim()) { | |
| $('#analyzeForm').submit(); | |
| } | |
| }); | |
| const fileDropZone = $('#fileDropZone'); | |
| const fileUploadIcon = $('#fileUploadIcon'); | |
| ['dragenter', 'dragover', 'dragleave', 'drop'].forEach(eventName => { | |
| fileDropZone[0].addEventListener(eventName, preventDefaults, false); | |
| document.body.addEventListener(eventName, preventDefaults, false); | |
| }); | |
| function preventDefaults(e) { | |
| e.preventDefault(); | |
| e.stopPropagation(); | |
| } | |
| document.addEventListener('dragenter', showDropZone, false); | |
| document.addEventListener('dragover', showDropZone, false); | |
| fileDropZone[0].addEventListener('dragleave', hideDropZone, false); | |
| fileDropZone[0].addEventListener('drop', hideDropZone, false); | |
| function showDropZone(e) { | |
| fileDropZone.addClass('active'); | |
| } | |
| function hideDropZone() { | |
| fileDropZone.removeClass('active'); | |
| } | |
| fileDropZone[0].addEventListener('drop', handleDrop, false); | |
| fileUploadIcon.on('click', function() { | |
| const input = document.createElement('input'); | |
| input.type = 'file'; | |
| input.onchange = e => { | |
| handleFiles(e.target.files); | |
| }; | |
| input.click(); | |
| }); | |
| function handleDrop(e) { | |
| const dt = e.dataTransfer; | |
| const files = dt.files; | |
| handleFiles(files); | |
| } | |
| function handleFiles(files) { | |
| if (files.length) { | |
| const file = files[0]; | |
| currentFile = file; | |
| lastUploadedFileName = file.name; | |
| fileJustUploaded = true; | |
| $('#fileInfo').html(`${file.name} (${formatFileSize(file.size)}) <span class="file-detach" id="fileDetach"><i class="fas fa-times"></i></span>`).fadeIn(300); | |
| $('#fileDetach').on('click', function(e) { | |
| e.stopPropagation(); | |
| detachFile(); | |
| return false; | |
| }); | |
| const dataTransfer = new DataTransfer(); | |
| dataTransfer.items.add(file); | |
| document.getElementById('fileInput').files = dataTransfer.files; | |
| const reader = new FileReader(); | |
| reader.onload = function(e) { | |
| const previewText = e.target.result.slice(0, 8096); | |
| $('#textInput').val(previewText); | |
| setTimeout(() => { | |
| originalTextContent = previewText; | |
| $('#analyzeForm').submit(); | |
| }, 50); | |
| }; | |
| reader.readAsText(file, 'utf-8'); | |
| } | |
| } | |
| function formatFileSize(bytes) { | |
| if (bytes < 1024) return bytes + ' bytes'; | |
| else if (bytes < 1048576) return (bytes / 1024).toFixed(1) + ' KB'; | |
| else return (bytes / 1048576).toFixed(1) + ' MB'; | |
| } | |
| $('#analyzeForm').on('submit', function(e) { | |
| e.preventDefault(); | |
| if (!fileJustUploaded) { | |
| const textInput = $('#textInput').val(); | |
| const fileInput = document.getElementById('fileInput'); | |
| if (fileInput.files.length > 0 && | |
| originalTextContent !== null && | |
| textInput !== originalTextContent && | |
| textInput.length < originalTextContent.length * 0.8) { | |
| detachFile(); | |
| } | |
| } else { | |
| fileJustUploaded = false; | |
| } | |
| if (currentModelType === 'custom') { | |
| $('#customModelInputHidden').val($('#customModelInput').val()); | |
| } else { | |
| $('#modelInput').val($('#modelSelect').val()); | |
| } | |
| const formData = new FormData(this); | |
| $('#analyzeButton').prop('disabled', true); | |
| $.ajax({ | |
| url: '/', | |
| method: 'POST', | |
| data: formData, | |
| processData: false, | |
| contentType: false, | |
| success: function(response) { | |
| if (response.error) { | |
| showError(response.error); | |
| } else { | |
| updateResults(response); | |
| if (currentModelType === 'custom') { | |
| $('#modelSuccessBadge').addClass('show'); | |
| setTimeout(() => { | |
| $('#modelSuccessBadge').removeClass('show'); | |
| }, 3000); | |
| } | |
| } | |
| }, | |
| error: function(xhr) { | |
| showError(xhr.responseText || 'An error occurred while processing the text'); | |
| }, | |
| complete: function() { | |
| $('#analyzeButton').prop('disabled', false); | |
| } | |
| }); | |
| }); | |
| $('#expandButton').click(function() { | |
| const container = $('#tokenContainer'); | |
| const isExpanded = container.hasClass('expanded'); | |
| container.toggleClass('expanded'); | |
| $(this).text(isExpanded ? 'Show More' : 'Show Less'); | |
| }); | |
| if (currentModelType === 'predefined') { | |
| fetchTokenizerInfo($('#modelSelect').val(), false); | |
| } else if ($('#customModelInput').val()) { | |
| fetchTokenizerInfo($('#customModelInput').val(), true); | |
| } | |
| $('#customModelInput').on('change', function() { | |
| const modelValue = $(this).val(); | |
| if (modelValue) { | |
| fetchTokenizerInfo(modelValue, true); | |
| } | |
| }); | |
| }); | |
| </script> | |
| </body> | |
| </html> | |
| """ | |
| def tokenizer_info(): | |
| """ | |
| Endpoint to get tokenizer information without processing text. | |
| """ | |
| model_id = request.args.get('model_id', '') | |
| is_custom = request.args.get('is_custom', 'false').lower() == 'true' | |
| if not model_id: | |
| return jsonify({"error": "No model ID provided"}), 400 | |
| try: | |
| if not is_custom and model_id in TOKENIZER_MODELS: | |
| model_id_or_name = model_id | |
| else: | |
| model_id_or_name = model_id | |
| tokenizer, info, error = load_tokenizer(model_id_or_name) | |
| if error: | |
| return jsonify({"error": error}), 400 | |
| return jsonify(info) | |
| except Exception as e: | |
| return jsonify({"error": f"Failed to get tokenizer info: {str(e)}"}), 500 | |
| def index(): | |
| text = "" | |
| token_data = None | |
| error_message = "" | |
| selected_model = request.args.get('model', request.form.get('model', 'Qwen3')) | |
| custom_model = request.args.get('custom_model', request.form.get('custom_model', '')) | |
| model_type = request.args.get('model_type', request.form.get('model_type', 'predefined')) | |
| model_to_use = selected_model if model_type == 'predefined' else custom_model | |
| if request.method == 'POST': | |
| if 'file' in request.files and request.files['file'].filename: | |
| uploaded_file = request.files['file'] | |
| file_path = os.path.join(app.config['UPLOAD_FOLDER'], uploaded_file.filename) | |
| uploaded_file.save(file_path) | |
| with open(file_path, 'r', encoding='utf-8', errors='replace') as f: | |
| text = f.read(8096) | |
| try: | |
| token_data = process_text("", model_to_use, is_full_file=True, file_path=file_path) | |
| if os.path.exists(file_path): | |
| os.remove(file_path) | |
| if request.headers.get('X-Requested-With') == 'XMLHttpRequest': | |
| return jsonify(token_data) | |
| except Exception as e: | |
| error_message = str(e) | |
| if os.path.exists(file_path): | |
| os.remove(file_path) | |
| if request.headers.get('X-Requested-With') == 'XMLHttpRequest': | |
| return jsonify({"error": error_message}), 400 | |
| return render_template_string( | |
| HTML_TEMPLATE, | |
| text=text, | |
| token_data=None, | |
| models=TOKENIZER_MODELS, | |
| selected_model=selected_model, | |
| custom_model=custom_model, | |
| model_type=model_type, | |
| error=error_message | |
| ) | |
| else: | |
| text = request.form.get('text', '') | |
| if text: | |
| try: | |
| token_data = process_text(text, model_to_use) | |
| if request.headers.get('X-Requested-With') == 'XMLHttpRequest': | |
| return jsonify(token_data) | |
| except Exception as e: | |
| error_message = str(e) | |
| if request.headers.get('X-Requested-With') == 'XMLHttpRequest': | |
| return jsonify({"error": error_message}), 400 | |
| return render_template_string( | |
| HTML_TEMPLATE, | |
| text=text, | |
| token_data=None, | |
| models=TOKENIZER_MODELS, | |
| selected_model=selected_model, | |
| custom_model=custom_model, | |
| model_type=model_type, | |
| error=error_message | |
| ) | |
| return render_template_string( | |
| HTML_TEMPLATE, | |
| text=text, | |
| token_data=token_data, | |
| models=TOKENIZER_MODELS, | |
| selected_model=selected_model, | |
| custom_model=custom_model, | |
| model_type=model_type, | |
| error=error_message | |
| ) | |
| if __name__ == "__main__": | |
| app.run(host='0.0.0.0', port=7860, debug=False) | |