Spaces:
Running
Running
| <html> | |
| <head> | |
| <title>Tokenizer Pro</title> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <link rel="icon" href="data:image/svg+xml,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 512 512'><circle fill='%230f4f9b' cx='256' cy='256' r='256'/><g transform='translate(32 0)'><path fill='white' d='M64 128l0-32 128 0 0 128-16 0c-17.7 0-32 14.3-32 32s14.3 32 32 32l96 0c17.7 0 32-14.3 32-32s-14.3-32-32-32l-16 0 0-128 128 0 0 32c0 17.7 14.3 32 32s32-14.3 32-32l0-48c0-26.5-21.5-48-48-48L224 32 48 32C21.5 32 0 53.5 0 80l0 48c0 17.7 14.3 32 32 32s32-14.3 32-32zM9.4 361.4c-12.5 12.5-12.5 32.8 0 45.3l64 64c9.2 9.2 22.9 11.9 34.9 6.9s19.8-16.6 19.8-29.6l0-32 192 0 0 32c0 12.9 7.8 24.6 19.8 29.6s25.7 2.2 34.9-6.9l64-64c12.5-12.5 12.5-32.8 0-45.3l-64-64c-9.2-9.2-22.9-11.9-34.9-6.9s-19.8 16.6-19.8 29.6l0 32-192 0 0-32c0-12.9-7.8-24.6-19.8-29.6s-25.7-2.2-34.9 6.9l-64 64z'/></g></svg>"> | |
| <script src="https://code.jquery.com/jquery-3.6.0.min.js"></script> | |
| <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css"> | |
| <link rel="stylesheet" href="{{ url_for('static', filename='css/style.css') }}"> | |
| </head> | |
| <body> | |
| <!-- Hidden File Drop Zone that appears when dragging files --> | |
| <div id="fileDropZone" class="file-drop-zone"> | |
| <div class="drop-indicator"> | |
| <div class="file-icon">📄</div> | |
| <p>Drop your file here</p> | |
| </div> | |
| </div> | |
| <!-- Loading overlay --> | |
| <div id="loadingOverlay" class="loading-overlay"> | |
| <div class="loading-content"> | |
| <div class="loading-spinner large"></div> | |
| <div class="loading-text" id="loadingText">Analyzing text...</div> | |
| </div> | |
| </div> | |
| <!-- File upload icon in bottom left corner --> | |
| <div id="fileUploadIcon" class="file-upload-icon"> | |
| <span>📎</span> | |
| </div> | |
| <p class="file-info" id="fileInfo"></p> | |
| <div class="container"> | |
| <div class="header"> | |
| <div class="title-section"> | |
| <h1 class="title">Tokenizer Pro</h1> | |
| <p class="subtitle">Advanced tokenization analysis and visualization</p> | |
| </div> | |
| <div class="model-selector"> | |
| <div class="model-selector-header"> | |
| <div class="model-type-toggle"> | |
| <div class="toggle-option predefined-toggle active" data-type="predefined">Predefined</div> | |
| <div class="toggle-option custom-toggle" data-type="custom">Custom</div> | |
| </div> | |
| </div> | |
| <div id="predefinedModelSelector"> | |
| <div style="position: relative;"> | |
| <div class="tokenizer-info-icon" id="modelInfoIcon" title="View tokenizer information">ℹ</div> | |
| <!-- TOOLTIP MOVED HERE --> | |
| <div class="tokenizer-info-tooltip" id="modelInfoTooltip"> | |
| <div id="tokenizerInfoContent"> | |
| <div class="tokenizer-info-loading"> | |
| <div class="tokenizer-info-spinner"></div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- SELECT NOW COMES AFTER ICON AND TOOLTIP --> | |
| <select id="modelSelect" name="model"> | |
| {% for model_id, info in models.items() %} | |
| <option value="{{ model_id }}" {% if selected_model == model_id %}selected{% endif %}> | |
| {{ info.alias }} | |
| </option> | |
| {% endfor %} | |
| </select> | |
| </div> | |
| </div> | |
| <div id="customModelSelector" style="display: none;" class="custom-model-wrapper"> | |
| <div style="position: relative;"> | |
| <div class="tokenizer-info-icon" id="customModelInfoIcon" title="View tokenizer information">ℹ</div> | |
| <div class="tokenizer-info-tooltip" id="customModelInfoTooltip"> | |
| <div id="customTokenizerInfoContent"> | |
| <div class="tokenizer-info-loading"> | |
| <div class="tokenizer-info-spinner"></div> | |
| </div> | |
| </div> | |
| </div> | |
| <input type="text" id="customModelInput" class="custom-model-input" | |
| placeholder="Enter HuggingFace model path" | |
| value="{{ custom_model if custom_model and custom_model|length > 0 else '' }}"> | |
| </div> | |
| <span class="custom-model-help">?</span> | |
| <div class="tooltip"> | |
| Enter a valid HuggingFace model ID (e.g., "mistralai/Mistral-7B-Instruct-v0.3") | |
| The model must have a tokenizer available and must be not restricted. (with some exceptions) | |
| Also some models have restrictions. You can use mirrored versions, like unsloth to omit that. | |
| Like ("unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit") instead of original path. | |
| </div> | |
| <div class="model-badge" id="modelSuccessBadge">Loaded</div> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="error-message" id="errorMessage">{{ error }}</div> | |
| <div class="input-section"> | |
| <div class="keyboard-shortcut-hint">Ctrl+Enter</div> | |
| <form id="analyzeForm" method="POST" enctype="multipart/form-data"> | |
| <textarea name="text" id="textInput" placeholder="Enter text to analyze or upload a file in bottom left corner...">{{ text }}</textarea> | |
| <input type="hidden" name="model" id="modelInput" value="{{ selected_model }}"> | |
| <input type="hidden" name="custom_model" id="customModelInputHidden" value="{{ custom_model if custom_model else '' }}"> | |
| <input type="hidden" name="model_type" id="modelTypeInput" value="{{ model_type if model_type else 'predefined' }}"> | |
| <input type="file" name="file" id="fileInput" style="display: none;"> | |
| <div class="button-container"> | |
| <button type="submit" id="analyzeButton">Analyze Text</button> | |
| </div> | |
| </form> | |
| </div> | |
| <div id="results" class="results" {% if not token_data %}style="display: none;"{% endif %}> | |
| <div class="card"> | |
| <div class="card-header"> | |
| <h2 class="card-title">Token Visualization</h2> | |
| <button type="button" class="search-toggle-btn" id="searchToggleBtn" title="Toggle token search" style="display: none;"> | |
| <svg width="16" height="16" viewBox="0 0 24 24" fill="currentColor"> | |
| <path d="M15.5 14h-.79l-.28-.27C15.41 12.59 16 11.11 16 9.5 16 5.91 13.09 3 9.5 3S3 5.91 3 9.5 5.91 16 9.5 16c1.61 0 3.09-.59 4.23-1.57l.27.28v.79l5 4.99L20.49 19l-4.99-5zm-6 0C7.01 14 5 11.99 5 9.5S7.01 5 9.5 5 14 7.01 14 9.5 11.99 14 9.5 14z"/> | |
| </svg> | |
| </button> | |
| </div> | |
| <div class="token-search-container" id="tokenSearchContainer" style="display: none;"> | |
| <div class="token-search-row"> | |
| <input type="text" class="token-search-input" id="tokenSearchInput" placeholder="Search tokens..."> | |
| <div class="token-search-controls"> | |
| <button class="token-search-btn" id="prevMatch">◀</button> | |
| <span class="token-search-count" id="searchCount">0/0</span> | |
| <button class="token-search-btn" id="nextMatch">▶</button> | |
| <button class="token-search-btn" id="clearSearch">Clear</button> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="preview-notice" id="previewNotice"> | |
| Note: Showing preview of first 8096 characters. Stats are calculated on the full file. | |
| </div> | |
| <div class="token-container" id="tokenContainer"> | |
| {% if token_data %} | |
| {% for token in token_data.tokens %} | |
| <span class="token" | |
| style="background-color: {{ token.colors.background }}; color: {{ token.colors.text }};" | |
| title="Original token: {{ token.original }} | Token ID: {{ token.token_id }}"> | |
| {{ token.display }} | |
| </span> | |
| {% if token.newline %}<br>{% endif %} | |
| {% endfor %} | |
| {% endif %} | |
| </div> | |
| <button class="expand-button" id="expandButton">Show More</button> | |
| <div class="display-limit-notice" id="displayLimitNotice"> | |
| Note: Only showing first 50,000 tokens. Total token count: <span id="totalTokenCount">0</span> | |
| </div> | |
| <div class="frequency-chart-container" id="frequencyChartContainer" style="display: none;"> | |
| <div class="frequency-chart-title"> | |
| <span>Top Token Frequencies</span> | |
| <button class="chart-toggle-btn" id="toggleFrequencyChart">Show Chart</button> | |
| </div> | |
| <div class="frequency-chart" id="frequencyChart"></div> | |
| </div> | |
| </div> | |
| <div class="stats-grid"> | |
| <div class="stat-card"> | |
| <div class="stat-title">Total Tokens</div> | |
| <div class="stat-value" id="totalTokens">{{ token_data.stats.basic_stats.total_tokens if token_data else 0 }}</div> | |
| <div class="stat-description"> | |
| <span id="uniqueTokens">{{ token_data.stats.basic_stats.unique_tokens if token_data else 0 }} unique</span> | |
| (<span id="uniquePercentage">{{ token_data.stats.basic_stats.unique_percentage if token_data else 0 }}</span>%) | |
| </div> | |
| </div> | |
| <div class="stat-card"> | |
| <div class="stat-title">Token Types</div> | |
| <div class="stat-value" id="specialTokens">{{ token_data.stats.basic_stats.special_tokens if token_data else 0 }}</div> | |
| <div class="stat-description">special tokens</div> | |
| </div> | |
| <div class="stat-card"> | |
| <div class="stat-title">Whitespace</div> | |
| <div class="stat-value" id="spaceTokens">{{ token_data.stats.basic_stats.space_tokens if token_data else 0 }}</div> | |
| <div class="stat-description"> | |
| spaces: <span id="spaceCount">{{ token_data.stats.basic_stats.space_tokens if token_data else 0 }}</span>, | |
| newlines: <span id="newlineCount">{{ token_data.stats.basic_stats.newline_tokens if token_data else 0 }}</span> | |
| </div> | |
| </div> | |
| <div class="stat-card"> | |
| <div class="stat-title">Token Length</div> | |
| <div class="stat-value" id="avgLength">{{ token_data.stats.length_stats.avg_length if token_data else 0 }}</div> | |
| <div class="stat-description"> | |
| median: <span id="medianLength">{{ token_data.stats.length_stats.median_length if token_data else 0 }}</span>, | |
| ±<span id="stdDev">{{ token_data.stats.length_stats.std_dev if token_data else 0 }}</span> std | |
| </div> | |
| </div> | |
| <div class="stat-card"> | |
| <div class="stat-title">Compression</div> | |
| <div class="stat-value" id="compressionRatio">{{ token_data.stats.basic_stats.compression_ratio if token_data else 0 }}</div> | |
| <div class="stat-description">characters per token</div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <a href="https://huggingface.co/spaces/bartar/tokenizers" target="_blank" class="watermark"> | |
| @bartar/tokenizers | |
| </a> | |
| <script> | |
| </script> | |
| <script src="{{ url_for('static', filename='js/main.js') }}"></script> | |
| </body> | |
| </html> |