Spaces:
Running
Running
Commit
·
1e32a60
1
Parent(s):
dbda7b0
Add support for davanstrien/rolm-test dataset with model info display
Browse files- Add inference_info column detection and rolmocr_text as improved text column
- Add example dataset selector dropdown with quick access to both datasets
- Extract and display model metadata (model name, processing date, parameters)
- Move model info panel above metrics for better visibility
- Fix sidebar scrolling to prevent model info from being hidden
- Add debug logging for troubleshooting column detection
🤖 Generated with [Claude Code](https://claude.ai/code)
Co-Authored-By: Claude <noreply@anthropic.com>
- index.html +61 -2
- js/app.js +64 -0
- js/dataset-api.js +49 -3
index.html
CHANGED
|
@@ -63,6 +63,27 @@
|
|
| 63 |
>
|
| 64 |
Load
|
| 65 |
</button>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
</div>
|
| 67 |
</div>
|
| 68 |
|
|
@@ -202,8 +223,8 @@
|
|
| 202 |
<!-- Content Area -->
|
| 203 |
<div x-show="!loading && !error && currentSample" class="flex-1 flex h-full">
|
| 204 |
<!-- Image Panel -->
|
| 205 |
-
<div class="w-1/3 bg-gray-100 dark:bg-gray-800 p-4 overflow-
|
| 206 |
-
<div
|
| 207 |
<div class="bg-white dark:bg-gray-700 rounded-lg shadow-sm overflow-hidden">
|
| 208 |
<img
|
| 209 |
:src="getImageSrc()"
|
|
@@ -221,6 +242,44 @@
|
|
| 221 |
</div>
|
| 222 |
</div>
|
| 223 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
<!-- Statistics Panel -->
|
| 225 |
<div class="mt-4 bg-white dark:bg-gray-700 rounded-lg shadow-sm p-4">
|
| 226 |
<h3 class="text-sm font-medium text-gray-700 dark:text-gray-300 mb-3">OCR Quality Metrics</h3>
|
|
|
|
| 63 |
>
|
| 64 |
Load
|
| 65 |
</button>
|
| 66 |
+
|
| 67 |
+
<!-- Example Dataset Selector -->
|
| 68 |
+
<div class="relative group">
|
| 69 |
+
<button class="px-3 py-1.5 text-sm text-gray-600 hover:text-gray-900 dark:text-gray-400 dark:hover:text-gray-200 border border-gray-300 dark:border-gray-600 rounded-md flex items-center space-x-1">
|
| 70 |
+
<svg class="w-4 h-4" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
| 71 |
+
<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M9 5H7a2 2 0 00-2 2v12a2 2 0 002 2h10a2 2 0 002-2V7a2 2 0 00-2-2h-2M9 5a2 2 0 002 2h2a2 2 0 002-2M9 5a2 2 0 012-2h2a2 2 0 012 2"></path>
|
| 72 |
+
</svg>
|
| 73 |
+
<span>Examples</span>
|
| 74 |
+
</button>
|
| 75 |
+
<div class="absolute left-0 mt-1 w-72 bg-white dark:bg-gray-800 rounded-md shadow-lg border border-gray-200 dark:border-gray-700 hidden group-hover:block z-50">
|
| 76 |
+
<template x-for="dataset in exampleDatasets" :key="dataset.id">
|
| 77 |
+
<button
|
| 78 |
+
@click="selectDataset(dataset.id)"
|
| 79 |
+
class="block w-full text-left px-4 py-3 hover:bg-gray-50 dark:hover:bg-gray-700 border-b border-gray-100 dark:border-gray-600 last:border-b-0"
|
| 80 |
+
>
|
| 81 |
+
<div class="font-medium text-sm text-gray-900 dark:text-gray-100" x-text="dataset.name"></div>
|
| 82 |
+
<div class="text-xs text-gray-500 dark:text-gray-400 mt-1" x-text="dataset.description"></div>
|
| 83 |
+
</button>
|
| 84 |
+
</template>
|
| 85 |
+
</div>
|
| 86 |
+
</div>
|
| 87 |
</div>
|
| 88 |
</div>
|
| 89 |
|
|
|
|
| 223 |
<!-- Content Area -->
|
| 224 |
<div x-show="!loading && !error && currentSample" class="flex-1 flex h-full">
|
| 225 |
<!-- Image Panel -->
|
| 226 |
+
<div class="w-1/3 bg-gray-100 dark:bg-gray-800 p-4 overflow-y-auto border-r border-gray-200 dark:border-gray-700">
|
| 227 |
+
<div>
|
| 228 |
<div class="bg-white dark:bg-gray-700 rounded-lg shadow-sm overflow-hidden">
|
| 229 |
<img
|
| 230 |
:src="getImageSrc()"
|
|
|
|
| 242 |
</div>
|
| 243 |
</div>
|
| 244 |
|
| 245 |
+
<!-- Model Info Panel -->
|
| 246 |
+
<div x-show="modelInfo" x-transition class="mt-4 bg-white dark:bg-gray-700 rounded-lg shadow-sm p-4">
|
| 247 |
+
<h3 class="text-sm font-medium text-gray-700 dark:text-gray-300 mb-3 flex items-center">
|
| 248 |
+
<svg class="w-4 h-4 mr-1" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
| 249 |
+
<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M9.75 17L9 20l-1 1h8l-1-1-.75-3M3 13h18M5 17h14a2 2 0 002-2V5a2 2 0 00-2-2H5a2 2 0 00-2 2v10a2 2 0 002 2z"></path>
|
| 250 |
+
</svg>
|
| 251 |
+
Model Information
|
| 252 |
+
</h3>
|
| 253 |
+
<div class="space-y-2 text-xs">
|
| 254 |
+
<div class="flex justify-between items-center">
|
| 255 |
+
<span class="text-gray-600 dark:text-gray-400">Model</span>
|
| 256 |
+
<span class="font-medium text-gray-900 dark:text-gray-100" x-text="modelInfo?.modelName || '-'"></span>
|
| 257 |
+
</div>
|
| 258 |
+
<div x-show="modelInfo?.processingDate" class="flex justify-between items-center">
|
| 259 |
+
<span class="text-gray-600 dark:text-gray-400">Processed</span>
|
| 260 |
+
<span class="text-gray-900 dark:text-gray-100" x-text="modelInfo?.processingDate || '-'"></span>
|
| 261 |
+
</div>
|
| 262 |
+
<div x-show="modelInfo?.batchSize" class="flex justify-between items-center">
|
| 263 |
+
<span class="text-gray-600 dark:text-gray-400">Batch Size</span>
|
| 264 |
+
<span class="text-gray-900 dark:text-gray-100" x-text="modelInfo?.batchSize || '-'"></span>
|
| 265 |
+
</div>
|
| 266 |
+
<div x-show="modelInfo?.maxTokens" class="flex justify-between items-center">
|
| 267 |
+
<span class="text-gray-600 dark:text-gray-400">Max Tokens</span>
|
| 268 |
+
<span class="text-gray-900 dark:text-gray-100" x-text="modelInfo?.maxTokens?.toLocaleString() || '-'"></span>
|
| 269 |
+
</div>
|
| 270 |
+
<div x-show="modelInfo?.scriptUrl" class="mt-2 pt-2 border-t border-gray-200 dark:border-gray-600">
|
| 271 |
+
<a :href="modelInfo?.scriptUrl"
|
| 272 |
+
target="_blank"
|
| 273 |
+
class="text-blue-600 dark:text-blue-400 hover:underline flex items-center">
|
| 274 |
+
<svg class="w-3 h-3 mr-1" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
| 275 |
+
<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14"></path>
|
| 276 |
+
</svg>
|
| 277 |
+
View Script
|
| 278 |
+
</a>
|
| 279 |
+
</div>
|
| 280 |
+
</div>
|
| 281 |
+
</div>
|
| 282 |
+
|
| 283 |
<!-- Statistics Panel -->
|
| 284 |
<div class="mt-4 bg-white dark:bg-gray-700 rounded-lg shadow-sm p-4">
|
| 285 |
<h3 class="text-sm font-medium text-gray-700 dark:text-gray-300 mb-3">OCR Quality Metrics</h3>
|
js/app.js
CHANGED
|
@@ -9,6 +9,12 @@ document.addEventListener('alpine:init', () => {
|
|
| 9 |
datasetConfig: 'default',
|
| 10 |
datasetSplit: 'train',
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
// Navigation state
|
| 13 |
currentIndex: 0,
|
| 14 |
totalSamples: null,
|
|
@@ -53,6 +59,10 @@ document.addEventListener('alpine:init', () => {
|
|
| 53 |
// Markdown cache
|
| 54 |
markdownCache: new Map(),
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
async init() {
|
| 57 |
// Initialize API
|
| 58 |
this.api = new DatasetAPI();
|
|
@@ -171,6 +181,14 @@ document.addEventListener('alpine:init', () => {
|
|
| 171 |
|
| 172 |
this.currentSample = data.row;
|
| 173 |
this.currentIndex = index;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
|
| 175 |
// Check if improved text contains markdown
|
| 176 |
const improvedText = this.getImprovedText();
|
|
@@ -215,6 +233,52 @@ document.addEventListener('alpine:init', () => {
|
|
| 215 |
}
|
| 216 |
},
|
| 217 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
getOriginalText() {
|
| 219 |
if (!this.currentSample) return '';
|
| 220 |
const columns = this.api.detectColumns(null, this.currentSample);
|
|
|
|
| 9 |
datasetConfig: 'default',
|
| 10 |
datasetSplit: 'train',
|
| 11 |
|
| 12 |
+
// Example datasets
|
| 13 |
+
exampleDatasets: [
|
| 14 |
+
{ id: 'davanstrien/exams-ocr', name: 'Exams OCR', description: 'Historical exam papers with VLM corrections' },
|
| 15 |
+
{ id: 'davanstrien/rolm-test', name: 'ROLM Test', description: 'Documents processed with RolmOCR model' }
|
| 16 |
+
],
|
| 17 |
+
|
| 18 |
// Navigation state
|
| 19 |
currentIndex: 0,
|
| 20 |
totalSamples: null,
|
|
|
|
| 59 |
// Markdown cache
|
| 60 |
markdownCache: new Map(),
|
| 61 |
|
| 62 |
+
// Model info
|
| 63 |
+
modelInfo: null,
|
| 64 |
+
columnInfo: null,
|
| 65 |
+
|
| 66 |
async init() {
|
| 67 |
// Initialize API
|
| 68 |
this.api = new DatasetAPI();
|
|
|
|
| 181 |
|
| 182 |
this.currentSample = data.row;
|
| 183 |
this.currentIndex = index;
|
| 184 |
+
this.columnInfo = data.columns;
|
| 185 |
+
|
| 186 |
+
// Extract model info if available
|
| 187 |
+
this.extractModelInfo();
|
| 188 |
+
|
| 189 |
+
// Debug: Log column info
|
| 190 |
+
console.log('Column info:', this.columnInfo);
|
| 191 |
+
console.log('Current sample keys:', Object.keys(this.currentSample));
|
| 192 |
|
| 193 |
// Check if improved text contains markdown
|
| 194 |
const improvedText = this.getImprovedText();
|
|
|
|
| 233 |
}
|
| 234 |
},
|
| 235 |
|
| 236 |
+
async selectDataset(datasetId) {
|
| 237 |
+
this.datasetId = datasetId;
|
| 238 |
+
await this.loadDataset();
|
| 239 |
+
},
|
| 240 |
+
|
| 241 |
+
extractModelInfo() {
|
| 242 |
+
this.modelInfo = null;
|
| 243 |
+
|
| 244 |
+
if (!this.currentSample || !this.columnInfo || !this.columnInfo.inferenceInfo) {
|
| 245 |
+
console.log('No inference info column detected');
|
| 246 |
+
return;
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
const inferenceData = this.currentSample[this.columnInfo.inferenceInfo];
|
| 250 |
+
if (!inferenceData) {
|
| 251 |
+
console.log('No inference data in current sample');
|
| 252 |
+
return;
|
| 253 |
+
}
|
| 254 |
+
|
| 255 |
+
console.log('Raw inference data:', inferenceData);
|
| 256 |
+
const parsed = this.api.parseInferenceInfo(inferenceData);
|
| 257 |
+
console.log('Parsed inference data:', parsed);
|
| 258 |
+
|
| 259 |
+
if (parsed) {
|
| 260 |
+
const formattedInfo = this.formatModelInfo(parsed);
|
| 261 |
+
// Ensure it's a plain object, not a proxy
|
| 262 |
+
this.modelInfo = formattedInfo ? {...formattedInfo} : null;
|
| 263 |
+
console.log('Formatted model info:', this.modelInfo);
|
| 264 |
+
}
|
| 265 |
+
},
|
| 266 |
+
|
| 267 |
+
formatModelInfo(info) {
|
| 268 |
+
if (!info) return null;
|
| 269 |
+
|
| 270 |
+
return {
|
| 271 |
+
modelId: info.model_id || 'Unknown',
|
| 272 |
+
modelName: info.model_id ? info.model_id.split('/').pop() : 'Unknown',
|
| 273 |
+
processingDate: info.processing_date ? new Date(info.processing_date).toLocaleDateString() : null,
|
| 274 |
+
scriptVersion: info.script_version || null,
|
| 275 |
+
batchSize: info.batch_size || null,
|
| 276 |
+
maxTokens: info.max_tokens || null,
|
| 277 |
+
scriptUrl: info.script_url || null,
|
| 278 |
+
columnName: info.column_name || null
|
| 279 |
+
};
|
| 280 |
+
},
|
| 281 |
+
|
| 282 |
getOriginalText() {
|
| 283 |
if (!this.currentSample) return '';
|
| 284 |
const columns = this.api.detectColumns(null, this.currentSample);
|
js/dataset-api.js
CHANGED
|
@@ -176,6 +176,7 @@ class DatasetAPI {
|
|
| 176 |
let imageColumn = null;
|
| 177 |
let originalTextColumn = null;
|
| 178 |
let improvedTextColumn = null;
|
|
|
|
| 179 |
|
| 180 |
// Try to detect from features first
|
| 181 |
for (const feature of features || []) {
|
|
@@ -192,9 +193,14 @@ class DatasetAPI {
|
|
| 192 |
originalTextColumn = name;
|
| 193 |
}
|
| 194 |
|
| 195 |
-
if (!improvedTextColumn && ['markdown', 'new_ocr', 'corrected_text', 'improved', 'vlm_ocr', 'corrected'].includes(name)) {
|
| 196 |
improvedTextColumn = name;
|
| 197 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
}
|
| 199 |
|
| 200 |
// Fallback: detect from sample row
|
|
@@ -217,15 +223,21 @@ class DatasetAPI {
|
|
| 217 |
}
|
| 218 |
|
| 219 |
if (!improvedTextColumn) {
|
| 220 |
-
const candidates = ['markdown', 'new_ocr', 'corrected_text', 'improved'];
|
| 221 |
improvedTextColumn = keys.find(k => candidates.includes(k)) || null;
|
| 222 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
}
|
| 224 |
|
| 225 |
return {
|
| 226 |
image: imageColumn,
|
| 227 |
originalText: originalTextColumn,
|
| 228 |
-
improvedText: improvedTextColumn
|
|
|
|
| 229 |
};
|
| 230 |
}
|
| 231 |
|
|
@@ -267,6 +279,40 @@ class DatasetAPI {
|
|
| 267 |
clearCache() {
|
| 268 |
this.cache.clear();
|
| 269 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
}
|
| 271 |
|
| 272 |
// Export for use in other scripts
|
|
|
|
| 176 |
let imageColumn = null;
|
| 177 |
let originalTextColumn = null;
|
| 178 |
let improvedTextColumn = null;
|
| 179 |
+
let inferenceInfoColumn = null;
|
| 180 |
|
| 181 |
// Try to detect from features first
|
| 182 |
for (const feature of features || []) {
|
|
|
|
| 193 |
originalTextColumn = name;
|
| 194 |
}
|
| 195 |
|
| 196 |
+
if (!improvedTextColumn && ['markdown', 'new_ocr', 'corrected_text', 'improved', 'vlm_ocr', 'corrected', 'rolmocr_text'].includes(name)) {
|
| 197 |
improvedTextColumn = name;
|
| 198 |
}
|
| 199 |
+
|
| 200 |
+
// Detect inference info column
|
| 201 |
+
if (name === 'inference_info') {
|
| 202 |
+
inferenceInfoColumn = name;
|
| 203 |
+
}
|
| 204 |
}
|
| 205 |
|
| 206 |
// Fallback: detect from sample row
|
|
|
|
| 223 |
}
|
| 224 |
|
| 225 |
if (!improvedTextColumn) {
|
| 226 |
+
const candidates = ['markdown', 'new_ocr', 'corrected_text', 'improved', 'rolmocr_text'];
|
| 227 |
improvedTextColumn = keys.find(k => candidates.includes(k)) || null;
|
| 228 |
}
|
| 229 |
+
|
| 230 |
+
// Check for inference info in sample row
|
| 231 |
+
if (!inferenceInfoColumn && keys.includes('inference_info')) {
|
| 232 |
+
inferenceInfoColumn = 'inference_info';
|
| 233 |
+
}
|
| 234 |
}
|
| 235 |
|
| 236 |
return {
|
| 237 |
image: imageColumn,
|
| 238 |
originalText: originalTextColumn,
|
| 239 |
+
improvedText: improvedTextColumn,
|
| 240 |
+
inferenceInfo: inferenceInfoColumn
|
| 241 |
};
|
| 242 |
}
|
| 243 |
|
|
|
|
| 279 |
clearCache() {
|
| 280 |
this.cache.clear();
|
| 281 |
}
|
| 282 |
+
|
| 283 |
+
/**
|
| 284 |
+
* Parse inference info JSON safely
|
| 285 |
+
*/
|
| 286 |
+
parseInferenceInfo(inferenceInfoData) {
|
| 287 |
+
if (!inferenceInfoData) return null;
|
| 288 |
+
|
| 289 |
+
try {
|
| 290 |
+
// Handle if it's already an object (some datasets might store it as object)
|
| 291 |
+
if (typeof inferenceInfoData === 'object' && !Array.isArray(inferenceInfoData)) {
|
| 292 |
+
return inferenceInfoData;
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
// Handle if it's a JSON string
|
| 296 |
+
if (typeof inferenceInfoData === 'string') {
|
| 297 |
+
const parsed = JSON.parse(inferenceInfoData);
|
| 298 |
+
// If it's an array, take the first item
|
| 299 |
+
if (Array.isArray(parsed) && parsed.length > 0) {
|
| 300 |
+
return parsed[0];
|
| 301 |
+
}
|
| 302 |
+
return parsed;
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
+
// Handle if it's already an array
|
| 306 |
+
if (Array.isArray(inferenceInfoData) && inferenceInfoData.length > 0) {
|
| 307 |
+
return inferenceInfoData[0];
|
| 308 |
+
}
|
| 309 |
+
|
| 310 |
+
return null;
|
| 311 |
+
} catch (error) {
|
| 312 |
+
console.warn('Failed to parse inference info:', error);
|
| 313 |
+
return null;
|
| 314 |
+
}
|
| 315 |
+
}
|
| 316 |
}
|
| 317 |
|
| 318 |
// Export for use in other scripts
|