Spaces:
Running
Running
içerik daha iyi türkçeye ocr çok bozuk iyi yapamıyor. abbyfinereader tarzında markdown, json,text formatına belge yapısına uygun çeviri yapsın. bu talimatı uygulama
Browse files
script.js
CHANGED
|
@@ -188,11 +188,13 @@ return {
|
|
| 188 |
|
| 189 |
reader.onload = async function(event) {
|
| 190 |
try {
|
| 191 |
-
|
|
|
|
|
|
|
| 192 |
|
| 193 |
// Enhanced PDF loading with multiple extraction strategies
|
| 194 |
const loadingTask = pdfjsLib.getDocument({
|
| 195 |
-
data: typedArray,
|
| 196 |
cMapUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/cmaps/',
|
| 197 |
cMapPacked: true,
|
| 198 |
standardFontDataUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/standard_fonts/',
|
|
@@ -202,8 +204,7 @@ return {
|
|
| 202 |
disableAutoFetch: false,
|
| 203 |
disableStream: false
|
| 204 |
});
|
| 205 |
-
|
| 206 |
-
const pdf = await loadingTask.promise;
|
| 207 |
let fullText = '';
|
| 208 |
let metadata = await pdf.getMetadata();
|
| 209 |
|
|
@@ -264,23 +265,46 @@ return {
|
|
| 264 |
|
| 265 |
// Strategy 2: Enhanced Turkish character decoding
|
| 266 |
fullText = decodeTurkishText(fullText);
|
| 267 |
-
|
| 268 |
// Strategy 3: If still poor quality, try OCR with preprocessing
|
| 269 |
if (!fullText.trim() || fullText.trim().length < 50) {
|
| 270 |
console.warn('Primary text extraction failed, attempting enhanced OCR...');
|
| 271 |
-
|
|
|
|
|
|
|
| 272 |
}
|
| 273 |
-
|
| 274 |
-
// Strategy 4: Apply text quality improvements
|
| 275 |
fullText = improveTextQuality(fullText);
|
| 276 |
|
| 277 |
resolve(fullText);
|
| 278 |
-
|
| 279 |
} catch (error) {
|
| 280 |
console.error('PDF extraction error:', error);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
reject(new Error('Failed to extract text from PDF: ' + error.message));
|
| 282 |
}
|
| 283 |
-
|
| 284 |
|
| 285 |
reader.onerror = () => reject(new Error('Failed to read PDF file'));
|
| 286 |
reader.readAsArrayBuffer(file);
|
|
@@ -332,11 +356,20 @@ return {
|
|
| 332 |
|
| 333 |
return decodedText;
|
| 334 |
}
|
| 335 |
-
|
| 336 |
// Enhanced OCR fallback with multiple engines
|
| 337 |
async function enhancedOCRFallback(pdfData) {
|
| 338 |
-
|
| 339 |
-
let
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 340 |
|
| 341 |
for (const image of images) {
|
| 342 |
// Try multiple OCR approaches
|
|
@@ -419,17 +452,29 @@ return {
|
|
| 419 |
ctx.putImageData(imageData, 0, 0);
|
| 420 |
return canvas;
|
| 421 |
}
|
| 422 |
-
|
| 423 |
// Enhanced PDF to image conversion
|
| 424 |
async function convertPDFToImagesEnhanced(pdfData) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 425 |
const loadingTask = pdfjsLib.getDocument({
|
| 426 |
-
data:
|
| 427 |
cMapUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/cmaps/',
|
| 428 |
cMapPacked: true,
|
| 429 |
-
standardFontDataUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/standard_fonts/'
|
|
|
|
|
|
|
|
|
|
|
|
|
| 430 |
});
|
| 431 |
-
|
| 432 |
-
const pdf = await loadingTask.promise;
|
| 433 |
const images = [];
|
| 434 |
|
| 435 |
// Process all pages with higher resolution
|
|
@@ -602,85 +647,74 @@ return {
|
|
| 602 |
reader.readAsArrayBuffer(file);
|
| 603 |
});
|
| 604 |
}
|
| 605 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 606 |
return await convertPDFToImagesEnhanced(pdfData);
|
| 607 |
}
|
| 608 |
-
async function extractTextFromImage(file) {
|
| 609 |
return new Promise(async (resolve, reject) => {
|
| 610 |
try {
|
| 611 |
-
// Apply learned corrections before OCR
|
| 612 |
-
let trainedWords = {};
|
| 613 |
-
if (window.ocrLearningDict) {
|
| 614 |
-
for (const [word, data] of Object.entries(window.ocrLearningDict)) {
|
| 615 |
-
if (data.confirmedCorrect && data.confirmedCorrect !== word) {
|
| 616 |
-
trainedWords[word] = data.confirmedCorrect;
|
| 617 |
-
}
|
| 618 |
-
}
|
| 619 |
-
}
|
| 620 |
-
|
| 621 |
const imageElement = file instanceof HTMLCanvasElement ? file : file;
|
| 622 |
|
| 623 |
-
//
|
| 624 |
-
const
|
| 625 |
-
logger: m => {
|
| 626 |
-
if (m.status === 'recognizing text') {
|
| 627 |
-
console.log(`OCR Progress: ${Math.round(m.progress * 100)}%`);
|
| 628 |
-
}
|
| 629 |
-
},
|
| 630 |
-
preserve_interword_spaces: '1',
|
| 631 |
-
tessedit_pageseg_mode: '6', // Assume uniform text block
|
| 632 |
-
tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" @#$%^&*+=<>:;_ ğüşıöçĞÜŞİÖÇ',
|
| 633 |
-
user_defined_words: Object.keys(trainedWords).join(' '),
|
| 634 |
-
tessedit_create_hocr: '1',
|
| 635 |
-
load_system_dawg: '1',
|
| 636 |
-
load_freq_dawg: '1',
|
| 637 |
-
user_words_suffix: 'tur',
|
| 638 |
-
user_patterns_suffix: 'tur',
|
| 639 |
-
tessedit_ocr_engine_mode: '1', // LSTM OCR engine
|
| 640 |
-
tessedit_do_ocr: '1',
|
| 641 |
-
tessedit_load_image: '1'
|
| 642 |
-
};
|
| 643 |
-
|
| 644 |
-
// Try multiple OCR approaches
|
| 645 |
-
const results = await Promise.allSettled([
|
| 646 |
-
// Primary: Turkish + English with enhanced preprocessing
|
| 647 |
-
performOCRWithPreprocessing(imageElement, 'tur+eng', config),
|
| 648 |
-
// Secondary: Different page segmentation
|
| 649 |
-
Tesseract.recognize(imageElement, 'tur+eng', {
|
| 650 |
-
...config,
|
| 651 |
-
tessedit_pageseg_mode: '1' // Automatic page segmentation
|
| 652 |
-
}),
|
| 653 |
-
// Tertiary: Only English if Turkish fails
|
| 654 |
-
Tesseract.recognize(imageElement, 'eng', config)
|
| 655 |
-
]);
|
| 656 |
|
| 657 |
-
//
|
| 658 |
-
|
| 659 |
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
|
| 666 |
-
(
|
| 667 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 668 |
}
|
| 669 |
-
}
|
| 670 |
-
}
|
| 671 |
|
| 672 |
-
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
|
|
|
|
|
|
| 676 |
|
| 677 |
-
|
| 678 |
-
|
| 679 |
-
|
| 680 |
-
|
| 681 |
-
} else {
|
| 682 |
-
resolve(bestResult.text);
|
| 683 |
-
}
|
| 684 |
} else {
|
| 685 |
resolve('No text could be extracted from the image.');
|
| 686 |
}
|
|
@@ -690,43 +724,432 @@ async function extractTextFromImage(file) {
|
|
| 690 |
reject(error);
|
| 691 |
}
|
| 692 |
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 693 |
|
| 694 |
-
//
|
| 695 |
-
async function
|
| 696 |
-
|
|
|
|
|
|
|
| 697 |
|
| 698 |
-
|
| 699 |
-
|
| 700 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 701 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 702 |
|
| 703 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 704 |
}
|
| 705 |
|
| 706 |
-
// Calculate
|
| 707 |
-
function
|
| 708 |
if (!text || text.trim().length === 0) return 0;
|
| 709 |
|
| 710 |
-
|
| 711 |
-
|
| 712 |
-
|
| 713 |
-
const
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 714 |
|
| 715 |
-
|
| 716 |
-
|
| 717 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 718 |
|
| 719 |
-
//
|
| 720 |
-
|
|
|
|
|
|
|
| 721 |
}
|
| 722 |
|
| 723 |
-
//
|
| 724 |
-
function
|
| 725 |
-
return
|
| 726 |
-
|
| 727 |
-
|
| 728 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 729 |
.trim();
|
|
|
|
|
|
|
| 730 |
}
|
| 731 |
function processFormattedOCR(hocr) {
|
| 732 |
// Apply learned corrections
|
|
|
|
| 188 |
|
| 189 |
reader.onload = async function(event) {
|
| 190 |
try {
|
| 191 |
+
// Create a copy of the ArrayBuffer to avoid detachment issues
|
| 192 |
+
const arrayBuffer = event.target.result;
|
| 193 |
+
const typedArray = new Uint8Array(arrayBuffer.slice(0));
|
| 194 |
|
| 195 |
// Enhanced PDF loading with multiple extraction strategies
|
| 196 |
const loadingTask = pdfjsLib.getDocument({
|
| 197 |
+
data: typedArray.buffer,
|
| 198 |
cMapUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/cmaps/',
|
| 199 |
cMapPacked: true,
|
| 200 |
standardFontDataUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/standard_fonts/',
|
|
|
|
| 204 |
disableAutoFetch: false,
|
| 205 |
disableStream: false
|
| 206 |
});
|
| 207 |
+
const pdf = await loadingTask.promise;
|
|
|
|
| 208 |
let fullText = '';
|
| 209 |
let metadata = await pdf.getMetadata();
|
| 210 |
|
|
|
|
| 265 |
|
| 266 |
// Strategy 2: Enhanced Turkish character decoding
|
| 267 |
fullText = decodeTurkishText(fullText);
|
|
|
|
| 268 |
// Strategy 3: If still poor quality, try OCR with preprocessing
|
| 269 |
if (!fullText.trim() || fullText.trim().length < 50) {
|
| 270 |
console.warn('Primary text extraction failed, attempting enhanced OCR...');
|
| 271 |
+
// Create a fresh copy for OCR to avoid detachment
|
| 272 |
+
const ocrArrayBuffer = arrayBuffer.slice(0);
|
| 273 |
+
fullText = await enhancedOCRFallback(ocrArrayBuffer);
|
| 274 |
}
|
| 275 |
+
// Strategy 4: Apply text quality improvements
|
|
|
|
| 276 |
fullText = improveTextQuality(fullText);
|
| 277 |
|
| 278 |
resolve(fullText);
|
|
|
|
| 279 |
} catch (error) {
|
| 280 |
console.error('PDF extraction error:', error);
|
| 281 |
+
|
| 282 |
+
// Try a simpler extraction method as fallback
|
| 283 |
+
try {
|
| 284 |
+
console.warn('Attempting simplified PDF extraction...');
|
| 285 |
+
const simpleArray = new Uint8Array(arrayBuffer.slice(0));
|
| 286 |
+
const simpleLoadingTask = pdfjsLib.getDocument(simpleArray.buffer);
|
| 287 |
+
const simplePdf = await simpleLoadingTask.promise;
|
| 288 |
+
let simpleText = '';
|
| 289 |
+
|
| 290 |
+
for (let i = 1; i <= Math.min(simplePdf.numPages, 5); i++) {
|
| 291 |
+
const page = await simplePdf.getPage(i);
|
| 292 |
+
const simpleContent = await page.getTextContent();
|
| 293 |
+
const pageText = simpleContent.items.map(item => item.str).join(' ');
|
| 294 |
+
simpleText += pageText + '\n';
|
| 295 |
+
}
|
| 296 |
+
|
| 297 |
+
if (simpleText.trim()) {
|
| 298 |
+
resolve(decodeTurkishText(improveTextQuality(simpleText)));
|
| 299 |
+
return;
|
| 300 |
+
}
|
| 301 |
+
} catch (fallbackError) {
|
| 302 |
+
console.error('Fallback extraction also failed:', fallbackError);
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
reject(new Error('Failed to extract text from PDF: ' + error.message));
|
| 306 |
}
|
| 307 |
+
};
|
| 308 |
|
| 309 |
reader.onerror = () => reject(new Error('Failed to read PDF file'));
|
| 310 |
reader.readAsArrayBuffer(file);
|
|
|
|
| 356 |
|
| 357 |
return decodedText;
|
| 358 |
}
|
|
|
|
| 359 |
// Enhanced OCR fallback with multiple engines
|
| 360 |
async function enhancedOCRFallback(pdfData) {
|
| 361 |
+
// Ensure we have a valid ArrayBuffer
|
| 362 |
+
let arrayBuffer;
|
| 363 |
+
if (pdfData instanceof ArrayBuffer) {
|
| 364 |
+
arrayBuffer = pdfData;
|
| 365 |
+
} else if (pdfData instanceof Uint8Array) {
|
| 366 |
+
arrayBuffer = pdfData.buffer;
|
| 367 |
+
} else {
|
| 368 |
+
throw new Error('Invalid PDF data format for OCR fallback');
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
const images = await convertPDFToImagesEnhanced(arrayBuffer);
|
| 372 |
+
let ocrResults = [];
|
| 373 |
|
| 374 |
for (const image of images) {
|
| 375 |
// Try multiple OCR approaches
|
|
|
|
| 452 |
ctx.putImageData(imageData, 0, 0);
|
| 453 |
return canvas;
|
| 454 |
}
|
|
|
|
| 455 |
// Enhanced PDF to image conversion
|
| 456 |
async function convertPDFToImagesEnhanced(pdfData) {
|
| 457 |
+
// Ensure we have a fresh copy of the data
|
| 458 |
+
let data;
|
| 459 |
+
if (pdfData instanceof ArrayBuffer) {
|
| 460 |
+
data = new Uint8Array(pdfData.slice(0));
|
| 461 |
+
} else if (pdfData instanceof Uint8Array) {
|
| 462 |
+
data = new Uint8Array(pdfData.buffer.slice(0));
|
| 463 |
+
} else {
|
| 464 |
+
throw new Error('Invalid PDF data format for image conversion');
|
| 465 |
+
}
|
| 466 |
+
|
| 467 |
const loadingTask = pdfjsLib.getDocument({
|
| 468 |
+
data: data.buffer,
|
| 469 |
cMapUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/cmaps/',
|
| 470 |
cMapPacked: true,
|
| 471 |
+
standardFontDataUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/standard_fonts/',
|
| 472 |
+
// Disable worker for image conversion to avoid detachment issues
|
| 473 |
+
useWorkerFetch: false,
|
| 474 |
+
isEvalSupported: false,
|
| 475 |
+
disableWorker: true
|
| 476 |
});
|
| 477 |
+
const pdf = await loadingTask.promise;
|
|
|
|
| 478 |
const images = [];
|
| 479 |
|
| 480 |
// Process all pages with higher resolution
|
|
|
|
| 647 |
reader.readAsArrayBuffer(file);
|
| 648 |
});
|
| 649 |
}
|
| 650 |
+
async function convertPDFToImages(pdfData) {
|
| 651 |
+
// Create a fresh copy before processing
|
| 652 |
+
if (pdfData instanceof ArrayBuffer) {
|
| 653 |
+
return await convertPDFToImagesEnhanced(pdfData.slice(0));
|
| 654 |
+
} else if (pdfData instanceof Uint8Array) {
|
| 655 |
+
return await convertPDFToImagesEnhanced(pdfData.buffer.slice(0));
|
| 656 |
+
}
|
| 657 |
return await convertPDFToImagesEnhanced(pdfData);
|
| 658 |
}
|
| 659 |
+
async function extractTextFromImage(file) {
|
| 660 |
return new Promise(async (resolve, reject) => {
|
| 661 |
try {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 662 |
const imageElement = file instanceof HTMLCanvasElement ? file : file;
|
| 663 |
|
| 664 |
+
// Apply advanced preprocessing
|
| 665 |
+
const processedImages = await applyAdvancedPreprocessing(imageElement);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 666 |
|
| 667 |
+
// Multi-strategy OCR approach
|
| 668 |
+
const ocrResults = [];
|
| 669 |
|
| 670 |
+
for (const processedImage of processedImages) {
|
| 671 |
+
const results = await Promise.allSettled([
|
| 672 |
+
// Strategy 1: Turkish with best settings
|
| 673 |
+
performAdvancedOCR(processedImage, 'tur', {
|
| 674 |
+
tessedit_pageseg_mode: '6',
|
| 675 |
+
preserve_interword_spaces: '1',
|
| 676 |
+
tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" @#$%^&*+=<>:;_ ğüşıöçĞÜŞİÖÇ',
|
| 677 |
+
tessedit_ocr_engine_mode: '1',
|
| 678 |
+
tessedit_do_ocr: '1',
|
| 679 |
+
tessedit_load_image: '1'
|
| 680 |
+
}),
|
| 681 |
+
// Strategy 2: Turkish+English with auto segmentation
|
| 682 |
+
performAdvancedOCR(processedImage, 'tur+eng', {
|
| 683 |
+
tessedit_pageseg_mode: '1',
|
| 684 |
+
preserve_interword_spaces: '1',
|
| 685 |
+
tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" @#$%^&*+=<>:;_ ğüşıöçĞÜŞİÖÇ',
|
| 686 |
+
tessedit_ocr_engine_mode: '1'
|
| 687 |
+
}),
|
| 688 |
+
// Strategy 3: Single column mode
|
| 689 |
+
performAdvancedOCR(processedImage, 'tur', {
|
| 690 |
+
tessedit_pageseg_mode: '3',
|
| 691 |
+
preserve_interword_spaces: '1',
|
| 692 |
+
tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" @#$%^&*+=<>:;_ ğüşıöçĞÜŞİÖÇ'
|
| 693 |
+
})
|
| 694 |
+
]);
|
| 695 |
+
|
| 696 |
+
results.forEach(result => {
|
| 697 |
+
if (result.status === 'fulfilled' && result.value.text.trim().length > 10) {
|
| 698 |
+
ocrResults.push({
|
| 699 |
+
text: result.value.text,
|
| 700 |
+
confidence: result.value.confidence || calculateConfidence(result.value.text),
|
| 701 |
+
strategy: result.value.strategy
|
| 702 |
+
});
|
| 703 |
}
|
| 704 |
+
});
|
| 705 |
+
}
|
| 706 |
|
| 707 |
+
// Select best result using advanced scoring
|
| 708 |
+
const bestResult = selectBestResult(ocrResults);
|
| 709 |
+
|
| 710 |
+
if (bestResult) {
|
| 711 |
+
// Apply document structure analysis
|
| 712 |
+
const structuredText = await analyzeDocumentStructure(bestResult.text);
|
| 713 |
|
| 714 |
+
// Apply intelligent Turkish text corrections
|
| 715 |
+
const correctedText = applyIntelligentTurkishCorrections(structuredText);
|
| 716 |
+
|
| 717 |
+
resolve(correctedText);
|
|
|
|
|
|
|
|
|
|
| 718 |
} else {
|
| 719 |
resolve('No text could be extracted from the image.');
|
| 720 |
}
|
|
|
|
| 724 |
reject(error);
|
| 725 |
}
|
| 726 |
});
|
| 727 |
+
// Apply advanced image preprocessing techniques
|
| 728 |
+
async function applyAdvancedPreprocessing(imageElement) {
|
| 729 |
+
const processedImages = [];
|
| 730 |
+
|
| 731 |
+
// Original image
|
| 732 |
+
if (imageElement instanceof HTMLCanvasElement) {
|
| 733 |
+
processedImages.push(imageElement);
|
| 734 |
+
} else {
|
| 735 |
+
const canvas = await imageToCanvas(imageElement);
|
| 736 |
+
processedImages.push(canvas);
|
| 737 |
+
}
|
| 738 |
+
|
| 739 |
+
// Enhanced preprocessing variations
|
| 740 |
+
const variations = [
|
| 741 |
+
// High contrast
|
| 742 |
+
await applyImageEnhancement(processedImages[0], 'contrast'),
|
| 743 |
+
// Denoised
|
| 744 |
+
await applyImageEnhancement(processedImages[0], 'denoise'),
|
| 745 |
+
// Sharpened
|
| 746 |
+
await applyImageEnhancement(processedImages[0], 'sharpen'),
|
| 747 |
+
// Binarized
|
| 748 |
+
await applyImageEnhancement(processedImages[0], 'binarize')
|
| 749 |
+
];
|
| 750 |
+
|
| 751 |
+
processedImages.push(...variations.filter(img => img !== null));
|
| 752 |
+
|
| 753 |
+
return processedImages;
|
| 754 |
+
}
|
| 755 |
+
|
| 756 |
+
// Convert image to canvas
|
| 757 |
+
async function imageToCanvas(image) {
|
| 758 |
+
return new Promise((resolve) => {
|
| 759 |
+
const img = new Image();
|
| 760 |
+
img.onload = () => {
|
| 761 |
+
const canvas = document.createElement('canvas');
|
| 762 |
+
canvas.width = img.width;
|
| 763 |
+
canvas.height = img.height;
|
| 764 |
+
const ctx = canvas.getContext('2d');
|
| 765 |
+
ctx.drawImage(img, 0, 0);
|
| 766 |
+
resolve(canvas);
|
| 767 |
+
};
|
| 768 |
+
img.src = image instanceof HTMLCanvasElement ? image.toDataURL() : URL.createObjectURL(image);
|
| 769 |
+
});
|
| 770 |
+
}
|
| 771 |
|
| 772 |
+
// Apply specific image enhancement
|
| 773 |
+
async function applyImageEnhancement(canvas, type) {
|
| 774 |
+
const ctx = canvas.getContext('2d');
|
| 775 |
+
const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
|
| 776 |
+
const data = imageData.data;
|
| 777 |
|
| 778 |
+
switch(type) {
|
| 779 |
+
case 'contrast':
|
| 780 |
+
// Enhance contrast
|
| 781 |
+
const contrast = 1.5;
|
| 782 |
+
for (let i = 0; i < data.length; i += 4) {
|
| 783 |
+
data[i] = ((data[i] - 128) * contrast) + 128;
|
| 784 |
+
data[i + 1] = ((data[i + 1] - 128) * contrast) + 128;
|
| 785 |
+
data[i + 2] = ((data[i + 2] - 128) * contrast) + 128;
|
| 786 |
+
}
|
| 787 |
+
break;
|
| 788 |
+
|
| 789 |
+
case 'denoise':
|
| 790 |
+
// Simple noise reduction
|
| 791 |
+
for (let i = 0; i < data.length; i += 4) {
|
| 792 |
+
const avg = (data[i] + data[i + 1] + data[i + 2]) / 3;
|
| 793 |
+
const threshold = 30;
|
| 794 |
+
if (Math.abs(data[i] - avg) > threshold) data[i] = avg;
|
| 795 |
+
if (Math.abs(data[i + 1] - avg) > threshold) data[i + 1] = avg;
|
| 796 |
+
if (Math.abs(data[i + 2] - avg) > threshold) data[i + 2] = avg;
|
| 797 |
+
}
|
| 798 |
+
break;
|
| 799 |
+
|
| 800 |
+
case 'sharpen':
|
| 801 |
+
// Sharpen filter
|
| 802 |
+
const weights = [0, -1, 0, -1, 5, -1, 0, -1, 0];
|
| 803 |
+
const side = Math.round(Math.sqrt(weights.length));
|
| 804 |
+
const halfSide = Math.floor(side / 2);
|
| 805 |
+
const output = ctx.createImageData(canvas.width, canvas.height);
|
| 806 |
+
const dst = output.data;
|
| 807 |
+
|
| 808 |
+
for (let y = 0; y < canvas.height; y++) {
|
| 809 |
+
for (let x = 0; x < canvas.width; x++) {
|
| 810 |
+
const dstOff = (y * canvas.width + x) * 4;
|
| 811 |
+
let r = 0, g = 0, b = 0;
|
| 812 |
+
|
| 813 |
+
for (let cy = 0; cy < side; cy++) {
|
| 814 |
+
for (let cx = 0; cx < side; cx++) {
|
| 815 |
+
const scy = y + cy - halfSide;
|
| 816 |
+
const scx = x + cx - halfSide;
|
| 817 |
+
|
| 818 |
+
if (scy >= 0 && scy < canvas.height && scx >= 0 && scx < canvas.width) {
|
| 819 |
+
const srcOff = (scy * canvas.width + scx) * 4;
|
| 820 |
+
const wt = weights[cy * side + cx];
|
| 821 |
+
r += data[srcOff] * wt;
|
| 822 |
+
g += data[srcOff + 1] * wt;
|
| 823 |
+
b += data[srcOff + 2] * wt;
|
| 824 |
+
}
|
| 825 |
+
}
|
| 826 |
+
}
|
| 827 |
+
|
| 828 |
+
dst[dstOff] = r;
|
| 829 |
+
dst[dstOff + 1] = g;
|
| 830 |
+
dst[dstOff + 2] = b;
|
| 831 |
+
dst[dstOff + 3] = 255;
|
| 832 |
+
}
|
| 833 |
+
}
|
| 834 |
+
ctx.putImageData(output, 0, 0);
|
| 835 |
+
return canvas;
|
| 836 |
+
|
| 837 |
+
case 'binarize':
|
| 838 |
+
// Adaptive thresholding
|
| 839 |
+
for (let i = 0; i < data.length; i += 4) {
|
| 840 |
+
const gray = data[i] * 0.299 + data[i + 1] * 0.587 + data[i + 2] * 0.114;
|
| 841 |
+
const threshold = gray > 128 ? 255 : 0;
|
| 842 |
+
data[i] = threshold;
|
| 843 |
+
data[i + 1] = threshold;
|
| 844 |
+
data[i + 2] = threshold;
|
| 845 |
+
}
|
| 846 |
+
break;
|
| 847 |
+
}
|
| 848 |
+
|
| 849 |
+
ctx.putImageData(imageData, 0, 0);
|
| 850 |
+
return canvas;
|
| 851 |
+
}
|
| 852 |
+
|
| 853 |
+
// Advanced OCR processing
|
| 854 |
+
async function performAdvancedOCR(image, languages, config) {
|
| 855 |
+
try {
|
| 856 |
+
const result = await Tesseract.recognize(image, languages, {
|
| 857 |
+
logger: m => console.log(`OCR (${languages}): ${m.status} - ${Math.round(m.progress * 100)}%`),
|
| 858 |
+
...config
|
| 859 |
+
});
|
| 860 |
+
|
| 861 |
+
return {
|
| 862 |
+
text: result.data.text,
|
| 863 |
+
confidence: result.data.confidence || 0,
|
| 864 |
+
strategy: `OCR_${languages}_${config.tessedit_pageseg_mode}`
|
| 865 |
+
};
|
| 866 |
+
} catch (error) {
|
| 867 |
+
console.error(`OCR strategy failed:`, error);
|
| 868 |
+
return { text: '', confidence: 0 };
|
| 869 |
}
|
| 870 |
+
}
|
| 871 |
+
// Select best OCR result using advanced scoring
|
| 872 |
+
function selectBestResult(results) {
|
| 873 |
+
if (results.length === 0) return null;
|
| 874 |
+
|
| 875 |
+
let bestScore = -1;
|
| 876 |
+
let bestResult = null;
|
| 877 |
|
| 878 |
+
results.forEach(result => {
|
| 879 |
+
const score = calculateAdvancedScore(result.text, result.confidence);
|
| 880 |
+
if (score > bestScore) {
|
| 881 |
+
bestScore = score;
|
| 882 |
+
bestResult = result;
|
| 883 |
+
}
|
| 884 |
+
});
|
| 885 |
+
|
| 886 |
+
return bestResult;
|
| 887 |
}
|
| 888 |
|
| 889 |
+
// Calculate advanced scoring for OCR results
|
| 890 |
+
function calculateAdvancedScore(text, baseConfidence) {
|
| 891 |
if (!text || text.trim().length === 0) return 0;
|
| 892 |
|
| 893 |
+
let score = baseConfidence || 0;
|
| 894 |
+
|
| 895 |
+
// Turkish character detection (40% weight)
|
| 896 |
+
const turkishChars = (text.match(/[ğüşıöçĞÜŞİÖÇ]/g) || []).length;
|
| 897 |
+
const totalChars = text.replace(/\s/g, '').length;
|
| 898 |
+
const turkishRatio = totalChars > 0 ? turkishChars / totalChars : 0;
|
| 899 |
+
score += turkishRatio * 40;
|
| 900 |
+
|
| 901 |
+
// Word detection (20% weight)
|
| 902 |
+
const words = text.match(/\b\w+\b/g) || [];
|
| 903 |
+
const turkishWords = words.filter(word => /[ğıüşiöçĞIÜŞİÖÇ]/.test(word));
|
| 904 |
+
const wordRatio = words.length > 0 ? turkishWords.length / words.length : 0;
|
| 905 |
+
score += wordRatio * 20;
|
| 906 |
+
|
| 907 |
+
// Sentence structure (20% weight)
|
| 908 |
+
const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 5);
|
| 909 |
+
const avgSentenceLength = sentences.length > 0 ?
|
| 910 |
+
sentences.reduce((sum, s) => sum + s.split(/\s+/).length, 0) / sentences.length : 0;
|
| 911 |
+
const sentenceScore = Math.min(avgSentenceLength / 10, 1);
|
| 912 |
+
score += sentenceScore * 20;
|
| 913 |
+
|
| 914 |
+
// Text length penalty for very short texts
|
| 915 |
+
if (text.trim().length < 20) score *= 0.5;
|
| 916 |
+
|
| 917 |
+
return Math.min(score, 100);
|
| 918 |
+
}
|
| 919 |
+
// Analyze document structure like Abbyy FineReader
|
| 920 |
+
async function analyzeDocumentStructure(text) {
|
| 921 |
+
// Split text into potential sections
|
| 922 |
+
const lines = text.split('\n').filter(line => line.trim().length > 0);
|
| 923 |
+
const structuredSections = [];
|
| 924 |
+
|
| 925 |
+
let currentSection = { type: 'paragraph', content: [], level: 0 };
|
| 926 |
+
|
| 927 |
+
for (let i = 0; i < lines.length; i++) {
|
| 928 |
+
const line = lines[i].trim();
|
| 929 |
+
|
| 930 |
+
// Detect headings
|
| 931 |
+
if (isHeading(line)) {
|
| 932 |
+
if (currentSection.content.length > 0) {
|
| 933 |
+
structuredSections.push(currentSection);
|
| 934 |
+
}
|
| 935 |
+
currentSection = {
|
| 936 |
+
type: 'heading',
|
| 937 |
+
content: [line],
|
| 938 |
+
level: detectHeadingLevel(line)
|
| 939 |
+
};
|
| 940 |
+
}
|
| 941 |
+
// Detect lists
|
| 942 |
+
else if (isListItem(line)) {
|
| 943 |
+
if (currentSection.type !== 'list') {
|
| 944 |
+
if (currentSection.content.length > 0) {
|
| 945 |
+
structuredSections.push(currentSection);
|
| 946 |
+
}
|
| 947 |
+
currentSection = { type: 'list', content: [], level: 0 };
|
| 948 |
+
}
|
| 949 |
+
currentSection.content.push(line);
|
| 950 |
+
}
|
| 951 |
+
// Detect tables
|
| 952 |
+
else if (isTableRow(line)) {
|
| 953 |
+
if (currentSection.type !== 'table') {
|
| 954 |
+
if (currentSection.content.length > 0) {
|
| 955 |
+
structuredSections.push(currentSection);
|
| 956 |
+
}
|
| 957 |
+
currentSection = { type: 'table', content: [], level: 0 };
|
| 958 |
+
}
|
| 959 |
+
currentSection.content.push(line);
|
| 960 |
+
}
|
| 961 |
+
// Regular paragraph
|
| 962 |
+
else {
|
| 963 |
+
if (currentSection.type !== 'paragraph') {
|
| 964 |
+
if (currentSection.content.length > 0) {
|
| 965 |
+
structuredSections.push(currentSection);
|
| 966 |
+
}
|
| 967 |
+
currentSection = { type: 'paragraph', content: [], level: 0 };
|
| 968 |
+
}
|
| 969 |
+
currentSection.content.push(line);
|
| 970 |
+
}
|
| 971 |
+
}
|
| 972 |
+
|
| 973 |
+
if (currentSection.content.length > 0) {
|
| 974 |
+
structuredSections.push(currentSection);
|
| 975 |
+
}
|
| 976 |
|
| 977 |
+
return formatStructuredText(structuredSections);
|
| 978 |
+
}
|
| 979 |
+
|
| 980 |
+
// Check if line is a heading
|
| 981 |
+
function isHeading(line) {
|
| 982 |
+
// Short lines with all caps or title case are likely headings
|
| 983 |
+
if (line.length < 50 && line.split(/\s+/).length <= 8) {
|
| 984 |
+
const words = line.split(/\s+/);
|
| 985 |
+
const titleWords = words.filter(word => /^[A-ZÇĞİÖŞÜ]/.test(word));
|
| 986 |
+
return titleWords.length / words.length > 0.6;
|
| 987 |
+
}
|
| 988 |
+
|
| 989 |
+
// Lines with colon at end are often headings
|
| 990 |
+
if (line.endsWith(':')) return true;
|
| 991 |
|
| 992 |
+
// Numbered headings
|
| 993 |
+
if (/^\d+\.?\s+[A-ZÇĞİÖŞÜ]/.test(line)) return true;
|
| 994 |
+
|
| 995 |
+
return false;
|
| 996 |
}
|
| 997 |
|
| 998 |
+
// Detect heading level
|
| 999 |
+
function detectHeadingLevel(line) {
|
| 1000 |
+
if (/^\d+\.\d+\s+/.test(line)) return 2;
|
| 1001 |
+
if (/^\d+\s+/.test(line)) return 1;
|
| 1002 |
+
if (line.length < 30) return 1;
|
| 1003 |
+
if (line.length < 40) return 2;
|
| 1004 |
+
return 3;
|
| 1005 |
+
}
|
| 1006 |
+
|
| 1007 |
+
// Check if line is a list item
|
| 1008 |
+
function isListItem(line) {
|
| 1009 |
+
return /^[-*•]\s+/.test(line) ||
|
| 1010 |
+
/^\d+\.\s+/.test(line) ||
|
| 1011 |
+
/^\([a-z]\)\s+/.test(line);
|
| 1012 |
+
}
|
| 1013 |
+
|
| 1014 |
+
// Check if line is a table row
|
| 1015 |
+
function isTableRow(line) {
|
| 1016 |
+
// Multiple tabs or multiple pipes suggest table
|
| 1017 |
+
return (line.split('\t').length > 2) ||
|
| 1018 |
+
(line.split('|').length > 3);
|
| 1019 |
+
}
|
| 1020 |
+
|
| 1021 |
+
// Format structured text based on output format
|
| 1022 |
+
function formatStructuredText(sections) {
|
| 1023 |
+
if (outputFormat.value === 'markdown') {
|
| 1024 |
+
return formatAsMarkdown(sections);
|
| 1025 |
+
} else if (outputFormat.value === 'json') {
|
| 1026 |
+
return formatAsJSON(sections);
|
| 1027 |
+
} else if (outputFormat.value === 'formatted') {
|
| 1028 |
+
return formatAsStructuredText(sections);
|
| 1029 |
+
}
|
| 1030 |
+
return sections.map(s => s.content.join(' ')).join('\n\n');
|
| 1031 |
+
}
|
| 1032 |
+
|
| 1033 |
+
// Format as Markdown
|
| 1034 |
+
function formatAsMarkdown(sections) {
|
| 1035 |
+
let markdown = '';
|
| 1036 |
+
|
| 1037 |
+
sections.forEach(section => {
|
| 1038 |
+
switch(section.type) {
|
| 1039 |
+
case 'heading':
|
| 1040 |
+
const hashes = '#'.repeat(section.level);
|
| 1041 |
+
markdown += `${hashes} ${section.content[0]}\n\n`;
|
| 1042 |
+
break;
|
| 1043 |
+
case 'list':
|
| 1044 |
+
section.content.forEach(item => {
|
| 1045 |
+
markdown += `- ${item}\n`;
|
| 1046 |
+
});
|
| 1047 |
+
markdown += '\n';
|
| 1048 |
+
break;
|
| 1049 |
+
case 'table':
|
| 1050 |
+
section.content.forEach(row => {
|
| 1051 |
+
markdown += `| ${row.split(/\t+|\|/).join(' | ')} |\n`;
|
| 1052 |
+
});
|
| 1053 |
+
markdown += '\n';
|
| 1054 |
+
break;
|
| 1055 |
+
case 'paragraph':
|
| 1056 |
+
markdown += section.content.join(' ') + '\n\n';
|
| 1057 |
+
break;
|
| 1058 |
+
}
|
| 1059 |
+
});
|
| 1060 |
+
|
| 1061 |
+
return markdown.trim();
|
| 1062 |
+
}
|
| 1063 |
+
|
| 1064 |
+
// Format as JSON
|
| 1065 |
+
function formatAsJSON(sections) {
|
| 1066 |
+
const structured = sections.map(section => ({
|
| 1067 |
+
type: section.type,
|
| 1068 |
+
level: section.level,
|
| 1069 |
+
content: section.content
|
| 1070 |
+
}));
|
| 1071 |
+
|
| 1072 |
+
return JSON.stringify(structured, null, 2);
|
| 1073 |
+
}
|
| 1074 |
+
|
| 1075 |
+
// Format as structured text
|
| 1076 |
+
function formatAsStructuredText(sections) {
|
| 1077 |
+
let text = '';
|
| 1078 |
+
|
| 1079 |
+
sections.forEach(section => {
|
| 1080 |
+
switch(section.type) {
|
| 1081 |
+
case 'heading':
|
| 1082 |
+
text += '\n' + section.content[0].toUpperCase() + '\n';
|
| 1083 |
+
text += '='.repeat(section.content[0].length) + '\n\n';
|
| 1084 |
+
break;
|
| 1085 |
+
case 'list':
|
| 1086 |
+
section.content.forEach(item => {
|
| 1087 |
+
text += ' • ' + item + '\n';
|
| 1088 |
+
});
|
| 1089 |
+
text += '\n';
|
| 1090 |
+
break;
|
| 1091 |
+
case 'table':
|
| 1092 |
+
section.content.forEach(row => {
|
| 1093 |
+
text += row + '\n';
|
| 1094 |
+
});
|
| 1095 |
+
text += '\n';
|
| 1096 |
+
break;
|
| 1097 |
+
case 'paragraph':
|
| 1098 |
+
text += section.content.join(' ') + '\n\n';
|
| 1099 |
+
break;
|
| 1100 |
+
}
|
| 1101 |
+
});
|
| 1102 |
+
|
| 1103 |
+
return text.trim();
|
| 1104 |
+
}
|
| 1105 |
+
|
| 1106 |
+
// Apply intelligent Turkish corrections
|
| 1107 |
+
function applyIntelligentTurkishCorrections(text) {
|
| 1108 |
+
// Turkish character corrections based on context
|
| 1109 |
+
const corrections = [
|
| 1110 |
+
// Common OCR mistakes
|
| 1111 |
+
{ pattern: /\bc\b/g, replacement: 'ç' },
|
| 1112 |
+
{ pattern: /\bC\b/g, replacement: 'Ç' },
|
| 1113 |
+
{ pattern: /\bg\b/g, replacement: 'ğ', context: /[aeiou]/i },
|
| 1114 |
+
{ pattern: /\bG\b/g, replacement: 'Ğ', context: /[AEIOU]/i },
|
| 1115 |
+
{ pattern: /\bi\b/g, replacement: 'ı', context: /[^iİ]/g },
|
| 1116 |
+
{ pattern: /\bI\b/g, replacement: 'İ' },
|
| 1117 |
+
{ pattern: /\bo\b/g, replacement: 'ö', context: /[aeiou]/i },
|
| 1118 |
+
{ pattern: /\bO\b/g, replacement: 'Ö', context: /[AEIOU]/i },
|
| 1119 |
+
{ pattern: /\bs\b/g, replacement: 'ş', context: /[aeiou]/i },
|
| 1120 |
+
{ pattern: /\bS\b/g, replacement: 'Ş', context: /[AEIOU]/i },
|
| 1121 |
+
{ pattern: /\bu\b/g, replacement: 'ü', context: /[aeiou]/i },
|
| 1122 |
+
{ pattern: /\bU\b/g, replacement: 'Ü', context: /[AEIOU]/i },
|
| 1123 |
+
|
| 1124 |
+
// Number and symbol corrections
|
| 1125 |
+
{ pattern: /0/g, replacement: 'O', context: /[A-Z]/ },
|
| 1126 |
+
{ pattern: /1/g, replacement: 'İ', context: /[A-Z]/ },
|
| 1127 |
+
{ pattern: /5/g, replacement: 'S', context: /[A-Z]/ },
|
| 1128 |
+
|
| 1129 |
+
// Common word corrections
|
| 1130 |
+
{ pattern: /\bve\b/gi, replacement: 've' },
|
| 1131 |
+
{ pattern: /\bile\b/gi, replacement: 'ile' },
|
| 1132 |
+
{ pattern: /\bicin\b/gi, replacement: 'için' },
|
| 1133 |
+
{ pattern: /\bsizin\b/gi, replacement: 'sizin' },
|
| 1134 |
+
{ pattern: /\bbir\b/gi, replacement: 'bir' },
|
| 1135 |
+
{ pattern: /\bbu\b/gi, replacement: 'bu' },
|
| 1136 |
+
{ pattern: /\bsu\b/gi, replacement: 'şu' }
|
| 1137 |
+
];
|
| 1138 |
+
|
| 1139 |
+
let correctedText = text;
|
| 1140 |
+
|
| 1141 |
+
corrections.forEach(correction => {
|
| 1142 |
+
correctedText = correctedText.replace(correction.pattern, correction.replacement);
|
| 1143 |
+
});
|
| 1144 |
+
|
| 1145 |
+
// Fix spacing around punctuation
|
| 1146 |
+
correctedText = correctedText
|
| 1147 |
+
.replace(/\s+([.,!?;:])/g, '$1')
|
| 1148 |
+
.replace(/([.,!?;:])\s*/g, '$1 ')
|
| 1149 |
+
.replace(/\s+/g, ' ')
|
| 1150 |
.trim();
|
| 1151 |
+
|
| 1152 |
+
return correctedText;
|
| 1153 |
}
|
| 1154 |
function processFormattedOCR(hocr) {
|
| 1155 |
// Apply learned corrections
|