ASDAD34 commited on
Commit
602f295
·
verified ·
1 Parent(s): ed84526

içerik daha iyi türkçeye ocr çok bozuk iyi yapamıyor. abbyfinereader tarzında markdown, json,text formatına belge yapısına uygun çeviri yapsın. bu talimatı uygulama

Browse files
Files changed (1) hide show
  1. script.js +533 -110
script.js CHANGED
@@ -188,11 +188,13 @@ return {
188
 
189
  reader.onload = async function(event) {
190
  try {
191
- const typedArray = new Uint8Array(event.target.result);
 
 
192
 
193
  // Enhanced PDF loading with multiple extraction strategies
194
  const loadingTask = pdfjsLib.getDocument({
195
- data: typedArray,
196
  cMapUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/cmaps/',
197
  cMapPacked: true,
198
  standardFontDataUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/standard_fonts/',
@@ -202,8 +204,7 @@ return {
202
  disableAutoFetch: false,
203
  disableStream: false
204
  });
205
-
206
- const pdf = await loadingTask.promise;
207
  let fullText = '';
208
  let metadata = await pdf.getMetadata();
209
 
@@ -264,23 +265,46 @@ return {
264
 
265
  // Strategy 2: Enhanced Turkish character decoding
266
  fullText = decodeTurkishText(fullText);
267
-
268
  // Strategy 3: If still poor quality, try OCR with preprocessing
269
  if (!fullText.trim() || fullText.trim().length < 50) {
270
  console.warn('Primary text extraction failed, attempting enhanced OCR...');
271
- fullText = await enhancedOCRFallback(typedArray);
 
 
272
  }
273
-
274
- // Strategy 4: Apply text quality improvements
275
  fullText = improveTextQuality(fullText);
276
 
277
  resolve(fullText);
278
-
279
  } catch (error) {
280
  console.error('PDF extraction error:', error);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
  reject(new Error('Failed to extract text from PDF: ' + error.message));
282
  }
283
- };
284
 
285
  reader.onerror = () => reject(new Error('Failed to read PDF file'));
286
  reader.readAsArrayBuffer(file);
@@ -332,11 +356,20 @@ return {
332
 
333
  return decodedText;
334
  }
335
-
336
  // Enhanced OCR fallback with multiple engines
337
  async function enhancedOCRFallback(pdfData) {
338
- const images = await convertPDFToImagesEnhanced(pdfData);
339
- let ocrResults = [];
 
 
 
 
 
 
 
 
 
 
340
 
341
  for (const image of images) {
342
  // Try multiple OCR approaches
@@ -419,17 +452,29 @@ return {
419
  ctx.putImageData(imageData, 0, 0);
420
  return canvas;
421
  }
422
-
423
  // Enhanced PDF to image conversion
424
  async function convertPDFToImagesEnhanced(pdfData) {
 
 
 
 
 
 
 
 
 
 
425
  const loadingTask = pdfjsLib.getDocument({
426
- data: pdfData,
427
  cMapUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/cmaps/',
428
  cMapPacked: true,
429
- standardFontDataUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/standard_fonts/'
 
 
 
 
430
  });
431
-
432
- const pdf = await loadingTask.promise;
433
  const images = [];
434
 
435
  // Process all pages with higher resolution
@@ -602,85 +647,74 @@ return {
602
  reader.readAsArrayBuffer(file);
603
  });
604
  }
605
- async function convertPDFToImages(pdfData) {
 
 
 
 
 
 
606
  return await convertPDFToImagesEnhanced(pdfData);
607
  }
608
- async function extractTextFromImage(file) {
609
  return new Promise(async (resolve, reject) => {
610
  try {
611
- // Apply learned corrections before OCR
612
- let trainedWords = {};
613
- if (window.ocrLearningDict) {
614
- for (const [word, data] of Object.entries(window.ocrLearningDict)) {
615
- if (data.confirmedCorrect && data.confirmedCorrect !== word) {
616
- trainedWords[word] = data.confirmedCorrect;
617
- }
618
- }
619
- }
620
-
621
  const imageElement = file instanceof HTMLCanvasElement ? file : file;
622
 
623
- // Enhanced OCR configuration
624
- const config = {
625
- logger: m => {
626
- if (m.status === 'recognizing text') {
627
- console.log(`OCR Progress: ${Math.round(m.progress * 100)}%`);
628
- }
629
- },
630
- preserve_interword_spaces: '1',
631
- tessedit_pageseg_mode: '6', // Assume uniform text block
632
- tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" @#$%^&*+=<>:;_ ğüşıöçĞÜŞİÖÇ',
633
- user_defined_words: Object.keys(trainedWords).join(' '),
634
- tessedit_create_hocr: '1',
635
- load_system_dawg: '1',
636
- load_freq_dawg: '1',
637
- user_words_suffix: 'tur',
638
- user_patterns_suffix: 'tur',
639
- tessedit_ocr_engine_mode: '1', // LSTM OCR engine
640
- tessedit_do_ocr: '1',
641
- tessedit_load_image: '1'
642
- };
643
-
644
- // Try multiple OCR approaches
645
- const results = await Promise.allSettled([
646
- // Primary: Turkish + English with enhanced preprocessing
647
- performOCRWithPreprocessing(imageElement, 'tur+eng', config),
648
- // Secondary: Different page segmentation
649
- Tesseract.recognize(imageElement, 'tur+eng', {
650
- ...config,
651
- tessedit_pageseg_mode: '1' // Automatic page segmentation
652
- }),
653
- // Tertiary: Only English if Turkish fails
654
- Tesseract.recognize(imageElement, 'eng', config)
655
- ]);
656
 
657
- // Find and return the best result
658
- let bestResult = { text: '', confidence: 0 };
659
 
660
- results.forEach(result => {
661
- if (result.status === 'fulfilled') {
662
- const text = result.value.text;
663
- const confidence = calculateConfidence(text);
664
-
665
- if (text.trim().length > bestResult.text.length ||
666
- (text.trim().length === bestResult.text.length && confidence > bestResult.confidence)) {
667
- bestResult = { text, confidence };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
668
  }
669
- }
670
- });
671
 
672
- if (bestResult.text) {
673
- // Apply text quality improvements
674
- bestResult.text = decodeTurkishText(bestResult.text);
675
- bestResult.text = improveTextQuality(bestResult.text);
 
 
676
 
677
- if (outputFormat.value === 'formatted') {
678
- // Create formatted output
679
- const formatted = createFormattedText(bestResult.text);
680
- resolve(formatted);
681
- } else {
682
- resolve(bestResult.text);
683
- }
684
  } else {
685
  resolve('No text could be extracted from the image.');
686
  }
@@ -690,43 +724,432 @@ async function extractTextFromImage(file) {
690
  reject(error);
691
  }
692
  });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
693
 
694
- // OCR with image preprocessing
695
- async function performOCRWithPreprocessing(image, languages, config) {
696
- let processedImage = image;
 
 
697
 
698
- if (image instanceof HTMLCanvasElement) {
699
- // Apply preprocessing to canvas
700
- processedImage = await preprocessImage(image);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
701
  }
 
 
 
 
 
 
 
702
 
703
- return await Tesseract.recognize(processedImage, languages, config);
 
 
 
 
 
 
 
 
704
  }
705
 
706
- // Calculate text confidence score
707
- function calculateConfidence(text) {
708
  if (!text || text.trim().length === 0) return 0;
709
 
710
- // Score based on Turkish word detection
711
- const turkishWords = text.match(/[ğüşıöçĞÜŞİÖÇ]+/g) || [];
712
- const wordCount = text.split(/\s+/).length;
713
- const turkishRatio = turkishWords.length / wordCount;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
714
 
715
- // Score based on sentence structure
716
- const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
717
- const avgSentenceLength = sentences.reduce((sum, s) => sum + s.split(/\s+/).length, 0) / sentences.length;
 
 
 
 
 
 
 
 
 
 
 
718
 
719
- // Combined confidence score
720
- return (turkishRatio * 0.5) + (Math.min(avgSentenceLength / 10, 1) * 0.5);
 
 
721
  }
722
 
723
- // Create formatted text output
724
- function createFormattedText(text) {
725
- return text
726
- .replace(/([.!?])\s+/g, '$1\n\n') // Better paragraph breaks
727
- .replace(/\n{3,}/g, '\n\n') // Remove excessive line breaks
728
- .replace(/([A-ZÇĞİÖŞÜ][a-zçğıöşü]+)\s+/g, '$1 ') // Preserve Turkish words
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
729
  .trim();
 
 
730
  }
731
  function processFormattedOCR(hocr) {
732
  // Apply learned corrections
 
188
 
189
  reader.onload = async function(event) {
190
  try {
191
+ // Create a copy of the ArrayBuffer to avoid detachment issues
192
+ const arrayBuffer = event.target.result;
193
+ const typedArray = new Uint8Array(arrayBuffer.slice(0));
194
 
195
  // Enhanced PDF loading with multiple extraction strategies
196
  const loadingTask = pdfjsLib.getDocument({
197
+ data: typedArray.buffer,
198
  cMapUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/cmaps/',
199
  cMapPacked: true,
200
  standardFontDataUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/standard_fonts/',
 
204
  disableAutoFetch: false,
205
  disableStream: false
206
  });
207
+ const pdf = await loadingTask.promise;
 
208
  let fullText = '';
209
  let metadata = await pdf.getMetadata();
210
 
 
265
 
266
  // Strategy 2: Enhanced Turkish character decoding
267
  fullText = decodeTurkishText(fullText);
 
268
  // Strategy 3: If still poor quality, try OCR with preprocessing
269
  if (!fullText.trim() || fullText.trim().length < 50) {
270
  console.warn('Primary text extraction failed, attempting enhanced OCR...');
271
+ // Create a fresh copy for OCR to avoid detachment
272
+ const ocrArrayBuffer = arrayBuffer.slice(0);
273
+ fullText = await enhancedOCRFallback(ocrArrayBuffer);
274
  }
275
+ // Strategy 4: Apply text quality improvements
 
276
  fullText = improveTextQuality(fullText);
277
 
278
  resolve(fullText);
 
279
  } catch (error) {
280
  console.error('PDF extraction error:', error);
281
+
282
+ // Try a simpler extraction method as fallback
283
+ try {
284
+ console.warn('Attempting simplified PDF extraction...');
285
+ const simpleArray = new Uint8Array(arrayBuffer.slice(0));
286
+ const simpleLoadingTask = pdfjsLib.getDocument(simpleArray.buffer);
287
+ const simplePdf = await simpleLoadingTask.promise;
288
+ let simpleText = '';
289
+
290
+ for (let i = 1; i <= Math.min(simplePdf.numPages, 5); i++) {
291
+ const page = await simplePdf.getPage(i);
292
+ const simpleContent = await page.getTextContent();
293
+ const pageText = simpleContent.items.map(item => item.str).join(' ');
294
+ simpleText += pageText + '\n';
295
+ }
296
+
297
+ if (simpleText.trim()) {
298
+ resolve(decodeTurkishText(improveTextQuality(simpleText)));
299
+ return;
300
+ }
301
+ } catch (fallbackError) {
302
+ console.error('Fallback extraction also failed:', fallbackError);
303
+ }
304
+
305
  reject(new Error('Failed to extract text from PDF: ' + error.message));
306
  }
307
+ };
308
 
309
  reader.onerror = () => reject(new Error('Failed to read PDF file'));
310
  reader.readAsArrayBuffer(file);
 
356
 
357
  return decodedText;
358
  }
 
359
  // Enhanced OCR fallback with multiple engines
360
  async function enhancedOCRFallback(pdfData) {
361
+ // Ensure we have a valid ArrayBuffer
362
+ let arrayBuffer;
363
+ if (pdfData instanceof ArrayBuffer) {
364
+ arrayBuffer = pdfData;
365
+ } else if (pdfData instanceof Uint8Array) {
366
+ arrayBuffer = pdfData.buffer;
367
+ } else {
368
+ throw new Error('Invalid PDF data format for OCR fallback');
369
+ }
370
+
371
+ const images = await convertPDFToImagesEnhanced(arrayBuffer);
372
+ let ocrResults = [];
373
 
374
  for (const image of images) {
375
  // Try multiple OCR approaches
 
452
  ctx.putImageData(imageData, 0, 0);
453
  return canvas;
454
  }
 
455
  // Enhanced PDF to image conversion
456
  async function convertPDFToImagesEnhanced(pdfData) {
457
+ // Ensure we have a fresh copy of the data
458
+ let data;
459
+ if (pdfData instanceof ArrayBuffer) {
460
+ data = new Uint8Array(pdfData.slice(0));
461
+ } else if (pdfData instanceof Uint8Array) {
462
+ data = new Uint8Array(pdfData.buffer.slice(0));
463
+ } else {
464
+ throw new Error('Invalid PDF data format for image conversion');
465
+ }
466
+
467
  const loadingTask = pdfjsLib.getDocument({
468
+ data: data.buffer,
469
  cMapUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/cmaps/',
470
  cMapPacked: true,
471
+ standardFontDataUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/standard_fonts/',
472
+ // Disable worker for image conversion to avoid detachment issues
473
+ useWorkerFetch: false,
474
+ isEvalSupported: false,
475
+ disableWorker: true
476
  });
477
+ const pdf = await loadingTask.promise;
 
478
  const images = [];
479
 
480
  // Process all pages with higher resolution
 
647
  reader.readAsArrayBuffer(file);
648
  });
649
  }
650
+ async function convertPDFToImages(pdfData) {
651
+ // Create a fresh copy before processing
652
+ if (pdfData instanceof ArrayBuffer) {
653
+ return await convertPDFToImagesEnhanced(pdfData.slice(0));
654
+ } else if (pdfData instanceof Uint8Array) {
655
+ return await convertPDFToImagesEnhanced(pdfData.buffer.slice(0));
656
+ }
657
  return await convertPDFToImagesEnhanced(pdfData);
658
  }
659
+ async function extractTextFromImage(file) {
660
  return new Promise(async (resolve, reject) => {
661
  try {
 
 
 
 
 
 
 
 
 
 
662
  const imageElement = file instanceof HTMLCanvasElement ? file : file;
663
 
664
+ // Apply advanced preprocessing
665
+ const processedImages = await applyAdvancedPreprocessing(imageElement);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
666
 
667
+ // Multi-strategy OCR approach
668
+ const ocrResults = [];
669
 
670
+ for (const processedImage of processedImages) {
671
+ const results = await Promise.allSettled([
672
+ // Strategy 1: Turkish with best settings
673
+ performAdvancedOCR(processedImage, 'tur', {
674
+ tessedit_pageseg_mode: '6',
675
+ preserve_interword_spaces: '1',
676
+ tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" @#$%^&*+=<>:;_ ğüşıöçĞÜŞİÖÇ',
677
+ tessedit_ocr_engine_mode: '1',
678
+ tessedit_do_ocr: '1',
679
+ tessedit_load_image: '1'
680
+ }),
681
+ // Strategy 2: Turkish+English with auto segmentation
682
+ performAdvancedOCR(processedImage, 'tur+eng', {
683
+ tessedit_pageseg_mode: '1',
684
+ preserve_interword_spaces: '1',
685
+ tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" @#$%^&*+=<>:;_ ğüşıöçĞÜŞİÖÇ',
686
+ tessedit_ocr_engine_mode: '1'
687
+ }),
688
+ // Strategy 3: Single column mode
689
+ performAdvancedOCR(processedImage, 'tur', {
690
+ tessedit_pageseg_mode: '3',
691
+ preserve_interword_spaces: '1',
692
+ tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" @#$%^&*+=<>:;_ ğüşıöçĞÜŞİÖÇ'
693
+ })
694
+ ]);
695
+
696
+ results.forEach(result => {
697
+ if (result.status === 'fulfilled' && result.value.text.trim().length > 10) {
698
+ ocrResults.push({
699
+ text: result.value.text,
700
+ confidence: result.value.confidence || calculateConfidence(result.value.text),
701
+ strategy: result.value.strategy
702
+ });
703
  }
704
+ });
705
+ }
706
 
707
+ // Select best result using advanced scoring
708
+ const bestResult = selectBestResult(ocrResults);
709
+
710
+ if (bestResult) {
711
+ // Apply document structure analysis
712
+ const structuredText = await analyzeDocumentStructure(bestResult.text);
713
 
714
+ // Apply intelligent Turkish text corrections
715
+ const correctedText = applyIntelligentTurkishCorrections(structuredText);
716
+
717
+ resolve(correctedText);
 
 
 
718
  } else {
719
  resolve('No text could be extracted from the image.');
720
  }
 
724
  reject(error);
725
  }
726
  });
727
+ // Apply advanced image preprocessing techniques
728
+ async function applyAdvancedPreprocessing(imageElement) {
729
+ const processedImages = [];
730
+
731
+ // Original image
732
+ if (imageElement instanceof HTMLCanvasElement) {
733
+ processedImages.push(imageElement);
734
+ } else {
735
+ const canvas = await imageToCanvas(imageElement);
736
+ processedImages.push(canvas);
737
+ }
738
+
739
+ // Enhanced preprocessing variations
740
+ const variations = [
741
+ // High contrast
742
+ await applyImageEnhancement(processedImages[0], 'contrast'),
743
+ // Denoised
744
+ await applyImageEnhancement(processedImages[0], 'denoise'),
745
+ // Sharpened
746
+ await applyImageEnhancement(processedImages[0], 'sharpen'),
747
+ // Binarized
748
+ await applyImageEnhancement(processedImages[0], 'binarize')
749
+ ];
750
+
751
+ processedImages.push(...variations.filter(img => img !== null));
752
+
753
+ return processedImages;
754
+ }
755
+
756
+ // Convert image to canvas
757
+ async function imageToCanvas(image) {
758
+ return new Promise((resolve) => {
759
+ const img = new Image();
760
+ img.onload = () => {
761
+ const canvas = document.createElement('canvas');
762
+ canvas.width = img.width;
763
+ canvas.height = img.height;
764
+ const ctx = canvas.getContext('2d');
765
+ ctx.drawImage(img, 0, 0);
766
+ resolve(canvas);
767
+ };
768
+ img.src = image instanceof HTMLCanvasElement ? image.toDataURL() : URL.createObjectURL(image);
769
+ });
770
+ }
771
 
772
+ // Apply specific image enhancement
773
+ async function applyImageEnhancement(canvas, type) {
774
+ const ctx = canvas.getContext('2d');
775
+ const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
776
+ const data = imageData.data;
777
 
778
+ switch(type) {
779
+ case 'contrast':
780
+ // Enhance contrast
781
+ const contrast = 1.5;
782
+ for (let i = 0; i < data.length; i += 4) {
783
+ data[i] = ((data[i] - 128) * contrast) + 128;
784
+ data[i + 1] = ((data[i + 1] - 128) * contrast) + 128;
785
+ data[i + 2] = ((data[i + 2] - 128) * contrast) + 128;
786
+ }
787
+ break;
788
+
789
+ case 'denoise':
790
+ // Simple noise reduction
791
+ for (let i = 0; i < data.length; i += 4) {
792
+ const avg = (data[i] + data[i + 1] + data[i + 2]) / 3;
793
+ const threshold = 30;
794
+ if (Math.abs(data[i] - avg) > threshold) data[i] = avg;
795
+ if (Math.abs(data[i + 1] - avg) > threshold) data[i + 1] = avg;
796
+ if (Math.abs(data[i + 2] - avg) > threshold) data[i + 2] = avg;
797
+ }
798
+ break;
799
+
800
+ case 'sharpen':
801
+ // Sharpen filter
802
+ const weights = [0, -1, 0, -1, 5, -1, 0, -1, 0];
803
+ const side = Math.round(Math.sqrt(weights.length));
804
+ const halfSide = Math.floor(side / 2);
805
+ const output = ctx.createImageData(canvas.width, canvas.height);
806
+ const dst = output.data;
807
+
808
+ for (let y = 0; y < canvas.height; y++) {
809
+ for (let x = 0; x < canvas.width; x++) {
810
+ const dstOff = (y * canvas.width + x) * 4;
811
+ let r = 0, g = 0, b = 0;
812
+
813
+ for (let cy = 0; cy < side; cy++) {
814
+ for (let cx = 0; cx < side; cx++) {
815
+ const scy = y + cy - halfSide;
816
+ const scx = x + cx - halfSide;
817
+
818
+ if (scy >= 0 && scy < canvas.height && scx >= 0 && scx < canvas.width) {
819
+ const srcOff = (scy * canvas.width + scx) * 4;
820
+ const wt = weights[cy * side + cx];
821
+ r += data[srcOff] * wt;
822
+ g += data[srcOff + 1] * wt;
823
+ b += data[srcOff + 2] * wt;
824
+ }
825
+ }
826
+ }
827
+
828
+ dst[dstOff] = r;
829
+ dst[dstOff + 1] = g;
830
+ dst[dstOff + 2] = b;
831
+ dst[dstOff + 3] = 255;
832
+ }
833
+ }
834
+ ctx.putImageData(output, 0, 0);
835
+ return canvas;
836
+
837
+ case 'binarize':
838
+ // Adaptive thresholding
839
+ for (let i = 0; i < data.length; i += 4) {
840
+ const gray = data[i] * 0.299 + data[i + 1] * 0.587 + data[i + 2] * 0.114;
841
+ const threshold = gray > 128 ? 255 : 0;
842
+ data[i] = threshold;
843
+ data[i + 1] = threshold;
844
+ data[i + 2] = threshold;
845
+ }
846
+ break;
847
+ }
848
+
849
+ ctx.putImageData(imageData, 0, 0);
850
+ return canvas;
851
+ }
852
+
853
+ // Advanced OCR processing
854
+ async function performAdvancedOCR(image, languages, config) {
855
+ try {
856
+ const result = await Tesseract.recognize(image, languages, {
857
+ logger: m => console.log(`OCR (${languages}): ${m.status} - ${Math.round(m.progress * 100)}%`),
858
+ ...config
859
+ });
860
+
861
+ return {
862
+ text: result.data.text,
863
+ confidence: result.data.confidence || 0,
864
+ strategy: `OCR_${languages}_${config.tessedit_pageseg_mode}`
865
+ };
866
+ } catch (error) {
867
+ console.error(`OCR strategy failed:`, error);
868
+ return { text: '', confidence: 0 };
869
  }
870
+ }
871
+ // Select best OCR result using advanced scoring
872
+ function selectBestResult(results) {
873
+ if (results.length === 0) return null;
874
+
875
+ let bestScore = -1;
876
+ let bestResult = null;
877
 
878
+ results.forEach(result => {
879
+ const score = calculateAdvancedScore(result.text, result.confidence);
880
+ if (score > bestScore) {
881
+ bestScore = score;
882
+ bestResult = result;
883
+ }
884
+ });
885
+
886
+ return bestResult;
887
  }
888
 
889
+ // Calculate advanced scoring for OCR results
890
+ function calculateAdvancedScore(text, baseConfidence) {
891
  if (!text || text.trim().length === 0) return 0;
892
 
893
+ let score = baseConfidence || 0;
894
+
895
+ // Turkish character detection (40% weight)
896
+ const turkishChars = (text.match(/[ğüşıöçĞÜŞİÖÇ]/g) || []).length;
897
+ const totalChars = text.replace(/\s/g, '').length;
898
+ const turkishRatio = totalChars > 0 ? turkishChars / totalChars : 0;
899
+ score += turkishRatio * 40;
900
+
901
+ // Word detection (20% weight)
902
+ const words = text.match(/\b\w+\b/g) || [];
903
+ const turkishWords = words.filter(word => /[ğıüşiöçĞIÜŞİÖÇ]/.test(word));
904
+ const wordRatio = words.length > 0 ? turkishWords.length / words.length : 0;
905
+ score += wordRatio * 20;
906
+
907
+ // Sentence structure (20% weight)
908
+ const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 5);
909
+ const avgSentenceLength = sentences.length > 0 ?
910
+ sentences.reduce((sum, s) => sum + s.split(/\s+/).length, 0) / sentences.length : 0;
911
+ const sentenceScore = Math.min(avgSentenceLength / 10, 1);
912
+ score += sentenceScore * 20;
913
+
914
+ // Text length penalty for very short texts
915
+ if (text.trim().length < 20) score *= 0.5;
916
+
917
+ return Math.min(score, 100);
918
+ }
919
+ // Analyze document structure like Abbyy FineReader
920
+ async function analyzeDocumentStructure(text) {
921
+ // Split text into potential sections
922
+ const lines = text.split('\n').filter(line => line.trim().length > 0);
923
+ const structuredSections = [];
924
+
925
+ let currentSection = { type: 'paragraph', content: [], level: 0 };
926
+
927
+ for (let i = 0; i < lines.length; i++) {
928
+ const line = lines[i].trim();
929
+
930
+ // Detect headings
931
+ if (isHeading(line)) {
932
+ if (currentSection.content.length > 0) {
933
+ structuredSections.push(currentSection);
934
+ }
935
+ currentSection = {
936
+ type: 'heading',
937
+ content: [line],
938
+ level: detectHeadingLevel(line)
939
+ };
940
+ }
941
+ // Detect lists
942
+ else if (isListItem(line)) {
943
+ if (currentSection.type !== 'list') {
944
+ if (currentSection.content.length > 0) {
945
+ structuredSections.push(currentSection);
946
+ }
947
+ currentSection = { type: 'list', content: [], level: 0 };
948
+ }
949
+ currentSection.content.push(line);
950
+ }
951
+ // Detect tables
952
+ else if (isTableRow(line)) {
953
+ if (currentSection.type !== 'table') {
954
+ if (currentSection.content.length > 0) {
955
+ structuredSections.push(currentSection);
956
+ }
957
+ currentSection = { type: 'table', content: [], level: 0 };
958
+ }
959
+ currentSection.content.push(line);
960
+ }
961
+ // Regular paragraph
962
+ else {
963
+ if (currentSection.type !== 'paragraph') {
964
+ if (currentSection.content.length > 0) {
965
+ structuredSections.push(currentSection);
966
+ }
967
+ currentSection = { type: 'paragraph', content: [], level: 0 };
968
+ }
969
+ currentSection.content.push(line);
970
+ }
971
+ }
972
+
973
+ if (currentSection.content.length > 0) {
974
+ structuredSections.push(currentSection);
975
+ }
976
 
977
+ return formatStructuredText(structuredSections);
978
+ }
979
+
980
+ // Check if line is a heading
981
+ function isHeading(line) {
982
+ // Short lines with all caps or title case are likely headings
983
+ if (line.length < 50 && line.split(/\s+/).length <= 8) {
984
+ const words = line.split(/\s+/);
985
+ const titleWords = words.filter(word => /^[A-ZÇĞİÖŞÜ]/.test(word));
986
+ return titleWords.length / words.length > 0.6;
987
+ }
988
+
989
+ // Lines with colon at end are often headings
990
+ if (line.endsWith(':')) return true;
991
 
992
+ // Numbered headings
993
+ if (/^\d+\.?\s+[A-ZÇĞİÖŞÜ]/.test(line)) return true;
994
+
995
+ return false;
996
  }
997
 
998
+ // Detect heading level
999
+ function detectHeadingLevel(line) {
1000
+ if (/^\d+\.\d+\s+/.test(line)) return 2;
1001
+ if (/^\d+\s+/.test(line)) return 1;
1002
+ if (line.length < 30) return 1;
1003
+ if (line.length < 40) return 2;
1004
+ return 3;
1005
+ }
1006
+
1007
+ // Check if line is a list item
1008
+ function isListItem(line) {
1009
+ return /^[-*•]\s+/.test(line) ||
1010
+ /^\d+\.\s+/.test(line) ||
1011
+ /^\([a-z]\)\s+/.test(line);
1012
+ }
1013
+
1014
+ // Check if line is a table row
1015
+ function isTableRow(line) {
1016
+ // Multiple tabs or multiple pipes suggest table
1017
+ return (line.split('\t').length > 2) ||
1018
+ (line.split('|').length > 3);
1019
+ }
1020
+
1021
+ // Format structured text based on output format
1022
+ function formatStructuredText(sections) {
1023
+ if (outputFormat.value === 'markdown') {
1024
+ return formatAsMarkdown(sections);
1025
+ } else if (outputFormat.value === 'json') {
1026
+ return formatAsJSON(sections);
1027
+ } else if (outputFormat.value === 'formatted') {
1028
+ return formatAsStructuredText(sections);
1029
+ }
1030
+ return sections.map(s => s.content.join(' ')).join('\n\n');
1031
+ }
1032
+
1033
+ // Format as Markdown
1034
+ function formatAsMarkdown(sections) {
1035
+ let markdown = '';
1036
+
1037
+ sections.forEach(section => {
1038
+ switch(section.type) {
1039
+ case 'heading':
1040
+ const hashes = '#'.repeat(section.level);
1041
+ markdown += `${hashes} ${section.content[0]}\n\n`;
1042
+ break;
1043
+ case 'list':
1044
+ section.content.forEach(item => {
1045
+ markdown += `- ${item}\n`;
1046
+ });
1047
+ markdown += '\n';
1048
+ break;
1049
+ case 'table':
1050
+ section.content.forEach(row => {
1051
+ markdown += `| ${row.split(/\t+|\|/).join(' | ')} |\n`;
1052
+ });
1053
+ markdown += '\n';
1054
+ break;
1055
+ case 'paragraph':
1056
+ markdown += section.content.join(' ') + '\n\n';
1057
+ break;
1058
+ }
1059
+ });
1060
+
1061
+ return markdown.trim();
1062
+ }
1063
+
1064
+ // Format as JSON
1065
+ function formatAsJSON(sections) {
1066
+ const structured = sections.map(section => ({
1067
+ type: section.type,
1068
+ level: section.level,
1069
+ content: section.content
1070
+ }));
1071
+
1072
+ return JSON.stringify(structured, null, 2);
1073
+ }
1074
+
1075
+ // Format as structured text
1076
+ function formatAsStructuredText(sections) {
1077
+ let text = '';
1078
+
1079
+ sections.forEach(section => {
1080
+ switch(section.type) {
1081
+ case 'heading':
1082
+ text += '\n' + section.content[0].toUpperCase() + '\n';
1083
+ text += '='.repeat(section.content[0].length) + '\n\n';
1084
+ break;
1085
+ case 'list':
1086
+ section.content.forEach(item => {
1087
+ text += ' • ' + item + '\n';
1088
+ });
1089
+ text += '\n';
1090
+ break;
1091
+ case 'table':
1092
+ section.content.forEach(row => {
1093
+ text += row + '\n';
1094
+ });
1095
+ text += '\n';
1096
+ break;
1097
+ case 'paragraph':
1098
+ text += section.content.join(' ') + '\n\n';
1099
+ break;
1100
+ }
1101
+ });
1102
+
1103
+ return text.trim();
1104
+ }
1105
+
1106
+ // Apply intelligent Turkish corrections
1107
+ function applyIntelligentTurkishCorrections(text) {
1108
+ // Turkish character corrections based on context
1109
+ const corrections = [
1110
+ // Common OCR mistakes
1111
+ { pattern: /\bc\b/g, replacement: 'ç' },
1112
+ { pattern: /\bC\b/g, replacement: 'Ç' },
1113
+ { pattern: /\bg\b/g, replacement: 'ğ', context: /[aeiou]/i },
1114
+ { pattern: /\bG\b/g, replacement: 'Ğ', context: /[AEIOU]/i },
1115
+ { pattern: /\bi\b/g, replacement: 'ı', context: /[^iİ]/g },
1116
+ { pattern: /\bI\b/g, replacement: 'İ' },
1117
+ { pattern: /\bo\b/g, replacement: 'ö', context: /[aeiou]/i },
1118
+ { pattern: /\bO\b/g, replacement: 'Ö', context: /[AEIOU]/i },
1119
+ { pattern: /\bs\b/g, replacement: 'ş', context: /[aeiou]/i },
1120
+ { pattern: /\bS\b/g, replacement: 'Ş', context: /[AEIOU]/i },
1121
+ { pattern: /\bu\b/g, replacement: 'ü', context: /[aeiou]/i },
1122
+ { pattern: /\bU\b/g, replacement: 'Ü', context: /[AEIOU]/i },
1123
+
1124
+ // Number and symbol corrections
1125
+ { pattern: /0/g, replacement: 'O', context: /[A-Z]/ },
1126
+ { pattern: /1/g, replacement: 'İ', context: /[A-Z]/ },
1127
+ { pattern: /5/g, replacement: 'S', context: /[A-Z]/ },
1128
+
1129
+ // Common word corrections
1130
+ { pattern: /\bve\b/gi, replacement: 've' },
1131
+ { pattern: /\bile\b/gi, replacement: 'ile' },
1132
+ { pattern: /\bicin\b/gi, replacement: 'için' },
1133
+ { pattern: /\bsizin\b/gi, replacement: 'sizin' },
1134
+ { pattern: /\bbir\b/gi, replacement: 'bir' },
1135
+ { pattern: /\bbu\b/gi, replacement: 'bu' },
1136
+ { pattern: /\bsu\b/gi, replacement: 'şu' }
1137
+ ];
1138
+
1139
+ let correctedText = text;
1140
+
1141
+ corrections.forEach(correction => {
1142
+ correctedText = correctedText.replace(correction.pattern, correction.replacement);
1143
+ });
1144
+
1145
+ // Fix spacing around punctuation
1146
+ correctedText = correctedText
1147
+ .replace(/\s+([.,!?;:])/g, '$1')
1148
+ .replace(/([.,!?;:])\s*/g, '$1 ')
1149
+ .replace(/\s+/g, ' ')
1150
  .trim();
1151
+
1152
+ return correctedText;
1153
  }
1154
  function processFormattedOCR(hocr) {
1155
  // Apply learned corrections