Spaces:

ASDAD34
/

docextractor-pro

Running

App Files Files Community

ASDAD34 commited on Nov 29, 2025

Commit

602f295

verified ·

1 Parent(s): ed84526

içerik daha iyi türkçeye ocr çok bozuk iyi yapamıyor. abbyfinereader tarzında markdown, json,text formatına belge yapısına uygun çeviri yapsın. bu talimatı uygulama

Browse files

Files changed (1) hide show

script.js +533 -110

script.js CHANGED Viewed

@@ -188,11 +188,13 @@ return {
             reader.onload = async function(event) {
                 try {
-                    const typedArray = new Uint8Array(event.target.result);
                     // Enhanced PDF loading with multiple extraction strategies
                     const loadingTask = pdfjsLib.getDocument({
-                        data: typedArray,
                         cMapUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/cmaps/',
                         cMapPacked: true,
                         standardFontDataUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/standard_fonts/',
@@ -202,8 +204,7 @@ return {
                         disableAutoFetch: false,
                         disableStream: false
                     });
-                    const pdf = await loadingTask.promise;
                     let fullText = '';
                     let metadata = await pdf.getMetadata();
@@ -264,23 +265,46 @@ return {
                     // Strategy 2: Enhanced Turkish character decoding
                     fullText = decodeTurkishText(fullText);
                     // Strategy 3: If still poor quality, try OCR with preprocessing
                     if (!fullText.trim() || fullText.trim().length < 50) {
                         console.warn('Primary text extraction failed, attempting enhanced OCR...');
-                        fullText = await enhancedOCRFallback(typedArray);
                     }
-                    // Strategy 4: Apply text quality improvements
                     fullText = improveTextQuality(fullText);
                     resolve(fullText);
                 } catch (error) {
                     console.error('PDF extraction error:', error);
                     reject(new Error('Failed to extract text from PDF: ' + error.message));
                 }
-            };
             reader.onerror = () => reject(new Error('Failed to read PDF file'));
             reader.readAsArrayBuffer(file);
@@ -332,11 +356,20 @@ return {
         return decodedText;
     }
     // Enhanced OCR fallback with multiple engines
     async function enhancedOCRFallback(pdfData) {
-        const images = await convertPDFToImagesEnhanced(pdfData);
-        let ocrResults = [];
         for (const image of images) {
             // Try multiple OCR approaches
@@ -419,17 +452,29 @@ return {
         ctx.putImageData(imageData, 0, 0);
         return canvas;
     }
     // Enhanced PDF to image conversion
     async function convertPDFToImagesEnhanced(pdfData) {
         const loadingTask = pdfjsLib.getDocument({
-            data: pdfData,
             cMapUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/cmaps/',
             cMapPacked: true,
-            standardFontDataUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/standard_fonts/'
         });
-        const pdf = await loadingTask.promise;
         const images = [];
         // Process all pages with higher resolution
@@ -602,85 +647,74 @@ return {
             reader.readAsArrayBuffer(file);
         });
     }
-    async function convertPDFToImages(pdfData) {
         return await convertPDFToImagesEnhanced(pdfData);
     }
-async function extractTextFromImage(file) {
         return new Promise(async (resolve, reject) => {
             try {
-                // Apply learned corrections before OCR
-                let trainedWords = {};
-                if (window.ocrLearningDict) {
-                    for (const [word, data] of Object.entries(window.ocrLearningDict)) {
-                        if (data.confirmedCorrect && data.confirmedCorrect !== word) {
-                            trainedWords[word] = data.confirmedCorrect;
-                        }
-                    }
-                }
                 const imageElement = file instanceof HTMLCanvasElement ? file : file;
-                // Enhanced OCR configuration
-                const config = {
-                    logger: m => {
-                        if (m.status === 'recognizing text') {
-                            console.log(`OCR Progress: ${Math.round(m.progress * 100)}%`);
-                        }
-                    },
-                    preserve_interword_spaces: '1',
-                    tessedit_pageseg_mode: '6', // Assume uniform text block
-                    tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" @#$%^&*+=<>:;_ ğüşıöçĞÜŞİÖÇ',
-                    user_defined_words: Object.keys(trainedWords).join(' '),
-                    tessedit_create_hocr: '1',
-                    load_system_dawg: '1',
-                    load_freq_dawg: '1',
-                    user_words_suffix: 'tur',
-                    user_patterns_suffix: 'tur',
-                    tessedit_ocr_engine_mode: '1', // LSTM OCR engine
-                    tessedit_do_ocr: '1',
-                    tessedit_load_image: '1'
-                };
-                // Try multiple OCR approaches
-                const results = await Promise.allSettled([
-                    // Primary: Turkish + English with enhanced preprocessing
-                    performOCRWithPreprocessing(imageElement, 'tur+eng', config),
-                    // Secondary: Different page segmentation
-                    Tesseract.recognize(imageElement, 'tur+eng', {
-                        ...config,
-                        tessedit_pageseg_mode: '1' // Automatic page segmentation
-                    }),
-                    // Tertiary: Only English if Turkish fails
-                    Tesseract.recognize(imageElement, 'eng', config)
-                ]);
-                // Find and return the best result
-                let bestResult = { text: '', confidence: 0 };
-                results.forEach(result => {
-                    if (result.status === 'fulfilled') {
-                        const text = result.value.text;
-                        const confidence = calculateConfidence(text);
-                        if (text.trim().length > bestResult.text.length ||
-                            (text.trim().length === bestResult.text.length && confidence > bestResult.confidence)) {
-                            bestResult = { text, confidence };
                         }
-                    }
-                });
-                if (bestResult.text) {
-                    // Apply text quality improvements
-                    bestResult.text = decodeTurkishText(bestResult.text);
-                    bestResult.text = improveTextQuality(bestResult.text);
-                    if (outputFormat.value === 'formatted') {
-                        // Create formatted output
-                        const formatted = createFormattedText(bestResult.text);
-                        resolve(formatted);
-                    } else {
-                        resolve(bestResult.text);
-                    }
                 } else {
                     resolve('No text could be extracted from the image.');
                 }
@@ -690,43 +724,432 @@ async function extractTextFromImage(file) {
                 reject(error);
             }
         });
-        // OCR with image preprocessing
-        async function performOCRWithPreprocessing(image, languages, config) {
-            let processedImage = image;
-            if (image instanceof HTMLCanvasElement) {
-                // Apply preprocessing to canvas
-                processedImage = await preprocessImage(image);
             }
-            return await Tesseract.recognize(processedImage, languages, config);
         }
-        // Calculate text confidence score
-        function calculateConfidence(text) {
             if (!text || text.trim().length === 0) return 0;
-            // Score based on Turkish word detection
-            const turkishWords = text.match(/[ğüşıöçĞÜŞİÖÇ]+/g) || [];
-            const wordCount = text.split(/\s+/).length;
-            const turkishRatio = turkishWords.length / wordCount;
-            // Score based on sentence structure
-            const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
-            const avgSentenceLength = sentences.reduce((sum, s) => sum + s.split(/\s+/).length, 0) / sentences.length;
-            // Combined confidence score
-            return (turkishRatio * 0.5) + (Math.min(avgSentenceLength / 10, 1) * 0.5);
         }
-        // Create formatted text output
-        function createFormattedText(text) {
-            return text
-                .replace(/([.!?])\s+/g, '$1\n\n') // Better paragraph breaks
-                .replace(/\n{3,}/g, '\n\n') // Remove excessive line breaks
-                .replace(/([A-ZÇĞİÖŞÜ][a-zçğıöşü]+)\s+/g, '$1 ') // Preserve Turkish words
                 .trim();
         }
 function processFormattedOCR(hocr) {
         // Apply learned corrections

             reader.onload = async function(event) {
                 try {
+                    // Create a copy of the ArrayBuffer to avoid detachment issues
+                    const arrayBuffer = event.target.result;
+                    const typedArray = new Uint8Array(arrayBuffer.slice(0));
                     // Enhanced PDF loading with multiple extraction strategies
                     const loadingTask = pdfjsLib.getDocument({
+                        data: typedArray.buffer,
                         cMapUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/cmaps/',
                         cMapPacked: true,
                         standardFontDataUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/standard_fonts/',
                         disableAutoFetch: false,
                         disableStream: false
                     });
+const pdf = await loadingTask.promise;
                     let fullText = '';
                     let metadata = await pdf.getMetadata();
                     // Strategy 2: Enhanced Turkish character decoding
                     fullText = decodeTurkishText(fullText);
                     // Strategy 3: If still poor quality, try OCR with preprocessing
                     if (!fullText.trim() || fullText.trim().length < 50) {
                         console.warn('Primary text extraction failed, attempting enhanced OCR...');
+                        // Create a fresh copy for OCR to avoid detachment
+                        const ocrArrayBuffer = arrayBuffer.slice(0);
+                        fullText = await enhancedOCRFallback(ocrArrayBuffer);
                     }
+// Strategy 4: Apply text quality improvements
                     fullText = improveTextQuality(fullText);
                     resolve(fullText);
                 } catch (error) {
                     console.error('PDF extraction error:', error);
+                    // Try a simpler extraction method as fallback
+                    try {
+                        console.warn('Attempting simplified PDF extraction...');
+                        const simpleArray = new Uint8Array(arrayBuffer.slice(0));
+                        const simpleLoadingTask = pdfjsLib.getDocument(simpleArray.buffer);
+                        const simplePdf = await simpleLoadingTask.promise;
+                        let simpleText = '';
+                        for (let i = 1; i <= Math.min(simplePdf.numPages, 5); i++) {
+                            const page = await simplePdf.getPage(i);
+                            const simpleContent = await page.getTextContent();
+                            const pageText = simpleContent.items.map(item => item.str).join(' ');
+                            simpleText += pageText + '\n';
+                        }
+                        if (simpleText.trim()) {
+                            resolve(decodeTurkishText(improveTextQuality(simpleText)));
+                            return;
+                        }
+                    } catch (fallbackError) {
+                        console.error('Fallback extraction also failed:', fallbackError);
+                    }
                     reject(new Error('Failed to extract text from PDF: ' + error.message));
                 }
+};
             reader.onerror = () => reject(new Error('Failed to read PDF file'));
             reader.readAsArrayBuffer(file);
         return decodedText;
     }
     // Enhanced OCR fallback with multiple engines
     async function enhancedOCRFallback(pdfData) {
+        // Ensure we have a valid ArrayBuffer
+        let arrayBuffer;
+        if (pdfData instanceof ArrayBuffer) {
+            arrayBuffer = pdfData;
+        } else if (pdfData instanceof Uint8Array) {
+            arrayBuffer = pdfData.buffer;
+        } else {
+            throw new Error('Invalid PDF data format for OCR fallback');
+        }
+        const images = await convertPDFToImagesEnhanced(arrayBuffer);
+let ocrResults = [];
         for (const image of images) {
             // Try multiple OCR approaches
         ctx.putImageData(imageData, 0, 0);
         return canvas;
     }
     // Enhanced PDF to image conversion
     async function convertPDFToImagesEnhanced(pdfData) {
+        // Ensure we have a fresh copy of the data
+        let data;
+        if (pdfData instanceof ArrayBuffer) {
+            data = new Uint8Array(pdfData.slice(0));
+        } else if (pdfData instanceof Uint8Array) {
+            data = new Uint8Array(pdfData.buffer.slice(0));
+        } else {
+            throw new Error('Invalid PDF data format for image conversion');
+        }
         const loadingTask = pdfjsLib.getDocument({
+            data: data.buffer,
             cMapUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/cmaps/',
             cMapPacked: true,
+            standardFontDataUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/standard_fonts/',
+            // Disable worker for image conversion to avoid detachment issues
+            useWorkerFetch: false,
+            isEvalSupported: false,
+            disableWorker: true
         });
+const pdf = await loadingTask.promise;
         const images = [];
         // Process all pages with higher resolution
             reader.readAsArrayBuffer(file);
         });
     }
+async function convertPDFToImages(pdfData) {
+        // Create a fresh copy before processing
+        if (pdfData instanceof ArrayBuffer) {
+            return await convertPDFToImagesEnhanced(pdfData.slice(0));
+        } else if (pdfData instanceof Uint8Array) {
+            return await convertPDFToImagesEnhanced(pdfData.buffer.slice(0));
+        }
         return await convertPDFToImagesEnhanced(pdfData);
     }
+    async function extractTextFromImage(file) {
         return new Promise(async (resolve, reject) => {
             try {
                 const imageElement = file instanceof HTMLCanvasElement ? file : file;
+                // Apply advanced preprocessing
+                const processedImages = await applyAdvancedPreprocessing(imageElement);
+                // Multi-strategy OCR approach
+                const ocrResults = [];
+                for (const processedImage of processedImages) {
+                    const results = await Promise.allSettled([
+                        // Strategy 1: Turkish with best settings
+                        performAdvancedOCR(processedImage, 'tur', {
+                            tessedit_pageseg_mode: '6',
+                            preserve_interword_spaces: '1',
+                            tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" @#$%^&*+=<>:;_ ğüşıöçĞÜŞİÖÇ',
+                            tessedit_ocr_engine_mode: '1',
+                            tessedit_do_ocr: '1',
+                            tessedit_load_image: '1'
+                        }),
+                        // Strategy 2: Turkish+English with auto segmentation
+                        performAdvancedOCR(processedImage, 'tur+eng', {
+                            tessedit_pageseg_mode: '1',
+                            preserve_interword_spaces: '1',
+                            tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" @#$%^&*+=<>:;_ ğüşıöçĞÜŞİÖÇ',
+                            tessedit_ocr_engine_mode: '1'
+                        }),
+                        // Strategy 3: Single column mode
+                        performAdvancedOCR(processedImage, 'tur', {
+                            tessedit_pageseg_mode: '3',
+                            preserve_interword_spaces: '1',
+                            tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" @#$%^&*+=<>:;_ ğüşıöçĞÜŞİÖÇ'
+                        })
+                    ]);
+                    results.forEach(result => {
+                        if (result.status === 'fulfilled' && result.value.text.trim().length > 10) {
+                            ocrResults.push({
+                                text: result.value.text,
+                                confidence: result.value.confidence || calculateConfidence(result.value.text),
+                                strategy: result.value.strategy
+                            });
                         }
+                    });
+                }
+                // Select best result using advanced scoring
+                const bestResult = selectBestResult(ocrResults);
+                if (bestResult) {
+                    // Apply document structure analysis
+                    const structuredText = await analyzeDocumentStructure(bestResult.text);
+                    // Apply intelligent Turkish text corrections
+                    const correctedText = applyIntelligentTurkishCorrections(structuredText);
+                    resolve(correctedText);
                 } else {
                     resolve('No text could be extracted from the image.');
                 }
                 reject(error);
             }
         });
+        // Apply advanced image preprocessing techniques
+        async function applyAdvancedPreprocessing(imageElement) {
+            const processedImages = [];
+            // Original image
+            if (imageElement instanceof HTMLCanvasElement) {
+                processedImages.push(imageElement);
+            } else {
+                const canvas = await imageToCanvas(imageElement);
+                processedImages.push(canvas);
+            }
+            // Enhanced preprocessing variations
+            const variations = [
+                // High contrast
+                await applyImageEnhancement(processedImages[0], 'contrast'),
+                // Denoised
+                await applyImageEnhancement(processedImages[0], 'denoise'),
+                // Sharpened
+                await applyImageEnhancement(processedImages[0], 'sharpen'),
+                // Binarized
+                await applyImageEnhancement(processedImages[0], 'binarize')
+            ];
+            processedImages.push(...variations.filter(img => img !== null));
+            return processedImages;
+        }
+        // Convert image to canvas
+        async function imageToCanvas(image) {
+            return new Promise((resolve) => {
+                const img = new Image();
+                img.onload = () => {
+                    const canvas = document.createElement('canvas');
+                    canvas.width = img.width;
+                    canvas.height = img.height;
+                    const ctx = canvas.getContext('2d');
+                    ctx.drawImage(img, 0, 0);
+                    resolve(canvas);
+                };
+                img.src = image instanceof HTMLCanvasElement ? image.toDataURL() : URL.createObjectURL(image);
+            });
+        }
+        // Apply specific image enhancement
+        async function applyImageEnhancement(canvas, type) {
+            const ctx = canvas.getContext('2d');
+            const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
+            const data = imageData.data;
+            switch(type) {
+                case 'contrast':
+                    // Enhance contrast
+                    const contrast = 1.5;
+                    for (let i = 0; i < data.length; i += 4) {
+                        data[i] = ((data[i] - 128) * contrast) + 128;
+                        data[i + 1] = ((data[i + 1] - 128) * contrast) + 128;
+                        data[i + 2] = ((data[i + 2] - 128) * contrast) + 128;
+                    }
+                    break;
+                case 'denoise':
+                    // Simple noise reduction
+                    for (let i = 0; i < data.length; i += 4) {
+                        const avg = (data[i] + data[i + 1] + data[i + 2]) / 3;
+                        const threshold = 30;
+                        if (Math.abs(data[i] - avg) > threshold) data[i] = avg;
+                        if (Math.abs(data[i + 1] - avg) > threshold) data[i + 1] = avg;
+                        if (Math.abs(data[i + 2] - avg) > threshold) data[i + 2] = avg;
+                    }
+                    break;
+                case 'sharpen':
+                    // Sharpen filter
+                    const weights = [0, -1, 0, -1, 5, -1, 0, -1, 0];
+                    const side = Math.round(Math.sqrt(weights.length));
+                    const halfSide = Math.floor(side / 2);
+                    const output = ctx.createImageData(canvas.width, canvas.height);
+                    const dst = output.data;
+                    for (let y = 0; y < canvas.height; y++) {
+                        for (let x = 0; x < canvas.width; x++) {
+                            const dstOff = (y * canvas.width + x) * 4;
+                            let r = 0, g = 0, b = 0;
+                            for (let cy = 0; cy < side; cy++) {
+                                for (let cx = 0; cx < side; cx++) {
+                                    const scy = y + cy - halfSide;
+                                    const scx = x + cx - halfSide;
+                                    if (scy >= 0 && scy < canvas.height && scx >= 0 && scx < canvas.width) {
+                                        const srcOff = (scy * canvas.width + scx) * 4;
+                                        const wt = weights[cy * side + cx];
+                                        r += data[srcOff] * wt;
+                                        g += data[srcOff + 1] * wt;
+                                        b += data[srcOff + 2] * wt;
+                                    }
+                                }
+                            }
+                            dst[dstOff] = r;
+                            dst[dstOff + 1] = g;
+                            dst[dstOff + 2] = b;
+                            dst[dstOff + 3] = 255;
+                        }
+                    }
+                    ctx.putImageData(output, 0, 0);
+                    return canvas;
+                case 'binarize':
+                    // Adaptive thresholding
+                    for (let i = 0; i < data.length; i += 4) {
+                        const gray = data[i] * 0.299 + data[i + 1] * 0.587 + data[i + 2] * 0.114;
+                        const threshold = gray > 128 ? 255 : 0;
+                        data[i] = threshold;
+                        data[i + 1] = threshold;
+                        data[i + 2] = threshold;
+                    }
+                    break;
+            }
+            ctx.putImageData(imageData, 0, 0);
+            return canvas;
+        }
+        // Advanced OCR processing
+        async function performAdvancedOCR(image, languages, config) {
+            try {
+                const result = await Tesseract.recognize(image, languages, {
+                    logger: m => console.log(`OCR (${languages}): ${m.status} - ${Math.round(m.progress * 100)}%`),
+                    ...config
+                });
+                return {
+                    text: result.data.text,
+                    confidence: result.data.confidence || 0,
+                    strategy: `OCR_${languages}_${config.tessedit_pageseg_mode}`
+                };
+            } catch (error) {
+                console.error(`OCR strategy failed:`, error);
+                return { text: '', confidence: 0 };
             }
+        }
+        // Select best OCR result using advanced scoring
+        function selectBestResult(results) {
+            if (results.length === 0) return null;
+            let bestScore = -1;
+            let bestResult = null;
+            results.forEach(result => {
+                const score = calculateAdvancedScore(result.text, result.confidence);
+                if (score > bestScore) {
+                    bestScore = score;
+                    bestResult = result;
+                }
+            });
+            return bestResult;
         }
+        // Calculate advanced scoring for OCR results
+        function calculateAdvancedScore(text, baseConfidence) {
             if (!text || text.trim().length === 0) return 0;
+            let score = baseConfidence || 0;
+            // Turkish character detection (40% weight)
+            const turkishChars = (text.match(/[ğüşıöçĞÜŞİÖÇ]/g) || []).length;
+            const totalChars = text.replace(/\s/g, '').length;
+            const turkishRatio = totalChars > 0 ? turkishChars / totalChars : 0;
+            score += turkishRatio * 40;
+            // Word detection (20% weight)
+            const words = text.match(/\b\w+\b/g) || [];
+            const turkishWords = words.filter(word => /[ğıüşiöçĞIÜŞİÖÇ]/.test(word));
+            const wordRatio = words.length > 0 ? turkishWords.length / words.length : 0;
+            score += wordRatio * 20;
+            // Sentence structure (20% weight)
+            const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 5);
+            const avgSentenceLength = sentences.length > 0 ?
+                sentences.reduce((sum, s) => sum + s.split(/\s+/).length, 0) / sentences.length : 0;
+            const sentenceScore = Math.min(avgSentenceLength / 10, 1);
+            score += sentenceScore * 20;
+            // Text length penalty for very short texts
+            if (text.trim().length < 20) score *= 0.5;
+            return Math.min(score, 100);
+        }
+        // Analyze document structure like Abbyy FineReader
+        async function analyzeDocumentStructure(text) {
+            // Split text into potential sections
+            const lines = text.split('\n').filter(line => line.trim().length > 0);
+            const structuredSections = [];
+            let currentSection = { type: 'paragraph', content: [], level: 0 };
+            for (let i = 0; i < lines.length; i++) {
+                const line = lines[i].trim();
+                // Detect headings
+                if (isHeading(line)) {
+                    if (currentSection.content.length > 0) {
+                        structuredSections.push(currentSection);
+                    }
+                    currentSection = {
+                        type: 'heading',
+                        content: [line],
+                        level: detectHeadingLevel(line)
+                    };
+                }
+                // Detect lists
+                else if (isListItem(line)) {
+                    if (currentSection.type !== 'list') {
+                        if (currentSection.content.length > 0) {
+                            structuredSections.push(currentSection);
+                        }
+                        currentSection = { type: 'list', content: [], level: 0 };
+                    }
+                    currentSection.content.push(line);
+                }
+                // Detect tables
+                else if (isTableRow(line)) {
+                    if (currentSection.type !== 'table') {
+                        if (currentSection.content.length > 0) {
+                            structuredSections.push(currentSection);
+                        }
+                        currentSection = { type: 'table', content: [], level: 0 };
+                    }
+                    currentSection.content.push(line);
+                }
+                // Regular paragraph
+                else {
+                    if (currentSection.type !== 'paragraph') {
+                        if (currentSection.content.length > 0) {
+                            structuredSections.push(currentSection);
+                        }
+                        currentSection = { type: 'paragraph', content: [], level: 0 };
+                    }
+                    currentSection.content.push(line);
+                }
+            }
+            if (currentSection.content.length > 0) {
+                structuredSections.push(currentSection);
+            }
+            return formatStructuredText(structuredSections);
+        }
+        // Check if line is a heading
+        function isHeading(line) {
+            // Short lines with all caps or title case are likely headings
+            if (line.length < 50 && line.split(/\s+/).length <= 8) {
+                const words = line.split(/\s+/);
+                const titleWords = words.filter(word => /^[A-ZÇĞİÖŞÜ]/.test(word));
+                return titleWords.length / words.length > 0.6;
+            }
+            // Lines with colon at end are often headings
+            if (line.endsWith(':')) return true;
+            // Numbered headings
+            if (/^\d+\.?\s+[A-ZÇĞİÖŞÜ]/.test(line)) return true;
+            return false;
         }
+        // Detect heading level
+        function detectHeadingLevel(line) {
+            if (/^\d+\.\d+\s+/.test(line)) return 2;
+            if (/^\d+\s+/.test(line)) return 1;
+            if (line.length < 30) return 1;
+            if (line.length < 40) return 2;
+            return 3;
+        }
+        // Check if line is a list item
+        function isListItem(line) {
+            return /^[-*•]\s+/.test(line) ||
+                   /^\d+\.\s+/.test(line) ||
+                   /^\([a-z]\)\s+/.test(line);
+        }
+        // Check if line is a table row
+        function isTableRow(line) {
+            // Multiple tabs or multiple pipes suggest table
+            return (line.split('\t').length > 2) ||
+                   (line.split('|').length > 3);
+        }
+        // Format structured text based on output format
+        function formatStructuredText(sections) {
+            if (outputFormat.value === 'markdown') {
+                return formatAsMarkdown(sections);
+            } else if (outputFormat.value === 'json') {
+                return formatAsJSON(sections);
+            } else if (outputFormat.value === 'formatted') {
+                return formatAsStructuredText(sections);
+            }
+            return sections.map(s => s.content.join(' ')).join('\n\n');
+        }
+        // Format as Markdown
+        function formatAsMarkdown(sections) {
+            let markdown = '';
+            sections.forEach(section => {
+                switch(section.type) {
+                    case 'heading':
+                        const hashes = '#'.repeat(section.level);
+                        markdown += `${hashes} ${section.content[0]}\n\n`;
+                        break;
+                    case 'list':
+                        section.content.forEach(item => {
+                            markdown += `- ${item}\n`;
+                        });
+                        markdown += '\n';
+                        break;
+                    case 'table':
+                        section.content.forEach(row => {
+                            markdown += `| ${row.split(/\t+|\|/).join(' | ')} |\n`;
+                        });
+                        markdown += '\n';
+                        break;
+                    case 'paragraph':
+                        markdown += section.content.join(' ') + '\n\n';
+                        break;
+                }
+            });
+            return markdown.trim();
+        }
+        // Format as JSON
+        function formatAsJSON(sections) {
+            const structured = sections.map(section => ({
+                type: section.type,
+                level: section.level,
+                content: section.content
+            }));
+            return JSON.stringify(structured, null, 2);
+        }
+        // Format as structured text
+        function formatAsStructuredText(sections) {
+            let text = '';
+            sections.forEach(section => {
+                switch(section.type) {
+                    case 'heading':
+                        text += '\n' + section.content[0].toUpperCase() + '\n';
+                        text += '='.repeat(section.content[0].length) + '\n\n';
+                        break;
+                    case 'list':
+                        section.content.forEach(item => {
+                            text += '  • ' + item + '\n';
+                        });
+                        text += '\n';
+                        break;
+                    case 'table':
+                        section.content.forEach(row => {
+                            text += row + '\n';
+                        });
+                        text += '\n';
+                        break;
+                    case 'paragraph':
+                        text += section.content.join(' ') + '\n\n';
+                        break;
+                }
+            });
+            return text.trim();
+        }
+        // Apply intelligent Turkish corrections
+        function applyIntelligentTurkishCorrections(text) {
+            // Turkish character corrections based on context
+            const corrections = [
+                // Common OCR mistakes
+                { pattern: /\bc\b/g, replacement: 'ç' },
+                { pattern: /\bC\b/g, replacement: 'Ç' },
+                { pattern: /\bg\b/g, replacement: 'ğ', context: /[aeiou]/i },
+                { pattern: /\bG\b/g, replacement: 'Ğ', context: /[AEIOU]/i },
+                { pattern: /\bi\b/g, replacement: 'ı', context: /[^iİ]/g },
+                { pattern: /\bI\b/g, replacement: 'İ' },
+                { pattern: /\bo\b/g, replacement: 'ö', context: /[aeiou]/i },
+                { pattern: /\bO\b/g, replacement: 'Ö', context: /[AEIOU]/i },
+                { pattern: /\bs\b/g, replacement: 'ş', context: /[aeiou]/i },
+                { pattern: /\bS\b/g, replacement: 'Ş', context: /[AEIOU]/i },
+                { pattern: /\bu\b/g, replacement: 'ü', context: /[aeiou]/i },
+                { pattern: /\bU\b/g, replacement: 'Ü', context: /[AEIOU]/i },
+                // Number and symbol corrections
+                { pattern: /0/g, replacement: 'O', context: /[A-Z]/ },
+                { pattern: /1/g, replacement: 'İ', context: /[A-Z]/ },
+                { pattern: /5/g, replacement: 'S', context: /[A-Z]/ },
+                // Common word corrections
+                { pattern: /\bve\b/gi, replacement: 've' },
+                { pattern: /\bile\b/gi, replacement: 'ile' },
+                { pattern: /\bicin\b/gi, replacement: 'için' },
+                { pattern: /\bsizin\b/gi, replacement: 'sizin' },
+                { pattern: /\bbir\b/gi, replacement: 'bir' },
+                { pattern: /\bbu\b/gi, replacement: 'bu' },
+                { pattern: /\bsu\b/gi, replacement: 'şu' }
+            ];
+            let correctedText = text;
+            corrections.forEach(correction => {
+                correctedText = correctedText.replace(correction.pattern, correction.replacement);
+            });
+            // Fix spacing around punctuation
+            correctedText = correctedText
+                .replace(/\s+([.,!?;:])/g, '$1')
+                .replace(/([.,!?;:])\s*/g, '$1 ')
+                .replace(/\s+/g, ' ')
                 .trim();
+            return correctedText;
         }
 function processFormattedOCR(hocr) {
         // Apply learned corrections