import { PDFDocument } from 'pdf-lib' import mammoth from 'mammoth' export type Page = { page_num: number; text: string; images: Buffer[] } // Simple text extraction from PDF using basic parsing function extractTextFromPdfBuffer(buffer: Buffer): string { try { // Convert buffer to string and look for text content const pdfString = buffer.toString('latin1') // Look for text streams in PDF const textMatches = pdfString.match(/BT[\s\S]*?ET/g) || [] const textContent = textMatches .map(match => { // Extract text from PDF text objects const textMatches = match.match(/\([^)]*\)/g) || [] return textMatches .map(t => t.slice(1, -1)) // Remove parentheses .join(' ') }) .join(' ') .trim() return textContent || `[PDF Content - ${buffer.length} bytes - Text extraction limited]` } catch (error) { return `[PDF Content - ${buffer.length} bytes - Text extraction failed: ${error}]` } } export async function parsePdfBytes(buf: Buffer): Promise { console.log(`[PARSER_DEBUG] Parsing PDF with ${buf.length} bytes`) // Optional heavy parser (guarded by env to avoid OOM locally) if (process.env.PARSER_USE_PDFJS === '1') { try { const pdfjs: any = await import('pdfjs-dist/legacy/build/pdf') if (pdfjs.GlobalWorkerOptions) { pdfjs.GlobalWorkerOptions.workerSrc = undefined } const loadingTask = pdfjs.getDocument({ data: new Uint8Array(buf), disableFontFace: true, useSystemFonts: true, isEvalSupported: false }) const pdf = await loadingTask.promise const pageCount: number = pdf.numPages console.log(`[PARSER_DEBUG] pdfjs-dist loaded. Pages: ${pageCount}`) const pages: Page[] = [] for (let i = 1; i <= pageCount; i++) { const page = await pdf.getPage(i) const textContent = await page.getTextContent() const text = (textContent.items || []) .map((it: any) => (typeof it.str === 'string' ? it.str : '')) .join(' ') .replace(/\s+/g, ' ') .trim() pages.push({ page_num: i, text, images: [] }) } console.log(`[PARSER_DEBUG] Parsed PDF with ${pages.length} pages via pdfjs-dist`) return pages } catch (err) { console.warn('[PARSER_DEBUG] pdfjs-dist extraction failed, falling back to basic extractor:', err) } } // Fallback: use lightweight extractor and page-count from pdf-lib try { const pdfDoc = await PDFDocument.load(buf) const pageCount = pdfDoc.getPageCount() const extractedText = extractTextFromPdfBuffer(buf) const pages: Page[] = [] if (pageCount > 1) { const textPerPage = Math.ceil(extractedText.length / pageCount) for (let i = 0; i < pageCount; i++) { const start = i * textPerPage const end = Math.min((i + 1) * textPerPage, extractedText.length) const pageText = extractedText.slice(start, end).trim() || `[Page ${i + 1} - Content]` pages.push({ page_num: i + 1, text: pageText, images: [] }) } } else { pages.push({ page_num: 1, text: extractedText || `[PDF Content - ${buf.length} bytes]`, images: [] }) } console.log(`[PARSER_DEBUG] Parsed PDF with ${pages.length} pages via fallback`) return pages } catch (error) { console.error('[PARSER_DEBUG] PDF parsing error (fallback):', error) return [{ page_num: 1, text: `[PDF Content - ${buf.length} bytes - Parse error: ${error}]`, images: [] }] } } export async function parseDocxBytes(buf: Buffer): Promise { try { const { value } = await mammoth.extractRawText({ buffer: buf }) const text = value || '' console.log(`[PARSER_DEBUG] DOCX extracted ${text.length} characters`) return [{ page_num: 1, text, images: [] }] } catch (error) { console.error('[PARSER_DEBUG] DOCX parsing error:', error) return [{ page_num: 1, text: `[DOCX Parse Error: ${error}]`, images: [] }] } } export function inferMime(filename: string): string { const lower = filename.toLowerCase() if (lower.endsWith('.pdf')) return 'application/pdf' if (lower.endsWith('.docx')) return 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' return 'application/octet-stream' } export async function extractPages(filename: string, file: Buffer): Promise { const mime = inferMime(filename) console.log(`[PARSER_DEBUG] Processing ${filename} as ${mime}`) if (mime === 'application/pdf') return parsePdfBytes(file) if (mime === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') return parseDocxBytes(file) throw new Error(`Unsupported file type: ${filename}`) }