Spaces:
Sleeping
Sleeping
| import { PDFDocument } from 'pdf-lib' | |
| import mammoth from 'mammoth' | |
| export type Page = { page_num: number; text: string; images: Buffer[] } | |
| // Simple text extraction from PDF using basic parsing | |
| function extractTextFromPdfBuffer(buffer: Buffer): string { | |
| try { | |
| // Convert buffer to string and look for text content | |
| const pdfString = buffer.toString('latin1') | |
| // Look for text streams in PDF | |
| const textMatches = pdfString.match(/BT[\s\S]*?ET/g) || [] | |
| const textContent = textMatches | |
| .map(match => { | |
| // Extract text from PDF text objects | |
| const textMatches = match.match(/\([^)]*\)/g) || [] | |
| return textMatches | |
| .map(t => t.slice(1, -1)) // Remove parentheses | |
| .join(' ') | |
| }) | |
| .join(' ') | |
| .trim() | |
| return textContent || `[PDF Content - ${buffer.length} bytes - Text extraction limited]` | |
| } catch (error) { | |
| return `[PDF Content - ${buffer.length} bytes - Text extraction failed: ${error}]` | |
| } | |
| } | |
| export async function parsePdfBytes(buf: Buffer): Promise<Page[]> { | |
| console.log(`[PARSER_DEBUG] Parsing PDF with ${buf.length} bytes`) | |
| // Optional heavy parser (guarded by env to avoid OOM locally) | |
| if (process.env.PARSER_USE_PDFJS === '1') { | |
| try { | |
| const pdfjs: any = await import('pdfjs-dist/legacy/build/pdf') | |
| if (pdfjs.GlobalWorkerOptions) { | |
| pdfjs.GlobalWorkerOptions.workerSrc = undefined | |
| } | |
| const loadingTask = pdfjs.getDocument({ | |
| data: new Uint8Array(buf), | |
| disableFontFace: true, | |
| useSystemFonts: true, | |
| isEvalSupported: false | |
| }) | |
| const pdf = await loadingTask.promise | |
| const pageCount: number = pdf.numPages | |
| console.log(`[PARSER_DEBUG] pdfjs-dist loaded. Pages: ${pageCount}`) | |
| const pages: Page[] = [] | |
| for (let i = 1; i <= pageCount; i++) { | |
| const page = await pdf.getPage(i) | |
| const textContent = await page.getTextContent() | |
| const text = (textContent.items || []) | |
| .map((it: any) => (typeof it.str === 'string' ? it.str : '')) | |
| .join(' ') | |
| .replace(/\s+/g, ' ') | |
| .trim() | |
| pages.push({ page_num: i, text, images: [] }) | |
| } | |
| console.log(`[PARSER_DEBUG] Parsed PDF with ${pages.length} pages via pdfjs-dist`) | |
| return pages | |
| } catch (err) { | |
| console.warn('[PARSER_DEBUG] pdfjs-dist extraction failed, falling back to basic extractor:', err) | |
| } | |
| } | |
| // Fallback: use lightweight extractor and page-count from pdf-lib | |
| try { | |
| const pdfDoc = await PDFDocument.load(buf) | |
| const pageCount = pdfDoc.getPageCount() | |
| const extractedText = extractTextFromPdfBuffer(buf) | |
| const pages: Page[] = [] | |
| if (pageCount > 1) { | |
| const textPerPage = Math.ceil(extractedText.length / pageCount) | |
| for (let i = 0; i < pageCount; i++) { | |
| const start = i * textPerPage | |
| const end = Math.min((i + 1) * textPerPage, extractedText.length) | |
| const pageText = extractedText.slice(start, end).trim() || `[Page ${i + 1} - Content]` | |
| pages.push({ page_num: i + 1, text: pageText, images: [] }) | |
| } | |
| } else { | |
| pages.push({ page_num: 1, text: extractedText || `[PDF Content - ${buf.length} bytes]`, images: [] }) | |
| } | |
| console.log(`[PARSER_DEBUG] Parsed PDF with ${pages.length} pages via fallback`) | |
| return pages | |
| } catch (error) { | |
| console.error('[PARSER_DEBUG] PDF parsing error (fallback):', error) | |
| return [{ page_num: 1, text: `[PDF Content - ${buf.length} bytes - Parse error: ${error}]`, images: [] }] | |
| } | |
| } | |
| export async function parseDocxBytes(buf: Buffer): Promise<Page[]> { | |
| try { | |
| const { value } = await mammoth.extractRawText({ buffer: buf }) | |
| const text = value || '' | |
| console.log(`[PARSER_DEBUG] DOCX extracted ${text.length} characters`) | |
| return [{ page_num: 1, text, images: [] }] | |
| } catch (error) { | |
| console.error('[PARSER_DEBUG] DOCX parsing error:', error) | |
| return [{ page_num: 1, text: `[DOCX Parse Error: ${error}]`, images: [] }] | |
| } | |
| } | |
| export function inferMime(filename: string): string { | |
| const lower = filename.toLowerCase() | |
| if (lower.endsWith('.pdf')) return 'application/pdf' | |
| if (lower.endsWith('.docx')) | |
| return 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' | |
| return 'application/octet-stream' | |
| } | |
| export async function extractPages(filename: string, file: Buffer): Promise<Page[]> { | |
| const mime = inferMime(filename) | |
| console.log(`[PARSER_DEBUG] Processing ${filename} as ${mime}`) | |
| if (mime === 'application/pdf') return parsePdfBytes(file) | |
| if (mime === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') return parseDocxBytes(file) | |
| throw new Error(`Unsupported file type: ${filename}`) | |
| } | |