Spaces:
Sleeping
Sleeping
File size: 4,757 Bytes
318a549 ec9f00b e36ae95 ec9f00b d6a2f37 e36ae95 dcc0243 55180c4 09431f4 55180c4 09431f4 55180c4 09431f4 dcc0243 318a549 d6a2f37 318a549 d6a2f37 318a549 d6a2f37 09431f4 318a549 d6a2f37 09431f4 318a549 09431f4 50da96f dcc0243 09431f4 50da96f dcc0243 ec9f00b e36ae95 50da96f ec9f00b e36ae95 ec9f00b 50da96f ec9f00b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import { PDFDocument } from 'pdf-lib'
import mammoth from 'mammoth'
export type Page = { page_num: number; text: string; images: Buffer[] }
// Simple text extraction from PDF using basic parsing
function extractTextFromPdfBuffer(buffer: Buffer): string {
try {
// Convert buffer to string and look for text content
const pdfString = buffer.toString('latin1')
// Look for text streams in PDF
const textMatches = pdfString.match(/BT[\s\S]*?ET/g) || []
const textContent = textMatches
.map(match => {
// Extract text from PDF text objects
const textMatches = match.match(/\([^)]*\)/g) || []
return textMatches
.map(t => t.slice(1, -1)) // Remove parentheses
.join(' ')
})
.join(' ')
.trim()
return textContent || `[PDF Content - ${buffer.length} bytes - Text extraction limited]`
} catch (error) {
return `[PDF Content - ${buffer.length} bytes - Text extraction failed: ${error}]`
}
}
export async function parsePdfBytes(buf: Buffer): Promise<Page[]> {
console.log(`[PARSER_DEBUG] Parsing PDF with ${buf.length} bytes`)
// Optional heavy parser (guarded by env to avoid OOM locally)
if (process.env.PARSER_USE_PDFJS === '1') {
try {
const pdfjs: any = await import('pdfjs-dist/legacy/build/pdf')
if (pdfjs.GlobalWorkerOptions) {
pdfjs.GlobalWorkerOptions.workerSrc = undefined
}
const loadingTask = pdfjs.getDocument({
data: new Uint8Array(buf),
disableFontFace: true,
useSystemFonts: true,
isEvalSupported: false
})
const pdf = await loadingTask.promise
const pageCount: number = pdf.numPages
console.log(`[PARSER_DEBUG] pdfjs-dist loaded. Pages: ${pageCount}`)
const pages: Page[] = []
for (let i = 1; i <= pageCount; i++) {
const page = await pdf.getPage(i)
const textContent = await page.getTextContent()
const text = (textContent.items || [])
.map((it: any) => (typeof it.str === 'string' ? it.str : ''))
.join(' ')
.replace(/\s+/g, ' ')
.trim()
pages.push({ page_num: i, text, images: [] })
}
console.log(`[PARSER_DEBUG] Parsed PDF with ${pages.length} pages via pdfjs-dist`)
return pages
} catch (err) {
console.warn('[PARSER_DEBUG] pdfjs-dist extraction failed, falling back to basic extractor:', err)
}
}
// Fallback: use lightweight extractor and page-count from pdf-lib
try {
const pdfDoc = await PDFDocument.load(buf)
const pageCount = pdfDoc.getPageCount()
const extractedText = extractTextFromPdfBuffer(buf)
const pages: Page[] = []
if (pageCount > 1) {
const textPerPage = Math.ceil(extractedText.length / pageCount)
for (let i = 0; i < pageCount; i++) {
const start = i * textPerPage
const end = Math.min((i + 1) * textPerPage, extractedText.length)
const pageText = extractedText.slice(start, end).trim() || `[Page ${i + 1} - Content]`
pages.push({ page_num: i + 1, text: pageText, images: [] })
}
} else {
pages.push({ page_num: 1, text: extractedText || `[PDF Content - ${buf.length} bytes]`, images: [] })
}
console.log(`[PARSER_DEBUG] Parsed PDF with ${pages.length} pages via fallback`)
return pages
} catch (error) {
console.error('[PARSER_DEBUG] PDF parsing error (fallback):', error)
return [{ page_num: 1, text: `[PDF Content - ${buf.length} bytes - Parse error: ${error}]`, images: [] }]
}
}
export async function parseDocxBytes(buf: Buffer): Promise<Page[]> {
try {
const { value } = await mammoth.extractRawText({ buffer: buf })
const text = value || ''
console.log(`[PARSER_DEBUG] DOCX extracted ${text.length} characters`)
return [{ page_num: 1, text, images: [] }]
} catch (error) {
console.error('[PARSER_DEBUG] DOCX parsing error:', error)
return [{ page_num: 1, text: `[DOCX Parse Error: ${error}]`, images: [] }]
}
}
export function inferMime(filename: string): string {
const lower = filename.toLowerCase()
if (lower.endsWith('.pdf')) return 'application/pdf'
if (lower.endsWith('.docx'))
return 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
return 'application/octet-stream'
}
export async function extractPages(filename: string, file: Buffer): Promise<Page[]> {
const mime = inferMime(filename)
console.log(`[PARSER_DEBUG] Processing ${filename} as ${mime}`)
if (mime === 'application/pdf') return parsePdfBytes(file)
if (mime === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') return parseDocxBytes(file)
throw new Error(`Unsupported file type: ${filename}`)
}
|