File size: 4,757 Bytes
318a549
ec9f00b
 
e36ae95
ec9f00b
d6a2f37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e36ae95
dcc0243
 
55180c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
09431f4
55180c4
 
 
 
 
 
 
 
 
09431f4
55180c4
 
 
 
 
 
09431f4
 
 
 
dcc0243
318a549
 
d6a2f37
318a549
d6a2f37
 
318a549
 
d6a2f37
 
09431f4
318a549
d6a2f37
09431f4
318a549
09431f4
50da96f
dcc0243
09431f4
50da96f
dcc0243
ec9f00b
 
e36ae95
50da96f
 
 
 
 
 
 
 
 
ec9f00b
 
 
 
 
 
 
 
 
 
e36ae95
ec9f00b
50da96f
 
ec9f00b
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import { PDFDocument } from 'pdf-lib'
import mammoth from 'mammoth'

export type Page = { page_num: number; text: string; images: Buffer[] }

// Simple text extraction from PDF using basic parsing
function extractTextFromPdfBuffer(buffer: Buffer): string {
  try {
    // Convert buffer to string and look for text content
    const pdfString = buffer.toString('latin1')
    
    // Look for text streams in PDF
    const textMatches = pdfString.match(/BT[\s\S]*?ET/g) || []
    const textContent = textMatches
      .map(match => {
        // Extract text from PDF text objects
        const textMatches = match.match(/\([^)]*\)/g) || []
        return textMatches
          .map(t => t.slice(1, -1)) // Remove parentheses
          .join(' ')
      })
      .join(' ')
      .trim()
    
    return textContent || `[PDF Content - ${buffer.length} bytes - Text extraction limited]`
  } catch (error) {
    return `[PDF Content - ${buffer.length} bytes - Text extraction failed: ${error}]`
  }
}

export async function parsePdfBytes(buf: Buffer): Promise<Page[]> {
  console.log(`[PARSER_DEBUG] Parsing PDF with ${buf.length} bytes`)
  
  // Optional heavy parser (guarded by env to avoid OOM locally)
  if (process.env.PARSER_USE_PDFJS === '1') {
    try {
      const pdfjs: any = await import('pdfjs-dist/legacy/build/pdf')
      if (pdfjs.GlobalWorkerOptions) {
        pdfjs.GlobalWorkerOptions.workerSrc = undefined
      }
      const loadingTask = pdfjs.getDocument({
        data: new Uint8Array(buf),
        disableFontFace: true,
        useSystemFonts: true,
        isEvalSupported: false
      })
      const pdf = await loadingTask.promise
      const pageCount: number = pdf.numPages
      console.log(`[PARSER_DEBUG] pdfjs-dist loaded. Pages: ${pageCount}`)

      const pages: Page[] = []
      for (let i = 1; i <= pageCount; i++) {
        const page = await pdf.getPage(i)
        const textContent = await page.getTextContent()
        const text = (textContent.items || [])
          .map((it: any) => (typeof it.str === 'string' ? it.str : ''))
          .join(' ')
          .replace(/\s+/g, ' ')
          .trim()

        pages.push({ page_num: i, text, images: [] })
      }
      console.log(`[PARSER_DEBUG] Parsed PDF with ${pages.length} pages via pdfjs-dist`)
      return pages
    } catch (err) {
      console.warn('[PARSER_DEBUG] pdfjs-dist extraction failed, falling back to basic extractor:', err)
    }
  }

  // Fallback: use lightweight extractor and page-count from pdf-lib
  try {
    const pdfDoc = await PDFDocument.load(buf)
    const pageCount = pdfDoc.getPageCount()
    const extractedText = extractTextFromPdfBuffer(buf)
    const pages: Page[] = []
    if (pageCount > 1) {
      const textPerPage = Math.ceil(extractedText.length / pageCount)
      for (let i = 0; i < pageCount; i++) {
        const start = i * textPerPage
        const end = Math.min((i + 1) * textPerPage, extractedText.length)
        const pageText = extractedText.slice(start, end).trim() || `[Page ${i + 1} - Content]`
        pages.push({ page_num: i + 1, text: pageText, images: [] })
      }
    } else {
      pages.push({ page_num: 1, text: extractedText || `[PDF Content - ${buf.length} bytes]`, images: [] })
    }
    console.log(`[PARSER_DEBUG] Parsed PDF with ${pages.length} pages via fallback`)
    return pages
  } catch (error) {
    console.error('[PARSER_DEBUG] PDF parsing error (fallback):', error)
    return [{ page_num: 1, text: `[PDF Content - ${buf.length} bytes - Parse error: ${error}]`, images: [] }]
  }
}

export async function parseDocxBytes(buf: Buffer): Promise<Page[]> {
  try {
    const { value } = await mammoth.extractRawText({ buffer: buf })
    const text = value || ''
    console.log(`[PARSER_DEBUG] DOCX extracted ${text.length} characters`)
    return [{ page_num: 1, text, images: [] }]
  } catch (error) {
    console.error('[PARSER_DEBUG] DOCX parsing error:', error)
    return [{ page_num: 1, text: `[DOCX Parse Error: ${error}]`, images: [] }]
  }
}

export function inferMime(filename: string): string {
  const lower = filename.toLowerCase()
  if (lower.endsWith('.pdf')) return 'application/pdf'
  if (lower.endsWith('.docx'))
    return 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
  return 'application/octet-stream'
}

export async function extractPages(filename: string, file: Buffer): Promise<Page[]> {
  const mime = inferMime(filename)
  console.log(`[PARSER_DEBUG] Processing ${filename} as ${mime}`)
  
  if (mime === 'application/pdf') return parsePdfBytes(file)
  if (mime === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') return parseDocxBytes(file)
  throw new Error(`Unsupported file type: ${filename}`)
}