Spaces:

BinKhoaLe1812
/

EdSummariser

Sleeping

App Files Files Community

EdSummariser / ingestion_js /lib /parser.ts

LiamKhoaLe

Upd ingest js local loadup #1

55180c4 about 1 month ago

raw

history blame contribute delete

4.76 kB

	import { PDFDocument } from 'pdf-lib'
	import mammoth from 'mammoth'

	export type Page = { page_num: number; text: string; images: Buffer[] }

	// Simple text extraction from PDF using basic parsing
	function extractTextFromPdfBuffer(buffer: Buffer): string {
	try {
	// Convert buffer to string and look for text content
	const pdfString = buffer.toString('latin1')

	// Look for text streams in PDF
	const textMatches = pdfString.match(/BT[\s\S]*?ET/g) \|\| []
	const textContent = textMatches
	.map(match => {
	// Extract text from PDF text objects
	const textMatches = match.match(/$[^)]*$/g) \|\| []
	return textMatches
	.map(t => t.slice(1, -1)) // Remove parentheses
	.join(' ')
	})
	.join(' ')
	.trim()

	return textContent \|\| `[PDF Content - ${buffer.length} bytes - Text extraction limited]`
	} catch (error) {
	return `[PDF Content - ${buffer.length} bytes - Text extraction failed: ${error}]`
	}
	}

	export async function parsePdfBytes(buf: Buffer): Promise<Page[]> {
	console.log(`[PARSER_DEBUG] Parsing PDF with ${buf.length} bytes`)

	// Optional heavy parser (guarded by env to avoid OOM locally)
	if (process.env.PARSER_USE_PDFJS === '1') {
	try {
	const pdfjs: any = await import('pdfjs-dist/legacy/build/pdf')
	if (pdfjs.GlobalWorkerOptions) {
	pdfjs.GlobalWorkerOptions.workerSrc = undefined
	}
	const loadingTask = pdfjs.getDocument({
	data: new Uint8Array(buf),
	disableFontFace: true,
	useSystemFonts: true,
	isEvalSupported: false
	})
	const pdf = await loadingTask.promise
	const pageCount: number = pdf.numPages
	console.log(`[PARSER_DEBUG] pdfjs-dist loaded. Pages: ${pageCount}`)

	const pages: Page[] = []
	for (let i = 1; i <= pageCount; i++) {
	const page = await pdf.getPage(i)
	const textContent = await page.getTextContent()
	const text = (textContent.items \|\| [])
	.map((it: any) => (typeof it.str === 'string' ? it.str : ''))
	.join(' ')
	.replace(/\s+/g, ' ')
	.trim()

	pages.push({ page_num: i, text, images: [] })
	}
	console.log(`[PARSER_DEBUG] Parsed PDF with ${pages.length} pages via pdfjs-dist`)
	return pages
	} catch (err) {
	console.warn('[PARSER_DEBUG] pdfjs-dist extraction failed, falling back to basic extractor:', err)
	}
	}

	// Fallback: use lightweight extractor and page-count from pdf-lib
	try {
	const pdfDoc = await PDFDocument.load(buf)
	const pageCount = pdfDoc.getPageCount()
	const extractedText = extractTextFromPdfBuffer(buf)
	const pages: Page[] = []
	if (pageCount > 1) {
	const textPerPage = Math.ceil(extractedText.length / pageCount)
	for (let i = 0; i < pageCount; i++) {
	const start = i * textPerPage
	const end = Math.min((i + 1) * textPerPage, extractedText.length)
	const pageText = extractedText.slice(start, end).trim() \|\| `[Page ${i + 1} - Content]`
	pages.push({ page_num: i + 1, text: pageText, images: [] })
	}
	} else {
	pages.push({ page_num: 1, text: extractedText \|\| `[PDF Content - ${buf.length} bytes]`, images: [] })
	}
	console.log(`[PARSER_DEBUG] Parsed PDF with ${pages.length} pages via fallback`)
	return pages
	} catch (error) {
	console.error('[PARSER_DEBUG] PDF parsing error (fallback):', error)
	return [{ page_num: 1, text: `[PDF Content - ${buf.length} bytes - Parse error: ${error}]`, images: [] }]
	}
	}

	export async function parseDocxBytes(buf: Buffer): Promise<Page[]> {
	try {
	const { value } = await mammoth.extractRawText({ buffer: buf })
	const text = value \|\| ''
	console.log(`[PARSER_DEBUG] DOCX extracted ${text.length} characters`)
	return [{ page_num: 1, text, images: [] }]
	} catch (error) {
	console.error('[PARSER_DEBUG] DOCX parsing error:', error)
	return [{ page_num: 1, text: `[DOCX Parse Error: ${error}]`, images: [] }]
	}
	}

	export function inferMime(filename: string): string {
	const lower = filename.toLowerCase()
	if (lower.endsWith('.pdf')) return 'application/pdf'
	if (lower.endsWith('.docx'))
	return 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
	return 'application/octet-stream'
	}

	export async function extractPages(filename: string, file: Buffer): Promise<Page[]> {
	const mime = inferMime(filename)
	console.log(`[PARSER_DEBUG] Processing ${filename} as ${mime}`)

	if (mime === 'application/pdf') return parsePdfBytes(file)
	if (mime === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') return parseDocxBytes(file)
	throw new Error(`Unsupported file type: ${filename}`)
	}