LiamKhoaLe's picture
Initial setups refine for ingestion_js #16 (upd pdf-ext)
50da96f
import slugify from 'slugify'
import type { Page } from './parser'
import { cheapSummarize, cleanChunkText } from './summarizer'
const MAX_WORDS = 500
const MIN_WORDS = 150
const OVERLAP_WORDS = 50
function byHeadings(text: string): string[] {
// Enhanced patterns matching Python logic
const patterns = [
/^(#{1,6}\s.*)\s*$/gm, // Markdown headers
/^([0-9]+\.\s+[^\n]+)\s*$/gm, // Numbered sections
/^([A-Z][A-Za-z0-9\s\-]{2,}\n[-=]{3,})\s*$/gm, // Underlined headers
/^(Chapter\s+\d+.*|Section\s+\d+.*)\s*$/gm, // Chapter/Section headers
/^(Abstract|Introduction|Conclusion|References|Bibliography)\s*$/gm, // Common academic sections
]
const parts: string[] = []
let last = 0
const allMatches: Array<{start: number, end: number, header: string}> = []
// Find all matches from all patterns
for (const pattern of patterns) {
let match
while ((match = pattern.exec(text)) !== null) {
allMatches.push({
start: match.index,
end: match.index + match[0].length,
header: match[1].trim()
})
}
}
// Sort matches by position
allMatches.sort((a, b) => a.start - b.start)
// Split text based on matches
for (const { start, end, header } of allMatches) {
if (start > last) {
parts.push(text.slice(last, start))
}
parts.push(text.slice(start, end))
last = end
}
if (last < text.length) {
parts.push(text.slice(last))
}
if (parts.length === 0) {
parts.push(text)
}
return parts.filter(p => p.trim().length > 0)
}
function createOverlappingChunks(blocks: string[]): string[] {
const chunks: string[] = []
for (let i = 0; i < blocks.length; i++) {
const block = blocks[i]
const words = block.split(/\s+/).filter(w => w.length > 0)
if (words.length === 0) continue
// If block is small enough, use as-is
if (words.length <= MAX_WORDS) {
chunks.push(block)
continue
}
// Split large blocks with overlap
let start = 0
while (start < words.length) {
const end = Math.min(start + MAX_WORDS, words.length)
let chunkWords = words.slice(start, end)
// Add overlap from previous chunk if available
if (start > 0 && chunks.length > 0) {
const prevWords = chunks[chunks.length - 1].split(/\s+/).filter(w => w.length > 0)
const overlapStart = Math.max(0, prevWords.length - OVERLAP_WORDS)
const overlapWords = prevWords.slice(overlapStart)
chunkWords = [...overlapWords, ...chunkWords]
}
chunks.push(chunkWords.join(' '))
start = end - OVERLAP_WORDS // Overlap with next chunk
}
}
return chunks
}
export async function buildCardsFromPages(pages: Page[], filename: string, user_id: string, project_id: string) {
console.log(`[CHUNKER_DEBUG] Building cards from ${pages.length} pages for ${filename}`)
let full = ''
for (const p of pages) full += `\n\n[[Page ${p.page_num}]]\n${(p.text || '').trim()}\n`
console.log(`[CHUNKER_DEBUG] Full text length: ${full.length}`)
const coarse = byHeadings(full)
console.log(`[CHUNKER_DEBUG] Split into ${coarse.length} heading blocks`)
const chunks = createOverlappingChunks(coarse)
console.log(`[CHUNKER_DEBUG] Created ${chunks.length} overlapping chunks`)
const out: any[] = []
for (let i = 0; i < chunks.length; i++) {
console.log(`[CHUNKER_DEBUG] Processing chunk ${i + 1}/${chunks.length}`)
const cleaned = await cleanChunkText(chunks[i])
const topic = (await cheapSummarize(cleaned, 1)) || (cleaned.slice(0, 80) + '...')
const summary = await cheapSummarize(cleaned, 3)
const firstPage = pages[0]?.page_num ?? 1
const lastPage = pages[pages.length - 1]?.page_num ?? 1
const card = {
user_id,
project_id,
filename,
topic_name: topic.slice(0, 120),
summary,
content: cleaned,
page_span: [firstPage, lastPage],
card_id: `${slugify(String(filename))}-c${String(i + 1).padStart(4, '0')}`
}
console.log(`[CHUNKER_DEBUG] Created card ${card.card_id} with content length ${cleaned.length}`)
out.push(card)
}
console.log(`[CHUNKER_DEBUG] Built ${out.length} cards total`)
return out
}