Spaces:
Sleeping
Sleeping
File size: 4,269 Bytes
5411a7d 50da96f 5411a7d 50da96f 5411a7d 50da96f 5411a7d 50da96f 5411a7d 50da96f 5411a7d 50da96f 5411a7d 3fafddc 5411a7d 3fafddc 5411a7d 3fafddc 5411a7d 3fafddc 5411a7d 3fafddc 5411a7d 3fafddc 5411a7d 3fafddc 5411a7d 3fafddc 5411a7d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import slugify from 'slugify'
import type { Page } from './parser'
import { cheapSummarize, cleanChunkText } from './summarizer'
const MAX_WORDS = 500
const MIN_WORDS = 150
const OVERLAP_WORDS = 50
function byHeadings(text: string): string[] {
// Enhanced patterns matching Python logic
const patterns = [
/^(#{1,6}\s.*)\s*$/gm, // Markdown headers
/^([0-9]+\.\s+[^\n]+)\s*$/gm, // Numbered sections
/^([A-Z][A-Za-z0-9\s\-]{2,}\n[-=]{3,})\s*$/gm, // Underlined headers
/^(Chapter\s+\d+.*|Section\s+\d+.*)\s*$/gm, // Chapter/Section headers
/^(Abstract|Introduction|Conclusion|References|Bibliography)\s*$/gm, // Common academic sections
]
const parts: string[] = []
let last = 0
const allMatches: Array<{start: number, end: number, header: string}> = []
// Find all matches from all patterns
for (const pattern of patterns) {
let match
while ((match = pattern.exec(text)) !== null) {
allMatches.push({
start: match.index,
end: match.index + match[0].length,
header: match[1].trim()
})
}
}
// Sort matches by position
allMatches.sort((a, b) => a.start - b.start)
// Split text based on matches
for (const { start, end, header } of allMatches) {
if (start > last) {
parts.push(text.slice(last, start))
}
parts.push(text.slice(start, end))
last = end
}
if (last < text.length) {
parts.push(text.slice(last))
}
if (parts.length === 0) {
parts.push(text)
}
return parts.filter(p => p.trim().length > 0)
}
function createOverlappingChunks(blocks: string[]): string[] {
const chunks: string[] = []
for (let i = 0; i < blocks.length; i++) {
const block = blocks[i]
const words = block.split(/\s+/).filter(w => w.length > 0)
if (words.length === 0) continue
// If block is small enough, use as-is
if (words.length <= MAX_WORDS) {
chunks.push(block)
continue
}
// Split large blocks with overlap
let start = 0
while (start < words.length) {
const end = Math.min(start + MAX_WORDS, words.length)
let chunkWords = words.slice(start, end)
// Add overlap from previous chunk if available
if (start > 0 && chunks.length > 0) {
const prevWords = chunks[chunks.length - 1].split(/\s+/).filter(w => w.length > 0)
const overlapStart = Math.max(0, prevWords.length - OVERLAP_WORDS)
const overlapWords = prevWords.slice(overlapStart)
chunkWords = [...overlapWords, ...chunkWords]
}
chunks.push(chunkWords.join(' '))
start = end - OVERLAP_WORDS // Overlap with next chunk
}
}
return chunks
}
export async function buildCardsFromPages(pages: Page[], filename: string, user_id: string, project_id: string) {
console.log(`[CHUNKER_DEBUG] Building cards from ${pages.length} pages for ${filename}`)
let full = ''
for (const p of pages) full += `\n\n[[Page ${p.page_num}]]\n${(p.text || '').trim()}\n`
console.log(`[CHUNKER_DEBUG] Full text length: ${full.length}`)
const coarse = byHeadings(full)
console.log(`[CHUNKER_DEBUG] Split into ${coarse.length} heading blocks`)
const chunks = createOverlappingChunks(coarse)
console.log(`[CHUNKER_DEBUG] Created ${chunks.length} overlapping chunks`)
const out: any[] = []
for (let i = 0; i < chunks.length; i++) {
console.log(`[CHUNKER_DEBUG] Processing chunk ${i + 1}/${chunks.length}`)
const cleaned = await cleanChunkText(chunks[i])
const topic = (await cheapSummarize(cleaned, 1)) || (cleaned.slice(0, 80) + '...')
const summary = await cheapSummarize(cleaned, 3)
const firstPage = pages[0]?.page_num ?? 1
const lastPage = pages[pages.length - 1]?.page_num ?? 1
const card = {
user_id,
project_id,
filename,
topic_name: topic.slice(0, 120),
summary,
content: cleaned,
page_span: [firstPage, lastPage],
card_id: `${slugify(String(filename))}-c${String(i + 1).padStart(4, '0')}`
}
console.log(`[CHUNKER_DEBUG] Created card ${card.card_id} with content length ${cleaned.length}`)
out.push(card)
}
console.log(`[CHUNKER_DEBUG] Built ${out.length} cards total`)
return out
}
|