File size: 4,760 Bytes
ee39cc9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# ────────────────────────────── utils/chunker.py ──────────────────────────────
import re
from typing import List, Dict, Any
from utils.service.summarizer import cheap_summarize, clean_chunk_text
from utils.service.common import split_sentences, slugify
from ..logger import get_logger

# Enhanced semantic chunker with overlap and better structure:
# - Split by headings / numbered sections if present
# - Ensure each chunk ~ 300-600 words (configurable)
# - Add overlap between chunks for better context preservation
# - Generate a short summary + topic name
# - Better handling of semantic boundaries

MAX_WORDS = 500
MIN_WORDS = 150
OVERLAP_WORDS = 50  # Overlap between chunks for better context
logger = get_logger("CHUNKER", __name__)


def _by_headings(text: str):
    # Enhanced split on markdown-like or outline headings with better patterns
    patterns = [
        r"(?m)^(#{1,6}\s.*)\s*$",  # Markdown headers
        r"(?m)^([0-9]+\.\s+[^\n]+)\s*$",  # Numbered sections
        r"(?m)^([A-Z][A-Za-z0-9\s\-]{2,}\n[-=]{3,})\s*$",  # Underlined headers
        r"(?m)^(Chapter\s+\d+.*|Section\s+\d+.*)\s*$",  # Chapter/Section headers
        r"(?m)^(Abstract|Introduction|Conclusion|References|Bibliography)\s*$",  # Common academic sections
    ]
    
    parts = []
    last = 0
    all_matches = []
    
    # Find all matches from all patterns
    for pattern in patterns:
        for m in re.finditer(pattern, text):
            all_matches.append((m.start(), m.end(), m.group(1).strip()))
    
    # Sort matches by position
    all_matches.sort(key=lambda x: x[0])
    
    # Split text based on matches
    for start, end, header in all_matches:
        if start > last:
            parts.append(text[last:start])
        parts.append(text[start:end])
        last = end
    
    if last < len(text):
        parts.append(text[last:])
    
    if not parts:
        parts = [text]
    
    return parts


def _create_overlapping_chunks(text_blocks: List[str]) -> List[str]:
    """Create overlapping chunks from text blocks for better context preservation"""
    chunks = []
    
    for i, block in enumerate(text_blocks):
        words = block.split()
        if not words:
            continue
            
        # If block is small enough, use as-is
        if len(words) <= MAX_WORDS:
            chunks.append(block)
            continue
        
        # Split large blocks with overlap
        start = 0
        while start < len(words):
            end = min(start + MAX_WORDS, len(words))
            chunk_words = words[start:end]
            
            # Add overlap from previous chunk if available
            if start > 0 and len(chunks) > 0:
                prev_words = chunks[-1].split()
                overlap_start = max(0, len(prev_words) - OVERLAP_WORDS)
                overlap_words = prev_words[overlap_start:]
                chunk_words = overlap_words + chunk_words
            
            chunks.append(" ".join(chunk_words))
            start = end - OVERLAP_WORDS  # Overlap with next chunk
    
    return chunks


async def build_cards_from_pages(pages: List[Dict[str, Any]], filename: str, user_id: str, project_id: str) -> List[Dict[str, Any]]:
    # Concatenate pages but keep page spans for metadata
    full = ""
    page_markers = []
    for p in pages:
        start = len(full)
        full += f"\n\n[[Page {p['page_num']}]]\n{p.get('text','').strip()}\n"
        page_markers.append((p['page_num'], start, len(full)))

    # First split by headings
    coarse = _by_headings(full)

    # Create overlapping chunks for better context preservation
    cards = _create_overlapping_chunks(coarse)

    # Build card dicts
    out = []
    for i, raw_content in enumerate(cards, 1):
        # Clean with LLM to remove headers/footers and IDs
        cleaned = await clean_chunk_text(raw_content)
        topic = await cheap_summarize(cleaned, max_sentences=1)
        if not topic:
            topic = cleaned[:80] + "..."
        summary = await cheap_summarize(cleaned, max_sentences=3)
        # Estimate page span
        first_page = pages[0]['page_num'] if pages else 1
        last_page = pages[-1]['page_num'] if pages else 1
        out.append({
            "user_id": user_id,
            "project_id": project_id,
            "filename": filename,
            "topic_name": topic[:120],
            "summary": summary,
            "content": cleaned,
            "page_span": [first_page, last_page],
            "card_id": f"{slugify(filename)}-c{i:04d}"
        })
    logger.info(f"Built {len(out)} cards from {len(pages)} pages for {filename}")
    return out