Spaces:

BinKhoaLe1812
/

EdSummariser

Sleeping

File size: 4,760 Bytes

ee39cc9

# ────────────────────────────── utils/chunker.py ──────────────────────────────
import re
from typing import List, Dict, Any
from utils.service.summarizer import cheap_summarize, clean_chunk_text
from utils.service.common import split_sentences, slugify
from ..logger import get_logger

# Enhanced semantic chunker with overlap and better structure:
# - Split by headings / numbered sections if present
# - Ensure each chunk ~ 300-600 words (configurable)
# - Add overlap between chunks for better context preservation
# - Generate a short summary + topic name
# - Better handling of semantic boundaries

MAX_WORDS = 500
MIN_WORDS = 150
OVERLAP_WORDS = 50  # Overlap between chunks for better context
logger = get_logger("CHUNKER", __name__)


def _by_headings(text: str):
    # Enhanced split on markdown-like or outline headings with better patterns
    patterns = [
        r"(?m)^(#{1,6}\s.*)\s*$",  # Markdown headers
        r"(?m)^([0-9]+\.\s+[^\n]+)\s*$",  # Numbered sections
        r"(?m)^([A-Z][A-Za-z0-9\s\-]{2,}\n[-=]{3,})\s*$",  # Underlined headers
        r"(?m)^(Chapter\s+\d+.*|Section\s+\d+.*)\s*$",  # Chapter/Section headers
        r"(?m)^(Abstract|Introduction|Conclusion|References|Bibliography)\s*$",  # Common academic sections
    ]
    
    parts = []
    last = 0
    all_matches = []
    
    # Find all matches from all patterns
    for pattern in patterns:
        for m in re.finditer(pattern, text):
            all_matches.append((m.start(), m.end(), m.group(1).strip()))
    
    # Sort matches by position
    all_matches.sort(key=lambda x: x[0])
    
    # Split text based on matches
    for start, end, header in all_matches:
        if start > last:
            parts.append(text[last:start])
        parts.append(text[start:end])
        last = end
    
    if last < len(text):
        parts.append(text[last:])
    
    if not parts:
        parts = [text]
    
    return parts


def _create_overlapping_chunks(text_blocks: List[str]) -> List[str]:
    """Create overlapping chunks from text blocks for better context preservation"""
    chunks = []
    
    for i, block in enumerate(text_blocks):
        words = block.split()
        if not words:
            continue
            
        # If block is small enough, use as-is
        if len(words) <= MAX_WORDS:
            chunks.append(block)
            continue
        
        # Split large blocks with overlap
        start = 0
        while start < len(words):
            end = min(start + MAX_WORDS, len(words))
            chunk_words = words[start:end]
            
            # Add overlap from previous chunk if available
            if start > 0 and len(chunks) > 0:
                prev_words = chunks[-1].split()
                overlap_start = max(0, len(prev_words) - OVERLAP_WORDS)
                overlap_words = prev_words[overlap_start:]
                chunk_words = overlap_words + chunk_words
            
            chunks.append(" ".join(chunk_words))
            start = end - OVERLAP_WORDS  # Overlap with next chunk
    
    return chunks


async def build_cards_from_pages(pages: List[Dict[str, Any]], filename: str, user_id: str, project_id: str) -> List[Dict[str, Any]]:
    # Concatenate pages but keep page spans for metadata
    full = ""
    page_markers = []
    for p in pages:
        start = len(full)
        full += f"\n\n[[Page {p['page_num']}]]\n{p.get('text','').strip()}\n"
        page_markers.append((p['page_num'], start, len(full)))

    # First split by headings
    coarse = _by_headings(full)

    # Create overlapping chunks for better context preservation
    cards = _create_overlapping_chunks(coarse)

    # Build card dicts
    out = []
    for i, raw_content in enumerate(cards, 1):
        # Clean with LLM to remove headers/footers and IDs
        cleaned = await clean_chunk_text(raw_content)
        topic = await cheap_summarize(cleaned, max_sentences=1)
        if not topic:
            topic = cleaned[:80] + "..."
        summary = await cheap_summarize(cleaned, max_sentences=3)
        # Estimate page span
        first_page = pages[0]['page_num'] if pages else 1
        last_page = pages[-1]['page_num'] if pages else 1
        out.append({
            "user_id": user_id,
            "project_id": project_id,
            "filename": filename,
            "topic_name": topic[:120],
            "summary": summary,
            "content": cleaned,
            "page_span": [first_page, last_page],
            "card_id": f"{slugify(filename)}-c{i:04d}"
        })
    logger.info(f"Built {len(out)} cards from {len(pages)} pages for {filename}")
    return out