LiamKhoaLe's picture
Merge commit 'e34edc7cd55f292dd0b192dc00b782c22208fde6' as 'ingestion_python'
ee39cc9
raw
history blame
4.76 kB
# ────────────────────────────── utils/chunker.py ──────────────────────────────
import re
from typing import List, Dict, Any
from utils.service.summarizer import cheap_summarize, clean_chunk_text
from utils.service.common import split_sentences, slugify
from ..logger import get_logger
# Enhanced semantic chunker with overlap and better structure:
# - Split by headings / numbered sections if present
# - Ensure each chunk ~ 300-600 words (configurable)
# - Add overlap between chunks for better context preservation
# - Generate a short summary + topic name
# - Better handling of semantic boundaries
MAX_WORDS = 500
MIN_WORDS = 150
OVERLAP_WORDS = 50 # Overlap between chunks for better context
logger = get_logger("CHUNKER", __name__)
def _by_headings(text: str):
# Enhanced split on markdown-like or outline headings with better patterns
patterns = [
r"(?m)^(#{1,6}\s.*)\s*$", # Markdown headers
r"(?m)^([0-9]+\.\s+[^\n]+)\s*$", # Numbered sections
r"(?m)^([A-Z][A-Za-z0-9\s\-]{2,}\n[-=]{3,})\s*$", # Underlined headers
r"(?m)^(Chapter\s+\d+.*|Section\s+\d+.*)\s*$", # Chapter/Section headers
r"(?m)^(Abstract|Introduction|Conclusion|References|Bibliography)\s*$", # Common academic sections
]
parts = []
last = 0
all_matches = []
# Find all matches from all patterns
for pattern in patterns:
for m in re.finditer(pattern, text):
all_matches.append((m.start(), m.end(), m.group(1).strip()))
# Sort matches by position
all_matches.sort(key=lambda x: x[0])
# Split text based on matches
for start, end, header in all_matches:
if start > last:
parts.append(text[last:start])
parts.append(text[start:end])
last = end
if last < len(text):
parts.append(text[last:])
if not parts:
parts = [text]
return parts
def _create_overlapping_chunks(text_blocks: List[str]) -> List[str]:
"""Create overlapping chunks from text blocks for better context preservation"""
chunks = []
for i, block in enumerate(text_blocks):
words = block.split()
if not words:
continue
# If block is small enough, use as-is
if len(words) <= MAX_WORDS:
chunks.append(block)
continue
# Split large blocks with overlap
start = 0
while start < len(words):
end = min(start + MAX_WORDS, len(words))
chunk_words = words[start:end]
# Add overlap from previous chunk if available
if start > 0 and len(chunks) > 0:
prev_words = chunks[-1].split()
overlap_start = max(0, len(prev_words) - OVERLAP_WORDS)
overlap_words = prev_words[overlap_start:]
chunk_words = overlap_words + chunk_words
chunks.append(" ".join(chunk_words))
start = end - OVERLAP_WORDS # Overlap with next chunk
return chunks
async def build_cards_from_pages(pages: List[Dict[str, Any]], filename: str, user_id: str, project_id: str) -> List[Dict[str, Any]]:
# Concatenate pages but keep page spans for metadata
full = ""
page_markers = []
for p in pages:
start = len(full)
full += f"\n\n[[Page {p['page_num']}]]\n{p.get('text','').strip()}\n"
page_markers.append((p['page_num'], start, len(full)))
# First split by headings
coarse = _by_headings(full)
# Create overlapping chunks for better context preservation
cards = _create_overlapping_chunks(coarse)
# Build card dicts
out = []
for i, raw_content in enumerate(cards, 1):
# Clean with LLM to remove headers/footers and IDs
cleaned = await clean_chunk_text(raw_content)
topic = await cheap_summarize(cleaned, max_sentences=1)
if not topic:
topic = cleaned[:80] + "..."
summary = await cheap_summarize(cleaned, max_sentences=3)
# Estimate page span
first_page = pages[0]['page_num'] if pages else 1
last_page = pages[-1]['page_num'] if pages else 1
out.append({
"user_id": user_id,
"project_id": project_id,
"filename": filename,
"topic_name": topic[:120],
"summary": summary,
"content": cleaned,
"page_span": [first_page, last_page],
"card_id": f"{slugify(filename)}-c{i:04d}"
})
logger.info(f"Built {len(out)} cards from {len(pages)} pages for {filename}")
return out