Spaces:

BinKhoaLe1812
/

EdSummariser

Sleeping

App Files Files Community

EdSummariser / ingestion_python /utils /ingestion /chunker.py

LiamKhoaLe

Merge commit 'e34edc7cd55f292dd0b192dc00b782c22208fde6' as 'ingestion_python'

ee39cc9 2 months ago

raw

history blame

4.76 kB

	# ────────────────────────────── utils/chunker.py ──────────────────────────────
	import re
	from typing import List, Dict, Any
	from utils.service.summarizer import cheap_summarize, clean_chunk_text
	from utils.service.common import split_sentences, slugify
	from ..logger import get_logger

	# Enhanced semantic chunker with overlap and better structure:
	# - Split by headings / numbered sections if present
	# - Ensure each chunk ~ 300-600 words (configurable)
	# - Add overlap between chunks for better context preservation
	# - Generate a short summary + topic name
	# - Better handling of semantic boundaries

	MAX_WORDS = 500
	MIN_WORDS = 150
	OVERLAP_WORDS = 50 # Overlap between chunks for better context
	logger = get_logger("CHUNKER", __name__)


	def _by_headings(text: str):
	# Enhanced split on markdown-like or outline headings with better patterns
	patterns = [
	r"(?m)^(#{1,6}\s.)\s$", # Markdown headers
	r"(?m)^([0-9]+\.\s+[^\n]+)\s*$", # Numbered sections
	r"(?m)^([A-Z][A-Za-z0-9\s\-]{2,}\n[-=]{3,})\s*$", # Underlined headers
	r"(?m)^(Chapter\s+\d+.\|Section\s+\d+.)\s*$", # Chapter/Section headers
	r"(?m)^(Abstract\|Introduction\|Conclusion\|References\|Bibliography)\s*$", # Common academic sections
	]

	parts = []
	last = 0
	all_matches = []

	# Find all matches from all patterns
	for pattern in patterns:
	for m in re.finditer(pattern, text):
	all_matches.append((m.start(), m.end(), m.group(1).strip()))

	# Sort matches by position
	all_matches.sort(key=lambda x: x[0])

	# Split text based on matches
	for start, end, header in all_matches:
	if start > last:
	parts.append(text[last:start])
	parts.append(text[start:end])
	last = end

	if last < len(text):
	parts.append(text[last:])

	if not parts:
	parts = [text]

	return parts


	def _create_overlapping_chunks(text_blocks: List[str]) -> List[str]:
	"""Create overlapping chunks from text blocks for better context preservation"""
	chunks = []

	for i, block in enumerate(text_blocks):
	words = block.split()
	if not words:
	continue

	# If block is small enough, use as-is
	if len(words) <= MAX_WORDS:
	chunks.append(block)
	continue

	# Split large blocks with overlap
	start = 0
	while start < len(words):
	end = min(start + MAX_WORDS, len(words))
	chunk_words = words[start:end]

	# Add overlap from previous chunk if available
	if start > 0 and len(chunks) > 0:
	prev_words = chunks[-1].split()
	overlap_start = max(0, len(prev_words) - OVERLAP_WORDS)
	overlap_words = prev_words[overlap_start:]
	chunk_words = overlap_words + chunk_words

	chunks.append(" ".join(chunk_words))
	start = end - OVERLAP_WORDS # Overlap with next chunk

	return chunks


	async def build_cards_from_pages(pages: List[Dict[str, Any]], filename: str, user_id: str, project_id: str) -> List[Dict[str, Any]]:
	# Concatenate pages but keep page spans for metadata
	full = ""
	page_markers = []
	for p in pages:
	start = len(full)
	full += f"\n\n[[Page {p['page_num']}]]\n{p.get('text','').strip()}\n"
	page_markers.append((p['page_num'], start, len(full)))

	# First split by headings
	coarse = _by_headings(full)

	# Create overlapping chunks for better context preservation
	cards = _create_overlapping_chunks(coarse)

	# Build card dicts
	out = []
	for i, raw_content in enumerate(cards, 1):
	# Clean with LLM to remove headers/footers and IDs
	cleaned = await clean_chunk_text(raw_content)
	topic = await cheap_summarize(cleaned, max_sentences=1)
	if not topic:
	topic = cleaned[:80] + "..."
	summary = await cheap_summarize(cleaned, max_sentences=3)
	# Estimate page span
	first_page = pages[0]['page_num'] if pages else 1
	last_page = pages[-1]['page_num'] if pages else 1
	out.append({
	"user_id": user_id,
	"project_id": project_id,
	"filename": filename,
	"topic_name": topic[:120],
	"summary": summary,
	"content": cleaned,
	"page_span": [first_page, last_page],
	"card_id": f"{slugify(filename)}-c{i:04d}"
	})
	logger.info(f"Built {len(out)} cards from {len(pages)} pages for {filename}")
	return out