Spaces:
Sleeping
Sleeping
File size: 4,760 Bytes
ee39cc9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
# ────────────────────────────── utils/chunker.py ──────────────────────────────
import re
from typing import List, Dict, Any
from utils.service.summarizer import cheap_summarize, clean_chunk_text
from utils.service.common import split_sentences, slugify
from ..logger import get_logger
# Enhanced semantic chunker with overlap and better structure:
# - Split by headings / numbered sections if present
# - Ensure each chunk ~ 300-600 words (configurable)
# - Add overlap between chunks for better context preservation
# - Generate a short summary + topic name
# - Better handling of semantic boundaries
MAX_WORDS = 500
MIN_WORDS = 150
OVERLAP_WORDS = 50 # Overlap between chunks for better context
logger = get_logger("CHUNKER", __name__)
def _by_headings(text: str):
# Enhanced split on markdown-like or outline headings with better patterns
patterns = [
r"(?m)^(#{1,6}\s.*)\s*$", # Markdown headers
r"(?m)^([0-9]+\.\s+[^\n]+)\s*$", # Numbered sections
r"(?m)^([A-Z][A-Za-z0-9\s\-]{2,}\n[-=]{3,})\s*$", # Underlined headers
r"(?m)^(Chapter\s+\d+.*|Section\s+\d+.*)\s*$", # Chapter/Section headers
r"(?m)^(Abstract|Introduction|Conclusion|References|Bibliography)\s*$", # Common academic sections
]
parts = []
last = 0
all_matches = []
# Find all matches from all patterns
for pattern in patterns:
for m in re.finditer(pattern, text):
all_matches.append((m.start(), m.end(), m.group(1).strip()))
# Sort matches by position
all_matches.sort(key=lambda x: x[0])
# Split text based on matches
for start, end, header in all_matches:
if start > last:
parts.append(text[last:start])
parts.append(text[start:end])
last = end
if last < len(text):
parts.append(text[last:])
if not parts:
parts = [text]
return parts
def _create_overlapping_chunks(text_blocks: List[str]) -> List[str]:
"""Create overlapping chunks from text blocks for better context preservation"""
chunks = []
for i, block in enumerate(text_blocks):
words = block.split()
if not words:
continue
# If block is small enough, use as-is
if len(words) <= MAX_WORDS:
chunks.append(block)
continue
# Split large blocks with overlap
start = 0
while start < len(words):
end = min(start + MAX_WORDS, len(words))
chunk_words = words[start:end]
# Add overlap from previous chunk if available
if start > 0 and len(chunks) > 0:
prev_words = chunks[-1].split()
overlap_start = max(0, len(prev_words) - OVERLAP_WORDS)
overlap_words = prev_words[overlap_start:]
chunk_words = overlap_words + chunk_words
chunks.append(" ".join(chunk_words))
start = end - OVERLAP_WORDS # Overlap with next chunk
return chunks
async def build_cards_from_pages(pages: List[Dict[str, Any]], filename: str, user_id: str, project_id: str) -> List[Dict[str, Any]]:
# Concatenate pages but keep page spans for metadata
full = ""
page_markers = []
for p in pages:
start = len(full)
full += f"\n\n[[Page {p['page_num']}]]\n{p.get('text','').strip()}\n"
page_markers.append((p['page_num'], start, len(full)))
# First split by headings
coarse = _by_headings(full)
# Create overlapping chunks for better context preservation
cards = _create_overlapping_chunks(coarse)
# Build card dicts
out = []
for i, raw_content in enumerate(cards, 1):
# Clean with LLM to remove headers/footers and IDs
cleaned = await clean_chunk_text(raw_content)
topic = await cheap_summarize(cleaned, max_sentences=1)
if not topic:
topic = cleaned[:80] + "..."
summary = await cheap_summarize(cleaned, max_sentences=3)
# Estimate page span
first_page = pages[0]['page_num'] if pages else 1
last_page = pages[-1]['page_num'] if pages else 1
out.append({
"user_id": user_id,
"project_id": project_id,
"filename": filename,
"topic_name": topic[:120],
"summary": summary,
"content": cleaned,
"page_span": [first_page, last_page],
"card_id": f"{slugify(filename)}-c{i:04d}"
})
logger.info(f"Built {len(out)} cards from {len(pages)} pages for {filename}")
return out
|