Spaces:
Sleeping
Sleeping
| # ────────────────────────────── utils/chunker.py ────────────────────────────── | |
| import re | |
| from typing import List, Dict, Any | |
| from utils.service.summarizer import cheap_summarize, clean_chunk_text | |
| from utils.service.common import split_sentences, slugify | |
| from ..logger import get_logger | |
| # Enhanced semantic chunker with overlap and better structure: | |
| # - Split by headings / numbered sections if present | |
| # - Ensure each chunk ~ 300-600 words (configurable) | |
| # - Add overlap between chunks for better context preservation | |
| # - Generate a short summary + topic name | |
| # - Better handling of semantic boundaries | |
| MAX_WORDS = 500 | |
| MIN_WORDS = 150 | |
| OVERLAP_WORDS = 50 # Overlap between chunks for better context | |
| logger = get_logger("CHUNKER", __name__) | |
| def _by_headings(text: str): | |
| # Enhanced split on markdown-like or outline headings with better patterns | |
| patterns = [ | |
| r"(?m)^(#{1,6}\s.*)\s*$", # Markdown headers | |
| r"(?m)^([0-9]+\.\s+[^\n]+)\s*$", # Numbered sections | |
| r"(?m)^([A-Z][A-Za-z0-9\s\-]{2,}\n[-=]{3,})\s*$", # Underlined headers | |
| r"(?m)^(Chapter\s+\d+.*|Section\s+\d+.*)\s*$", # Chapter/Section headers | |
| r"(?m)^(Abstract|Introduction|Conclusion|References|Bibliography)\s*$", # Common academic sections | |
| ] | |
| parts = [] | |
| last = 0 | |
| all_matches = [] | |
| # Find all matches from all patterns | |
| for pattern in patterns: | |
| for m in re.finditer(pattern, text): | |
| all_matches.append((m.start(), m.end(), m.group(1).strip())) | |
| # Sort matches by position | |
| all_matches.sort(key=lambda x: x[0]) | |
| # Split text based on matches | |
| for start, end, header in all_matches: | |
| if start > last: | |
| parts.append(text[last:start]) | |
| parts.append(text[start:end]) | |
| last = end | |
| if last < len(text): | |
| parts.append(text[last:]) | |
| if not parts: | |
| parts = [text] | |
| return parts | |
| def _create_overlapping_chunks(text_blocks: List[str]) -> List[str]: | |
| """Create overlapping chunks from text blocks for better context preservation""" | |
| chunks = [] | |
| for i, block in enumerate(text_blocks): | |
| words = block.split() | |
| if not words: | |
| continue | |
| # If block is small enough, use as-is | |
| if len(words) <= MAX_WORDS: | |
| chunks.append(block) | |
| continue | |
| # Split large blocks with overlap | |
| start = 0 | |
| while start < len(words): | |
| end = min(start + MAX_WORDS, len(words)) | |
| chunk_words = words[start:end] | |
| # Add overlap from previous chunk if available | |
| if start > 0 and len(chunks) > 0: | |
| prev_words = chunks[-1].split() | |
| overlap_start = max(0, len(prev_words) - OVERLAP_WORDS) | |
| overlap_words = prev_words[overlap_start:] | |
| chunk_words = overlap_words + chunk_words | |
| chunks.append(" ".join(chunk_words)) | |
| start = end - OVERLAP_WORDS # Overlap with next chunk | |
| return chunks | |
| async def build_cards_from_pages(pages: List[Dict[str, Any]], filename: str, user_id: str, project_id: str) -> List[Dict[str, Any]]: | |
| # Concatenate pages but keep page spans for metadata | |
| full = "" | |
| page_markers = [] | |
| for p in pages: | |
| start = len(full) | |
| full += f"\n\n[[Page {p['page_num']}]]\n{p.get('text','').strip()}\n" | |
| page_markers.append((p['page_num'], start, len(full))) | |
| # First split by headings | |
| coarse = _by_headings(full) | |
| # Create overlapping chunks for better context preservation | |
| cards = _create_overlapping_chunks(coarse) | |
| # Build card dicts | |
| out = [] | |
| for i, raw_content in enumerate(cards, 1): | |
| # Clean with LLM to remove headers/footers and IDs | |
| cleaned = await clean_chunk_text(raw_content) | |
| topic = await cheap_summarize(cleaned, max_sentences=1) | |
| if not topic: | |
| topic = cleaned[:80] + "..." | |
| summary = await cheap_summarize(cleaned, max_sentences=3) | |
| # Estimate page span | |
| first_page = pages[0]['page_num'] if pages else 1 | |
| last_page = pages[-1]['page_num'] if pages else 1 | |
| out.append({ | |
| "user_id": user_id, | |
| "project_id": project_id, | |
| "filename": filename, | |
| "topic_name": topic[:120], | |
| "summary": summary, | |
| "content": cleaned, | |
| "page_span": [first_page, last_page], | |
| "card_id": f"{slugify(filename)}-c{i:04d}" | |
| }) | |
| logger.info(f"Built {len(out)} cards from {len(pages)} pages for {filename}") | |
| return out | |