"""Data loading utilities for chunks and JSON files.""" import json from pathlib import Path from typing import List, Dict, Any from langchain.docstore.document import Document def load_json(filepath: Path | str) -> List[Dict[str, Any]]: """ Load JSON data from file. Args: filepath: Path to JSON file Returns: List of dictionaries containing the JSON data """ filepath = Path(filepath) if not filepath.exists(): raise FileNotFoundError(f"JSON file not found: {filepath}") with open(filepath, 'r', encoding='utf-8') as f: data = json.load(f) return data def open_file(filepath: Path | str) -> str: """ Open and read a text file. Args: filepath: Path to text file Returns: File contents as string """ filepath = Path(filepath) if not filepath.exists(): raise FileNotFoundError(f"File not found: {filepath}") with open(filepath, 'r', encoding='utf-8') as f: content = f.read() return content def load_chunks(chunks_file: Path | str = None) -> List[Dict[str, Any]]: """ Load document chunks from JSON file. Args: chunks_file: Path to chunks JSON file. If None, uses default path. Returns: List of chunk dictionaries """ if chunks_file is None: chunks_file = Path("reports/docling_chunks.json") return load_json(chunks_file) def chunks_to_documents(chunks: List[Dict[str, Any]]) -> List[Document]: """ Convert chunk dictionaries to LangChain Document objects. Args: chunks: List of chunk dictionaries Returns: List of Document objects """ documents = [] for chunk in chunks: doc = Document( page_content=chunk.get("content", ""), metadata=chunk.get("metadata", {}) ) documents.append(doc) return documents def validate_chunks(chunks: List[Dict[str, Any]]) -> bool: """ Validate that chunks have required fields. Args: chunks: List of chunk dictionaries Returns: True if valid, raises ValueError if invalid """ required_fields = ["content", "metadata"] for i, chunk in enumerate(chunks): for field in required_fields: if field not in chunk: raise ValueError(f"Chunk {i} missing required field: {field}") # Validate metadata has required fields metadata = chunk["metadata"] if not isinstance(metadata, dict): raise ValueError(f"Chunk {i} metadata must be a dictionary") # Check for common metadata fields if "filename" not in metadata: raise ValueError(f"Chunk {i} metadata missing 'filename' field") return True