|
|
"""Data loading utilities for chunks and JSON files.""" |
|
|
|
|
|
import json |
|
|
from pathlib import Path |
|
|
from typing import List, Dict, Any |
|
|
from langchain.docstore.document import Document |
|
|
|
|
|
|
|
|
def load_json(filepath: Path | str) -> List[Dict[str, Any]]: |
|
|
""" |
|
|
Load JSON data from file. |
|
|
|
|
|
Args: |
|
|
filepath: Path to JSON file |
|
|
|
|
|
Returns: |
|
|
List of dictionaries containing the JSON data |
|
|
""" |
|
|
filepath = Path(filepath) |
|
|
|
|
|
if not filepath.exists(): |
|
|
raise FileNotFoundError(f"JSON file not found: {filepath}") |
|
|
|
|
|
with open(filepath, 'r', encoding='utf-8') as f: |
|
|
data = json.load(f) |
|
|
|
|
|
return data |
|
|
|
|
|
|
|
|
def open_file(filepath: Path | str) -> str: |
|
|
""" |
|
|
Open and read a text file. |
|
|
|
|
|
Args: |
|
|
filepath: Path to text file |
|
|
|
|
|
Returns: |
|
|
File contents as string |
|
|
""" |
|
|
filepath = Path(filepath) |
|
|
|
|
|
if not filepath.exists(): |
|
|
raise FileNotFoundError(f"File not found: {filepath}") |
|
|
|
|
|
with open(filepath, 'r', encoding='utf-8') as f: |
|
|
content = f.read() |
|
|
|
|
|
return content |
|
|
|
|
|
|
|
|
def load_chunks(chunks_file: Path | str = None) -> List[Dict[str, Any]]: |
|
|
""" |
|
|
Load document chunks from JSON file. |
|
|
|
|
|
Args: |
|
|
chunks_file: Path to chunks JSON file. If None, uses default path. |
|
|
|
|
|
Returns: |
|
|
List of chunk dictionaries |
|
|
""" |
|
|
if chunks_file is None: |
|
|
chunks_file = Path("reports/docling_chunks.json") |
|
|
|
|
|
return load_json(chunks_file) |
|
|
|
|
|
|
|
|
def chunks_to_documents(chunks: List[Dict[str, Any]]) -> List[Document]: |
|
|
""" |
|
|
Convert chunk dictionaries to LangChain Document objects. |
|
|
|
|
|
Args: |
|
|
chunks: List of chunk dictionaries |
|
|
|
|
|
Returns: |
|
|
List of Document objects |
|
|
""" |
|
|
documents = [] |
|
|
|
|
|
for chunk in chunks: |
|
|
doc = Document( |
|
|
page_content=chunk.get("content", ""), |
|
|
metadata=chunk.get("metadata", {}) |
|
|
) |
|
|
documents.append(doc) |
|
|
|
|
|
return documents |
|
|
|
|
|
|
|
|
def validate_chunks(chunks: List[Dict[str, Any]]) -> bool: |
|
|
""" |
|
|
Validate that chunks have required fields. |
|
|
|
|
|
Args: |
|
|
chunks: List of chunk dictionaries |
|
|
|
|
|
Returns: |
|
|
True if valid, raises ValueError if invalid |
|
|
""" |
|
|
required_fields = ["content", "metadata"] |
|
|
|
|
|
for i, chunk in enumerate(chunks): |
|
|
for field in required_fields: |
|
|
if field not in chunk: |
|
|
raise ValueError(f"Chunk {i} missing required field: {field}") |
|
|
|
|
|
|
|
|
metadata = chunk["metadata"] |
|
|
if not isinstance(metadata, dict): |
|
|
raise ValueError(f"Chunk {i} metadata must be a dictionary") |
|
|
|
|
|
|
|
|
if "filename" not in metadata: |
|
|
raise ValueError(f"Chunk {i} metadata missing 'filename' field") |
|
|
|
|
|
return True |
|
|
|