audit_assistant / src /loader.py
Ara Yeroyan
add src
f5df983
raw
history blame
2.89 kB
"""Data loading utilities for chunks and JSON files."""
import json
from pathlib import Path
from typing import List, Dict, Any
from langchain.docstore.document import Document
def load_json(filepath: Path | str) -> List[Dict[str, Any]]:
"""
Load JSON data from file.
Args:
filepath: Path to JSON file
Returns:
List of dictionaries containing the JSON data
"""
filepath = Path(filepath)
if not filepath.exists():
raise FileNotFoundError(f"JSON file not found: {filepath}")
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
return data
def open_file(filepath: Path | str) -> str:
"""
Open and read a text file.
Args:
filepath: Path to text file
Returns:
File contents as string
"""
filepath = Path(filepath)
if not filepath.exists():
raise FileNotFoundError(f"File not found: {filepath}")
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
return content
def load_chunks(chunks_file: Path | str = None) -> List[Dict[str, Any]]:
"""
Load document chunks from JSON file.
Args:
chunks_file: Path to chunks JSON file. If None, uses default path.
Returns:
List of chunk dictionaries
"""
if chunks_file is None:
chunks_file = Path("reports/docling_chunks.json")
return load_json(chunks_file)
def chunks_to_documents(chunks: List[Dict[str, Any]]) -> List[Document]:
"""
Convert chunk dictionaries to LangChain Document objects.
Args:
chunks: List of chunk dictionaries
Returns:
List of Document objects
"""
documents = []
for chunk in chunks:
doc = Document(
page_content=chunk.get("content", ""),
metadata=chunk.get("metadata", {})
)
documents.append(doc)
return documents
def validate_chunks(chunks: List[Dict[str, Any]]) -> bool:
"""
Validate that chunks have required fields.
Args:
chunks: List of chunk dictionaries
Returns:
True if valid, raises ValueError if invalid
"""
required_fields = ["content", "metadata"]
for i, chunk in enumerate(chunks):
for field in required_fields:
if field not in chunk:
raise ValueError(f"Chunk {i} missing required field: {field}")
# Validate metadata has required fields
metadata = chunk["metadata"]
if not isinstance(metadata, dict):
raise ValueError(f"Chunk {i} metadata must be a dictionary")
# Check for common metadata fields
if "filename" not in metadata:
raise ValueError(f"Chunk {i} metadata missing 'filename' field")
return True