Spaces:

akryldigital
/

audit_assistant

Sleeping

audit_assistant / src /reporting /metadata.py

Ara Yeroyan

add src

f5df983 about 1 month ago

6.17 kB

	"""Report metadata management."""

	from typing import Dict, List, Any, Set
	from pathlib import Path


	def get_report_metadata(chunks: List[Dict[str, Any]]) -> Dict[str, Any]:
	"""
	Extract metadata from chunks.

	Args:
	chunks: List of chunk dictionaries

	Returns:
	Dictionary with report metadata
	"""
	if not chunks:
	return {}

	sources = set()
	filenames = set()
	years = set()

	for chunk in chunks:
	metadata = chunk.get("metadata", {})

	if "source" in metadata:
	sources.add(metadata["source"])

	if "filename" in metadata:
	filenames.add(metadata["filename"])

	if "year" in metadata:
	years.add(metadata["year"])

	return {
	"sources": sorted(list(sources)),
	"filenames": sorted(list(filenames)),
	"years": sorted(list(years)),
	"total_chunks": len(chunks)
	}


	def get_available_sources() -> List[str]:
	"""
	Get list of available report sources (legacy compatibility).

	Returns:
	List of source categories
	"""
	# This would typically come from the original auditqa_old.reports module
	# For now, return common categories
	return [
	"Consolidated",
	"Ministry, Department, Agency and Projects",
	"Local Government",
	"Value for Money",
	"Thematic",
	"Hospital",
	"Project"
	]


	def get_source_subtypes() -> Dict[str, List[str]]:
	"""
	Get mapping of sources to their subtypes (placeholder).

	Returns:
	Dictionary mapping sources to subtypes
	"""
	# This was originally imported from auditqa_old.reports.new_files
	# For now, return a placeholder structure
	return {
	"Consolidated": ["Annual Consolidated OAG 2024", "Annual Consolidated OAG 2023"],
	"Local Government": ["District Reports", "Municipal Reports"],
	"Ministry, Department, Agency and Projects": ["Ministry Reports", "Agency Reports"],
	"Value for Money": ["VFM Reports 2024", "VFM Reports 2023"],
	"Thematic": ["Thematic Reports 2024", "Thematic Reports 2023"],
	"Hospital": ["Hospital Reports 2024", "Hospital Reports 2023"],
	"Project": ["Project Reports 2024", "Project Reports 2023"]
	}


	def validate_report_filters(
	reports: List[str] = None,
	sources: str = None,
	subtype: List[str] = None,
	available_metadata: Dict[str, Any] = None
	) -> Dict[str, Any]:
	"""
	Validate report filter parameters.

	Args:
	reports: List of specific report filenames
	sources: Source category
	subtype: List of subtypes
	available_metadata: Available metadata for validation

	Returns:
	Dictionary with validation results
	"""
	validation_result = {
	"valid": True,
	"warnings": [],
	"errors": []
	}

	if not available_metadata:
	validation_result["warnings"].append("No metadata available for validation")
	return validation_result

	available_sources = available_metadata.get("sources", [])
	available_filenames = available_metadata.get("filenames", [])

	# Validate sources
	if sources and sources not in available_sources:
	validation_result["errors"].append(f"Source '{sources}' not found in available sources")
	validation_result["valid"] = False

	# Validate reports
	if reports:
	for report in reports:
	if report not in available_filenames:
	validation_result["warnings"].append(f"Report '{report}' not found in available reports")

	# Validate subtypes
	if subtype:
	for sub in subtype:
	if sub not in available_filenames:
	validation_result["warnings"].append(f"Subtype '{sub}' not found in available reports")

	return validation_result


	def get_report_statistics(chunks: List[Dict[str, Any]]) -> Dict[str, Any]:
	"""
	Get statistics about reports in chunks.

	Args:
	chunks: List of chunk dictionaries

	Returns:
	Dictionary with report statistics
	"""
	if not chunks:
	return {}

	stats = {
	"total_chunks": len(chunks),
	"sources": {},
	"years": {},
	"avg_chunk_length": 0,
	"total_content_length": 0
	}

	total_length = 0

	for chunk in chunks:
	content = chunk.get("content", "")
	total_length += len(content)

	metadata = chunk.get("metadata", {})

	# Count by source
	source = metadata.get("source", "Unknown")
	stats["sources"][source] = stats["sources"].get(source, 0) + 1

	# Count by year
	year = metadata.get("year", "Unknown")
	stats["years"][year] = stats["years"].get(year, 0) + 1

	stats["total_content_length"] = total_length
	stats["avg_chunk_length"] = total_length / len(chunks) if chunks else 0

	return stats


	def filter_chunks_by_metadata(
	chunks: List[Dict[str, Any]],
	source_filter: str = None,
	filename_filter: List[str] = None,
	year_filter: List[str] = None
	) -> List[Dict[str, Any]]:
	"""
	Filter chunks by metadata criteria.

	Args:
	chunks: List of chunk dictionaries
	source_filter: Source to filter by
	filename_filter: List of filenames to filter by
	year_filter: List of years to filter by

	Returns:
	Filtered list of chunks
	"""
	filtered_chunks = chunks

	if source_filter:
	filtered_chunks = [
	chunk for chunk in filtered_chunks
	if chunk.get("metadata", {}).get("source") == source_filter
	]

	if filename_filter:
	filtered_chunks = [
	chunk for chunk in filtered_chunks
	if chunk.get("metadata", {}).get("filename") in filename_filter
	]

	if year_filter:
	filtered_chunks = [
	chunk for chunk in filtered_chunks
	if chunk.get("metadata", {}).get("year") in year_filter
	]

	return filtered_chunks