Spaces:

reab5555
/

AI-Data-Cleaner

Sleeping

App Files Files Community

AI-Data-Cleaner / manage_schema.py

reab5555

Upload 5 files

1853d90 verified over 1 year ago

raw

history blame

1.78 kB

	import pandas as pd
	import numpy as np
	import json
	from llm_config import generate_llm_response
	from llm_prompts import DETERMINE_DTYPE_PROMPT

	SAMPLE_SIZE = 200


	def determine_column_type(df, column):
	sample = df[column].sample(n=min(SAMPLE_SIZE, len(df)), random_state=42).tolist()
	prompt = DETERMINE_DTYPE_PROMPT.format(sample_values=str(sample))
	response = generate_llm_response(prompt)

	try:
	result = json.loads(response)
	return result['column_type'], result['invalid_indices']
	except (json.JSONDecodeError, KeyError):
	print(f"Error parsing LLM response for column {column}")
	return 'string', []


	def enforce_column_type(df, column, column_type, invalid_indices):
	if column_type == 'float':
	df[column] = pd.to_numeric(df[column], errors='coerce')
	elif column_type == 'integer':
	df[column] = pd.to_numeric(df[column], errors='coerce').astype('Int64')
	elif column_type == 'date':
	df[column] = pd.to_datetime(df[column], errors='coerce')

	# Set invalid values to NaN
	df.loc[invalid_indices, column] = np.nan

	return df


	def process_dataframe(df):
	print("Determining and enforcing column data types...")

	for column in df.columns:
	print(f"\nProcessing column: {column}")
	column_type, invalid_indices = determine_column_type(df, column)
	print(f" Detected type: {column_type}")
	print(f" Number of invalid values: {len(invalid_indices)}")

	df = enforce_column_type(df, column, column_type, invalid_indices)

	valid_percentage = (df[column].count() / len(df)) * 100
	print(f" Percentage of valid values after type enforcement: {valid_percentage:.2f}%")

	return df