Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| import json | |
| from llm_config import generate_llm_response | |
| from llm_prompts import DETERMINE_DTYPE_PROMPT | |
| SAMPLE_SIZE = 200 | |
| def determine_column_type(df, column): | |
| sample = df[column].sample(n=min(SAMPLE_SIZE, len(df)), random_state=42).tolist() | |
| prompt = DETERMINE_DTYPE_PROMPT.format(sample_values=str(sample)) | |
| response = generate_llm_response(prompt) | |
| try: | |
| result = json.loads(response) | |
| return result['column_type'], result['invalid_indices'] | |
| except (json.JSONDecodeError, KeyError): | |
| print(f"Error parsing LLM response for column {column}") | |
| return 'string', [] | |
| def enforce_column_type(df, column, column_type, invalid_indices): | |
| if column_type == 'float': | |
| df[column] = pd.to_numeric(df[column], errors='coerce') | |
| elif column_type == 'integer': | |
| df[column] = pd.to_numeric(df[column], errors='coerce').astype('Int64') | |
| elif column_type == 'date': | |
| df[column] = pd.to_datetime(df[column], errors='coerce') | |
| # Set invalid values to NaN | |
| df.loc[invalid_indices, column] = np.nan | |
| return df | |
| def process_dataframe(df): | |
| print("Determining and enforcing column data types...") | |
| for column in df.columns: | |
| print(f"\nProcessing column: {column}") | |
| column_type, invalid_indices = determine_column_type(df, column) | |
| print(f" Detected type: {column_type}") | |
| print(f" Number of invalid values: {len(invalid_indices)}") | |
| df = enforce_column_type(df, column, column_type, invalid_indices) | |
| valid_percentage = (df[column].count() / len(df)) * 100 | |
| print(f" Percentage of valid values after type enforcement: {valid_percentage:.2f}%") | |
| return df | |