import pandas as pd import numpy as np import joblib import matplotlib.pyplot as plt import seaborn as sns # Load saved model, encoder, and training columns model = joblib.load('random_forest_model.pkl') le = joblib.load('label_encoder.pkl') training_columns = joblib.load('training_columns.pkl') # Mapping helper def map_and_prepare_input_data(input_df): from difflib import get_close_matches column_aliases = { "App Tech Stack": ["app tech stack", "technology stack", "application stack"], "Operating System": ["os", "operating system", "platform"], "DB Details": ["db info", "database", "database information", "db"], "Authentication Model": ["auth model", "authentication", "authentication type"], "Application Components": ["components", "app components", "application parts"], "Licence Renewal": ["license", "license renewal", "renewal"], } reverse_aliases = {} for std_col, aliases in column_aliases.items(): for alias in aliases: reverse_aliases[alias.lower()] = std_col mapping = {} for col in input_df.columns: col_lower = col.lower() if col_lower in reverse_aliases: mapping[col] = reverse_aliases[col_lower] else: match = get_close_matches(col_lower, reverse_aliases.keys(), n=1, cutoff=0.8) if match: mapping[col] = reverse_aliases[match[0]] input_df_renamed = input_df.rename(columns=mapping) input_df_filtered = input_df_renamed[[col for col in input_df_renamed.columns if col in list(column_aliases.keys())]] missing_columns = set(list(column_aliases.keys())) - set(input_df_filtered.columns) if missing_columns: raise ValueError(f"Missing required columns: {missing_columns}") return input_df_filtered # Load new input data try: new_data = pd.read_csv('input.csv') except FileNotFoundError: print("Error: 'input.csv' not found.") exit() new_data = map_and_prepare_input_data(new_data) new_data.fillna('Unknown', inplace=True) # One-hot encode and align with training columns encoded_data = pd.get_dummies(new_data, columns=[ 'App Tech Stack', 'Operating System', 'DB Details', 'Authentication Model', 'Application Components', 'Licence Renewal' ]) encoded_data = encoded_data.reindex(columns=training_columns, fill_value=0) # Predict predicted_labels_encoded = model.predict(encoded_data) predicted_labels = le.inverse_transform(predicted_labels_encoded) new_data['Predicted Modernization Strategy'] = predicted_labels # Save to CSV new_data.to_csv('output.csv', index=False) print("✅ Predictions saved to 'output.csv'") # Visualize counts = new_data['Predicted Modernization Strategy'].value_counts() plt.figure(figsize=(10, 6)) counts.plot(kind='bar', color=['skyblue', 'lightgreen', 'salmon', 'plum', 'gold']) plt.title('Distribution of Predicted Modernization Strategies') plt.ylabel('Count') plt.xticks(rotation=45, ha='right') plt.tight_layout() plt.show() print("\n Count of Predicted Modernization Strategies:") for strategy, count in counts.items(): print(f"{strategy}: {count}")