|
|
import pandas as pd
|
|
|
import numpy as np
|
|
|
import joblib
|
|
|
import matplotlib.pyplot as plt
|
|
|
import seaborn as sns
|
|
|
|
|
|
|
|
|
model = joblib.load('random_forest_model.pkl')
|
|
|
le = joblib.load('label_encoder.pkl')
|
|
|
training_columns = joblib.load('training_columns.pkl')
|
|
|
|
|
|
|
|
|
def map_and_prepare_input_data(input_df):
|
|
|
from difflib import get_close_matches
|
|
|
|
|
|
column_aliases = {
|
|
|
"App Tech Stack": ["app tech stack", "technology stack", "application stack"],
|
|
|
"Operating System": ["os", "operating system", "platform"],
|
|
|
"DB Details": ["db info", "database", "database information", "db"],
|
|
|
"Authentication Model": ["auth model", "authentication", "authentication type"],
|
|
|
"Application Components": ["components", "app components", "application parts"],
|
|
|
"Licence Renewal": ["license", "license renewal", "renewal"],
|
|
|
}
|
|
|
|
|
|
reverse_aliases = {}
|
|
|
for std_col, aliases in column_aliases.items():
|
|
|
for alias in aliases:
|
|
|
reverse_aliases[alias.lower()] = std_col
|
|
|
|
|
|
mapping = {}
|
|
|
for col in input_df.columns:
|
|
|
col_lower = col.lower()
|
|
|
if col_lower in reverse_aliases:
|
|
|
mapping[col] = reverse_aliases[col_lower]
|
|
|
else:
|
|
|
match = get_close_matches(col_lower, reverse_aliases.keys(), n=1, cutoff=0.8)
|
|
|
if match:
|
|
|
mapping[col] = reverse_aliases[match[0]]
|
|
|
|
|
|
input_df_renamed = input_df.rename(columns=mapping)
|
|
|
input_df_filtered = input_df_renamed[[col for col in input_df_renamed.columns if col in list(column_aliases.keys())]]
|
|
|
|
|
|
missing_columns = set(list(column_aliases.keys())) - set(input_df_filtered.columns)
|
|
|
if missing_columns:
|
|
|
raise ValueError(f"Missing required columns: {missing_columns}")
|
|
|
|
|
|
return input_df_filtered
|
|
|
|
|
|
|
|
|
try:
|
|
|
new_data = pd.read_csv('input.csv')
|
|
|
except FileNotFoundError:
|
|
|
print("Error: 'input.csv' not found.")
|
|
|
exit()
|
|
|
|
|
|
new_data = map_and_prepare_input_data(new_data)
|
|
|
new_data.fillna('Unknown', inplace=True)
|
|
|
|
|
|
|
|
|
encoded_data = pd.get_dummies(new_data, columns=[
|
|
|
'App Tech Stack', 'Operating System', 'DB Details',
|
|
|
'Authentication Model', 'Application Components', 'Licence Renewal'
|
|
|
])
|
|
|
encoded_data = encoded_data.reindex(columns=training_columns, fill_value=0)
|
|
|
|
|
|
|
|
|
predicted_labels_encoded = model.predict(encoded_data)
|
|
|
predicted_labels = le.inverse_transform(predicted_labels_encoded)
|
|
|
new_data['Predicted Modernization Strategy'] = predicted_labels
|
|
|
|
|
|
|
|
|
new_data.to_csv('output.csv', index=False)
|
|
|
print("✅ Predictions saved to 'output.csv'")
|
|
|
|
|
|
|
|
|
counts = new_data['Predicted Modernization Strategy'].value_counts()
|
|
|
plt.figure(figsize=(10, 6))
|
|
|
counts.plot(kind='bar', color=['skyblue', 'lightgreen', 'salmon', 'plum', 'gold'])
|
|
|
plt.title('Distribution of Predicted Modernization Strategies')
|
|
|
plt.ylabel('Count')
|
|
|
plt.xticks(rotation=45, ha='right')
|
|
|
plt.tight_layout()
|
|
|
plt.show()
|
|
|
|
|
|
print("\n Count of Predicted Modernization Strategies:")
|
|
|
for strategy, count in counts.items():
|
|
|
print(f"{strategy}: {count}") |