File size: 3,232 Bytes
a553846
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

# Load saved model, encoder, and training columns
model = joblib.load('random_forest_model.pkl')
le = joblib.load('label_encoder.pkl')
training_columns = joblib.load('training_columns.pkl')

# Mapping helper
def map_and_prepare_input_data(input_df):
    from difflib import get_close_matches

    column_aliases = {
        "App Tech Stack": ["app tech stack", "technology stack", "application stack"],
        "Operating System": ["os", "operating system", "platform"],
        "DB Details": ["db info", "database", "database information", "db"],
        "Authentication Model": ["auth model", "authentication", "authentication type"],
        "Application Components": ["components", "app components", "application parts"],
        "Licence Renewal": ["license", "license renewal", "renewal"],
    }

    reverse_aliases = {}
    for std_col, aliases in column_aliases.items():
        for alias in aliases:
            reverse_aliases[alias.lower()] = std_col

    mapping = {}
    for col in input_df.columns:
        col_lower = col.lower()
        if col_lower in reverse_aliases:
            mapping[col] = reverse_aliases[col_lower]
        else:
            match = get_close_matches(col_lower, reverse_aliases.keys(), n=1, cutoff=0.8)
            if match:
                mapping[col] = reverse_aliases[match[0]]

    input_df_renamed = input_df.rename(columns=mapping)
    input_df_filtered = input_df_renamed[[col for col in input_df_renamed.columns if col in list(column_aliases.keys())]]

    missing_columns = set(list(column_aliases.keys())) - set(input_df_filtered.columns)
    if missing_columns:
        raise ValueError(f"Missing required columns: {missing_columns}")

    return input_df_filtered

# Load new input data
try:
    new_data = pd.read_csv('input.csv')
except FileNotFoundError:
    print("Error: 'input.csv' not found.")
    exit()

new_data = map_and_prepare_input_data(new_data)
new_data.fillna('Unknown', inplace=True)

# One-hot encode and align with training columns
encoded_data = pd.get_dummies(new_data, columns=[
    'App Tech Stack', 'Operating System', 'DB Details',
    'Authentication Model', 'Application Components', 'Licence Renewal'
])
encoded_data = encoded_data.reindex(columns=training_columns, fill_value=0)

# Predict
predicted_labels_encoded = model.predict(encoded_data)
predicted_labels = le.inverse_transform(predicted_labels_encoded)
new_data['Predicted Modernization Strategy'] = predicted_labels

# Save to CSV
new_data.to_csv('output.csv', index=False)
print("✅ Predictions saved to 'output.csv'")

# Visualize
counts = new_data['Predicted Modernization Strategy'].value_counts()
plt.figure(figsize=(10, 6))
counts.plot(kind='bar', color=['skyblue', 'lightgreen', 'salmon', 'plum', 'gold'])
plt.title('Distribution of Predicted Modernization Strategies')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

print("\n Count of Predicted Modernization Strategies:")
for strategy, count in counts.items():
    print(f"{strategy}: {count}")