import pandas as pd import numpy as np # import matplotlib.pyplot as plt # import seaborn as sns from sklearn.metrics import classification_report, confusion_matrix import joblib import streamlit as st import os import plotly.express as px def preprocess(dataset, x_iloc_list, y_iloc, testSize): # dataset = pd.read_csv(csv_file) X = dataset.iloc[:, x_iloc_list].values y = dataset.iloc[:, y_iloc].values # split into training and testing set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = testSize, random_state = 0) # standardization of values from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) return X_train, X_test, y_train, y_test class classification: def __init__(self, X_train, X_test, y_train, y_test): self.X_train = X_train self.X_test = X_test self.y_train = y_train self.y_test = y_test def accuracy(self, confusion_matrix): sum, total = 0,0 for i in range(len(confusion_matrix)): for j in range(len(confusion_matrix[0])): if i == j: sum += confusion_matrix[i,j] total += confusion_matrix[i,j] return sum/total def classification_report_plot(self, clf_report): fig = px.imshow(pd.DataFrame(clf_report).iloc[:-1, :].T) st.plotly_chart(fig) def LR(self): from sklearn.linear_model import LogisticRegression lr_classifier = LogisticRegression() lr_classifier.fit(self.X_train, self.y_train) joblib.dump(lr_classifier, "model/lr.sav") y_pred = lr_classifier.predict(self.X_test) st.write("\n") st.write("--------------------------------------") st.write("### Random Forest Classifier ###") st.write("--------------------------------------") st.write('Classification Report: ') clf = classification_report(self.y_test, y_pred, output_dict=True) st.table(pd.DataFrame(clf)) st.write('Confusion Matrix: ') st.table(pd.DataFrame(confusion_matrix(self.y_test, y_pred))) st.write('Precision: ', self.accuracy(confusion_matrix(self.y_test, y_pred))*100,'%') self.classification_report_plot(clf) def KNN(self): from sklearn.neighbors import KNeighborsClassifier knn_classifier = KNeighborsClassifier() knn_classifier.fit(self.X_train, self.y_train) joblib.dump(knn_classifier, "model/knn.sav") y_pred = knn_classifier.predict(self.X_test) st.write("\n") st.write("-------------------------------") st.write("### K-Neighbors Classifier ###") st.write("-------------------------------") st.write('Classification Report: ') clf = classification_report(self.y_test, y_pred, output_dict=True) st.table(pd.DataFrame(clf)) st.write('Confusion Matrix: ') st.table(pd.DataFrame(confusion_matrix(self.y_test, y_pred))) st.write('Precision: ', self.accuracy(confusion_matrix(self.y_test, y_pred))*100,'%') self.classification_report_plot(clf) # kernel type could be 'linear' or 'rbf' (Gaussian) def SVM(self, kernel_type): from sklearn.svm import SVC svm_classifier = SVC(kernel = kernel_type) svm_classifier.fit(self.X_train, self.y_train) joblib.dump(svm_classifier, "model/svm.sav") y_pred = svm_classifier.predict(self.X_test) st.write("\n") st.write("--------------------------------------") st.write("### Support Vector Classifier (" + kernel_type + ") ###") st.write("--------------------------------------") st.write('Classification Report: ') clf = classification_report(self.y_test, y_pred, output_dict=True) st.table(pd.DataFrame(clf)) st.write('Confusion Matrix: ') st.table(pd.DataFrame(confusion_matrix(self.y_test, y_pred))) st.write('Precision: ', self.accuracy(confusion_matrix(self.y_test, y_pred))*100,'%') self.classification_report_plot(clf) def NB(self): from sklearn.naive_bayes import GaussianNB nb_classifier = GaussianNB() nb_classifier.fit(self.X_train, self.y_train) joblib.dump(nb_classifier, "model/nb.sav") y_pred = nb_classifier.predict(self.X_test) st.write("\n") st.write("------------------------------") st.write("### Naive Bayes Classifier ###") st.write("------------------------------") st.write('Classification Report: ') clf = classification_report(self.y_test, y_pred, output_dict=True) st.table(pd.DataFrame(clf)) st.write('Confusion Matrix: ') st.table(pd.DataFrame(confusion_matrix(self.y_test, y_pred))) st.write('Precision: ', self.accuracy(confusion_matrix(self.y_test, y_pred))*100,'%') self.classification_report_plot(clf) def DT(self): from sklearn.tree import DecisionTreeClassifier tree_classifier = DecisionTreeClassifier() tree_classifier.fit(self.X_train, self.y_train) joblib.dump(tree_classifier, "model/tree.sav") y_pred = tree_classifier.predict(self.X_test) st.write("\n") st.write("--------------------------------") st.write("### Decision Tree Classifier ###") st.write("--------------------------------") st.write('Classification Report: ') clf = classification_report(self.y_test, y_pred, output_dict=True) st.table(pd.DataFrame(clf)) st.write('Confusion Matrix: ') st.table(pd.DataFrame(confusion_matrix(self.y_test, y_pred))) st.write('Precision: ', self.accuracy(confusion_matrix(self.y_test, y_pred))*100,'%') self.classification_report_plot(clf) def RF(self): from sklearn.ensemble import RandomForestClassifier rf_classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy') rf_classifier.fit(self.X_train, self.y_train) joblib.dump(rf_classifier, "model/rf-model.pkl") y_pred = rf_classifier.predict(self.X_test) st.write("\n") st.write("--------------------------------") st.write("### Random Forest Classifier ###") st.write("--------------------------------") st.write('Classification Report: ') clf = classification_report(self.y_test, y_pred, output_dict=True) st.table(pd.DataFrame(clf)) st.write('Confusion Matrix: ') st.table(pd.DataFrame(confusion_matrix(self.y_test, y_pred))) st.write('Precision: ', self.accuracy(confusion_matrix(self.y_test, y_pred))*100,'%') self.classification_report_plot(clf) # primary App interfacing function for classification def st_classification(): df = pd.read_csv("temp_data/test.csv") # select features/columns col_names = [] feature_list = list(df.columns) st.sidebar.write("Select Column Names from the Dataset") for col_name in feature_list: check_box = st.sidebar.checkbox(col_name) if check_box: col_names.append(col_name) try: df = df[col_names] st.write(df) except: pass try : x_iloc_list = list(range(0,len(df.columns)-1)) y_iloc = len(df.columns)-1 test_size = st.sidebar.slider("Enter Test Data Size (default 0.2)", 0.0,0.4,0.2,0.1) X_train, X_test, y_train, y_test = preprocess(df, x_iloc_list, y_iloc, test_size) model = st.sidebar.selectbox( 'Choose Model', ["LR", "KNN", "SVM", "NB", "DT", "RF"]) classifier = classification(X_train, X_test, y_train, y_test) if model == "LR": try: classifier.LR() except Exception as e: st.write(e) if model == "KNN": try: classifier.KNN() except Exception as e: st.write(e) if model == "SVM": kernel_choice = st.sidebar.selectbox('Select Feature Selection Method',\ ["linear", "rbf"]) try: classifier.SVM(kernel_choice) except Exception as e: st.write(e) if model == "NB": try: classifier.NB() except Exception as e: st.write(e) if model == "DT": try: classifier.DT() except Exception as e: st.write(e) if model == "RF": try: classifier.RF() except Exception as e: st.write(e) except Exception as e: st.warning('Consider selecting the columns in the left bar for classification', icon="⚠️")