AutomaticDatavisualization / classification.py
RahulParajuli12's picture
Added application file
2735152
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import streamlit as st
import os
import plotly.express as px
def preprocess(dataset, x_iloc_list, y_iloc, testSize):
# dataset = pd.read_csv(csv_file)
X = dataset.iloc[:, x_iloc_list].values
y = dataset.iloc[:, y_iloc].values
# split into training and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = testSize, random_state = 0)
# standardization of values
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
return X_train, X_test, y_train, y_test
class classification:
def __init__(self, X_train, X_test, y_train, y_test):
self.X_train = X_train
self.X_test = X_test
self.y_train = y_train
self.y_test = y_test
def accuracy(self, confusion_matrix):
sum, total = 0,0
for i in range(len(confusion_matrix)):
for j in range(len(confusion_matrix[0])):
if i == j:
sum += confusion_matrix[i,j]
total += confusion_matrix[i,j]
return sum/total
def classification_report_plot(self, clf_report):
fig = px.imshow(pd.DataFrame(clf_report).iloc[:-1, :].T)
st.plotly_chart(fig)
def LR(self):
from sklearn.linear_model import LogisticRegression
lr_classifier = LogisticRegression()
lr_classifier.fit(self.X_train, self.y_train)
joblib.dump(lr_classifier, "model/lr.sav")
y_pred = lr_classifier.predict(self.X_test)
st.write("\n")
st.write("--------------------------------------")
st.write("### Random Forest Classifier ###")
st.write("--------------------------------------")
st.write('Classification Report: ')
clf = classification_report(self.y_test, y_pred, output_dict=True)
st.table(pd.DataFrame(clf))
st.write('Confusion Matrix: ')
st.table(pd.DataFrame(confusion_matrix(self.y_test, y_pred)))
st.write('Precision: ', self.accuracy(confusion_matrix(self.y_test, y_pred))*100,'%')
self.classification_report_plot(clf)
def KNN(self):
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier()
knn_classifier.fit(self.X_train, self.y_train)
joblib.dump(knn_classifier, "model/knn.sav")
y_pred = knn_classifier.predict(self.X_test)
st.write("\n")
st.write("-------------------------------")
st.write("### K-Neighbors Classifier ###")
st.write("-------------------------------")
st.write('Classification Report: ')
clf = classification_report(self.y_test, y_pred, output_dict=True)
st.table(pd.DataFrame(clf))
st.write('Confusion Matrix: ')
st.table(pd.DataFrame(confusion_matrix(self.y_test, y_pred)))
st.write('Precision: ', self.accuracy(confusion_matrix(self.y_test, y_pred))*100,'%')
self.classification_report_plot(clf)
# kernel type could be 'linear' or 'rbf' (Gaussian)
def SVM(self, kernel_type):
from sklearn.svm import SVC
svm_classifier = SVC(kernel = kernel_type)
svm_classifier.fit(self.X_train, self.y_train)
joblib.dump(svm_classifier, "model/svm.sav")
y_pred = svm_classifier.predict(self.X_test)
st.write("\n")
st.write("--------------------------------------")
st.write("### Support Vector Classifier (" + kernel_type + ") ###")
st.write("--------------------------------------")
st.write('Classification Report: ')
clf = classification_report(self.y_test, y_pred, output_dict=True)
st.table(pd.DataFrame(clf))
st.write('Confusion Matrix: ')
st.table(pd.DataFrame(confusion_matrix(self.y_test, y_pred)))
st.write('Precision: ', self.accuracy(confusion_matrix(self.y_test, y_pred))*100,'%')
self.classification_report_plot(clf)
def NB(self):
from sklearn.naive_bayes import GaussianNB
nb_classifier = GaussianNB()
nb_classifier.fit(self.X_train, self.y_train)
joblib.dump(nb_classifier, "model/nb.sav")
y_pred = nb_classifier.predict(self.X_test)
st.write("\n")
st.write("------------------------------")
st.write("### Naive Bayes Classifier ###")
st.write("------------------------------")
st.write('Classification Report: ')
clf = classification_report(self.y_test, y_pred, output_dict=True)
st.table(pd.DataFrame(clf))
st.write('Confusion Matrix: ')
st.table(pd.DataFrame(confusion_matrix(self.y_test, y_pred)))
st.write('Precision: ', self.accuracy(confusion_matrix(self.y_test, y_pred))*100,'%')
self.classification_report_plot(clf)
def DT(self):
from sklearn.tree import DecisionTreeClassifier
tree_classifier = DecisionTreeClassifier()
tree_classifier.fit(self.X_train, self.y_train)
joblib.dump(tree_classifier, "model/tree.sav")
y_pred = tree_classifier.predict(self.X_test)
st.write("\n")
st.write("--------------------------------")
st.write("### Decision Tree Classifier ###")
st.write("--------------------------------")
st.write('Classification Report: ')
clf = classification_report(self.y_test, y_pred, output_dict=True)
st.table(pd.DataFrame(clf))
st.write('Confusion Matrix: ')
st.table(pd.DataFrame(confusion_matrix(self.y_test, y_pred)))
st.write('Precision: ', self.accuracy(confusion_matrix(self.y_test, y_pred))*100,'%')
self.classification_report_plot(clf)
def RF(self):
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy')
rf_classifier.fit(self.X_train, self.y_train)
joblib.dump(rf_classifier, "model/rf-model.pkl")
y_pred = rf_classifier.predict(self.X_test)
st.write("\n")
st.write("--------------------------------")
st.write("### Random Forest Classifier ###")
st.write("--------------------------------")
st.write('Classification Report: ')
clf = classification_report(self.y_test, y_pred, output_dict=True)
st.table(pd.DataFrame(clf))
st.write('Confusion Matrix: ')
st.table(pd.DataFrame(confusion_matrix(self.y_test, y_pred)))
st.write('Precision: ', self.accuracy(confusion_matrix(self.y_test, y_pred))*100,'%')
self.classification_report_plot(clf)
# primary App interfacing function for classification
def st_classification():
df = pd.read_csv("temp_data/test.csv")
# select features/columns
col_names = []
feature_list = list(df.columns)
st.sidebar.write("Select Column Names from the Dataset")
for col_name in feature_list:
check_box = st.sidebar.checkbox(col_name)
if check_box:
col_names.append(col_name)
try:
df = df[col_names]
st.write(df)
except:
pass
try :
x_iloc_list = list(range(0,len(df.columns)-1))
y_iloc = len(df.columns)-1
test_size = st.sidebar.slider("Enter Test Data Size (default 0.2)", 0.0,0.4,0.2,0.1)
X_train, X_test, y_train, y_test = preprocess(df, x_iloc_list, y_iloc, test_size)
model = st.sidebar.selectbox(
'Choose Model', ["LR", "KNN", "SVM", "NB", "DT", "RF"])
classifier = classification(X_train, X_test, y_train, y_test)
if model == "LR":
try:
classifier.LR()
except Exception as e:
st.write(e)
if model == "KNN":
try:
classifier.KNN()
except Exception as e:
st.write(e)
if model == "SVM":
kernel_choice = st.sidebar.selectbox('Select Feature Selection Method',\
["linear", "rbf"])
try:
classifier.SVM(kernel_choice)
except Exception as e:
st.write(e)
if model == "NB":
try:
classifier.NB()
except Exception as e:
st.write(e)
if model == "DT":
try:
classifier.DT()
except Exception as e:
st.write(e)
if model == "RF":
try:
classifier.RF()
except Exception as e:
st.write(e)
except Exception as e:
st.warning('Consider selecting the columns in the left bar for classification', icon="⚠️")