<aside> 💡 We built the AI algorithm using two different languages/softwares. The first was built in google Collaboratory using python and the second was built using orange. In the python algorithm we used three different machine learning models: logistic regression, random forest, and naive bayes. Each model was evaluated using a confusion matrix and classification report which we have shown below. The orange algorithm was created using four different machine learning models, the three used in the python model and neural networks. They were all evaluated through a confusion matrix and ROC analysis.

</aside>

Python Algorithm

import pandas as pd

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# Data processing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict
from sklearn.utils.class_weight import compute_sample_weight

# Feature selection
from sklearn.feature_selection import SelectKBest, chi2

# Encoders and model eval libraries
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

somedia = pd.read_csv("dataset.csv")
print(somedia.head())
print(len(somedia))

#delete missing/empty data values
somedia = somedia.dropna() 
print(len(somedia))

## Determine Class Imbalance
print(somedia['LD'].value_counts())
print(somedia['PT'].value_counts())
print(somedia['FC'].value_counts())
print(len(somedia))

# remove all categories with less than 5 values
ld_vcts = somedia['LD'].value_counts()
to_remove = ld_vcts[ld_vcts < 5].index
somedia = somedia[~somedia['LD'].isin(to_remove)]
print(len(somedia))

## Split the x and y data
# Features = x
# Label = y

X = somedia.drop(['Do'], axis=1)
y = somedia['Do']
print(X.head())

# Encoding the data
oe = OrdinalEncoder()
oe.fit(X)
X_enc = oe.transform(X)
le = LabelEncoder()
le.fit(y)
y_enc = le.transform(y)

## Feature extraction
selector = SelectKBest(chi2, k=5) #get the best 5 features
newX = selector.fit_transform(X_enc, y_enc)

cols = selector.get_support(indices=True)
print(cols)
print(X.iloc[:, cols])

## Split the dataset into training and testing data
X_train, X_test, y_train, y_test = train_test_split(newX, y_enc, test_size=0.3, random_state=15)

## Logistic Regression
logreg = LogisticRegression(solver='lbfgs', class_weight = 'balanced', max_iter=4000)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=4000, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

y_pred_log = logreg.predict(X_test)
print(confusion_matrix(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log, target_names=['Not Do', 'Do']))
scv = StratifiedKFold(n_splits=3)
crosspred = cross_val_predict(logreg, newX, y_enc, cv=scv)
print(confusion_matrix(y_enc, crosspred))
print(classification_report(y_enc, crosspred, target_names=['Not Do', 'Do']))

## Random Forest
randfor = RandomForestClassifier(n_estimators = 10, class_weight='balanced', verbose=3)
randfor.fit(X_train, y_train)
y_pred_rf = randfor.predict(X_test)
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf, target_names=['Not Do', 'Do']))
scv = StratifiedKFold(n_splits=10)
crosspred = cross_val_predict(randfor, newX, y_enc, cv=scv)
print(confusion_matrix(y_enc, crosspred))
print(classification_report(y_enc, crosspred, target_names=['Not Do', 'Do']))

## Naive Bayes
nb = GaussianNB()
sweight = compute_sample_weight(class_weight='balanced', y=y_train)
nb.fit(X_train, y_train, sweight)
y_pred_nb = nb.predict(X_test)
print(confusion_matrix(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log, target_names=['Not Do', 'Do']))

scv = StratifiedKFold(n_splits = 10)
crosspred = cross_val_predict(nb, newX, y_enc, cv=scv)
print(confusion_matrix(y_enc, crosspred))
print(classification_report(y_enc, crosspred, target_names=['Not Do', 'Do']))

Evaluation

Logistic Regression

https://s3-us-west-2.amazonaws.com/secure.notion-static.com/d46e7619-524b-4c3f-8755-839e62583615/Untitled.png

Random Forest

https://s3-us-west-2.amazonaws.com/secure.notion-static.com/1ceaef20-df2a-4fd5-b91b-28815cffad1b/Untitled.png

Naive Bayes