Run the following cell to make sure you can import all of the libraries. Then move onto Part 1.
import pandas as pd
from collections import *
import numpy as np
import sklearn
from sklearn import datasets
from collections import Counter as ctr
from sklearn.model_selection import train_test_split
from math import *
%matplotlib inline
iris = sklearn.datasets.load_iris()
X = iris.data
y = iris.target
X[:5], y[:5]
list(iris.target_names) # classes 0, 1, 2
# put the iris data into a dataframe
cols = ['f1', 'f2', 'f3', 'f4'] # we have four features to deal with
data = pd.DataFrame(X,columns=cols)
data['y'] = y # add the prediction label as a column
# split into train/test data
train, test = train_test_split(data, test_size=0.5)
train
#estimate gaussian parameters for P(F|C)
params_fc = {}
for y_val in set(train.y):
sub_frame = train[train.y == y_val]
for f in cols:
params_fc['{}-{}'.format(y_val, f)] = (sub_frame[f].mean(), sub_frame[f].std())
# estimate gaussian parameters for P(F)
params_f = {}
for f in cols:
params_f[f] = (train[f].mean(), train[f].std())
# show the mean/std values for all the combinations
params_fc, params_f
def gaussian(x, mu, sig):
return 1./(sqrt(2.*pi)*sig)*np.exp(-np.power((x - mu)/sig, 2.)/2)
#P(F|C)
def Pfc(feat='f1', F='',C=''):
mu,sigma = params_fc['{}-{}'.format(C,feat)]
return gaussian(F,mu,sigma)
#P(F)
def Pf(feat='f1', F=''):
mu,sigma=params_f[feat]
return gaussian(F,mu,sigma)
#P(C) -- there are 50 of each type in the data, so each type is 50/150 -> 1/3
def Pc(C=''):
return 1.0 / 3.0
#P(C|F) = P(F|C) * P(C) / P(F)
def Pcf(feat='f1', C='', F=''):
return Pfc(feat,F,C) * Pc(C) / Pf(feat,F)
# call P(C|F) on all four individual features, multiply result together
# note: a \ (i.e., a slash) at the end of a line in python means to continue the current line of code
for y_val in set(test.y):
test[str(y_val)] = test.f1.map(lambda x: Pcf(feat='f1', C=y_val, F=x)) *\
test.f2.map(lambda x: Pcf(feat='f2', C=y_val, F=x)) *\
test.f3.map(lambda x: Pcf(feat='f3', C=y_val, F=x)) *\
test.f4.map(lambda x: Pcf(feat='f4', C=y_val, F=x))
test['guess'] = test[['0','1','2']].idxmax(axis=1) # take the argmax class label
test
len(test[test.y.map(str) == test.guess]) / len(test) # see if the guess matches the actual class label
Xtrain = pd.DataFrame.as_matrix(train[cols])
ytrain = train.y
Xtest = pd.DataFrame.as_matrix(test[cols])
ytest = test.y
#import and instantiate classifier here
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(Xtrain, ytrain)
preds = classifier.predict(Xtest)
sklearn.metrics.accuracy_score(ytest, preds)