Continuous Naive Bayes Classifier¶

Run the following cell to make sure you can import all of the libraries. Then move onto Part 1.

import pandas as pd
from collections import *
import numpy as np
import sklearn
from sklearn import datasets
from collections import Counter as ctr
from sklearn.model_selection import train_test_split
from math import *
%matplotlib inline

First, load the iris data¶

iris = sklearn.datasets.load_iris()
X = iris.data
y = iris.target

X[:5], y[:5]

(array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2]]), array([0, 0, 0, 0, 0]))

list(iris.target_names) # classes 0, 1, 2

['setosa', 'versicolor', 'virginica']

Split the data¶

# put the iris data into a dataframe
cols = ['f1', 'f2', 'f3', 'f4'] # we have four features to deal with
data = pd.DataFrame(X,columns=cols)
data['y'] = y # add the prediction label as a column
# split into train/test data
train, test = train_test_split(data, test_size=0.5)

train

Training¶

#estimate gaussian parameters for P(F|C)
params_fc = {}
for y_val in set(train.y):
    sub_frame = train[train.y == y_val]
    for f in cols:
        params_fc['{}-{}'.format(y_val, f)] = (sub_frame[f].mean(), sub_frame[f].std())

# estimate gaussian parameters for P(F)
params_f = {}
for f in cols:
    params_f[f] = (train[f].mean(), train[f].std())

# show the mean/std values for all the combinations
params_fc, params_f

({'0-f1': (5.031818181818183, 0.3257027035230919),
  '0-f2': (3.4681818181818187, 0.321286644952927),
  '0-f3': (1.4499999999999997, 0.16547190813232435),
  '0-f4': (0.25454545454545463, 0.13354960814430655),
  '1-f1': (6.0, 0.5311712126450916),
  '1-f2': (2.717241379310345, 0.34441354452920747),
  '1-f3': (4.362068965517241, 0.4329415938037372),
  '1-f4': (1.337931034482759, 0.20073755628298123),
  '2-f1': (6.558333333333334, 0.5897063872711621),
  '2-f2': (3.033333333333333, 0.348495732201011),
  '2-f3': (5.508333333333333, 0.5648521673982431),
  '2-f4': (2.1041666666666665, 0.2851074041200227)},
 {'f1': (5.894666666666666, 0.7832526314572358),
  'f2': (3.0386666666666673, 0.4552674691524405),
  'f3': (3.8746666666666676, 1.6983372896894036),
  'f4': (1.2653333333333334, 0.761309919016423)})

def gaussian(x, mu, sig):
    return 1./(sqrt(2.*pi)*sig)*np.exp(-np.power((x - mu)/sig, 2.)/2)

#P(F|C)
def Pfc(feat='f1', F='',C=''):
    mu,sigma = params_fc['{}-{}'.format(C,feat)]
    return gaussian(F,mu,sigma)

#P(F)
def Pf(feat='f1', F=''):
    mu,sigma=params_f[feat]
    return gaussian(F,mu,sigma)

#P(C) -- there are 50 of each type in the data, so each type is 50/150 -> 1/3
def Pc(C=''):
    return 1.0 / 3.0

#P(C|F) = P(F|C) * P(C) / P(F)
def Pcf(feat='f1', C='', F=''):
    return Pfc(feat,F,C) * Pc(C) / Pf(feat,F)

Testing¶

# call P(C|F) on all four individual features, multiply result together

# note:  a \ (i.e., a slash) at the end of a line in python means to continue the current line of code
for y_val in set(test.y):
    test[str(y_val)] = test.f1.map(lambda x: Pcf(feat='f1', C=y_val, F=x)) *\
    test.f2.map(lambda x: Pcf(feat='f2', C=y_val, F=x)) *\
    test.f3.map(lambda x: Pcf(feat='f3', C=y_val, F=x)) *\
    test.f4.map(lambda x: Pcf(feat='f4', C=y_val, F=x))
    
test['guess'] = test[['0','1','2']].idxmax(axis=1) # take the argmax class label

test

/home/casey/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
/home/casey/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.

Testing accuracy¶

len(test[test.y.map(str) == test.guess]) / len(test) # see if the guess matches the actual class label

0.96

Check against the scikit classifier¶

Xtrain = pd.DataFrame.as_matrix(train[cols])
ytrain = train.y
Xtest  = pd.DataFrame.as_matrix(test[cols])
ytest  = test.y

/home/casey/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.
  """Entry point for launching an IPython kernel.
/home/casey/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:3: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.
  This is separate from the ipykernel package so we can avoid doing imports until

#import and instantiate classifier here
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()

classifier.fit(Xtrain, ytrain)
preds = classifier.predict(Xtest)
sklearn.metrics.accuracy_score(ytest, preds)

0.96

	f1	f2	f3	f4	y	0	1	2	guess
38	4.4	3.0	1.3	0.2	0	4.119695e+00	2.058186e-19	2.004069e-24	0
55	5.7	2.8	4.5	1.3	1	3.628499e-89	3.509757e-01	2.299131e-04	1
139	6.9	3.1	5.4	2.1	2	3.615726e-172	1.209278e-05	8.799803e-01	2
95	5.7	3.0	4.2	1.2	1	1.522507e-72	1.697706e-01	2.812663e-05	1
144	6.7	3.3	5.7	2.5	2	1.333055e-209	2.124410e-10	5.961066e-01	2
...	...	...	...	...	...	...	...	...	...
121	5.6	2.8	4.9	2.0	2	4.748145e-133	1.237729e-03	4.524714e-02	2
1	4.9	3.0	1.4	0.2	0	1.192546e+01	3.730925e-18	3.810489e-23	0
57	4.9	2.4	3.3	1.0	1	7.515879e-36	2.213083e-03	1.132130e-09	1
116	6.5	3.0	5.5	1.8	2	6.226553e-164	1.003889e-03	2.638955e-01	2
35	5.0	3.2	1.2	0.2	0	9.556611e+00	1.176437e-19	4.215638e-24	0

	f1	f2	f3	f4	y
16	5.4	3.9	1.3	0.4	0
147	6.5	3.0	5.2	2.0	2
68	6.2	2.2	4.5	1.5	1
148	6.2	3.4	5.4	2.3	2
58	6.6	2.9	4.6	1.3	1
...	...	...	...	...	...
78	6.0	2.9	4.5	1.5	1
39	5.1	3.4	1.5	0.2	0
67	5.8	2.7	4.1	1.0	1
109	7.2	3.6	6.1	2.5	2
54	6.5	2.8	4.6	1.5	1

	f1	f2	f3	f4	y
16	5.4	3.9	1.3	0.4	0
147	6.5	3.0	5.2	2.0	2
68	6.2	2.2	4.5	1.5	1
148	6.2	3.4	5.4	2.3	2
58	6.6	2.9	4.6	1.3	1
...	...	...	...	...	...
78	6.0	2.9	4.5	1.5	1
39	5.1	3.4	1.5	0.2	0
67	5.8	2.7	4.1	1.0	1
109	7.2	3.6	6.1	2.5	2
54	6.5	2.8	4.6	1.5	1

	f1	f2	f3	f4	y
16	5.4	3.9	1.3	0.4	0
147	6.5	3.0	5.2	2.0	2
68	6.2	2.2	4.5	1.5	1
148	6.2	3.4	5.4	2.3	2
58	6.6	2.9	4.6	1.3	1
...	...	...	...	...	...
78	6.0	2.9	4.5	1.5	1
39	5.1	3.4	1.5	0.2	0
67	5.8	2.7	4.1	1.0	1
109	7.2	3.6	6.1	2.5	2
54	6.5	2.8	4.6	1.5	1