In [1]:
import pandas as pd
import nltk as n
from collections import Counter as ctr
import itertools as i
import numpy as np
In [2]:
data = pd.read_csv('spam.csv',skiprows=1, names=['type','text', 'c3', 'c4', 'c5'], delimiter=',', encoding='ISO-8859-1')
In [3]:
data.drop(['c3','c4','c5'],axis=1,inplace=True)
In [4]:
data['clean'] = data.text.map(lambda x: x.lower().split())
data[:5]
Out[4]:
type text clean
0 ham Go until jurong point, crazy.. Available only ... [go, until, jurong, point,, crazy.., available...
1 ham Ok lar... Joking wif u oni... [ok, lar..., joking, wif, u, oni...]
2 spam Free entry in 2 a wkly comp to win FA Cup fina... [free, entry, in, 2, a, wkly, comp, to, win, f...
3 ham U dun say so early hor... U c already then say... [u, dun, say, so, early, hor..., u, c, already...
4 ham Nah I don't think he goes to usf, he lives aro... [nah, i, don't, think, he, goes, to, usf,, he,...
In [5]:
ctr(data.type)
Out[5]:
Counter({'ham': 4825, 'spam': 747})
In [6]:
test = data.sample(frac=0.1)
train = data[~data.index.isin(test.index)]

train.shape, test.shape
Out[6]:
((5015, 3), (557, 3))

Prior probability

In [7]:
ham_spam_ctr = ctr(train.type)

def Pa(X=''):
    return ham_spam_ctr[X] / len(train)
In [8]:
Pa(X='ham'), Pa(X='spam')
Out[8]:
(0.8620139581256231, 0.13798604187437688)

Normalization

In [9]:
words_ctr = ctr([word for row in train.clean for word in row])

def Pb(W=''):
    if W not in words_ctr: return 0.0001
    return words_ctr[W] / sum(words_ctr.values())
In [10]:
Pb(W='the')
Out[10]:
0.01532889205096695
In [11]:
Pb(W='hello')
Out[11]:
0.00023284392988810556

Maximum Likelihood

In [ ]:
for col_name in list(set(data.type)):
    sub_df = train[train.type == col_name]
    words_ham_spam[col_name] = []
    for row in sub_df.clean:
        for word in row:
            words_ham_spam[col_name].append(word)
    words_ham_spam[col_name] = ctr(words_ham_spam[col_name])
        
In [23]:
words_ham_spam = {}

for col_name in list(set(data.type)):
    sub_df = train[train.type == col_name]
    words_ham_spam[col_name] = ctr([word for row in sub_df.clean for word in row])
    
def Pba(W='',X=''):
    part = words_ham_spam[X]
    if W not in part: return 0.000001
    return part[W] / sum(part.values())
In [24]:
Pba(W='the', X='ham')
Out[24]:
0.01628086546517932
In [25]:
for part in words_ham_spam:
    words = words_ham_spam[part]
    print(part, round(sum([Pba(a,part) for a in words.keys()]),8))
ham 1.0
spam 1.0
In [26]:
def Pab(X='',W=''):
    return Pba(W,X) * Pa(X) / Pb(W)
In [27]:
Pab(X='ham', W='the')
Out[27]:
0.9155477926706843
In [33]:
def Ps(T,X='ham'):
    return np.prod([Pab(X=X,W=word) for word in T])
In [35]:
Ps(['the','hello', 'ahoytherebigfella'],'spam')
Out[35]:
1.0572294329884905e-05
In [36]:
test['ham'] = test.clean.map(lambda x: Ps('ham',x))
In [31]:
test['spam'] = test.clean.map(lambda x: Ps('spam',x))

test.spam.describe()
Out[31]:
count     5.570000e+02
mean      1.011411e-05
std       2.260772e-04
min      1.706035e-243
25%       5.716325e-36
50%       6.244939e-22
75%       1.231426e-13
max       5.333505e-03
Name: spam, dtype: float64
In [21]:
test['winner'] = test.ham / test.spam
test['winner'] = test.winner.map(lambda x: 'ham' if x>1 else 'spam')
In [22]:
result = test.winner == test.type

sum(result) / len(test)
Out[22]:
0.9461400359066428
In [ ]: