import pandas as pd
import nltk as n
from collections import Counter as ctr
import itertools as i
import numpy as np

data = pd.read_csv('spam.csv',skiprows=1, names=['type','text', 'c3', 'c4', 'c5'], delimiter=',', encoding='ISO-8859-1')

data.drop(['c3','c4','c5'],axis=1,inplace=True)

data['clean'] = data.text.map(lambda x: x.lower().split())
data[:5]

ctr(data.type)

Counter({'ham': 4825, 'spam': 747})

test = data.sample(frac=0.1)
train = data[~data.index.isin(test.index)]

train.shape, test.shape

((5015, 3), (557, 3))

Prior probability¶

ham_spam_ctr = ctr(train.type)

def Pa(X=''):
    return ham_spam_ctr[X] / len(train)

Pa(X='ham'), Pa(X='spam')

(0.8620139581256231, 0.13798604187437688)

Normalization¶

words_ctr = ctr([word for row in train.clean for word in row])

def Pb(W=''):
    if W not in words_ctr: return 0.0001
    return words_ctr[W] / sum(words_ctr.values())

Pb(W='the')

0.01532889205096695

Pb(W='hello')

0.00023284392988810556

Maximum Likelihood¶

for col_name in list(set(data.type)):
    sub_df = train[train.type == col_name]
    words_ham_spam[col_name] = []
    for row in sub_df.clean:
        for word in row:
            words_ham_spam[col_name].append(word)
    words_ham_spam[col_name] = ctr(words_ham_spam[col_name])

words_ham_spam = {}

for col_name in list(set(data.type)):
    sub_df = train[train.type == col_name]
    words_ham_spam[col_name] = ctr([word for row in sub_df.clean for word in row])
    
def Pba(W='',X=''):
    part = words_ham_spam[X]
    if W not in part: return 0.000001
    return part[W] / sum(part.values())

Pba(W='the', X='ham')

0.01628086546517932

for part in words_ham_spam:
    words = words_ham_spam[part]
    print(part, round(sum([Pba(a,part) for a in words.keys()]),8))

ham 1.0
spam 1.0

def Pab(X='',W=''):
    return Pba(W,X) * Pa(X) / Pb(W)

Pab(X='ham', W='the')

0.9155477926706843

def Ps(T,X='ham'):
    return np.prod([Pab(X=X,W=word) for word in T])

Ps(['the','hello', 'ahoytherebigfella'],'spam')

1.0572294329884905e-05

test['ham'] = test.clean.map(lambda x: Ps('ham',x))

test['spam'] = test.clean.map(lambda x: Ps('spam',x))

test.spam.describe()

count     5.570000e+02
mean      1.011411e-05
std       2.260772e-04
min      1.706035e-243
25%       5.716325e-36
50%       6.244939e-22
75%       1.231426e-13
max       5.333505e-03
Name: spam, dtype: float64

test['winner'] = test.ham / test.spam
test['winner'] = test.winner.map(lambda x: 'ham' if x>1 else 'spam')

result = test.winner == test.type

sum(result) / len(test)

0.9461400359066428

	type	text	clean
0	ham	Go until jurong point, crazy.. Available only ...	[go, until, jurong, point,, crazy.., available...
1	ham	Ok lar... Joking wif u oni...	[ok, lar..., joking, wif, u, oni...]
2	spam	Free entry in 2 a wkly comp to win FA Cup fina...	[free, entry, in, 2, a, wkly, comp, to, win, f...
3	ham	U dun say so early hor... U c already then say...	[u, dun, say, so, early, hor..., u, c, already...
4	ham	Nah I don't think he goes to usf, he lives aro...	[nah, i, don't, think, he, goes, to, usf,, he,...