import pandas as pd
import nltk as n
from collections import Counter as ctr
import itertools as i
import numpy as np
data = pd.read_csv('spam.csv',skiprows=1, names=['type','text', 'c3', 'c4', 'c5'], delimiter=',', encoding='ISO-8859-1')
data.drop(['c3','c4','c5'],axis=1,inplace=True)
data['clean'] = data.text.map(lambda x: x.lower().split())
data[:5]
ctr(data.type)
test = data.sample(frac=0.1)
train = data[~data.index.isin(test.index)]
train.shape, test.shape
ham_spam_ctr = ctr(train.type)
def Pa(X=''):
return ham_spam_ctr[X] / len(train)
Pa(X='ham'), Pa(X='spam')
words_ctr = ctr([word for row in train.clean for word in row])
def Pb(W=''):
if W not in words_ctr: return 0.0001
return words_ctr[W] / sum(words_ctr.values())
Pb(W='the')
Pb(W='hello')
for col_name in list(set(data.type)):
sub_df = train[train.type == col_name]
words_ham_spam[col_name] = []
for row in sub_df.clean:
for word in row:
words_ham_spam[col_name].append(word)
words_ham_spam[col_name] = ctr(words_ham_spam[col_name])
words_ham_spam = {}
for col_name in list(set(data.type)):
sub_df = train[train.type == col_name]
words_ham_spam[col_name] = ctr([word for row in sub_df.clean for word in row])
def Pba(W='',X=''):
part = words_ham_spam[X]
if W not in part: return 0.000001
return part[W] / sum(part.values())
Pba(W='the', X='ham')
for part in words_ham_spam:
words = words_ham_spam[part]
print(part, round(sum([Pba(a,part) for a in words.keys()]),8))
def Pab(X='',W=''):
return Pba(W,X) * Pa(X) / Pb(W)
Pab(X='ham', W='the')
def Ps(T,X='ham'):
return np.prod([Pab(X=X,W=word) for word in T])
Ps(['the','hello', 'ahoytherebigfella'],'spam')
test['ham'] = test.clean.map(lambda x: Ps('ham',x))
test['spam'] = test.clean.map(lambda x: Ps('spam',x))
test.spam.describe()
test['winner'] = test.ham / test.spam
test['winner'] = test.winner.map(lambda x: 'ham' if x>1 else 'spam')
result = test.winner == test.type
sum(result) / len(test)