Baysian Spam Detector

Spam detection with Bayesian Networks

These are my notes for the Bayesian Networks section of the udacity course on artifical intelligence.

In [22]:
# python standard library
from fractions import Fraction
import sys
In [23]:
# it turns out 'reduce' is no longer a built-in function in python 3
if sys.version_info.major >= 3:
    from functools import reduce
In [24]:
spam = 'offer is secret, click secret link, secret sports link'.split(',')
print(len(spam))
3
In [25]:
ham = 'play sports today, went play sports, secret sports event, sports is today, sports costs money'.split(',')
print(len(ham))
5

The terms have to be changed to be either all plural or all singular. In this case I changed 'sport' to 'sports' where needed.

The SpamDetector classes

I originally implemented everything as functions, but decided it was too scattered and created these after the fact, which is why there's all the duplication below. I left the old code to validate these classes.

The MailBag

This class holds either spam or ham. It actually holds both but the idea is one of them is the real type of interest.

In [26]:
class MailBag(object):
    """
    A place to put spam or ham
    """
    def __init__(self, mail, other_mail, k=0):
        """
        :param:
         - `mail`: list of example mail
         - `other_mail`: mail not in this class (e.g. spam if this is ham)
         - `k`: Laplace smoothing constant
        """
        self.mail = mail
        self.other_mail = other_mail
        self.k = k
        
        self._bag = None
        self._probability = None
        self._vocabulary_size = None
        self._sample_size = None
        return

    @property
    def vocabulary_size(self):
        """
        :return: count of unique words in all examples
        """
        if self._vocabulary_size is None:
            self._vocabulary_size = len(set(self.bag) | set(self.bag_boy(self.other_mail)))
        return self._vocabulary_size

    @property
    def bag(self):
        """
        :return: list of words in `mail`
        """
        if self._bag is None:
            self._bag = self.bag_boy(self.mail)
        return self._bag

    @property
    def sample_size(self):
        """
        :return: count of mail in both spam and not spam
        """
        if self._sample_size is None:
            self._sample_size = len(self.mail + self.other_mail)
        return self._sample_size
    
    @property
    def probability(self):
        """
        :return: count of this mail/total sample size
        """
        if self._probability is None:
            SPAM_AND_HAM = 2
            self._probability = self.l_probability(len(self.mail),
                                                   len(self.mail) + len(self.other_mail),
                                                   SPAM_AND_HAM)
        return self._probability

    def bag_boy(self, lines):
        """
        :param:
         - `lines`: list of lines

        :return: list of words taken from the lines
        """
        tokenized = (line.split() for line in lines)
        bag = []
        for tokens in tokenized:
            for token in tokens:
                bag.append(token)
        return bag

    def l_probability(self, event_size, sample_size, classes):
        """
        :param:
         - `event_size`: count of events of interest
         - `sample_size`: count of all events
         - `classes`: count of all classes of events

        :return: probability with Laplace Smoothing
        """        
        return Fraction(event_size + self.k,
                        sample_size + classes * self.k)

    def p_message(self, message):
        """
        :param:
         - `message`: line of mail

        :return: p(message|this class)
        """
        probabilities = (self.p_word(word) for word in message.split())
        return reduce(lambda x, y: x * y, probabilities) * self.probability
        
    def p_word(self, word):
        """
        :param:
         - `word`: string to check for
        :return: fraction of word occurence in bag
        """
        return self.l_probability(self.word_count(word), len(self.bag), self.vocabulary_size)
    
    def word_count(self, word):
        """
        :param:
         - `word`: string to check for
        :return: number of times word appears in bag
        """
        return sum((1 for token in self.bag if token == word))

SpamDetector

In [27]:
class SpamDetector(object):
    """
    A bayesian network spam detector
    """
    def __init__(self, spam, ham, k=0):
        """
        :param:
         - `spam`: list of example spam lines
         - `ham`: list of example ham_lines
         - `k`: laplace smoothing constant
        """
        self.spam = MailBag(mail=spam, k=k, other_mail=ham)
        self.ham = MailBag(mail=ham, k=k, other_mail=spam)
        return

    def p_spam_given_message(self, message):
        """
        :param:
         - `message`: line to check if it's spam
        :return: probability that it's spam
        """        
        p_message_given_spam = self.spam.p_message(message) 
        return p_message_given_spam/ (p_message_given_spam +
                                      self.ham.p_message(message))

# leave this in the same cell so updating the class updates the instance
detector = SpamDetector(spam=spam, ham=ham)
l_detector = SpamDetector(spam=spam, ham=ham, k=1)

What is the size of the vocabulary?

In [28]:
def bagger(mail):
    """
    converts list of lines into list of tokens
    
    :param:
     - `mail`: list of space-separated lines
    :return: list of words in `mail`
    """
    mail_tokenized = (line.split() for line in mail)
    mail_bag = []
    for tokens in mail_tokenized:
        for token in tokens:
            mail_bag.append(token)
    return mail_bag

spam_bag = bagger(spam)
ham_bag = bagger(ham)
            
In [29]:
def assert_equal(expected, actual, description):
    assert expected == actual, \
        "'{2}'\nExpected: {0}, Actual: {1}".format(expected, actual,
                                                   description)
In [30]:
vocabulary_list = set(spam_bag) | set(ham_bag)
vocabulary = len(set(spam_bag) | set(ham_bag))
assert_equal(spam_bag, detector.spam.bag, 'check spam bags')
assert_equal(ham_bag, detector.ham.bag, 'ham bags')
assert_equal(vocabulary, detector.spam.vocabulary_size, 'vocabulary size')
print(vocabulary)
12

what is the probability that a piece of mail is spam?

In [31]:
mail_count = len(ham) + len(spam)
assert_equal(mail_count, detector.spam.sample_size, 'mail count')
p_spam = Fraction(len(spam), mail_count)
assert_equal(p_spam, Fraction(3, 8), 'p-spam known')
assert_equal(p_spam, detector.spam.probability, 'p-spam detector')
print(p_spam)
3/8

what is p('secret'| spam)?

In [32]:
def word_count(bag, word):
    """
    count the number of times a word is in the bag

    :param:
     - `bag`: collection of words
     - `word`: word to count
    :return: number of times word appears in bag
    """
    return sum((1 for token in bag if token == word))
In [33]:
def p_word(bag, word, k=0, sample_space=12):
    """
    fraction of times word appears in the bag

    :param:
     - `bag`: collection of words
     - `word`: word to count in bag
     - `k`: laplace smoothing constant
     - `sample_space`: total number of words in vocabulary
    :return: Fraction of total bag that is word
    """
    return Fraction(word_count(bag, word) + k, len(bag) + k * sample_space)
In [34]:
p_secret_given_spam = p_word(spam_bag, 'secret')
assert p_secret_given_spam == Fraction(3, 9)
assert_equal(p_secret_given_spam, detector.spam.p_word('secret'),
             'secret given spam')
print(p_secret_given_spam)
1/3

what is p('secret'| ham)?

In [35]:
p_secret_given_ham = p_word(ham_bag, 'secret')
assert p_secret_given_ham == Fraction(1, 15)
assert_equal(p_secret_given_ham, detector.ham.p_word('secret'), 'p(secret|ham)')
print(p_secret_given_ham)
1/15

You get a message with one word - 'sports', what is p(spam|'sports')?

In [36]:
%%latex
$p(spam|`sports') = \frac{p(`sports' | spam)p(spam)}{p(`sports')}$
$p(spam|`sports') = \frac{p(`sports' | spam)p(spam)}{p(`sports')}$
In [37]:
p_sports_given_spam = p_word(spam_bag, 'sports')
assert p_sports_given_spam == Fraction(1, 9)
assert_equal(p_sports_given_spam, detector.spam.p_word('sports'),
             'p(sports|spam)')
print(p_sports_given_spam)
1/9
In [38]:
p_sports_given_ham = p_word(ham_bag, 'sports')
expected = Fraction(1, 3)
assert p_sports_given_ham == expected
assert_equal(p_sports_given_ham, detector.ham.p_word('sports'),
             'p(sports|ham)')
In [39]:
p_ham = Fraction(len(ham), mail_count)
assert_equal(p_ham, detector.ham.probability, 'p(ham)')
print(p_ham)
5/8
In [40]:
p_sports = Fraction(word_count(spam_bag, 'sports') + word_count(ham_bag, 'sports'), vocabulary)
print(p_sports)
1/2
In [41]:
p_spam_given_sports = (p_sports_given_spam * p_spam)/(p_sports_given_spam * p_spam + p_sports_given_ham * p_ham)
assert p_spam_given_sports == Fraction(3, 18)
assert_equal(p_spam_given_sports, detector.p_spam_given_message('sports'),
             'p(spam|sports)')
print(p_spam_given_sports)
1/6

Given the message 'secret is secret', what is the probability that it is spam?

In [42]:
%%latex
$p(spam|message) = \frac{p(message|spam)p(spam}{p(message|spam)p(spam) + p(message|ham)p(ham)}$
$p(spam|message) = \frac{p(message|spam)p(spam}{p(message|spam)p(spam) + p(message|ham)p(ham)}$

So, the question here is, how do you calculate the probabilities for the entire message instead of for a single word? The answer turns out to be to multiply the probability for each of the words together - so p('secret is secret'| spam) is the product p('secret'|spam) x p('is'|spam) x p('secret'|spam)

In [43]:
%%latex
$p(spam|sis) = \frac{p(s|spam)p(i|spam)p(s|spam)p(spam)}{p(s|spam)p(i|spam)p(s|spam)p(spam) + p(s|ham)p(i|ham)p(s|ham)p(ham)}$
$p(spam|sis) = \frac{p(s|spam)p(i|spam)p(s|spam)p(spam)}{p(s|spam)p(i|spam)p(s|spam)p(spam) + p(s|ham)p(i|ham)p(s|ham)p(ham)}$

Where s = 'secret', i = 'is' and sis='secret is secret'.

In [44]:
p_is_given_spam = p_word(spam_bag, 'is')
assert_equal(p_is_given_spam, detector.spam.p_word('is'), 'p(is|spam)')
p_is_given_ham = p_word(ham_bag, 'is')
assert_equal(p_is_given_ham, detector.ham.p_word('is'), 'p(is|ham)')
In [45]:
def p_message_given_class(message, bag, class_probability, k=0, sample_space=12):
    """
    :param:
     - `message`: string of words
     - `bag`: bag of words
     - `class_probability`: probability for this class (e.g. p(spam))
     - `k`: Laplace smoothing constant
     - `sample_space`: Size of the vocabulary
    :return: p(message|classification) * p(classification)
    """
    probabilities = (p_word(bag, word, k=k, sample_space=sample_space) for word in message.split())
    probability = class_probability
    for p in probabilities:
        probability *= p
    return probability
In [46]:
def p_spam_given_message(message, k=0, sample_space=12):
    """
    :param:
     - `message`: string of words
     - `k`: Laplace Smoothing constant
     - `sample_space`: total count of words in spam/ham bags
    :return: probability message is spam
    """
    spam_probability = p_spam if k == 0 else lp_spam
    ham_probability = p_ham if k == 0 else lp_ham
    p_m_given_spam = p_message_given_class(message, spam_bag, spam_probability, k=k, sample_space=sample_space)
    p_m_given_ham = p_message_given_class(message, ham_bag, ham_probability, k=k, sample_space=sample_space)
    return p_m_given_spam/(p_m_given_spam + p_m_given_ham)
In [47]:
message = 'secret is secret'
expected = Fraction(25, 26)
p_sis_given_spam = (p_secret_given_spam * p_is_given_spam * p_secret_given_spam
                    * p_spam)
assert p_message_given_class(message, spam_bag, p_spam) == p_sis_given_spam
assert_equal(p_sis_given_spam, detector.spam.p_message(message), 'p(sis|spam)')

p_sis_given_ham = p_secret_given_ham * p_is_given_ham * p_secret_given_ham * p_ham
assert p_message_given_class(message, ham_bag, p_ham) == p_sis_given_ham
assert_equal(p_sis_given_ham, detector.ham.p_message(message), 'p(sis|ham)')

p_spam_given_sis = p_sis_given_spam / (p_sis_given_spam + p_sis_given_ham)
assert_equal(p_spam_given_sis, detector.p_spam_given_message(message), 'p(spam|sis)')
assert p_spam_given_message(message) == p_spam_given_sis
assert p_spam_given_sis == expected
print(p_spam_given_sis)
25/26

What is the probability that "today is secret" is spam?

In [48]:
%%latex
$p(spam|tis) = \frac{p(t|spam)p(i|spam)p(s|spam)p(spam)}{p(t|spam)p(i|spam)p(s|spam)p(spam) + p(t|ham)p(i|ham)p(s|ham)p(ham)}$
$p(spam|tis) = \frac{p(t|spam)p(i|spam)p(s|spam)p(spam)}{p(t|spam)p(i|spam)p(s|spam)p(spam) + p(t|ham)p(i|ham)p(s|ham)p(ham)}$
In [49]:
tis = 'today is secret'
p_spam_given_tis = p_spam_given_message(tis)
print(p_spam_given_tis)
assert p_spam_given_tis == 0
assert_equal(p_spam_given_tis, detector.p_spam_given_message(tis),
             'p(spam|tis)')
0
In [50]:
'today' in spam_bag
Out[50]:
False

Since one of the words isn't in the spam bag of words, the numerator is going to be 0 (p('today'|spam) = 0) so the probability overall is 0.

Laplace Smoothing

When a single missing word drops the probability to 0, this means your model is overfitting the data. To get around this Laplace Smoothing is used.

In [51]:
%%latex
$p(s) = \frac{s_{count} + k}{total_{count} + k * |classes|}$
$p(s) = \frac{s_{count} + k}{total_{count} + k * |classes|}$

let k = 1.

What is the probability that a message is spam if you have 1 example message and it's spam?

In [52]:
def l_probability(class_count, total_count, k=1, classes=2):
    """
    :param:
     - `class_count`: size of event space
     - `total_count`: size of sample space
     - `k`: constant to prevent 0 probability
     - `classes`: total number of events
    :return: probability of class_count with Laplace Smoothing
    """
    return Fraction(class_count + k, total_count + classes * k)
In [53]:
k = 1
# classes = spam, ham
number_of_classes = 2
In [54]:
messages = 1
spam_messages = 1
actual = Fraction(spam_messages + k, messages + number_of_classes * k)
assert actual == Fraction(2, 3)
                            
print(actual)
2/3

What if you have 10 messages and 6 are spam?

In [55]:
messages, spam_messages = 10, 6
actual = l_probability(spam_messages, messages, k, number_of_classes)
expected = Fraction(spam_messages + k, messages + number_of_classes * k)
assert actual == expected
print(actual)
7/12

What if you have 100 messages and 60 are spam?

In [56]:
messages, spam_messages = 100, 60
print(l_probability(spam_messages, messages, k, number_of_classes))
61/102

spam/ham with Laplace Smoothing

What are the probabilities that a message is spam or ham with k=1?

In [57]:
lp_spam = l_probability(total_count=mail_count, class_count=len(spam))
assert_equal(lp_spam, l_detector.spam.probability, 'p(spam)')
lp_ham = l_probability(total_count=mail_count, class_count=len(ham))
assert_equal(lp_ham, l_detector.ham.probability, 'p(ham)')
print(lp_spam)
print(lp_ham)
2/5
3/5

What are p('today'|spam) and p('today'|ham)?

In this case the class-count isn't 2 (for spam or ham) but 12, for the total number of words in the vocabulary.

In [58]:
print(p_word(spam_bag, 'today', k=1, sample_space=vocabulary))
1/21
In [59]:
lp_today_given_spam = l_probability(total_count=len(spam_bag),
                                    class_count=word_count(spam_bag, 'today'),
                                    classes=vocabulary)
assert_equal(lp_today_given_spam, l_detector.spam.p_word('today'), 'p(today|spam)')
lp_today_given_ham = l_probability(total_count=len(ham_bag),
                                   class_count=word_count(ham_bag, 'today'),
                                   classes=vocabulary
)
assert_equal(lp_today_given_ham, l_detector.ham.p_word('today'),
             'p(today|ham)')
assert lp_today_given_spam == Fraction(1, 21)
assert lp_today_given_ham == Fraction(1, 9)
print('p(today|spam) = {0}'.format(lp_today_given_spam))
print('p(today|ham) = {0}'.format(lp_today_given_ham))
p(today|spam) = 1/21
p(today|ham) = 1/9

What is p(spam|m) if m = 'today is secret' and k=1?

In [60]:
tis = 'today is secret'
lp_is_given_spam = p_word(spam_bag, 'is', k=1, sample_space=vocabulary)
assert_equal(lp_is_given_spam, l_detector.spam.p_word('is'), 'p(is|spam)')

lp_is_given_ham = p_word(ham_bag, 'is', k=1, sample_space=vocabulary)
assert_equal(lp_is_given_ham, l_detector.ham.p_word('is'), 'p(is|ham)')

lp_secret_given_spam = p_word(spam_bag, 'secret', k=1, sample_space=vocabulary)
assert_equal(lp_secret_given_spam, l_detector.spam.p_word('secret'), 'p(secret|spam)')

lp_secret_given_ham = p_word(ham_bag, 'secret', k=1, sample_space=vocabulary)
assert_equal(lp_secret_given_ham, l_detector.ham.p_word('secret'), 'p(secret|ham)')

lp_tis_given_spam = lp_today_given_spam * lp_is_given_spam * lp_secret_given_spam * lp_spam
lp_tis_given_ham =  lp_today_given_ham * lp_is_given_ham * lp_secret_given_ham * lp_ham
lp_spam_given_tis = Fraction(lp_tis_given_spam, lp_tis_given_spam + lp_tis_given_ham)

assert_equal(lp_tis_given_spam, l_detector.spam.p_message(tis), 'p(tis|spam)')
assert_equal(lp_tis_given_ham, l_detector.ham.p_message(tis), 'p(tis|ham)')
assert_equal(lp_spam_given_tis, l_detector.p_spam_given_message(tis), 'p(spam|tis)')
print(lp_spam_given_tis)
324/667

This is just more double-checking to make sure that the functions I originally wrote match the hand-calculated answers.

In [61]:
actual = p_message_given_class(tis, ham_bag, lp_ham, k=1, sample_space=vocabulary)
assert lp_tis_given_ham == actual, "Expected: {0} Actual: {1}".format(lp_tis_given_ham, actual)
In [62]:
actual = p_spam_given_message(message=tis, k=1, sample_space=vocabulary)
assert lp_spam_given_tis == actual , "Expected: {0} Actual: {1}".format(lp_spam_given_tis, actual)

Re-do

Since the code ended up being so messy I'm going to re-do the last example using the class-based version only.

In [64]:
spam_detector = SpamDetector(spam=spam, ham=ham, k=1)
message = 'today is secret'
answer = spam_detector.p_spam_given_message(message)
print("p(spam|'today is secret') = {0}".format(answer))
p(spam|'today is secret') = 324/667
In [65]:
assert_equal(lp_spam_given_tis, answer, "p(spam|'today is secret')")