forked from hmason/ml_class
-
Notifications
You must be signed in to change notification settings - Fork 0
/
classify.py
123 lines (92 loc) · 4.37 KB
/
classify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env python
# encoding: utf-8
"""
classify.py
Created by Hilary Mason on 2011-02-17.
Copyright (c) 2011 Hilary Mason. All rights reserved.
"""
import sys, os
import re, string
from nltk import FreqDist
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
class NaiveBayesClassifier(object):
def __init__(self):
self.feature_count = {}
self.category_count = {}
def probability(self, item, category):
"""
probability: prob that an item is in a category
"""
category_prob = self.get_category_count(category) / sum(self.category_count.values())
return self.document_probability(item, category) * category_prob
def document_probability(self, item, category):
features = self.get_features(item)
p = 1
for feature in features:
print "%s - %s - %s" % (feature, category, self.weighted_prob(feature, category))
p *= self.weighted_prob(feature, category)
return p
def train_from_data(self, data):
for category, documents in data.items():
for doc in documents:
self.train(doc, category)
# print self.feature_count
# def get_features(self, document):
# all_words = word_tokenize(document)
# all_words_freq = FreqDist(all_words)
#
# # print sorted(all_words_freq.items(), key=lambda(w,c):(-c, w))
# return all_words_freq
def get_features(self, document):
document = re.sub('[%s]' % re.escape(string.punctuation), '', document) # removes punctuation
document = document.lower() # make everything lowercase
all_words = [w for w in word_tokenize(document) if len(w) > 3 and len(w) < 16]
p = PorterStemmer()
all_words = [p.stem(w) for w in all_words]
all_words_freq = FreqDist(all_words)
# print sorted(all_words_freq.items(), key=lambda(w,c):(-c, w))
return all_words_freq
def increment_feature(self, feature, category):
self.feature_count.setdefault(feature,{})
self.feature_count[feature].setdefault(category, 0)
self.feature_count[feature][category] += 1
def increment_cat(self, category):
self.category_count.setdefault(category, 0)
self.category_count[category] += 1
def get_feature_count(self, feature, category):
if feature in self.feature_count and category in self.feature_count[feature]:
return float(self.feature_count[feature][category])
else:
return 0.0
def get_category_count(self, category):
if category in self.category_count:
return float(self.category_count[category])
else:
return 0.0
def feature_prob(self, f, category): # Pr(A|B)
if self.get_category_count(category) == 0:
return 0
return (self.get_feature_count(f, category) / self.get_category_count(category))
def weighted_prob(self, f, category, weight=1.0, ap=0.5):
basic_prob = self.feature_prob(f, category)
totals = sum([self.get_feature_count(f, category) for category in self.category_count.keys()])
w_prob = ((weight*ap) + (totals * basic_prob)) / (weight + totals)
return w_prob
def train(self, item, category):
features = self.get_features(item)
for f in features:
self.increment_feature(f, category)
self.increment_cat(category)
if __name__ == '__main__':
labels = ['arts', 'sports'] # these are the categories we want
data = {}
for label in labels:
f = open(label, 'r')
data[label] = f.readlines()
# print len(data[label])
f.close()
nb = NaiveBayesClassifier()
nb.train_from_data(data)
print nb.probability("Early Friday afternoon, the lead negotiators for the N.B.A. and the players union will hold a bargaining session in Beverly Hills — the latest attempt to break a 12-month stalemate on a new labor deal.", 'arts')
print nb.probability("Early Friday afternoon, the lead negotiators for the N.B.A. and the players union will hold a bargaining session in Beverly Hills — the latest attempt to break a 12-month stalemate on a new labor deal.", 'sports')