-
Notifications
You must be signed in to change notification settings - Fork 9
/
bayes.go
119 lines (95 loc) · 2.71 KB
/
bayes.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
package multibayes
import (
"math"
)
var (
smoother = 1 // laplace
defaultMinClassSize = 5
)
type Classifier struct {
Tokenizer *tokenizer `json:"-"`
Matrix *sparseMatrix `json:"matrix"`
MinClassSize int
}
// Create a new multibayes classifier.
func NewClassifier() *Classifier {
tokenize, _ := newTokenizer(&tokenizerConf{
NGramSize: 1,
})
sparse := newSparseMatrix()
return &Classifier{
Tokenizer: tokenize,
Matrix: sparse,
MinClassSize: defaultMinClassSize,
}
}
// Train the classifier with a new document and its classes.
func (c *Classifier) Add(document string, classes []string) {
ngrams := c.Tokenizer.Parse(document)
c.Matrix.Add(ngrams, classes)
}
// Calculate the posterior probability for a new document on each
// class from the training set.
func (c *Classifier) Posterior(document string) map[string]float64 {
tokens := c.Tokenizer.Parse(document)
predictions := make(map[string]float64)
for class, classcolumn := range c.Matrix.Classes {
if len(classcolumn.Data) < c.MinClassSize {
continue
}
n := classcolumn.Count()
smoothN := n + (smoother * 2)
priors := []float64{
float64(n+smoother) / float64(c.Matrix.N+(smoother*2)), // P(C=Y)
float64(c.Matrix.N-n+smoother) / float64(c.Matrix.N+(smoother*2)), // P(C=N)
}
loglikelihood := []float64{1.0, 1.0}
// check if each token is in our token sparse matrix
for _, token := range tokens {
if tokencolumn, ok := c.Matrix.Tokens[token.String()]; ok {
// conditional probability the token occurs for the class
joint := intersection(tokencolumn.Data, classcolumn.Data)
conditional := float64(joint+smoother) / float64(smoothN) // P(F|C=Y)
loglikelihood[0] += math.Log(conditional)
// conditional probability the token occurs if the class doesn't apply
not := len(tokencolumn.Data) - joint
notconditional := float64(not+smoother) / float64(smoothN) // P(F|C=N)
loglikelihood[1] += math.Log(notconditional)
}
}
likelihood := []float64{
math.Exp(loglikelihood[0]),
math.Exp(loglikelihood[1]),
}
prob := bayesRule(priors, likelihood) // P(C|F)
predictions[class] = prob[0]
}
return predictions
}
func bayesRule(prior, likelihood []float64) []float64 {
posterior := make([]float64, len(prior))
sum := 0.0
for i, _ := range prior {
combined := prior[i] * likelihood[i]
posterior[i] = combined
sum += combined
}
// scale the likelihoods
for i, _ := range posterior {
posterior[i] /= sum
}
return posterior
}
// elements that are in both array1 and array2
func intersection(array1, array2 []int) int {
var count int
for _, elem1 := range array1 {
for _, elem2 := range array2 {
if elem1 == elem2 {
count++
break
}
}
}
return count
}