-
Notifications
You must be signed in to change notification settings - Fork 9
/
sparse.go
73 lines (62 loc) · 1.62 KB
/
sparse.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
package multibayes
type sparseMatrix struct {
Tokens map[string]*sparseColumn `json:"tokens"` // []map[tokenindex]occurence
Classes map[string]*sparseColumn `json:"classes"` // map[classname]classindex
N int `json:"n"` // number of rows currently in the matrix
}
type sparseColumn struct {
Data []int `json:"data"`
}
func newSparseColumn() *sparseColumn {
return &sparseColumn{
Data: make([]int, 0, 1000),
}
}
func (s *sparseColumn) Add(index int) {
s.Data = append(s.Data, index)
}
// return the number of rows that contain the column
func (s *sparseColumn) Count() int {
return len(s.Data)
}
// sparse to dense
func (s *sparseColumn) Expand(n int) []float64 {
expanded := make([]float64, n)
for _, index := range s.Data {
expanded[index] = 1.0
}
return expanded
}
func newSparseMatrix() *sparseMatrix {
return &sparseMatrix{
Tokens: make(map[string]*sparseColumn),
Classes: make(map[string]*sparseColumn),
N: 0,
}
}
func (s *sparseMatrix) Add(ngrams []ngram, classes []string) {
if len(ngrams) == 0 || len(classes) == 0 {
return
}
for _, class := range classes {
if _, ok := s.Classes[class]; !ok {
s.Classes[class] = newSparseColumn()
}
s.Classes[class].Add(s.N)
}
// add ngrams uniquely
added := make(map[string]int)
for _, ngram := range ngrams {
gramString := ngram.String()
if _, ok := s.Tokens[gramString]; !ok {
s.Tokens[gramString] = newSparseColumn()
}
// only add the document index once for the ngram
if _, ok := added[gramString]; !ok {
added[gramString] = 1
s.Tokens[gramString].Add(s.N)
}
}
// increment the row counter
s.N++
}