-
Notifications
You must be signed in to change notification settings - Fork 217
/
mlrmap.go
154 lines (140 loc) · 4.88 KB
/
mlrmap.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
// ================================================================
// ORDERED MAP FROM STRING TO MLRVAL
//
// This is an implementation of insertion-ordered key-value pairs for Miller's
// fundamental record data structure. It's also an ordered-map data structure,
// suitable for Miller JSON decode/encode.
//
// ----------------------------------------------------------------
// DESIGN
//
// * It keeps a doubly-linked list of key-value pairs.
//
// * By default, no hash functions are computed when the map is written to or
// read from.
//
// * Gets are implemented by sequential scan through the list: given a key,
// the key-value pairs are scanned through until a match is (or is not) found.
//
// * Performance improvement of 25% percent over hash-mapping from key to entry
// was found in the Go implementation. Test data was million-line CSV and
// DKVP, with a dozen columns or so.
//
// Note however that an auxiliary constructor is provided which does use
// a key-to-entry hashmap in place of linear search for get/put/has/delete.
// This may be useful in certain contexts, even though it's not the default
// chosen for stream-records.
//
// ----------------------------------------------------------------
// MOTIVATION
//
// * The use case for records in Miller is that *all* fields are read from
// strings & written to strings (split/join), while only *some* fields are
// operated on.
//
// * Meanwhile there are few repeated accesses to a given record: the
// access-to-construct ratio is quite low for Miller data records. Miller
// instantiates thousands, millions, billions of records (depending on the
// input data) but accesses each record only once per transforming operation.
// (This is in contrast to accumulator hashmaps which are repeatedly accessed
// during a stats run.)
//
// * The hashed impl computes hashsums for *all* fields whether operated on or not,
// for the benefit of the *few* fields looked up during the transforming operation.
//
// * The hashless impl only keeps string pointers. Lookups are done at runtime
// doing prefix search on the key names. Assuming field names are distinct,
// this is just a few char-ptr accesses which (in experiments) turn out to
// offer about a 10-15% performance improvement.
//
// * Added benefit: the field-rename operation (preserving field order) becomes
// trivial.
// ================================================================
package mlrval
// For the C port having this off was a noticeable performance improvement (10-15%).
// For the Go port having it off is a less-noticeable performance improvement (5%).
// Both these figures are for just doing mlr cat. At the moment I'm leaving this
// default-on pending more profiling on more complex record-processing operations
// such as mlr sort.
var hashRecords = false
func HashRecords(onOff bool) {
hashRecords = onOff
}
// ----------------------------------------------------------------
type Mlrmap struct {
FieldCount int64
Head *MlrmapEntry
Tail *MlrmapEntry
// Surprisingly, using this costs about 25% for cat/cut/etc tests
// on million-line data files (CSV, DKVP) with a dozen or so columns.
// So, the constructor allows callsites to use it, or not.
keysToEntries map[string]*MlrmapEntry
}
type MlrmapEntry struct {
Key string
Value *Mlrval
Prev *MlrmapEntry
Next *MlrmapEntry
}
// MlrmapEntryForArray is for use by sorting routines where the Prev/Next pointers
// are irrelevant as well as ephemeral
type MlrmapEntryForArray struct {
Key string
Value *Mlrval
}
// Only used for sorting, map-to-pairs-array and pairs-array-to-map contexts.
type MlrmapPair struct {
Key string
Value *Mlrval
}
// ----------------------------------------------------------------
func NewMlrmapAsRecord() *Mlrmap {
if hashRecords {
return newMlrmapHashed()
} else {
return newMlrmapUnhashed()
}
}
func NewMlrmap() *Mlrmap {
return newMlrmapHashed()
}
// Faster on record-stream data as noted above.
func newMlrmapUnhashed() *Mlrmap {
return &Mlrmap{
FieldCount: 0,
Head: nil,
Tail: nil,
keysToEntries: nil,
}
}
// Intended for use in DSL expressions wherein the access-to-construct ratio
// might be higher (although this needs profiling over a variety of use-cases).
func newMlrmapHashed() *Mlrmap {
return &Mlrmap{
FieldCount: 0,
Head: nil,
Tail: nil,
keysToEntries: make(map[string]*MlrmapEntry),
}
}
func NewMlrmapMaybeHashed(wantHashing bool) *Mlrmap {
if wantHashing {
return newMlrmapHashed()
} else {
return newMlrmapUnhashed()
}
}
func (mlrmap *Mlrmap) isHashed() bool {
return mlrmap.keysToEntries != nil
}
// ----------------------------------------------------------------
// Value-copy is up to the caller -- PutReference and PutCopy
// are in the public Mlrmap API.
func newMlrmapEntry(key string, value *Mlrval) *MlrmapEntry {
return &MlrmapEntry{
key,
value,
nil,
nil,
}
}