Skip to content

Commit

Permalink
perf(query): Update CompressedBin IntersectionAlgo (#9000)
Browse files Browse the repository at this point in the history
Updated algo to intersect. Getting upto 175% improvment overall
```
goos: linux
goarch: amd64
pkg: github.com/dgraph-io/dgraph/algo
cpu: 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz
                                                                                          │     in1      │                 main_in                 │
                                                                                          │    sec/op    │    sec/op      vs base                  │
ListIntersectCompressBin/compressed:IntersectWith:ratio=0.01:size=100:overlap=0.01:-8       91.21n ± ∞ ¹   387.70n ± ∞ ¹         ~ (p=1.000 n=1) ²
ListIntersectCompressBin/compressed:IntersectWith:ratio=0.1:size=100:overlap=0.01:-8        336.0n ± ∞ ¹   1733.0n ± ∞ ¹         ~ (p=1.000 n=1) ²
ListIntersectCompressBin/compressed:IntersectWith:ratio=1:size=100:overlap=0.01:-8          1.093µ ± ∞ ¹    3.481µ ± ∞ ¹         ~ (p=1.000 n=1) ²
ListIntersectCompressBin/compressed:IntersectWith:ratio=10:size=100:overlap=0.01:-8         4.719µ ± ∞ ¹    8.600µ ± ∞ ¹         ~ (p=1.000 n=1) ²
ListIntersectCompressBin/compressed:IntersectWith:ratio=100:size=100:overlap=0.01:-8        19.76µ ± ∞ ¹    44.32µ ± ∞ ¹         ~ (p=1.000 n=1) ²
ListIntersectCompressBin/compressed:IntersectWith:ratio=0.01:size=1000:overlap=0.01:-8      481.7n ± ∞ ¹    784.2n ± ∞ ¹         ~ (p=1.000 n=1) ²
ListIntersectCompressBin/compressed:IntersectWith:ratio=0.1:size=1000:overlap=0.01:-8       1.502µ ± ∞ ¹    3.667µ ± ∞ ¹         ~ (p=1.000 n=1) ²
ListIntersectCompressBin/compressed:IntersectWith:ratio=1:size=1000:overlap=0.01:-8         5.454µ ± ∞ ¹   36.977µ ± ∞ ¹         ~ (p=1.000 n=1) ²
ListIntersectCompressBin/compressed:IntersectWith:ratio=10:size=1000:overlap=0.01:-8        33.14µ ± ∞ ¹    70.73µ ± ∞ ¹         ~ (p=1.000 n=1) ²
ListIntersectCompressBin/compressed:IntersectWith:ratio=100:size=1000:overlap=0.01:-8       179.8µ ± ∞ ¹    485.1µ ± ∞ ¹         ~ (p=1.000 n=1) ²
ListIntersectCompressBin/compressed:IntersectWith:ratio=0.01:size=10000:overlap=0.01:-8     3.214µ ± ∞ ¹    4.459µ ± ∞ ¹         ~ (p=1.000 n=1) ²
ListIntersectCompressBin/compressed:IntersectWith:ratio=0.1:size=10000:overlap=0.01:-8      12.00µ ± ∞ ¹    32.78µ ± ∞ ¹         ~ (p=1.000 n=1) ²
ListIntersectCompressBin/compressed:IntersectWith:ratio=1:size=10000:overlap=0.01:-8        76.90µ ± ∞ ¹   356.05µ ± ∞ ¹         ~ (p=1.000 n=1) ²
ListIntersectCompressBin/compressed:IntersectWith:ratio=10:size=10000:overlap=0.01:-8       371.6µ ± ∞ ¹    923.1µ ± ∞ ¹         ~ (p=1.000 n=1) ²
ListIntersectCompressBin/compressed:IntersectWith:ratio=100:size=10000:overlap=0.01:-8      1.894m ± ∞ ¹    5.141m ± ∞ ¹         ~ (p=1.000 n=1) ²
ListIntersectCompressBin/compressed:IntersectWith:ratio=0.01:size=100000:overlap=0.01:-8    48.13µ ± ∞ ¹    60.59µ ± ∞ ¹         ~ (p=1.000 n=1) ²
ListIntersectCompressBin/compressed:IntersectWith:ratio=0.1:size=100000:overlap=0.01:-8     146.7µ ± ∞ ¹    611.9µ ± ∞ ¹         ~ (p=1.000 n=1) ²
ListIntersectCompressBin/compressed:IntersectWith:ratio=1:size=100000:overlap=0.01:-8       943.7µ ± ∞ ¹   3359.6µ ± ∞ ¹         ~ (p=1.000 n=1) ²
ListIntersectCompressBin/compressed:IntersectWith:ratio=10:size=100000:overlap=0.01:-8      3.926m ± ∞ ¹    8.849m ± ∞ ¹         ~ (p=1.000 n=1) ²
ListIntersectCompressBin/compressed:IntersectWith:ratio=100:size=100000:overlap=0.01:-8     20.78m ± ∞ ¹
ListIntersectCompressBin/compressed:IntersectWith:ratio=0.01:size=1000000:overlap=0.01:-8   862.4µ ± ∞ ¹   1142.5µ ± ∞ ¹         ~ (p=1.000 n=1) ²
ListIntersectCompressBin/compressed:IntersectWith:ratio=0.1:size=1000000:overlap=0.01:-8    1.599m ± ∞ ¹    7.878m ± ∞ ¹         ~ (p=1.000 n=1) ²
ListIntersectCompressBin/compressed:IntersectWith:ratio=1:size=1000000:overlap=0.01:-8      9.791m ± ∞ ¹   33.871m ± ∞ ¹         ~ (p=1.000 n=1) ²
ListIntersectCompressBin/compressed:IntersectWith:ratio=10:size=1000000:overlap=0.01:-8     44.40m ± ∞ ¹
geomean                                                                                     66.19µ          104.6µ        +175.93%               ³
                    

```
  • Loading branch information
harshil-goel authored Oct 13, 2023
1 parent 6dc93c7 commit 0b84c95
Show file tree
Hide file tree
Showing 3 changed files with 139 additions and 34 deletions.
55 changes: 28 additions & 27 deletions algo/uidlist.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ import (
"github.com/dgraph-io/dgraph/protos/pb"
)

const jump = 32 // Jump size in InsersectWithJump.
const jump = 32 // Jump size in InsersectWithJump.
const linVsBinRatio = 10 // When is linear search better than binary

// ApplyFilter applies a filter to our UIDList.
func ApplyFilter(u *pb.List, f func(uint64, int) bool) {
Expand Down Expand Up @@ -60,7 +61,7 @@ func IntersectCompressedWith(pack *pb.UidPack, afterUID uint64, v, o *pb.List) {

// Select appropriate function based on heuristics.
ratio := float64(m) / float64(n)
if ratio < 500 {
if ratio < linVsBinRatio {
IntersectCompressedWithLinJump(&dec, v.Uids, &dst)
} else {
IntersectCompressedWithBin(&dec, v.Uids, &dst)
Expand Down Expand Up @@ -94,7 +95,7 @@ func IntersectCompressedWithLinJump(dec *codec.Decoder, v []uint64, o *[]uint64)
// https://link.springer.com/chapter/10.1007/978-3-642-12476-1_3
// Call seek on dec before calling this function
func IntersectCompressedWithBin(dec *codec.Decoder, q []uint64, o *[]uint64) {
ld := dec.ApproxLen()
ld := codec.ExactLen(dec.Pack)
lq := len(q)

if lq == 0 {
Expand All @@ -105,46 +106,44 @@ func IntersectCompressedWithBin(dec *codec.Decoder, q []uint64, o *[]uint64) {
}

// Pick the shorter list and do binary search
if ld < lq {
if ld <= lq {
for {
blockUids := dec.Uids()
if len(blockUids) == 0 {
break
}
IntersectWithBin(blockUids, q, o)
lastUid := blockUids[len(blockUids)-1]
qidx := sort.Search(len(q), func(idx int) bool {
return q[idx] >= lastUid
})
if qidx >= len(q) {
_, off := IntersectWithJump(blockUids, q, o)
q = q[off:]
if len(q) == 0 {
return
}
q = q[qidx:]
dec.Next()
}
return
}

var uids []uint64
for _, u := range q {
uids := dec.Uids()
qidx := 0
for {
if qidx >= len(q) {
return
}
u := q[qidx]
if len(uids) == 0 || u > uids[len(uids)-1] {
uids = dec.Seek(u, codec.SeekStart)
if lq*linVsBinRatio < ld {
uids = dec.LinearSeek(u)
} else {
uids = dec.SeekToBlock(u, codec.SeekCurrent)
}
if len(uids) == 0 {
return
}
}
uidIdx := sort.Search(len(uids), func(idx int) bool {
return uids[idx] >= u
})
if uidIdx >= len(uids) {
// We know that u < max(uids). If we didn't find it here, it's not here.
continue
}
if uids[uidIdx] == u {
*o = append(*o, u)
uidIdx++
_, off := IntersectWithJump(uids, q[qidx:], o)
if off == 0 {
off = 1 // if v[k] isn't in u, move forward
}
uids = uids[uidIdx:]
qidx += off
}
}

Expand Down Expand Up @@ -233,7 +232,8 @@ func IntersectWithJump(u, v []uint64, o *[]uint64) (int, int) {
// IntersectWithBin is based on the paper
// "Fast Intersection Algorithms for Sorted Sequences"
// https://link.springer.com/chapter/10.1007/978-3-642-12476-1_3
func IntersectWithBin(d, q []uint64, o *[]uint64) {
// Returns where to move the second array(q) to. O means not found
func IntersectWithBin(d, q []uint64, o *[]uint64) int {
ld := len(d)
lq := len(q)

Expand All @@ -242,7 +242,7 @@ func IntersectWithBin(d, q []uint64, o *[]uint64) {
d, q = q, d
}
if ld == 0 || lq == 0 || d[ld-1] < q[0] || q[lq-1] < d[0] {
return
return 0
}

val := d[0]
Expand All @@ -256,6 +256,7 @@ func IntersectWithBin(d, q []uint64, o *[]uint64) {
})

binIntersect(d, q[minq:maxq], o)
return maxq
}

// binIntersect is the recursive function used.
Expand Down
60 changes: 53 additions & 7 deletions algo/uidlist_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,7 @@ func BenchmarkListIntersectCompressBin(b *testing.B) {
for _, r := range rs {
sz1 := sz
sz2 := int(float64(sz) * r)
if sz2 > 1000000 || sz2 == 0 {
if sz2 > 10000000 || sz2 == 0 {
break
}

Expand All @@ -389,8 +389,18 @@ func BenchmarkListIntersectCompressBin(b *testing.B) {
sort.Slice(v1, func(i, j int) bool { return v1[i] < v1[j] })

dst2 := &pb.List{}
dst1 := &pb.List{}
compressedUids := codec.Encode(v1, 256)

b.Run(fmt.Sprintf("linJump:IntersectWith:ratio=%v:size=%d:overlap=%.2f:", r, sz, overlap),
func(b *testing.B) {
for k := 0; k < b.N; k++ {
dec := codec.Decoder{Pack: compressedUids}
dec.Seek(0, codec.SeekStart)
IntersectCompressedWithLinJump(&dec, u1, &dst1.Uids)
}
})

b.Run(fmt.Sprintf("compressed:IntersectWith:ratio=%v:size=%d:overlap=%.2f:", r, sz, overlap),
func(b *testing.B) {
for k := 0; k < b.N; k++ {
Expand All @@ -399,7 +409,6 @@ func BenchmarkListIntersectCompressBin(b *testing.B) {
IntersectCompressedWithBin(&dec, u1, &dst2.Uids)
}
})
fmt.Println()

codec.FreePack(compressedUids)
}
Expand Down Expand Up @@ -493,6 +502,43 @@ func sortUint64(nums []uint64) {
sort.Slice(nums, func(i, j int) bool { return nums[i] < nums[j] })
}

func fillNumsDiff(N1, N2, N3 int) ([]uint64, []uint64, []uint64) {
rand.Seed(time.Now().UnixNano())

commonNums := make([]uint64, N1)
blockNums := make([]uint64, N1+N2)
otherNums := make([]uint64, N1+N3)
allC := make(map[uint64]bool)

for i := 0; i < N1; i++ {
val := rand.Uint64() % 1000
commonNums[i] = val
blockNums[i] = val
otherNums[i] = val
allC[val] = true
}

for i := N1; i < N1+N2; i++ {
val := rand.Uint64() % 1000
blockNums[i] = val
allC[val] = true
}

for i := N1; i < N1+N3; i++ {
val := rand.Uint64()
for ok := true; ok; _, ok = allC[val] {
val = rand.Uint64() % 1000
}
otherNums[i] = val
}

sortUint64(commonNums)
sortUint64(blockNums)
sortUint64(otherNums)

return commonNums, blockNums, otherNums
}

func fillNums(N1, N2 int) ([]uint64, []uint64, []uint64) {
rand.Seed(time.Now().UnixNano())

Expand Down Expand Up @@ -545,12 +591,12 @@ func TestIntersectCompressedWithLinJump(t *testing.T) {
}

func TestIntersectCompressedWithBin(t *testing.T) {
lengths := []int{0, 1, 3, 11, 100}
//lengths := []int{0, 1, 3, 11, 100, 500, 1000}

for _, N1 := range lengths {
for _, N2 := range lengths {
for _, N1 := range []int{11} {
for _, N2 := range []int{3} {
// Intersection of blockNums and otherNums is commonNums.
commonNums, blockNums, otherNums := fillNums(N1, N2)
commonNums, blockNums, otherNums := fillNumsDiff(N1/10, N1, N2)

enc := codec.Encoder{BlockSize: 10}
for _, num := range blockNums {
Expand All @@ -570,7 +616,7 @@ func TestIntersectCompressedWithBin(t *testing.T) {
}

func TestIntersectCompressedWithBinMissingSize(t *testing.T) {
lengths := []int{0, 1, 3, 11, 100}
lengths := []int{0, 1, 3, 11, 100, 500, 1000}

for _, N1 := range lengths {
for _, N2 := range lengths {
Expand Down
58 changes: 58 additions & 0 deletions codec/codec.go
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,64 @@ func (d *Decoder) ApproxLen() int {

type searchFunc func(int) bool

// SeekToBlock will find the block containing the uid, and unpack it. When we are going to
// intersect the list later, this function is useful. As this function skips the search function
// and returns the entire block, it is faster than Seek. Unlike seek, we don't truncate the uids
// returned, which would be done by the intersect function anyways.
func (d *Decoder) SeekToBlock(uid uint64, whence seekPos) []uint64 {
if d.Pack == nil {
return []uint64{}
}
prevBlockIdx := d.blockIdx
d.blockIdx = 0
if uid == 0 {
return d.UnpackBlock()
}

// If for some reason we are searching an older uid, we need to search the entire pack
if prevBlockIdx > 0 && uid < d.Pack.Blocks[prevBlockIdx].Base {
prevBlockIdx = 0
}

blocksFunc := func() searchFunc {
var f searchFunc
switch whence {
case SeekStart:
f = func(i int) bool { return d.Pack.Blocks[i+prevBlockIdx].Base >= uid }
case SeekCurrent:
f = func(i int) bool { return d.Pack.Blocks[i+prevBlockIdx].Base > uid }
}
return f
}

idx := sort.Search(len(d.Pack.Blocks[prevBlockIdx:]), blocksFunc()) + prevBlockIdx
// The first block.Base >= uid.
if idx == 0 {
return d.UnpackBlock()
}
// The uid is the first entry in the block.
if idx < len(d.Pack.Blocks) && d.Pack.Blocks[idx].Base == uid {
d.blockIdx = idx
return d.UnpackBlock()
}

// Either the idx = len(pack.Blocks) that means it wasn't found in any of the block's base. Or,
// we found the first block index whose base is greater than uid. In these cases, go to the
// previous block and search there.
d.blockIdx = idx - 1 // Move to the previous block. If blockIdx<0, unpack will deal with it.
if d.blockIdx != prevBlockIdx {
d.UnpackBlock() // And get all their uids.
}

if uid <= d.uids[len(d.uids)-1] {
return d.uids
}

// Could not find any uid in the block, which is >= uid. The next block might still have valid
// entries > uid.
return d.Next()
}

// Seek will search for uid in a packed block using the specified whence position.
// The value of whence must be one of the predefined values SeekStart or SeekCurrent.
// SeekStart searches uid and includes it as part of the results.
Expand Down

0 comments on commit 0b84c95

Please sign in to comment.