diff --git a/algo/uidlist.go b/algo/uidlist.go index 1465d4ff2fe..cb37232f3bd 100644 --- a/algo/uidlist.go +++ b/algo/uidlist.go @@ -24,7 +24,8 @@ import ( "github.com/dgraph-io/dgraph/protos/pb" ) -const jump = 32 // Jump size in InsersectWithJump. +const jump = 32 // Jump size in InsersectWithJump. +const linVsBinRatio = 10 // When is linear search better than binary // ApplyFilter applies a filter to our UIDList. func ApplyFilter(u *pb.List, f func(uint64, int) bool) { @@ -60,7 +61,7 @@ func IntersectCompressedWith(pack *pb.UidPack, afterUID uint64, v, o *pb.List) { // Select appropriate function based on heuristics. ratio := float64(m) / float64(n) - if ratio < 500 { + if ratio < linVsBinRatio { IntersectCompressedWithLinJump(&dec, v.Uids, &dst) } else { IntersectCompressedWithBin(&dec, v.Uids, &dst) @@ -94,7 +95,7 @@ func IntersectCompressedWithLinJump(dec *codec.Decoder, v []uint64, o *[]uint64) // https://link.springer.com/chapter/10.1007/978-3-642-12476-1_3 // Call seek on dec before calling this function func IntersectCompressedWithBin(dec *codec.Decoder, q []uint64, o *[]uint64) { - ld := dec.ApproxLen() + ld := codec.ExactLen(dec.Pack) lq := len(q) if lq == 0 { @@ -105,46 +106,44 @@ func IntersectCompressedWithBin(dec *codec.Decoder, q []uint64, o *[]uint64) { } // Pick the shorter list and do binary search - if ld < lq { + if ld <= lq { for { blockUids := dec.Uids() if len(blockUids) == 0 { break } - IntersectWithBin(blockUids, q, o) - lastUid := blockUids[len(blockUids)-1] - qidx := sort.Search(len(q), func(idx int) bool { - return q[idx] >= lastUid - }) - if qidx >= len(q) { + _, off := IntersectWithJump(blockUids, q, o) + q = q[off:] + if len(q) == 0 { return } - q = q[qidx:] dec.Next() } return } - var uids []uint64 - for _, u := range q { + uids := dec.Uids() + qidx := 0 + for { + if qidx >= len(q) { + return + } + u := q[qidx] if len(uids) == 0 || u > uids[len(uids)-1] { - uids = dec.Seek(u, codec.SeekStart) + if lq*linVsBinRatio < ld { + uids = dec.LinearSeek(u) + } else { + uids = dec.SeekToBlock(u, codec.SeekCurrent) + } if len(uids) == 0 { return } } - uidIdx := sort.Search(len(uids), func(idx int) bool { - return uids[idx] >= u - }) - if uidIdx >= len(uids) { - // We know that u < max(uids). If we didn't find it here, it's not here. - continue - } - if uids[uidIdx] == u { - *o = append(*o, u) - uidIdx++ + _, off := IntersectWithJump(uids, q[qidx:], o) + if off == 0 { + off = 1 // if v[k] isn't in u, move forward } - uids = uids[uidIdx:] + qidx += off } } @@ -233,7 +232,8 @@ func IntersectWithJump(u, v []uint64, o *[]uint64) (int, int) { // IntersectWithBin is based on the paper // "Fast Intersection Algorithms for Sorted Sequences" // https://link.springer.com/chapter/10.1007/978-3-642-12476-1_3 -func IntersectWithBin(d, q []uint64, o *[]uint64) { +// Returns where to move the second array(q) to. O means not found +func IntersectWithBin(d, q []uint64, o *[]uint64) int { ld := len(d) lq := len(q) @@ -242,7 +242,7 @@ func IntersectWithBin(d, q []uint64, o *[]uint64) { d, q = q, d } if ld == 0 || lq == 0 || d[ld-1] < q[0] || q[lq-1] < d[0] { - return + return 0 } val := d[0] @@ -256,6 +256,7 @@ func IntersectWithBin(d, q []uint64, o *[]uint64) { }) binIntersect(d, q[minq:maxq], o) + return maxq } // binIntersect is the recursive function used. diff --git a/algo/uidlist_test.go b/algo/uidlist_test.go index 05eafba68e3..50fddb30b38 100644 --- a/algo/uidlist_test.go +++ b/algo/uidlist_test.go @@ -373,7 +373,7 @@ func BenchmarkListIntersectCompressBin(b *testing.B) { for _, r := range rs { sz1 := sz sz2 := int(float64(sz) * r) - if sz2 > 1000000 || sz2 == 0 { + if sz2 > 10000000 || sz2 == 0 { break } @@ -389,8 +389,18 @@ func BenchmarkListIntersectCompressBin(b *testing.B) { sort.Slice(v1, func(i, j int) bool { return v1[i] < v1[j] }) dst2 := &pb.List{} + dst1 := &pb.List{} compressedUids := codec.Encode(v1, 256) + b.Run(fmt.Sprintf("linJump:IntersectWith:ratio=%v:size=%d:overlap=%.2f:", r, sz, overlap), + func(b *testing.B) { + for k := 0; k < b.N; k++ { + dec := codec.Decoder{Pack: compressedUids} + dec.Seek(0, codec.SeekStart) + IntersectCompressedWithLinJump(&dec, u1, &dst1.Uids) + } + }) + b.Run(fmt.Sprintf("compressed:IntersectWith:ratio=%v:size=%d:overlap=%.2f:", r, sz, overlap), func(b *testing.B) { for k := 0; k < b.N; k++ { @@ -399,7 +409,6 @@ func BenchmarkListIntersectCompressBin(b *testing.B) { IntersectCompressedWithBin(&dec, u1, &dst2.Uids) } }) - fmt.Println() codec.FreePack(compressedUids) } @@ -493,6 +502,43 @@ func sortUint64(nums []uint64) { sort.Slice(nums, func(i, j int) bool { return nums[i] < nums[j] }) } +func fillNumsDiff(N1, N2, N3 int) ([]uint64, []uint64, []uint64) { + rand.Seed(time.Now().UnixNano()) + + commonNums := make([]uint64, N1) + blockNums := make([]uint64, N1+N2) + otherNums := make([]uint64, N1+N3) + allC := make(map[uint64]bool) + + for i := 0; i < N1; i++ { + val := rand.Uint64() % 1000 + commonNums[i] = val + blockNums[i] = val + otherNums[i] = val + allC[val] = true + } + + for i := N1; i < N1+N2; i++ { + val := rand.Uint64() % 1000 + blockNums[i] = val + allC[val] = true + } + + for i := N1; i < N1+N3; i++ { + val := rand.Uint64() + for ok := true; ok; _, ok = allC[val] { + val = rand.Uint64() % 1000 + } + otherNums[i] = val + } + + sortUint64(commonNums) + sortUint64(blockNums) + sortUint64(otherNums) + + return commonNums, blockNums, otherNums +} + func fillNums(N1, N2 int) ([]uint64, []uint64, []uint64) { rand.Seed(time.Now().UnixNano()) @@ -545,12 +591,12 @@ func TestIntersectCompressedWithLinJump(t *testing.T) { } func TestIntersectCompressedWithBin(t *testing.T) { - lengths := []int{0, 1, 3, 11, 100} + //lengths := []int{0, 1, 3, 11, 100, 500, 1000} - for _, N1 := range lengths { - for _, N2 := range lengths { + for _, N1 := range []int{11} { + for _, N2 := range []int{3} { // Intersection of blockNums and otherNums is commonNums. - commonNums, blockNums, otherNums := fillNums(N1, N2) + commonNums, blockNums, otherNums := fillNumsDiff(N1/10, N1, N2) enc := codec.Encoder{BlockSize: 10} for _, num := range blockNums { @@ -570,7 +616,7 @@ func TestIntersectCompressedWithBin(t *testing.T) { } func TestIntersectCompressedWithBinMissingSize(t *testing.T) { - lengths := []int{0, 1, 3, 11, 100} + lengths := []int{0, 1, 3, 11, 100, 500, 1000} for _, N1 := range lengths { for _, N2 := range lengths { diff --git a/codec/codec.go b/codec/codec.go index 4ebc17a341d..60359f474fa 100644 --- a/codec/codec.go +++ b/codec/codec.go @@ -223,6 +223,64 @@ func (d *Decoder) ApproxLen() int { type searchFunc func(int) bool +// SeekToBlock will find the block containing the uid, and unpack it. When we are going to +// intersect the list later, this function is useful. As this function skips the search function +// and returns the entire block, it is faster than Seek. Unlike seek, we don't truncate the uids +// returned, which would be done by the intersect function anyways. +func (d *Decoder) SeekToBlock(uid uint64, whence seekPos) []uint64 { + if d.Pack == nil { + return []uint64{} + } + prevBlockIdx := d.blockIdx + d.blockIdx = 0 + if uid == 0 { + return d.UnpackBlock() + } + + // If for some reason we are searching an older uid, we need to search the entire pack + if prevBlockIdx > 0 && uid < d.Pack.Blocks[prevBlockIdx].Base { + prevBlockIdx = 0 + } + + blocksFunc := func() searchFunc { + var f searchFunc + switch whence { + case SeekStart: + f = func(i int) bool { return d.Pack.Blocks[i+prevBlockIdx].Base >= uid } + case SeekCurrent: + f = func(i int) bool { return d.Pack.Blocks[i+prevBlockIdx].Base > uid } + } + return f + } + + idx := sort.Search(len(d.Pack.Blocks[prevBlockIdx:]), blocksFunc()) + prevBlockIdx + // The first block.Base >= uid. + if idx == 0 { + return d.UnpackBlock() + } + // The uid is the first entry in the block. + if idx < len(d.Pack.Blocks) && d.Pack.Blocks[idx].Base == uid { + d.blockIdx = idx + return d.UnpackBlock() + } + + // Either the idx = len(pack.Blocks) that means it wasn't found in any of the block's base. Or, + // we found the first block index whose base is greater than uid. In these cases, go to the + // previous block and search there. + d.blockIdx = idx - 1 // Move to the previous block. If blockIdx<0, unpack will deal with it. + if d.blockIdx != prevBlockIdx { + d.UnpackBlock() // And get all their uids. + } + + if uid <= d.uids[len(d.uids)-1] { + return d.uids + } + + // Could not find any uid in the block, which is >= uid. The next block might still have valid + // entries > uid. + return d.Next() +} + // Seek will search for uid in a packed block using the specified whence position. // The value of whence must be one of the predefined values SeekStart or SeekCurrent. // SeekStart searches uid and includes it as part of the results.