From b9c1a27479e6905bfc5b1f4abb485ee51053e845 Mon Sep 17 00:00:00 2001 From: Yiding Date: Thu, 25 Jul 2024 19:34:46 +0800 Subject: [PATCH 1/5] disjoinset: add generic impl --- pkg/expression/constant_propagation.go | 8 ++-- pkg/util/disjointset/int_set.go | 14 +++--- pkg/util/disjointset/set.go | 66 ++++++++++++++++++++++++++ pkg/util/disjointset/set_test.go | 49 +++++++++++++++++++ 4 files changed, 127 insertions(+), 10 deletions(-) create mode 100644 pkg/util/disjointset/set.go create mode 100644 pkg/util/disjointset/set_test.go diff --git a/pkg/expression/constant_propagation.go b/pkg/expression/constant_propagation.go index 38c4cf3790a63..41ad3637308ed 100644 --- a/pkg/expression/constant_propagation.go +++ b/pkg/expression/constant_propagation.go @@ -32,10 +32,10 @@ var MaxPropagateColsCnt = 100 // nolint:structcheck type basePropConstSolver struct { - colMapper map[int64]int // colMapper maps column to its index - eqList []*Constant // if eqList[i] != nil, it means col_i = eqList[i] - unionSet *disjointset.IntSet // unionSet stores the relations like col_i = col_j - columns []*Column // columns stores all columns appearing in the conditions + colMapper map[int64]int // colMapper maps column to its index + eqList []*Constant // if eqList[i] != nil, it means col_i = eqList[i] + unionSet *disjointset.SimpleIntSet // unionSet stores the relations like col_i = col_j + columns []*Column // columns stores all columns appearing in the conditions ctx exprctx.ExprContext } diff --git a/pkg/util/disjointset/int_set.go b/pkg/util/disjointset/int_set.go index 05846e3840850..db2a29b75d3d8 100644 --- a/pkg/util/disjointset/int_set.go +++ b/pkg/util/disjointset/int_set.go @@ -14,27 +14,29 @@ package disjointset -// IntSet is the int disjoint set. -type IntSet struct { +// SimpleIntSet is the int disjoint set. +// It's not designed for sparse case. You should use it when the elements are continuous. +// Time complexity: O(1) for FindRoot, O(1) for Union. Both are amortized. +type SimpleIntSet struct { parent []int } // NewIntSet returns a new int disjoint set. -func NewIntSet(size int) *IntSet { +func NewIntSet(size int) *SimpleIntSet { p := make([]int, size) for i := range p { p[i] = i } - return &IntSet{parent: p} + return &SimpleIntSet{parent: p} } // Union unions two sets in int disjoint set. -func (m *IntSet) Union(a int, b int) { +func (m *SimpleIntSet) Union(a int, b int) { m.parent[m.FindRoot(a)] = m.FindRoot(b) } // FindRoot finds the representative element of the set that `a` belongs to. -func (m *IntSet) FindRoot(a int) int { +func (m *SimpleIntSet) FindRoot(a int) int { if a == m.parent[a] { return a } diff --git a/pkg/util/disjointset/set.go b/pkg/util/disjointset/set.go new file mode 100644 index 0000000000000..98c0e78f2a91b --- /dev/null +++ b/pkg/util/disjointset/set.go @@ -0,0 +1,66 @@ +// Copyright 2024 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disjointset + +// Set is the universal implementation of disjoint set. +// It's designed for sparse case or not integer type. +// We hash the original value to an integer index and then apply the core disjoint set algorithm. +// Time complexity: O(1) for findRoot, O(1) for InSameGroup, O(1) for Union. Both are amortized. +type Set[T comparable] struct { + parent []int + val2idx map[T]int + tailIdx int +} + +func NewSet[T comparable](size int) *Set[T] { + return &Set[T]{ + parent: make([]int, 0, size), + val2idx: make(map[T]int), + tailIdx: 0, + } +} + +func (s *Set[T]) findRootOrigialVal(a T) int { + idx, ok := s.val2idx[a] + if !ok { + s.parent = append(s.parent, s.tailIdx) + s.val2idx[a] = s.tailIdx + s.tailIdx++ + return s.tailIdx - 1 + } + s.parent[idx] = s.findRoot(s.parent[idx]) + return s.parent[idx] +} + +// findRoot is internal impl. Call it inside the findRootOrig. +func (s *Set[T]) findRoot(a int) int { + if s.parent[a] == a { + return a + } + s.parent[a] = s.findRoot(s.parent[a]) + return s.parent[a] +} + +func (s *Set[T]) InSameGroup(a T, b T) bool { + return s.findRootOrigialVal(a) == s.findRootOrigialVal(b) +} + +func (s *Set[T]) Union(a T, b T) { + rootA := s.findRootOrigialVal(a) + rootB := s.findRootOrigialVal(b) + if rootA != rootB { + s.parent[rootA] = rootB + } +} diff --git a/pkg/util/disjointset/set_test.go b/pkg/util/disjointset/set_test.go new file mode 100644 index 0000000000000..ae7cada175845 --- /dev/null +++ b/pkg/util/disjointset/set_test.go @@ -0,0 +1,49 @@ +// Copyright 2024 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disjointset + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestDisjointSet(t *testing.T) { + set := NewSet[string](10) + assert.False(t, set.InSameGroup("a", "b")) + assert.Len(t, set.parent, 2) + set.Union("a", "b") + assert.True(t, set.InSameGroup("a", "b")) + assert.False(t, set.InSameGroup("a", "c")) + assert.Len(t, set.parent, 3) + assert.False(t, set.InSameGroup("b", "c")) + assert.Len(t, set.parent, 3) + set.Union("b", "c") + assert.True(t, set.InSameGroup("a", "c")) + assert.True(t, set.InSameGroup("b", "c")) + set.Union("d", "e") + set.Union("e", "f") + set.Union("f", "g") + assert.Len(t, set.parent, 7) + assert.False(t, set.InSameGroup("a", "d")) + assert.True(t, set.InSameGroup("d", "g")) + assert.False(t, set.InSameGroup("c", "g")) + set.Union("a", "g") + assert.True(t, set.InSameGroup("a", "d")) + assert.True(t, set.InSameGroup("b", "g")) + assert.True(t, set.InSameGroup("c", "f")) + assert.True(t, set.InSameGroup("a", "e")) + assert.True(t, set.InSameGroup("b", "c")) +} From fab801521fa602c1c67852413aab609b255eb1d3 Mon Sep 17 00:00:00 2001 From: Yiding Date: Thu, 25 Jul 2024 19:42:20 +0800 Subject: [PATCH 2/5] correct time complexity --- pkg/util/disjointset/int_set.go | 2 +- pkg/util/disjointset/set.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/util/disjointset/int_set.go b/pkg/util/disjointset/int_set.go index db2a29b75d3d8..a53b7e6d0a44a 100644 --- a/pkg/util/disjointset/int_set.go +++ b/pkg/util/disjointset/int_set.go @@ -16,7 +16,7 @@ package disjointset // SimpleIntSet is the int disjoint set. // It's not designed for sparse case. You should use it when the elements are continuous. -// Time complexity: O(1) for FindRoot, O(1) for Union. Both are amortized. +// Time complexity: the union operation is inverse ackermann function, which is very close to O(1). type SimpleIntSet struct { parent []int } diff --git a/pkg/util/disjointset/set.go b/pkg/util/disjointset/set.go index 98c0e78f2a91b..9eeb1891961ab 100644 --- a/pkg/util/disjointset/set.go +++ b/pkg/util/disjointset/set.go @@ -17,7 +17,7 @@ package disjointset // Set is the universal implementation of disjoint set. // It's designed for sparse case or not integer type. // We hash the original value to an integer index and then apply the core disjoint set algorithm. -// Time complexity: O(1) for findRoot, O(1) for InSameGroup, O(1) for Union. Both are amortized. +// Time complexity: the union operation is inverse ackermann function, which is very close to O(1). type Set[T comparable] struct { parent []int val2idx map[T]int From 7288425e276efaba8c88624e3c385b04b7dcefdb Mon Sep 17 00:00:00 2001 From: Yiding Date: Thu, 25 Jul 2024 21:02:07 +0800 Subject: [PATCH 3/5] fix bazel_prepare and add comment --- pkg/util/disjointset/BUILD.bazel | 6 +++++- pkg/util/disjointset/set.go | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/pkg/util/disjointset/BUILD.bazel b/pkg/util/disjointset/BUILD.bazel index 941410ed9d54b..8578cbc54206b 100644 --- a/pkg/util/disjointset/BUILD.bazel +++ b/pkg/util/disjointset/BUILD.bazel @@ -2,7 +2,10 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") go_library( name = "disjointset", - srcs = ["int_set.go"], + srcs = [ + "int_set.go", + "set.go", + ], importpath = "github.com/pingcap/tidb/pkg/util/disjointset", visibility = ["//visibility:public"], ) @@ -13,6 +16,7 @@ go_test( srcs = [ "int_set_test.go", "main_test.go", + "set_test.go", ], embed = [":disjointset"], flaky = True, diff --git a/pkg/util/disjointset/set.go b/pkg/util/disjointset/set.go index 9eeb1891961ab..f1153dbf9b611 100644 --- a/pkg/util/disjointset/set.go +++ b/pkg/util/disjointset/set.go @@ -16,6 +16,7 @@ package disjointset // Set is the universal implementation of disjoint set. // It's designed for sparse case or not integer type. +// If you are dealing with continuous integer, you should use SimpleIntSet to avoid the cost of hash map. // We hash the original value to an integer index and then apply the core disjoint set algorithm. // Time complexity: the union operation is inverse ackermann function, which is very close to O(1). type Set[T comparable] struct { From 8e7bc0ad573cd6151273d5515cf43c5dfd29b9f6 Mon Sep 17 00:00:00 2001 From: Yiding Date: Thu, 25 Jul 2024 22:01:25 +0800 Subject: [PATCH 4/5] fix static check --- pkg/util/disjointset/set.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/util/disjointset/set.go b/pkg/util/disjointset/set.go index f1153dbf9b611..9502d72a3b35d 100644 --- a/pkg/util/disjointset/set.go +++ b/pkg/util/disjointset/set.go @@ -25,6 +25,7 @@ type Set[T comparable] struct { tailIdx int } +// NewSet creates the disjoint set. func NewSet[T comparable](size int) *Set[T] { return &Set[T]{ parent: make([]int, 0, size), @@ -54,10 +55,12 @@ func (s *Set[T]) findRoot(a int) int { return s.parent[a] } +// InSameGroup checks whether a and b are in the same group. func (s *Set[T]) InSameGroup(a T, b T) bool { return s.findRootOrigialVal(a) == s.findRootOrigialVal(b) } +// Union unions two sets in disjoint set. func (s *Set[T]) Union(a T, b T) { rootA := s.findRootOrigialVal(a) rootB := s.findRootOrigialVal(b) From d3d4fc9d59746d9121eeae61358e66b6353fe241 Mon Sep 17 00:00:00 2001 From: Yiding Date: Fri, 26 Jul 2024 19:25:45 +0800 Subject: [PATCH 5/5] address comments --- pkg/util/disjointset/set.go | 43 +++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/pkg/util/disjointset/set.go b/pkg/util/disjointset/set.go index 9502d72a3b35d..08b63aec5dd3c 100644 --- a/pkg/util/disjointset/set.go +++ b/pkg/util/disjointset/set.go @@ -14,56 +14,53 @@ package disjointset -// Set is the universal implementation of disjoint set. -// It's designed for sparse case or not integer type. -// If you are dealing with continuous integer, you should use SimpleIntSet to avoid the cost of hash map. +// Set is the universal implementation of a disjoint set. +// It's designed for sparse cases or non-integer types. +// If you are dealing with continuous integers, you should use SimpleIntSet to avoid the cost of a hash map. // We hash the original value to an integer index and then apply the core disjoint set algorithm. -// Time complexity: the union operation is inverse ackermann function, which is very close to O(1). +// Time complexity: the union operation has an inverse Ackermann function time complexity, which is very close to O(1). type Set[T comparable] struct { parent []int - val2idx map[T]int + val2Idx map[T]int tailIdx int } -// NewSet creates the disjoint set. +// NewSet creates a disjoint set. func NewSet[T comparable](size int) *Set[T] { return &Set[T]{ parent: make([]int, 0, size), - val2idx: make(map[T]int), + val2Idx: make(map[T]int, size), tailIdx: 0, } } - -func (s *Set[T]) findRootOrigialVal(a T) int { - idx, ok := s.val2idx[a] +func (s *Set[T]) findRootOriginalVal(a T) int { + idx, ok := s.val2Idx[a] if !ok { s.parent = append(s.parent, s.tailIdx) - s.val2idx[a] = s.tailIdx + s.val2Idx[a] = s.tailIdx s.tailIdx++ return s.tailIdx - 1 } - s.parent[idx] = s.findRoot(s.parent[idx]) - return s.parent[idx] + return s.findRoot(idx) } -// findRoot is internal impl. Call it inside the findRootOrig. +// findRoot is an internal implementation. Call it inside findRootOriginalVal. func (s *Set[T]) findRoot(a int) int { - if s.parent[a] == a { - return a + if s.parent[a] != a { + s.parent[a] = s.findRoot(s.parent[a]) } - s.parent[a] = s.findRoot(s.parent[a]) return s.parent[a] } // InSameGroup checks whether a and b are in the same group. -func (s *Set[T]) InSameGroup(a T, b T) bool { - return s.findRootOrigialVal(a) == s.findRootOrigialVal(b) +func (s *Set[T]) InSameGroup(a, b T) bool { + return s.findRootOriginalVal(a) == s.findRootOriginalVal(b) } -// Union unions two sets in disjoint set. -func (s *Set[T]) Union(a T, b T) { - rootA := s.findRootOrigialVal(a) - rootB := s.findRootOrigialVal(b) +// Union joins two sets in the disjoint set. +func (s *Set[T]) Union(a, b T) { + rootA := s.findRootOriginalVal(a) + rootB := s.findRootOriginalVal(b) if rootA != rootB { s.parent[rootA] = rootB }