-
Notifications
You must be signed in to change notification settings - Fork 5.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
lightning: support disable scheduler by key range #34130
Changes from 18 commits
080cba1
79a1834
d19965a
42bf379
75afa2f
3ef1b43
1d60815
1c6aefc
6f57ff5
87aac6a
552b46e
7e9eb2a
2c7bc6e
4d4e951
034a7e3
fcbda40
f340e89
9a84a31
ef65a9f
c27bcf8
fb01544
ba5df76
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
// Copyright 2022 PingCAP, Inc. Licensed under Apache-2.0. | ||
|
||
package errors_test | ||
|
||
import ( | ||
"context" | ||
"net/url" | ||
"testing" | ||
|
||
"github.com/pingcap/errors" | ||
berrors "github.com/pingcap/tidb/br/pkg/errors" | ||
"github.com/stretchr/testify/require" | ||
) | ||
|
||
func TestIsContextCanceled(t *testing.T) { | ||
require.False(t, berrors.IsContextCanceled(nil)) | ||
require.False(t, berrors.IsContextCanceled(errors.New("connection closed"))) | ||
require.True(t, berrors.IsContextCanceled(context.Canceled)) | ||
require.True(t, berrors.IsContextCanceled(context.DeadlineExceeded)) | ||
require.True(t, berrors.IsContextCanceled(errors.Trace(context.Canceled))) | ||
require.True(t, berrors.IsContextCanceled(errors.Trace(context.DeadlineExceeded))) | ||
require.True(t, berrors.IsContextCanceled(&url.Error{Err: context.Canceled})) | ||
require.True(t, berrors.IsContextCanceled(&url.Error{Err: context.DeadlineExceeded})) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,7 +19,6 @@ import ( | |
"context" | ||
"database/sql" | ||
"math" | ||
"regexp" | ||
"runtime" | ||
"sort" | ||
"strings" | ||
|
@@ -332,14 +331,7 @@ func (local *local) SplitAndScatterRegionByRanges( | |
} | ||
|
||
startTime := time.Now() | ||
scatterCount := 0 | ||
for _, region := range scatterRegions { | ||
local.waitForScatterRegion(ctx, region) | ||
if time.Since(startTime) > split.ScatterWaitUpperInterval { | ||
break | ||
} | ||
scatterCount++ | ||
} | ||
scatterCount, err := local.waitForScatterRegions(ctx, scatterRegions) | ||
if scatterCount == len(scatterRegions) { | ||
log.FromContext(ctx).Info("waiting for scattering regions done", | ||
zap.Int("skipped_keys", skippedKeys), | ||
|
@@ -349,7 +341,8 @@ func (local *local) SplitAndScatterRegionByRanges( | |
zap.Int("skipped_keys", skippedKeys), | ||
zap.Int("scatterCount", scatterCount), | ||
zap.Int("regions", len(scatterRegions)), | ||
zap.Duration("take", time.Since(startTime))) | ||
zap.Duration("take", time.Since(startTime)), | ||
zap.Error(err)) | ||
} | ||
return nil | ||
Comment on lines
+345
to
347
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need to handle this error? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Emm, it's hard to say. Until now, we didn't treat scatter region error as a critical error. |
||
} | ||
|
@@ -447,28 +440,38 @@ func (local *local) waitForSplit(ctx context.Context, regionID uint64) { | |
} | ||
} | ||
|
||
func (local *local) waitForScatterRegion(ctx context.Context, regionInfo *split.RegionInfo) { | ||
for i := 0; i < split.ScatterWaitMaxRetryTimes; i++ { | ||
ok, err := local.checkScatterRegionFinishedOrReScatter(ctx, regionInfo) | ||
if ok { | ||
return | ||
} | ||
if err != nil { | ||
if !common.IsRetryableError(err) { | ||
log.FromContext(ctx).Warn("wait for scatter region encountered non-retryable error", logutil.Region(regionInfo.Region), zap.Error(err)) | ||
return | ||
func (local *local) waitForScatterRegions(ctx context.Context, regions []*split.RegionInfo) (scatterCount int, _ error) { | ||
subCtx, cancel := context.WithTimeout(ctx, split.ScatterWaitUpperInterval) | ||
defer cancel() | ||
|
||
for len(regions) > 0 { | ||
var retryRegions []*split.RegionInfo | ||
for _, region := range regions { | ||
scattered, err := local.checkRegionScatteredOrReScatter(subCtx, region) | ||
if scattered { | ||
scatterCount++ | ||
continue | ||
} | ||
log.FromContext(ctx).Warn("wait for scatter region encountered error, will retry again", logutil.Region(regionInfo.Region), zap.Error(err)) | ||
if err != nil { | ||
if !common.IsRetryableError(err) { | ||
log.FromContext(ctx).Warn("wait for scatter region encountered non-retryable error", logutil.Region(region.Region), zap.Error(err)) | ||
return scatterCount, err | ||
} | ||
log.FromContext(ctx).Warn("wait for scatter region encountered error, will retry again", logutil.Region(region.Region), zap.Error(err)) | ||
} | ||
retryRegions = append(retryRegions, region) | ||
} | ||
regions = retryRegions | ||
select { | ||
case <-time.After(time.Second): | ||
case <-ctx.Done(): | ||
case <-subCtx.Done(): | ||
return | ||
} | ||
} | ||
return scatterCount, nil | ||
} | ||
|
||
func (local *local) checkScatterRegionFinishedOrReScatter(ctx context.Context, regionInfo *split.RegionInfo) (bool, error) { | ||
func (local *local) checkRegionScatteredOrReScatter(ctx context.Context, regionInfo *split.RegionInfo) (bool, error) { | ||
resp, err := local.splitCli.GetOperator(ctx, regionInfo.Region.GetId()) | ||
if err != nil { | ||
return false, err | ||
|
@@ -478,13 +481,9 @@ func (local *local) checkScatterRegionFinishedOrReScatter(ctx context.Context, r | |
if respErr.GetType() == pdpb.ErrorType_REGION_NOT_FOUND { | ||
return true, nil | ||
} | ||
// don't return error if region replicate not complete | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't need to handle this error now, do we? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is moved to retriable errors. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oh, it's caught by the caller. |
||
// TODO: should add a new error type to avoid this check by string matching | ||
matches, _ := regexp.MatchString("region \\d+ is not fully replicated", respErr.Message) | ||
if matches { | ||
return false, nil | ||
} | ||
return false, errors.Errorf("get operator error: %s", respErr.GetType()) | ||
return false, errors.Errorf( | ||
"failed to get region operator, error type: %s, error message: %s", | ||
respErr.GetType().String(), respErr.GetMessage()) | ||
} | ||
// If the current operator of the region is not 'scatter-region', we could assume | ||
// that 'scatter-operator' has finished. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What if several engines with range overlapping start importing at the same time, is the new scheduler compatible with this kind of concurrent operations?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's compatible. Different engines will use different label rule id. It guarantees the scheduler is disabled on every engine key range even if overlapping exists.