From e8e870faf647b249d96f3c3964796598f8fb465d Mon Sep 17 00:00:00 2001 From: Wenqi Mou Date: Wed, 17 Jul 2024 13:23:00 -0400 Subject: [PATCH 1/8] br cli refactor backup error handling logic Signed-off-by: Wenqi Mou --- br/pkg/backup/client.go | 4 +- br/pkg/utils/backoff.go | 4 +- br/pkg/utils/error_handling.go | 181 ++++++++++++++++++++++++++++++++ br/pkg/utils/permission.go | 20 ---- br/pkg/utils/retry.go | 184 --------------------------------- br/pkg/utils/retry_test.go | 58 +++++------ 6 files changed, 209 insertions(+), 242 deletions(-) create mode 100644 br/pkg/utils/error_handling.go delete mode 100644 br/pkg/utils/permission.go diff --git a/br/pkg/backup/client.go b/br/pkg/backup/client.go index 7790dc0ee3858..8125d21f8848a 100644 --- a/br/pkg/backup/client.go +++ b/br/pkg/backup/client.go @@ -1235,9 +1235,9 @@ func (bc *Client) OnBackupResponse( return txnlock.NewLock(lockErr), nil } } - res := errContext.HandleIgnorableError(errPb, storeID) + res := utils.HandleBackupError(errPb, storeID, errContext) switch res.Strategy { - case utils.GiveUpStrategy: + case utils.GiveUp: errMsg := res.Reason if len(errMsg) <= 0 { errMsg = errPb.Msg diff --git a/br/pkg/utils/backoff.go b/br/pkg/utils/backoff.go index fb947980e572e..a3613fea64b1b 100644 --- a/br/pkg/utils/backoff.go +++ b/br/pkg/utils/backoff.go @@ -172,8 +172,8 @@ func (bo *importerBackoffer) NextBackoff(err error) time.Duration { // we don't care storeID here. errs := multierr.Errors(err) lastErr := errs[len(errs)-1] - res := bo.errContext.HandleErrorMsg(lastErr.Error(), 0) - if res.Strategy == RetryStrategy { + res := HandleUnknownError(lastErr.Error(), 0, bo.errContext) + if res.Strategy == Retry { bo.delayTime = 2 * bo.delayTime bo.attempt-- } else { diff --git a/br/pkg/utils/error_handling.go b/br/pkg/utils/error_handling.go new file mode 100644 index 0000000000000..8a5b62c904cf7 --- /dev/null +++ b/br/pkg/utils/error_handling.go @@ -0,0 +1,181 @@ +package utils + +import ( + "fmt" + backuppb "github.com/pingcap/kvproto/pkg/brpb" + "github.com/pingcap/log" + "go.uber.org/zap" + "strings" + "sync" +) + +// TODO: remove and mapped all to an error type +var retryableErrorMsg = []string{ + "server closed", + "connection refused", + "connection reset by peer", + "channel closed", + "error trying to connect", + "connection closed before message completed", + "body write aborted", + "error during dispatch", + "put object timeout", + "timeout after", + "internalerror", + "not read from or written to within the timeout period", + "requesttimeout", + "invalidpart", + "end of file before message length reached", +} + +// non-retryable error messages +// TODO: remove and mapped all to an error type +var ( + ioMsg = "io" + notFoundMsg = "notfound" + permissionDeniedMsg = "permissiondenied" +) + +// error messages +var ( + UnreachableRetryMsg = "unreachable retry" + RetryOnKvErrorMsg = "retry on kv error" + RetryOnRegionErrorMsg = "retry on region error" + ClusterIdMismatchMsg = "cluster id mismatch" + UnknownErrorMsg = "unknown error" + ContextCancelledMsg = "context canceled" + RetryOnUnknownErrorMsg = "unknown error, retry it for a few times" + NoRetryOnUnknownErrorMsg = "unknown error, retried too many times, give up" + RetryableStorageErrorMsg = "retryable storage error" +) + +type ErrorHandlingResult struct { + Strategy ErrorHandlingStrategy + Reason string +} + +type ErrorHandlingStrategy int + +const ( + // Retry error can be retried but will consume the backoff attempt quota. + Retry ErrorHandlingStrategy = iota + // GiveUp means unrecoverable error happened and the BR should exit + // for example: + // 1. permission not valid. + // 2. data not found. + // 3. retry too many times + GiveUp + // Unknown for Unknown error + Unknown +) + +type ErrorContext struct { + mu sync.Mutex + // encounter times for one context on a store + // we may use this value to determine the retry policy + encounterTimes map[uint64]int + // unknown error retry limitation. + // encounter many times error makes Retry to GiveUp. + encounterTimesLimitation int + description string +} + +func NewErrorContext(scenario string, limitation int) *ErrorContext { + return &ErrorContext{ + description: scenario, + encounterTimes: make(map[uint64]int), + encounterTimesLimitation: limitation, + } +} + +func NewDefaultContext() *ErrorContext { + return &ErrorContext{ + description: "default", + encounterTimes: make(map[uint64]int), + encounterTimesLimitation: 1, + } +} + +func HandleBackupError(err *backuppb.Error, storeId uint64, ec *ErrorContext) ErrorHandlingResult { + if err == nil { + return ErrorHandlingResult{Retry, UnreachableRetryMsg} + } + res := handleBackupProtoError(err) + // try the best effort handle unknown error based on their error message + if res.Strategy == Unknown && len(err.Msg) != 0 { + return HandleUnknownError(err.Msg, storeId, ec) + } + return res +} + +func handleBackupProtoError(e *backuppb.Error) ErrorHandlingResult { + switch e.Detail.(type) { + case *backuppb.Error_KvError: + return ErrorHandlingResult{Retry, RetryOnKvErrorMsg} + case *backuppb.Error_RegionError: + return ErrorHandlingResult{Retry, RetryOnRegionErrorMsg} + case *backuppb.Error_ClusterIdError: + return ErrorHandlingResult{GiveUp, ClusterIdMismatchMsg} + } + return ErrorHandlingResult{Unknown, UnknownErrorMsg} +} + +// HandleUnknownError TODO: remove this method and map all the current unknown errors to an error type +func HandleUnknownError(msg string, uuid uint64, ec *ErrorContext) ErrorHandlingResult { + // UNSAFE! TODO: use meaningful error code instead of unstructured message to find failed to write error. + logger := log.L().With(zap.String("description", ec.description)) + if messageIsNotFoundStorageError(msg) { + reason := fmt.Sprintf("File or directory not found on TiKV Node (store id: %v). "+ + "workaround: please ensure br and tikv nodes share a same storage and the user of br and tikv has same uid.", + uuid) + return ErrorHandlingResult{GiveUp, reason} + } + if messageIsPermissionDeniedStorageError(msg) { + reason := fmt.Sprintf("I/O permission denied error occurs on TiKV Node(store id: %v). "+ + "workaround: please ensure tikv has permission to read from & write to the storage.", + uuid) + return ErrorHandlingResult{GiveUp, reason} + } + msgLower := strings.ToLower(msg) + if strings.Contains(msgLower, ContextCancelledMsg) { + return ErrorHandlingResult{GiveUp, ContextCancelledMsg} + } + + if MessageIsRetryableStorageError(msg) { + logger.Warn(RetryableStorageErrorMsg, zap.String("error", msg)) + return ErrorHandlingResult{Retry, RetryableStorageErrorMsg} + } + + // retry enough on same store + ec.mu.Lock() + defer ec.mu.Unlock() + ec.encounterTimes[uuid]++ + if ec.encounterTimes[uuid] <= ec.encounterTimesLimitation { + return ErrorHandlingResult{Retry, RetryOnUnknownErrorMsg} + } + return ErrorHandlingResult{GiveUp, NoRetryOnUnknownErrorMsg} +} + +// messageIsNotFoundStorageError checks whether the message returning from TiKV is "NotFound" storage I/O error +func messageIsNotFoundStorageError(msg string) bool { + msgLower := strings.ToLower(msg) + return strings.Contains(msgLower, ioMsg) && strings.Contains(msgLower, notFoundMsg) +} + +// MessageIsPermissionDeniedStorageError checks whether the message returning from TiKV is "PermissionDenied" storage I/O error +func messageIsPermissionDeniedStorageError(msg string) bool { + msgLower := strings.ToLower(msg) + return strings.Contains(msgLower, permissionDeniedMsg) +} + +// MessageIsRetryableStorageError checks whether the message returning from TiKV is retryable ExternalStorageError. +func MessageIsRetryableStorageError(msg string) bool { + msgLower := strings.ToLower(msg) + // UNSAFE! TODO: Add a error type for retryable connection error. + for _, errStr := range retryableErrorMsg { + if strings.Contains(msgLower, errStr) { + return true + } + } + return false +} diff --git a/br/pkg/utils/permission.go b/br/pkg/utils/permission.go deleted file mode 100644 index 3c0795db11c47..0000000000000 --- a/br/pkg/utils/permission.go +++ /dev/null @@ -1,20 +0,0 @@ -package utils - -import "strings" - -var ( - ioNotFoundMsg = "notfound" - permissionDeniedMsg = "permissiondenied" -) - -// messageIsNotFoundStorageError checks whether the message returning from TiKV is "NotFound" storage I/O error -func messageIsNotFoundStorageError(msg string) bool { - msgLower := strings.ToLower(msg) - return strings.Contains(msgLower, "io") && strings.Contains(msgLower, ioNotFoundMsg) -} - -// MessageIsPermissionDeniedStorageError checks whether the message returning from TiKV is "PermissionDenied" storage I/O error -func messageIsPermissionDeniedStorageError(msg string) bool { - msgLower := strings.ToLower(msg) - return strings.Contains(msgLower, permissionDeniedMsg) -} diff --git a/br/pkg/utils/retry.go b/br/pkg/utils/retry.go index 2efe37a8e8dff..6671f16b7842e 100644 --- a/br/pkg/utils/retry.go +++ b/br/pkg/utils/retry.go @@ -5,14 +5,11 @@ package utils import ( "context" stderrs "errors" - "fmt" - "strings" "sync" "time" "github.com/google/uuid" "github.com/pingcap/errors" - backuppb "github.com/pingcap/kvproto/pkg/brpb" "github.com/pingcap/log" tmysql "github.com/pingcap/tidb/pkg/errno" "github.com/pingcap/tidb/pkg/parser/terror" @@ -22,175 +19,6 @@ import ( "go.uber.org/zap" ) -var retryableServerError = []string{ - "server closed", - "connection refused", - "connection reset by peer", - "channel closed", - "error trying to connect", - "connection closed before message completed", - "body write aborted", - "error during dispatch", - "put object timeout", - "timeout after", - "internalerror", - "not read from or written to within the timeout period", - "requesttimeout", - "invalidpart", - "end of file before message length reached", -} - -type ErrorResult struct { - Strategy ErrorStrategy - Reason string -} - -type ErrorStrategy int - -const ( - // This type can be retry but consume the backoffer attempts. - RetryStrategy ErrorStrategy = iota - // This type means unrecoverable error and the whole progress should exits - // for example: - // 1. permission not valid. - // 2. data has not found. - // 3. retry too many times - GiveUpStrategy - // This type represents Unknown error - UnknownStrategy -) - -type ErrorContext struct { - mu sync.Mutex - // encounter times for one context on a store - // we may use this value to determine the retry policy - encounterTimes map[uint64]int - // unknown error retry limitation. - // encouter many times error makes Retry to GiveUp. - encounterTimesLimitation int - // whether in backup or restore - scenario string -} - -func NewErrorContext(scenario string, limitation int) *ErrorContext { - return &ErrorContext{ - scenario: scenario, - encounterTimes: make(map[uint64]int), - encounterTimesLimitation: limitation, - } -} - -func NewDefaultContext() *ErrorContext { - return &ErrorContext{ - scenario: "default", - encounterTimes: make(map[uint64]int), - encounterTimesLimitation: 1, - } -} - -func (ec *ErrorContext) HandleError(err *backuppb.Error, uuid uint64) ErrorResult { - if err == nil { - return ErrorResult{RetryStrategy, "unreachable retry"} - } - res := ec.handleErrorPb(err, uuid) - // try the best effort to save progress from error here - if res.Strategy == UnknownStrategy && len(err.Msg) != 0 { - return ec.HandleErrorMsg(err.Msg, uuid) - } - return res -} - -func (ec *ErrorContext) HandleIgnorableError(err *backuppb.Error, uuid uint64) ErrorResult { - if err == nil { - return ErrorResult{RetryStrategy, "unreachable retry"} - } - res := ec.handleIgnorableErrorPb(err, uuid) - // try the best effort to save progress from error here - if res.Strategy == UnknownStrategy && len(err.Msg) != 0 { - return ec.HandleErrorMsg(err.Msg, uuid) - } - return res -} - -func (ec *ErrorContext) HandleErrorMsg(msg string, uuid uint64) ErrorResult { - // UNSAFE! TODO: use meaningful error code instead of unstructured message to find failed to write error. - logger := log.L().With(zap.String("scenario", ec.scenario)) - if messageIsNotFoundStorageError(msg) { - reason := fmt.Sprintf("File or directory not found on TiKV Node (store id: %v). "+ - "work around:please ensure br and tikv nodes share a same storage and the user of br and tikv has same uid.", - uuid) - return ErrorResult{GiveUpStrategy, reason} - } - if messageIsPermissionDeniedStorageError(msg) { - reason := fmt.Sprintf("I/O permission denied error occurs on TiKV Node(store id: %v). "+ - "work around:please ensure tikv has permission to read from & write to the storage.", - uuid) - return ErrorResult{GiveUpStrategy, reason} - } - msgLower := strings.ToLower(msg) - if strings.Contains(msgLower, "context canceled") { - return ErrorResult{GiveUpStrategy, "context canceled, give up"} - } - - if MessageIsRetryableStorageError(msg) { - logger.Warn("occur storage error", zap.String("error", msg)) - return ErrorResult{RetryStrategy, "retrable error"} - } - // retry enough on same store - ec.mu.Lock() - defer ec.mu.Unlock() - ec.encounterTimes[uuid]++ - if ec.encounterTimes[uuid] <= ec.encounterTimesLimitation { - return ErrorResult{RetryStrategy, "unknown error, retry it for few times"} - } - return ErrorResult{GiveUpStrategy, "unknown error and retry too many times, give up"} -} - -func (ec *ErrorContext) handleIgnorableErrorPb(e *backuppb.Error, uuid uint64) ErrorResult { - switch e.Detail.(type) { - case *backuppb.Error_KvError: - return ErrorResult{RetryStrategy, "retry outside because the error can be ignored"} - case *backuppb.Error_RegionError: - return ErrorResult{RetryStrategy, "retry outside because the error can be ignored"} - case *backuppb.Error_ClusterIdError: - return ErrorResult{GiveUpStrategy, "cluster ID mismatch"} - } - return ErrorResult{UnknownStrategy, "unreachable code"} -} - -func (ec *ErrorContext) handleErrorPb(e *backuppb.Error, uuid uint64) ErrorResult { - logger := log.L().With(zap.String("scenario", ec.scenario)) - switch v := e.Detail.(type) { - case *backuppb.Error_KvError: - // should not meet error other than KeyLocked. - return ErrorResult{GiveUpStrategy, "unknown kv error"} - - case *backuppb.Error_RegionError: - regionErr := v.RegionError - // Ignore following errors. - if !(regionErr.EpochNotMatch != nil || - regionErr.NotLeader != nil || - regionErr.RegionNotFound != nil || - regionErr.ServerIsBusy != nil || - regionErr.StaleCommand != nil || - regionErr.StoreNotMatch != nil || - regionErr.ReadIndexNotReady != nil || - regionErr.ProposalInMergingMode != nil) { - logger.Error("unexpect region error", zap.Reflect("RegionError", regionErr)) - return ErrorResult{GiveUpStrategy, "unknown kv error"} - } - logger.Warn("occur region error", - zap.Reflect("RegionError", regionErr), - zap.Uint64("uuid", uuid)) - return ErrorResult{RetryStrategy, "retrable error"} - - case *backuppb.Error_ClusterIdError: - logger.Error("occur cluster ID error", zap.Reflect("error", v), zap.Uint64("uuid", uuid)) - return ErrorResult{GiveUpStrategy, "cluster ID mismatch"} - } - return ErrorResult{UnknownStrategy, "unreachable code"} -} - // RetryableFunc presents a retryable operation. type RetryableFunc func() error @@ -281,18 +109,6 @@ func WithRetryReturnLastErr( return lastErr } -// MessageIsRetryableStorageError checks whether the message returning from TiKV is retryable ExternalStorageError. -func MessageIsRetryableStorageError(msg string) bool { - msgLower := strings.ToLower(msg) - // UNSAFE! TODO: Add a error type for retryable connection error. - for _, errStr := range retryableServerError { - if strings.Contains(msgLower, errStr) { - return true - } - } - return false -} - func FallBack2CreateTable(err error) bool { switch nerr := errors.Cause(err).(type) { case *terror.Error: diff --git a/br/pkg/utils/retry_test.go b/br/pkg/utils/retry_test.go index 605b59fadd8b8..972f738b4c0c9 100644 --- a/br/pkg/utils/retry_test.go +++ b/br/pkg/utils/retry_test.go @@ -77,34 +77,24 @@ func TestFailNowIf(t *testing.T) { func TestHandleError(t *testing.T) { ec := utils.NewErrorContext("test", 3) // Test case 1: Error is nil - result := ec.HandleError(nil, 123) - require.Equal(t, utils.ErrorResult{utils.RetryStrategy, "unreachable retry"}, result) + result := utils.HandleBackupError(nil, 123, ec) + require.Equal(t, utils.ErrorHandlingResult{Strategy: utils.Retry, Reason: utils.UnreachableRetryMsg}, result) // Test case 2: Error is KvError and can be ignored kvError := &backuppb.Error_KvError{} - result = ec.HandleIgnorableError(&backuppb.Error{Detail: kvError}, 123) - require.Equal(t, utils.ErrorResult{utils.RetryStrategy, "retry outside because the error can be ignored"}, result) + result = utils.HandleBackupError(&backuppb.Error{Detail: kvError}, 123, ec) + require.Equal(t, utils.ErrorHandlingResult{Strategy: utils.Retry, Reason: utils.RetryOnKvErrorMsg}, result) - // Test case 3: Error is KvError and cannot be ignored - result = ec.HandleError(&backuppb.Error{Detail: kvError}, 123) - require.Equal(t, utils.ErrorResult{utils.GiveUpStrategy, "unknown kv error"}, result) - - // Test case 4: Error is RegionError and can be ignored + // Test case 3: Error is RegionError and can be ignored regionError := &backuppb.Error_RegionError{ RegionError: &errorpb.Error{NotLeader: &errorpb.NotLeader{RegionId: 1}}} - result = ec.HandleIgnorableError(&backuppb.Error{Detail: regionError}, 123) - require.Equal(t, utils.ErrorResult{utils.RetryStrategy, "retry outside because the error can be ignored"}, result) - - // Test case 5: Error is RegionError and cannot be ignored - regionError = &backuppb.Error_RegionError{ - RegionError: &errorpb.Error{DiskFull: &errorpb.DiskFull{}}} - result = ec.HandleError(&backuppb.Error{Detail: regionError}, 123) - require.Equal(t, utils.ErrorResult{utils.GiveUpStrategy, "unknown kv error"}, result) + result = utils.HandleBackupError(&backuppb.Error{Detail: regionError}, 123, ec) + require.Equal(t, utils.ErrorHandlingResult{Strategy: utils.Retry, Reason: utils.RetryOnRegionErrorMsg}, result) - // Test case 6: Error is ClusterIdError + // Test case 4: Error is ClusterIdError clusterIdError := &backuppb.Error_ClusterIdError{} - result = ec.HandleError(&backuppb.Error{Detail: clusterIdError}, 123) - require.Equal(t, utils.ErrorResult{utils.GiveUpStrategy, "cluster ID mismatch"}, result) + result = utils.HandleBackupError(&backuppb.Error{Detail: clusterIdError}, 123, ec) + require.Equal(t, utils.ErrorHandlingResult{Strategy: utils.GiveUp, Reason: utils.ClusterIdMismatchMsg}, result) } func TestHandleErrorMsg(t *testing.T) { @@ -113,34 +103,34 @@ func TestHandleErrorMsg(t *testing.T) { // Test messageIsNotFoundStorageError msg := "IO: files Notfound error" uuid := uint64(456) - expectedReason := "File or directory not found on TiKV Node (store id: 456). work around:please ensure br and tikv nodes share a same storage and the user of br and tikv has same uid." - expectedResult := utils.ErrorResult{utils.GiveUpStrategy, expectedReason} - actualResult := ec.HandleErrorMsg(msg, uuid) + expectedReason := "File or directory not found on TiKV Node (store id: 456). workaround: please ensure br and tikv nodes share a same storage and the user of br and tikv has same uid." + expectedResult := utils.ErrorHandlingResult{Strategy: utils.GiveUp, Reason: expectedReason} + actualResult := utils.HandleUnknownError(msg, uuid, ec) require.Equal(t, expectedResult, actualResult) // Test messageIsPermissionDeniedStorageError msg = "I/O permissiondenied error occurs on TiKV Node(store id: 456)." - expectedReason = "I/O permission denied error occurs on TiKV Node(store id: 456). work around:please ensure tikv has permission to read from & write to the storage." - expectedResult = utils.ErrorResult{utils.GiveUpStrategy, expectedReason} - actualResult = ec.HandleErrorMsg(msg, uuid) + expectedReason = "I/O permission denied error occurs on TiKV Node(store id: 456). workaround: please ensure tikv has permission to read from & write to the storage." + expectedResult = utils.ErrorHandlingResult{Strategy: utils.GiveUp, Reason: expectedReason} + actualResult = utils.HandleUnknownError(msg, uuid, ec) require.Equal(t, expectedResult, actualResult) // Test MessageIsRetryableStorageError msg = "server closed" - expectedResult = utils.ErrorResult{utils.RetryStrategy, "retrable error"} - actualResult = ec.HandleErrorMsg(msg, uuid) + expectedResult = utils.ErrorHandlingResult{Strategy: utils.Retry, Reason: utils.RetryableStorageErrorMsg} + actualResult = utils.HandleUnknownError(msg, uuid, ec) require.Equal(t, expectedResult, actualResult) // Test unknown error msg = "unknown error" - expectedResult = utils.ErrorResult{utils.RetryStrategy, "unknown error, retry it for few times"} - actualResult = ec.HandleErrorMsg(msg, uuid) + expectedResult = utils.ErrorHandlingResult{Strategy: utils.Retry, Reason: utils.RetryOnUnknownErrorMsg} + actualResult = utils.HandleUnknownError(msg, uuid, ec) require.Equal(t, expectedResult, actualResult) // Test retry too many times - _ = ec.HandleErrorMsg(msg, uuid) - _ = ec.HandleErrorMsg(msg, uuid) - expectedResult = utils.ErrorResult{utils.GiveUpStrategy, "unknown error and retry too many times, give up"} - actualResult = ec.HandleErrorMsg(msg, uuid) + _ = utils.HandleUnknownError(msg, uuid, ec) + _ = utils.HandleUnknownError(msg, uuid, ec) + expectedResult = utils.ErrorHandlingResult{Strategy: utils.GiveUp, Reason: utils.NoRetryOnUnknownErrorMsg} + actualResult = utils.HandleUnknownError(msg, uuid, ec) require.Equal(t, expectedResult, actualResult) } From 976ad253430f2b8b142b0300ac5a25b97994727b Mon Sep 17 00:00:00 2001 From: Wenqi Mou Date: Wed, 17 Jul 2024 13:43:29 -0400 Subject: [PATCH 2/8] refactor test and minor Signed-off-by: Wenqi Mou --- br/pkg/utils/backoff.go | 2 +- br/pkg/utils/error_handling.go | 50 ++++++++++----------- br/pkg/utils/error_handling_test.go | 70 +++++++++++++++++++++++++++++ br/pkg/utils/retry_test.go | 63 -------------------------- 4 files changed, 96 insertions(+), 89 deletions(-) create mode 100644 br/pkg/utils/error_handling_test.go diff --git a/br/pkg/utils/backoff.go b/br/pkg/utils/backoff.go index a3613fea64b1b..ffb58646e1752 100644 --- a/br/pkg/utils/backoff.go +++ b/br/pkg/utils/backoff.go @@ -172,7 +172,7 @@ func (bo *importerBackoffer) NextBackoff(err error) time.Duration { // we don't care storeID here. errs := multierr.Errors(err) lastErr := errs[len(errs)-1] - res := HandleUnknownError(lastErr.Error(), 0, bo.errContext) + res := HandleUnknownBackupError(lastErr.Error(), 0, bo.errContext) if res.Strategy == Retry { bo.delayTime = 2 * bo.delayTime bo.attempt-- diff --git a/br/pkg/utils/error_handling.go b/br/pkg/utils/error_handling.go index 8a5b62c904cf7..027ed08146861 100644 --- a/br/pkg/utils/error_handling.go +++ b/br/pkg/utils/error_handling.go @@ -30,23 +30,23 @@ var retryableErrorMsg = []string{ // non-retryable error messages // TODO: remove and mapped all to an error type -var ( +const ( ioMsg = "io" notFoundMsg = "notfound" permissionDeniedMsg = "permissiondenied" ) // error messages -var ( - UnreachableRetryMsg = "unreachable retry" - RetryOnKvErrorMsg = "retry on kv error" - RetryOnRegionErrorMsg = "retry on region error" - ClusterIdMismatchMsg = "cluster id mismatch" - UnknownErrorMsg = "unknown error" - ContextCancelledMsg = "context canceled" - RetryOnUnknownErrorMsg = "unknown error, retry it for a few times" - NoRetryOnUnknownErrorMsg = "unknown error, retried too many times, give up" - RetryableStorageErrorMsg = "retryable storage error" +const ( + unreachableRetryMsg = "unreachable retry" + retryOnKvErrorMsg = "retry on kv error" + retryOnRegionErrorMsg = "retry on region error" + clusterIdMismatchMsg = "cluster id mismatch" + unknownErrorMsg = "unknown error" + contextCancelledMsg = "context canceled" + retryOnUnknownErrorMsg = "unknown error, retry it for a few times" + noRetryOnUnknownErrorMsg = "unknown error, retried too many times, give up" + retryableStorageErrorMsg = "retryable storage error" ) type ErrorHandlingResult struct { @@ -98,12 +98,12 @@ func NewDefaultContext() *ErrorContext { func HandleBackupError(err *backuppb.Error, storeId uint64, ec *ErrorContext) ErrorHandlingResult { if err == nil { - return ErrorHandlingResult{Retry, UnreachableRetryMsg} + return ErrorHandlingResult{Retry, unreachableRetryMsg} } res := handleBackupProtoError(err) // try the best effort handle unknown error based on their error message if res.Strategy == Unknown && len(err.Msg) != 0 { - return HandleUnknownError(err.Msg, storeId, ec) + return HandleUnknownBackupError(err.Msg, storeId, ec) } return res } @@ -111,17 +111,17 @@ func HandleBackupError(err *backuppb.Error, storeId uint64, ec *ErrorContext) Er func handleBackupProtoError(e *backuppb.Error) ErrorHandlingResult { switch e.Detail.(type) { case *backuppb.Error_KvError: - return ErrorHandlingResult{Retry, RetryOnKvErrorMsg} + return ErrorHandlingResult{Retry, retryOnKvErrorMsg} case *backuppb.Error_RegionError: - return ErrorHandlingResult{Retry, RetryOnRegionErrorMsg} + return ErrorHandlingResult{Retry, retryOnRegionErrorMsg} case *backuppb.Error_ClusterIdError: - return ErrorHandlingResult{GiveUp, ClusterIdMismatchMsg} + return ErrorHandlingResult{GiveUp, clusterIdMismatchMsg} } - return ErrorHandlingResult{Unknown, UnknownErrorMsg} + return ErrorHandlingResult{Unknown, unknownErrorMsg} } -// HandleUnknownError TODO: remove this method and map all the current unknown errors to an error type -func HandleUnknownError(msg string, uuid uint64, ec *ErrorContext) ErrorHandlingResult { +// HandleUnknownBackupError TODO: remove this method and map all the current unknown errors to an error type +func HandleUnknownBackupError(msg string, uuid uint64, ec *ErrorContext) ErrorHandlingResult { // UNSAFE! TODO: use meaningful error code instead of unstructured message to find failed to write error. logger := log.L().With(zap.String("description", ec.description)) if messageIsNotFoundStorageError(msg) { @@ -137,13 +137,13 @@ func HandleUnknownError(msg string, uuid uint64, ec *ErrorContext) ErrorHandling return ErrorHandlingResult{GiveUp, reason} } msgLower := strings.ToLower(msg) - if strings.Contains(msgLower, ContextCancelledMsg) { - return ErrorHandlingResult{GiveUp, ContextCancelledMsg} + if strings.Contains(msgLower, contextCancelledMsg) { + return ErrorHandlingResult{GiveUp, contextCancelledMsg} } if MessageIsRetryableStorageError(msg) { - logger.Warn(RetryableStorageErrorMsg, zap.String("error", msg)) - return ErrorHandlingResult{Retry, RetryableStorageErrorMsg} + logger.Warn(retryableStorageErrorMsg, zap.String("error", msg)) + return ErrorHandlingResult{Retry, retryableStorageErrorMsg} } // retry enough on same store @@ -151,9 +151,9 @@ func HandleUnknownError(msg string, uuid uint64, ec *ErrorContext) ErrorHandling defer ec.mu.Unlock() ec.encounterTimes[uuid]++ if ec.encounterTimes[uuid] <= ec.encounterTimesLimitation { - return ErrorHandlingResult{Retry, RetryOnUnknownErrorMsg} + return ErrorHandlingResult{Retry, retryOnUnknownErrorMsg} } - return ErrorHandlingResult{GiveUp, NoRetryOnUnknownErrorMsg} + return ErrorHandlingResult{GiveUp, noRetryOnUnknownErrorMsg} } // messageIsNotFoundStorageError checks whether the message returning from TiKV is "NotFound" storage I/O error diff --git a/br/pkg/utils/error_handling_test.go b/br/pkg/utils/error_handling_test.go new file mode 100644 index 0000000000000..7c82972b56690 --- /dev/null +++ b/br/pkg/utils/error_handling_test.go @@ -0,0 +1,70 @@ +package utils + +import ( + backuppb "github.com/pingcap/kvproto/pkg/brpb" + "github.com/pingcap/kvproto/pkg/errorpb" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestHandleError(t *testing.T) { + ec := NewErrorContext("test", 3) + // Test case 1: Error is nil + result := HandleBackupError(nil, 123, ec) + require.Equal(t, ErrorHandlingResult{Strategy: Retry, Reason: unreachableRetryMsg}, result) + + // Test case 2: Error is KvError and can be ignored + kvError := &backuppb.Error_KvError{} + result = HandleBackupError(&backuppb.Error{Detail: kvError}, 123, ec) + require.Equal(t, ErrorHandlingResult{Strategy: Retry, Reason: retryOnKvErrorMsg}, result) + + // Test case 3: Error is RegionError and can be ignored + regionError := &backuppb.Error_RegionError{ + RegionError: &errorpb.Error{NotLeader: &errorpb.NotLeader{RegionId: 1}}} + result = HandleBackupError(&backuppb.Error{Detail: regionError}, 123, ec) + require.Equal(t, ErrorHandlingResult{Strategy: Retry, Reason: retryOnRegionErrorMsg}, result) + + // Test case 4: Error is ClusterIdError + clusterIdError := &backuppb.Error_ClusterIdError{} + result = HandleBackupError(&backuppb.Error{Detail: clusterIdError}, 123, ec) + require.Equal(t, ErrorHandlingResult{Strategy: GiveUp, Reason: clusterIdMismatchMsg}, result) +} + +func TestHandleErrorMsg(t *testing.T) { + ec := NewErrorContext("test", 3) + + // Test messageIsNotFoundStorageError + msg := "IO: files Notfound error" + uuid := uint64(456) + expectedReason := "File or directory not found on TiKV Node (store id: 456). workaround: please ensure br and tikv nodes share a same storage and the user of br and tikv has same uid." + expectedResult := ErrorHandlingResult{Strategy: GiveUp, Reason: expectedReason} + actualResult := HandleUnknownBackupError(msg, uuid, ec) + require.Equal(t, expectedResult, actualResult) + + // Test messageIsPermissionDeniedStorageError + msg = "I/O permissiondenied error occurs on TiKV Node(store id: 456)." + expectedReason = "I/O permission denied error occurs on TiKV Node(store id: 456). workaround: please ensure tikv has permission to read from & write to the storage." + expectedResult = ErrorHandlingResult{Strategy: GiveUp, Reason: expectedReason} + actualResult = HandleUnknownBackupError(msg, uuid, ec) + require.Equal(t, expectedResult, actualResult) + + // Test MessageIsRetryableStorageError + msg = "server closed" + expectedResult = ErrorHandlingResult{Strategy: Retry, Reason: retryableStorageErrorMsg} + actualResult = HandleUnknownBackupError(msg, uuid, ec) + require.Equal(t, expectedResult, actualResult) + + // Test unknown error + msg = "unknown error" + expectedResult = ErrorHandlingResult{Strategy: Retry, Reason: retryOnUnknownErrorMsg} + actualResult = HandleUnknownBackupError(msg, uuid, ec) + require.Equal(t, expectedResult, actualResult) + + // Test retry too many times + _ = HandleUnknownBackupError(msg, uuid, ec) + _ = HandleUnknownBackupError(msg, uuid, ec) + expectedResult = ErrorHandlingResult{Strategy: GiveUp, Reason: noRetryOnUnknownErrorMsg} + actualResult = HandleUnknownBackupError(msg, uuid, ec) + require.Equal(t, expectedResult, actualResult) +} diff --git a/br/pkg/utils/retry_test.go b/br/pkg/utils/retry_test.go index 972f738b4c0c9..c2afe35f47741 100644 --- a/br/pkg/utils/retry_test.go +++ b/br/pkg/utils/retry_test.go @@ -9,8 +9,6 @@ import ( "time" "github.com/pingcap/errors" - backuppb "github.com/pingcap/kvproto/pkg/brpb" - "github.com/pingcap/kvproto/pkg/errorpb" berrors "github.com/pingcap/tidb/br/pkg/errors" "github.com/pingcap/tidb/br/pkg/utils" "github.com/stretchr/testify/require" @@ -73,64 +71,3 @@ func TestFailNowIf(t *testing.T) { assert.Equal(time.Duration(0), bo.NextBackoff(annotatedErr)) assert.Equal(0, bo.Attempt()) } - -func TestHandleError(t *testing.T) { - ec := utils.NewErrorContext("test", 3) - // Test case 1: Error is nil - result := utils.HandleBackupError(nil, 123, ec) - require.Equal(t, utils.ErrorHandlingResult{Strategy: utils.Retry, Reason: utils.UnreachableRetryMsg}, result) - - // Test case 2: Error is KvError and can be ignored - kvError := &backuppb.Error_KvError{} - result = utils.HandleBackupError(&backuppb.Error{Detail: kvError}, 123, ec) - require.Equal(t, utils.ErrorHandlingResult{Strategy: utils.Retry, Reason: utils.RetryOnKvErrorMsg}, result) - - // Test case 3: Error is RegionError and can be ignored - regionError := &backuppb.Error_RegionError{ - RegionError: &errorpb.Error{NotLeader: &errorpb.NotLeader{RegionId: 1}}} - result = utils.HandleBackupError(&backuppb.Error{Detail: regionError}, 123, ec) - require.Equal(t, utils.ErrorHandlingResult{Strategy: utils.Retry, Reason: utils.RetryOnRegionErrorMsg}, result) - - // Test case 4: Error is ClusterIdError - clusterIdError := &backuppb.Error_ClusterIdError{} - result = utils.HandleBackupError(&backuppb.Error{Detail: clusterIdError}, 123, ec) - require.Equal(t, utils.ErrorHandlingResult{Strategy: utils.GiveUp, Reason: utils.ClusterIdMismatchMsg}, result) -} - -func TestHandleErrorMsg(t *testing.T) { - ec := utils.NewErrorContext("test", 3) - - // Test messageIsNotFoundStorageError - msg := "IO: files Notfound error" - uuid := uint64(456) - expectedReason := "File or directory not found on TiKV Node (store id: 456). workaround: please ensure br and tikv nodes share a same storage and the user of br and tikv has same uid." - expectedResult := utils.ErrorHandlingResult{Strategy: utils.GiveUp, Reason: expectedReason} - actualResult := utils.HandleUnknownError(msg, uuid, ec) - require.Equal(t, expectedResult, actualResult) - - // Test messageIsPermissionDeniedStorageError - msg = "I/O permissiondenied error occurs on TiKV Node(store id: 456)." - expectedReason = "I/O permission denied error occurs on TiKV Node(store id: 456). workaround: please ensure tikv has permission to read from & write to the storage." - expectedResult = utils.ErrorHandlingResult{Strategy: utils.GiveUp, Reason: expectedReason} - actualResult = utils.HandleUnknownError(msg, uuid, ec) - require.Equal(t, expectedResult, actualResult) - - // Test MessageIsRetryableStorageError - msg = "server closed" - expectedResult = utils.ErrorHandlingResult{Strategy: utils.Retry, Reason: utils.RetryableStorageErrorMsg} - actualResult = utils.HandleUnknownError(msg, uuid, ec) - require.Equal(t, expectedResult, actualResult) - - // Test unknown error - msg = "unknown error" - expectedResult = utils.ErrorHandlingResult{Strategy: utils.Retry, Reason: utils.RetryOnUnknownErrorMsg} - actualResult = utils.HandleUnknownError(msg, uuid, ec) - require.Equal(t, expectedResult, actualResult) - - // Test retry too many times - _ = utils.HandleUnknownError(msg, uuid, ec) - _ = utils.HandleUnknownError(msg, uuid, ec) - expectedResult = utils.ErrorHandlingResult{Strategy: utils.GiveUp, Reason: utils.NoRetryOnUnknownErrorMsg} - actualResult = utils.HandleUnknownError(msg, uuid, ec) - require.Equal(t, expectedResult, actualResult) -} From f917357442a4abed06a2de0d1edbaf907a002a22 Mon Sep 17 00:00:00 2001 From: Wenqi Mou Date: Wed, 17 Jul 2024 14:09:51 -0400 Subject: [PATCH 3/8] fix bazel Signed-off-by: Wenqi Mou --- br/pkg/utils/BUILD.bazel | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/br/pkg/utils/BUILD.bazel b/br/pkg/utils/BUILD.bazel index 1b0272f37d956..6dd301cdf5946 100644 --- a/br/pkg/utils/BUILD.bazel +++ b/br/pkg/utils/BUILD.bazel @@ -7,10 +7,10 @@ go_library( "db.go", "dyn_pprof_other.go", "dyn_pprof_unix.go", + "error_handling.go" "json.go", "key.go", "misc.go", - "permission.go", "pointer.go", "pprof.go", "progress.go", @@ -68,6 +68,7 @@ go_test( srcs = [ "backoff_test.go", "db_test.go", + "error_handling_test.go" "json_test.go", "key_test.go", "main_test.go", From 597dd7151413f9cd8d384c8ff9c7fbce315a22df Mon Sep 17 00:00:00 2001 From: Wenqi Mou Date: Wed, 17 Jul 2024 16:44:38 -0400 Subject: [PATCH 4/8] fix bazel Signed-off-by: Wenqi Mou --- br/pkg/utils/BUILD.bazel | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/br/pkg/utils/BUILD.bazel b/br/pkg/utils/BUILD.bazel index 6dd301cdf5946..0f4ceade484e6 100644 --- a/br/pkg/utils/BUILD.bazel +++ b/br/pkg/utils/BUILD.bazel @@ -7,7 +7,7 @@ go_library( "db.go", "dyn_pprof_other.go", "dyn_pprof_unix.go", - "error_handling.go" + "error_handling.go", "json.go", "key.go", "misc.go", @@ -68,7 +68,7 @@ go_test( srcs = [ "backoff_test.go", "db_test.go", - "error_handling_test.go" + "error_handling_test.go", "json_test.go", "key_test.go", "main_test.go", From 06426693aad597c16e1f7c4aef8649bd69aa2dff Mon Sep 17 00:00:00 2001 From: Wenqi Mou Date: Wed, 17 Jul 2024 18:19:41 -0400 Subject: [PATCH 5/8] annotate TODO Signed-off-by: Wenqi Mou --- br/pkg/utils/error_handling.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/br/pkg/utils/error_handling.go b/br/pkg/utils/error_handling.go index 027ed08146861..26d24fda851cc 100644 --- a/br/pkg/utils/error_handling.go +++ b/br/pkg/utils/error_handling.go @@ -9,7 +9,7 @@ import ( "sync" ) -// TODO: remove and mapped all to an error type +// UNSAFE! TODO: remove and map them to error types var retryableErrorMsg = []string{ "server closed", "connection refused", @@ -29,7 +29,7 @@ var retryableErrorMsg = []string{ } // non-retryable error messages -// TODO: remove and mapped all to an error type +// UNSAFE! TODO: remove and map them to error types const ( ioMsg = "io" notFoundMsg = "notfound" @@ -120,7 +120,7 @@ func handleBackupProtoError(e *backuppb.Error) ErrorHandlingResult { return ErrorHandlingResult{Unknown, unknownErrorMsg} } -// HandleUnknownBackupError TODO: remove this method and map all the current unknown errors to an error type +// HandleUnknownBackupError UNSAFE! TODO: remove this method and map all the current unknown errors to error types func HandleUnknownBackupError(msg string, uuid uint64, ec *ErrorContext) ErrorHandlingResult { // UNSAFE! TODO: use meaningful error code instead of unstructured message to find failed to write error. logger := log.L().With(zap.String("description", ec.description)) From 62b39d0d4827a8c57a50ba7f0609274ef3ab4c7f Mon Sep 17 00:00:00 2001 From: Wenqi Mou Date: Thu, 18 Jul 2024 13:14:23 -0400 Subject: [PATCH 6/8] use goimports, add license Signed-off-by: Wenqi Mou --- br/pkg/utils/error_handling.go | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/br/pkg/utils/error_handling.go b/br/pkg/utils/error_handling.go index 26d24fda851cc..f6d3c0abb13d6 100644 --- a/br/pkg/utils/error_handling.go +++ b/br/pkg/utils/error_handling.go @@ -1,12 +1,27 @@ +// Copyright 2024 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package utils import ( "fmt" + "strings" + "sync" + backuppb "github.com/pingcap/kvproto/pkg/brpb" "github.com/pingcap/log" "go.uber.org/zap" - "strings" - "sync" ) // UNSAFE! TODO: remove and map them to error types From 6b994c9fed322b3bb6f1a59648b5fb266edbfaf9 Mon Sep 17 00:00:00 2001 From: Wenqi Mou Date: Thu, 18 Jul 2024 14:03:34 -0400 Subject: [PATCH 7/8] fix import in test.... Signed-off-by: Wenqi Mou --- br/pkg/utils/error_handling_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/br/pkg/utils/error_handling_test.go b/br/pkg/utils/error_handling_test.go index 7c82972b56690..a4c098f528ada 100644 --- a/br/pkg/utils/error_handling_test.go +++ b/br/pkg/utils/error_handling_test.go @@ -1,10 +1,10 @@ package utils import ( - backuppb "github.com/pingcap/kvproto/pkg/brpb" - "github.com/pingcap/kvproto/pkg/errorpb" "testing" + backuppb "github.com/pingcap/kvproto/pkg/brpb" + "github.com/pingcap/kvproto/pkg/errorpb" "github.com/stretchr/testify/require" ) From 7e3a7e9c978c0ba09d05d02ed596bfbdcdfbbd47 Mon Sep 17 00:00:00 2001 From: Wenqi Mou Date: Mon, 22 Jul 2024 12:14:09 -0400 Subject: [PATCH 8/8] rename enum to be more meaningful Signed-off-by: Wenqi Mou --- br/pkg/backup/client.go | 2 +- br/pkg/utils/backoff.go | 2 +- br/pkg/utils/error_handling.go | 36 ++++++++++++++--------------- br/pkg/utils/error_handling_test.go | 18 +++++++-------- 4 files changed, 29 insertions(+), 29 deletions(-) diff --git a/br/pkg/backup/client.go b/br/pkg/backup/client.go index 8125d21f8848a..602838ba61f21 100644 --- a/br/pkg/backup/client.go +++ b/br/pkg/backup/client.go @@ -1237,7 +1237,7 @@ func (bc *Client) OnBackupResponse( } res := utils.HandleBackupError(errPb, storeID, errContext) switch res.Strategy { - case utils.GiveUp: + case utils.StrategyGiveUp: errMsg := res.Reason if len(errMsg) <= 0 { errMsg = errPb.Msg diff --git a/br/pkg/utils/backoff.go b/br/pkg/utils/backoff.go index ffb58646e1752..89c6196ad953d 100644 --- a/br/pkg/utils/backoff.go +++ b/br/pkg/utils/backoff.go @@ -173,7 +173,7 @@ func (bo *importerBackoffer) NextBackoff(err error) time.Duration { errs := multierr.Errors(err) lastErr := errs[len(errs)-1] res := HandleUnknownBackupError(lastErr.Error(), 0, bo.errContext) - if res.Strategy == Retry { + if res.Strategy == StrategyRetry { bo.delayTime = 2 * bo.delayTime bo.attempt-- } else { diff --git a/br/pkg/utils/error_handling.go b/br/pkg/utils/error_handling.go index f6d3c0abb13d6..6c6c84a2a1884 100644 --- a/br/pkg/utils/error_handling.go +++ b/br/pkg/utils/error_handling.go @@ -72,16 +72,16 @@ type ErrorHandlingResult struct { type ErrorHandlingStrategy int const ( - // Retry error can be retried but will consume the backoff attempt quota. - Retry ErrorHandlingStrategy = iota - // GiveUp means unrecoverable error happened and the BR should exit + // StrategyRetry error can be retried but will consume the backoff attempt quota. + StrategyRetry ErrorHandlingStrategy = iota + // StrategyGiveUp means unrecoverable error happened and the BR should exit // for example: // 1. permission not valid. // 2. data not found. // 3. retry too many times - GiveUp - // Unknown for Unknown error - Unknown + StrategyGiveUp + // StrategyUnknown for StrategyUnknown error + StrategyUnknown ) type ErrorContext struct { @@ -113,11 +113,11 @@ func NewDefaultContext() *ErrorContext { func HandleBackupError(err *backuppb.Error, storeId uint64, ec *ErrorContext) ErrorHandlingResult { if err == nil { - return ErrorHandlingResult{Retry, unreachableRetryMsg} + return ErrorHandlingResult{StrategyRetry, unreachableRetryMsg} } res := handleBackupProtoError(err) // try the best effort handle unknown error based on their error message - if res.Strategy == Unknown && len(err.Msg) != 0 { + if res.Strategy == StrategyUnknown && len(err.Msg) != 0 { return HandleUnknownBackupError(err.Msg, storeId, ec) } return res @@ -126,13 +126,13 @@ func HandleBackupError(err *backuppb.Error, storeId uint64, ec *ErrorContext) Er func handleBackupProtoError(e *backuppb.Error) ErrorHandlingResult { switch e.Detail.(type) { case *backuppb.Error_KvError: - return ErrorHandlingResult{Retry, retryOnKvErrorMsg} + return ErrorHandlingResult{StrategyRetry, retryOnKvErrorMsg} case *backuppb.Error_RegionError: - return ErrorHandlingResult{Retry, retryOnRegionErrorMsg} + return ErrorHandlingResult{StrategyRetry, retryOnRegionErrorMsg} case *backuppb.Error_ClusterIdError: - return ErrorHandlingResult{GiveUp, clusterIdMismatchMsg} + return ErrorHandlingResult{StrategyGiveUp, clusterIdMismatchMsg} } - return ErrorHandlingResult{Unknown, unknownErrorMsg} + return ErrorHandlingResult{StrategyUnknown, unknownErrorMsg} } // HandleUnknownBackupError UNSAFE! TODO: remove this method and map all the current unknown errors to error types @@ -143,22 +143,22 @@ func HandleUnknownBackupError(msg string, uuid uint64, ec *ErrorContext) ErrorHa reason := fmt.Sprintf("File or directory not found on TiKV Node (store id: %v). "+ "workaround: please ensure br and tikv nodes share a same storage and the user of br and tikv has same uid.", uuid) - return ErrorHandlingResult{GiveUp, reason} + return ErrorHandlingResult{StrategyGiveUp, reason} } if messageIsPermissionDeniedStorageError(msg) { reason := fmt.Sprintf("I/O permission denied error occurs on TiKV Node(store id: %v). "+ "workaround: please ensure tikv has permission to read from & write to the storage.", uuid) - return ErrorHandlingResult{GiveUp, reason} + return ErrorHandlingResult{StrategyGiveUp, reason} } msgLower := strings.ToLower(msg) if strings.Contains(msgLower, contextCancelledMsg) { - return ErrorHandlingResult{GiveUp, contextCancelledMsg} + return ErrorHandlingResult{StrategyGiveUp, contextCancelledMsg} } if MessageIsRetryableStorageError(msg) { logger.Warn(retryableStorageErrorMsg, zap.String("error", msg)) - return ErrorHandlingResult{Retry, retryableStorageErrorMsg} + return ErrorHandlingResult{StrategyRetry, retryableStorageErrorMsg} } // retry enough on same store @@ -166,9 +166,9 @@ func HandleUnknownBackupError(msg string, uuid uint64, ec *ErrorContext) ErrorHa defer ec.mu.Unlock() ec.encounterTimes[uuid]++ if ec.encounterTimes[uuid] <= ec.encounterTimesLimitation { - return ErrorHandlingResult{Retry, retryOnUnknownErrorMsg} + return ErrorHandlingResult{StrategyRetry, retryOnUnknownErrorMsg} } - return ErrorHandlingResult{GiveUp, noRetryOnUnknownErrorMsg} + return ErrorHandlingResult{StrategyGiveUp, noRetryOnUnknownErrorMsg} } // messageIsNotFoundStorageError checks whether the message returning from TiKV is "NotFound" storage I/O error diff --git a/br/pkg/utils/error_handling_test.go b/br/pkg/utils/error_handling_test.go index a4c098f528ada..6b82983843560 100644 --- a/br/pkg/utils/error_handling_test.go +++ b/br/pkg/utils/error_handling_test.go @@ -12,23 +12,23 @@ func TestHandleError(t *testing.T) { ec := NewErrorContext("test", 3) // Test case 1: Error is nil result := HandleBackupError(nil, 123, ec) - require.Equal(t, ErrorHandlingResult{Strategy: Retry, Reason: unreachableRetryMsg}, result) + require.Equal(t, ErrorHandlingResult{Strategy: StrategyRetry, Reason: unreachableRetryMsg}, result) // Test case 2: Error is KvError and can be ignored kvError := &backuppb.Error_KvError{} result = HandleBackupError(&backuppb.Error{Detail: kvError}, 123, ec) - require.Equal(t, ErrorHandlingResult{Strategy: Retry, Reason: retryOnKvErrorMsg}, result) + require.Equal(t, ErrorHandlingResult{Strategy: StrategyRetry, Reason: retryOnKvErrorMsg}, result) // Test case 3: Error is RegionError and can be ignored regionError := &backuppb.Error_RegionError{ RegionError: &errorpb.Error{NotLeader: &errorpb.NotLeader{RegionId: 1}}} result = HandleBackupError(&backuppb.Error{Detail: regionError}, 123, ec) - require.Equal(t, ErrorHandlingResult{Strategy: Retry, Reason: retryOnRegionErrorMsg}, result) + require.Equal(t, ErrorHandlingResult{Strategy: StrategyRetry, Reason: retryOnRegionErrorMsg}, result) // Test case 4: Error is ClusterIdError clusterIdError := &backuppb.Error_ClusterIdError{} result = HandleBackupError(&backuppb.Error{Detail: clusterIdError}, 123, ec) - require.Equal(t, ErrorHandlingResult{Strategy: GiveUp, Reason: clusterIdMismatchMsg}, result) + require.Equal(t, ErrorHandlingResult{Strategy: StrategyGiveUp, Reason: clusterIdMismatchMsg}, result) } func TestHandleErrorMsg(t *testing.T) { @@ -38,33 +38,33 @@ func TestHandleErrorMsg(t *testing.T) { msg := "IO: files Notfound error" uuid := uint64(456) expectedReason := "File or directory not found on TiKV Node (store id: 456). workaround: please ensure br and tikv nodes share a same storage and the user of br and tikv has same uid." - expectedResult := ErrorHandlingResult{Strategy: GiveUp, Reason: expectedReason} + expectedResult := ErrorHandlingResult{Strategy: StrategyGiveUp, Reason: expectedReason} actualResult := HandleUnknownBackupError(msg, uuid, ec) require.Equal(t, expectedResult, actualResult) // Test messageIsPermissionDeniedStorageError msg = "I/O permissiondenied error occurs on TiKV Node(store id: 456)." expectedReason = "I/O permission denied error occurs on TiKV Node(store id: 456). workaround: please ensure tikv has permission to read from & write to the storage." - expectedResult = ErrorHandlingResult{Strategy: GiveUp, Reason: expectedReason} + expectedResult = ErrorHandlingResult{Strategy: StrategyGiveUp, Reason: expectedReason} actualResult = HandleUnknownBackupError(msg, uuid, ec) require.Equal(t, expectedResult, actualResult) // Test MessageIsRetryableStorageError msg = "server closed" - expectedResult = ErrorHandlingResult{Strategy: Retry, Reason: retryableStorageErrorMsg} + expectedResult = ErrorHandlingResult{Strategy: StrategyRetry, Reason: retryableStorageErrorMsg} actualResult = HandleUnknownBackupError(msg, uuid, ec) require.Equal(t, expectedResult, actualResult) // Test unknown error msg = "unknown error" - expectedResult = ErrorHandlingResult{Strategy: Retry, Reason: retryOnUnknownErrorMsg} + expectedResult = ErrorHandlingResult{Strategy: StrategyRetry, Reason: retryOnUnknownErrorMsg} actualResult = HandleUnknownBackupError(msg, uuid, ec) require.Equal(t, expectedResult, actualResult) // Test retry too many times _ = HandleUnknownBackupError(msg, uuid, ec) _ = HandleUnknownBackupError(msg, uuid, ec) - expectedResult = ErrorHandlingResult{Strategy: GiveUp, Reason: noRetryOnUnknownErrorMsg} + expectedResult = ErrorHandlingResult{Strategy: StrategyGiveUp, Reason: noRetryOnUnknownErrorMsg} actualResult = HandleUnknownBackupError(msg, uuid, ec) require.Equal(t, expectedResult, actualResult) }