From d08fcd1da105ff3f9d3f6d17710ebbb88cd40b7d Mon Sep 17 00:00:00 2001 From: Modular Magician Date: Fri, 18 Feb 2022 00:43:08 +0000 Subject: [PATCH] Retry GCE's (new?) 403 ReadRequests errors (#5723) Signed-off-by: Modular Magician --- .changelog/5723.txt | 3 +++ google/error_retry_predicates.go | 24 ++++++++++++++++++++++++ 2 files changed, 27 insertions(+) create mode 100644 .changelog/5723.txt diff --git a/.changelog/5723.txt b/.changelog/5723.txt new file mode 100644 index 00000000000..f75e6b75c77 --- /dev/null +++ b/.changelog/5723.txt @@ -0,0 +1,3 @@ +```release-note:enhancement +provider: added retries for `ReadRequest` errors incorrectly coded as `403` errors, particularly in Google Compute Engine +``` diff --git a/google/error_retry_predicates.go b/google/error_retry_predicates.go index 63907c5305b..01b473dad89 100644 --- a/google/error_retry_predicates.go +++ b/google/error_retry_predicates.go @@ -32,11 +32,20 @@ var defaultErrorRetryPredicates = []RetryErrorPredicateFunc{ // Keeping it as a default for now. is409OperationInProgressError, + // GCE Error codes- we don't have a way to add these to all GCE resources + // easily, so add them globally. + // GCE Subnetworks are considered unready for a brief period when certain // operations are performed on them, and the scope is likely too broad to // apply a mutex. If we attempt an operation w/ an unready subnetwork, retry // it. isSubnetworkUnreadyError, + + // As of February 2022 GCE seems to have added extra quota enforcement on + // reads, causing significant failure for our CI and for large customers. + // GCE returns the wrong error code, as this should be a 429, which we retry + // already. + is403ReadRequestsForMinuteError, } /** END GLOBAL ERROR RETRY PREDICATES HERE **/ @@ -116,6 +125,21 @@ func isSubnetworkUnreadyError(err error) (bool, string) { return false, "" } +// GCE (and possibly other APIs) incorrectly return a 403 rather than a 429 on +// rate limits. +func is403ReadRequestsForMinuteError(err error) (bool, string) { + gerr, ok := err.(*googleapi.Error) + if !ok { + return false, "" + } + + if gerr.Code == 403 && strings.Contains(gerr.Body, "Quota exceeded for quota metric") && strings.Contains(gerr.Body, "Read requests per minute") { + log.Printf("[DEBUG] Dismissed an error as retryable based on error code 403 and error message 'Quota exceeded for quota metric' on metric `Read requests per minute`: %s", err) + return true, "Read requests per minute" + } + return false, "" +} + // Retry on comon googleapi error codes for retryable errors. // TODO(#5609): This may not need to be applied globally - figure out // what retryable error codes apply to which API.