Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Internal] Availability: Adds partition level retry logic for request timeout #2446

Merged
merged 9 commits into from
May 6, 2021
57 changes: 47 additions & 10 deletions Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,15 @@ public async Task<ShouldRetryResult> ShouldRetryAsync(
// Received Connection error (HttpRequestException), initiate the endpoint rediscovery
if (exception is HttpRequestException _)
{
DefaultTrace.TraceWarning("Endpoint not reachable. Refresh cache and retry");
DefaultTrace.TraceWarning("ClientRetryPolicy: Gateway HttpRequestException Endpoint not reachable. Failed Location: {0}; ResourceAddress: {1}",
this.documentServiceRequest?.RequestContext?.LocationEndpointToRoute?.ToString() ?? string.Empty,
this.documentServiceRequest?.ResourceAddress ?? string.Empty);

// Mark both read and write requests because it gateway exception.
// This means all requests going to the region will fail.
return await this.ShouldRetryOnEndpointFailureAsync(
isReadRequest: this.isReadRequest,
markBothReadAndWriteAsUnavailable: true,
forceRefresh: false,
retryOnPreferredLocations: true);
}
Expand Down Expand Up @@ -152,19 +158,36 @@ private async Task<ShouldRetryResult> ShouldRetryInternalAsync(
return null;
}

// Received request timeout
if (statusCode == HttpStatusCode.RequestTimeout)
j82w marked this conversation as resolved.
Show resolved Hide resolved
{
DefaultTrace.TraceWarning("ClientRetryPolicy: RequestTimeout. Failed Location: {0}; ResourceAddress: {1}",
this.documentServiceRequest?.RequestContext?.LocationEndpointToRoute?.ToString() ?? string.Empty,
this.documentServiceRequest?.ResourceAddress ?? string.Empty);

// Mark the partition key range as unavailable to retry future request on a new region.
this.partitionKeyRangeLocationCache.TryMarkEndpointUnavailableForPartitionKeyRange(
j82w marked this conversation as resolved.
Show resolved Hide resolved
this.documentServiceRequest);
}

// Received 403.3 on write region, initiate the endpoint rediscovery
if (statusCode == HttpStatusCode.Forbidden
&& subStatusCode == SubStatusCodes.WriteForbidden)
{
// It's a write forbidden so it safe to retry
if (this.partitionKeyRangeLocationCache.TryMarkEndpointUnavailableForPartitionKeyRange(
this.documentServiceRequest))
{
return ShouldRetryResult.RetryAfter(TimeSpan.Zero);
}

DefaultTrace.TraceWarning("Endpoint not writable. Refresh cache and retry");
DefaultTrace.TraceWarning("ClientRetryPolicy: Endpoint not writable. Refresh cache and retry. Failed Location: {0}; ResourceAddress: {1}",
this.documentServiceRequest?.RequestContext?.LocationEndpointToRoute?.ToString() ?? string.Empty,
this.documentServiceRequest?.ResourceAddress ?? string.Empty);

return await this.ShouldRetryOnEndpointFailureAsync(
isReadRequest: false,
markBothReadAndWriteAsUnavailable: false,
forceRefresh: true,
retryOnPreferredLocations: false);
}
Expand All @@ -174,9 +197,13 @@ private async Task<ShouldRetryResult> ShouldRetryInternalAsync(
&& subStatusCode == SubStatusCodes.DatabaseAccountNotFound
&& (this.isReadRequest || this.canUseMultipleWriteLocations))
{
DefaultTrace.TraceWarning("Endpoint not available for reads. Refresh cache and retry");
DefaultTrace.TraceWarning("ClientRetryPolicy: Endpoint not available for reads. Refresh cache and retry. Failed Location: {0}; ResourceAddress: {1}",
this.documentServiceRequest?.RequestContext?.LocationEndpointToRoute?.ToString() ?? string.Empty,
this.documentServiceRequest?.ResourceAddress ?? string.Empty);

return await this.ShouldRetryOnEndpointFailureAsync(
isReadRequest: this.isReadRequest,
markBothReadAndWriteAsUnavailable: false,
forceRefresh: false,
retryOnPreferredLocations: false);
}
Expand All @@ -191,6 +218,12 @@ private async Task<ShouldRetryResult> ShouldRetryInternalAsync(
if (statusCode == HttpStatusCode.ServiceUnavailable
&& subStatusCode == SubStatusCodes.Unknown)
{
DefaultTrace.TraceWarning("ClientRetryPolicy: ServiceUnavailable. Refresh cache and retry. Failed Location: {0}; ResourceAddress: {1}",
this.documentServiceRequest?.RequestContext?.LocationEndpointToRoute?.ToString() ?? string.Empty,
this.documentServiceRequest?.ResourceAddress ?? string.Empty);

// Mark the partition as unavailable.
// Let the ClientRetry logic decide if the request should be retried
this.partitionKeyRangeLocationCache.TryMarkEndpointUnavailableForPartitionKeyRange(
this.documentServiceRequest);

Expand All @@ -202,24 +235,28 @@ private async Task<ShouldRetryResult> ShouldRetryInternalAsync(

private async Task<ShouldRetryResult> ShouldRetryOnEndpointFailureAsync(
bool isReadRequest,
bool markBothReadAndWriteAsUnavailable,
bool forceRefresh,
bool retryOnPreferredLocations)
{
if (!this.enableEndpointDiscovery || this.failoverRetryCount > MaxRetryCount)
{
DefaultTrace.TraceInformation("ShouldRetryOnEndpointFailureAsync() Not retrying. Retry count = {0}", this.failoverRetryCount);
DefaultTrace.TraceInformation("ClientRetryPolicy: ShouldRetryOnEndpointFailureAsync() Not retrying. Retry count = {0}, Endpoint = {1}",
this.failoverRetryCount,
this.locationEndpoint?.ToString() ?? string.Empty);
return ShouldRetryResult.NoRetry();
}

this.failoverRetryCount++;

if (this.locationEndpoint != null)
{
if (isReadRequest)
if (isReadRequest || markBothReadAndWriteAsUnavailable)
{
this.globalEndpointManager.MarkEndpointUnavailableForRead(this.locationEndpoint);
}
else

if (!isReadRequest || markBothReadAndWriteAsUnavailable)
{
this.globalEndpointManager.MarkEndpointUnavailableForWrite(this.locationEndpoint);
}
Expand All @@ -228,7 +265,7 @@ private async Task<ShouldRetryResult> ShouldRetryOnEndpointFailureAsync(
TimeSpan retryDelay = TimeSpan.Zero;
if (!isReadRequest)
{
DefaultTrace.TraceInformation("Failover happening. retryCount {0}", this.failoverRetryCount);
DefaultTrace.TraceInformation("ClientRetryPolicy: Failover happening. retryCount {0}", this.failoverRetryCount);

if (this.failoverRetryCount > 1)
{
Expand Down Expand Up @@ -320,7 +357,7 @@ private ShouldRetryResult ShouldRetryOnServiceUnavailable()
{
if (this.serviceUnavailableRetryCount++ >= ClientRetryPolicy.MaxServiceUnavailableRetryCount)
{
DefaultTrace.TraceInformation($"ShouldRetryOnServiceUnavailable() Not retrying. Retry count = {this.serviceUnavailableRetryCount}.");
DefaultTrace.TraceInformation($"ClientRetryPolicy: ShouldRetryOnServiceUnavailable() Not retrying. Retry count = {this.serviceUnavailableRetryCount}.");
return ShouldRetryResult.NoRetry();
}

Expand All @@ -336,11 +373,11 @@ private ShouldRetryResult ShouldRetryOnServiceUnavailable()
if (availablePreferredLocations <= 1)
{
// No other regions to retry on
DefaultTrace.TraceInformation($"ShouldRetryOnServiceUnavailable() Not retrying. No other regions available for the request. AvailablePreferredLocations = {availablePreferredLocations}.");
DefaultTrace.TraceInformation($"ClientRetryPolicy: ShouldRetryOnServiceUnavailable() Not retrying. No other regions available for the request. AvailablePreferredLocations = {availablePreferredLocations}.");
return ShouldRetryResult.NoRetry();
}

DefaultTrace.TraceInformation($"ShouldRetryOnServiceUnavailable() Retrying. Received on endpoint {this.locationEndpoint}, IsReadRequest = {this.isReadRequest}.");
DefaultTrace.TraceInformation($"ClientRetryPolicy: ShouldRetryOnServiceUnavailable() Retrying. Received on endpoint {this.locationEndpoint}, IsReadRequest = {this.isReadRequest}.");

// Retrying on second PreferredLocations
// RetryCount is used as zero-based index
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -642,19 +642,28 @@ await BackoffRetryUtility<bool>.ExecuteAsync(
}

[DataTestMethod]
[DataRow(true, false, false, false, DisplayName = "Read request - Single master - no preferred locations - should NOT retry")]
[DataRow(false, false, false, false, DisplayName = "Write request - Single master - no preferred locations - should NOT retry")]
[DataRow(true, true, false, false, DisplayName = "Read request - Multi master - no preferred locations - should NOT retry")]
[DataRow(false, true, false, false, DisplayName = "Write request - Multi master - no preferred locations - should NOT retry")]
[DataRow(true, false, true, true, DisplayName = "Read request - Single master - with preferred locations - should retry")]
[DataRow(false, false, true, false, DisplayName = "Write request - Single master - with preferred locations - should NOT retry")]
[DataRow(true, true, true, true, DisplayName = "Read request - Multi master - with preferred locations - should retry")]
[DataRow(false, true, true, true, DisplayName = "Write request - Multi master - with preferred locations - should retry")]
[DataRow(true, false, false, false, false, DisplayName = "Read request - Single master - no preferred locations - should NOT retry")]
[DataRow(false, false, false, false, false, DisplayName = "Write request - Single master - no preferred locations - should NOT retry")]
[DataRow(true, true, false, false, false, DisplayName = "Read request - Multi master - no preferred locations - should NOT retry")]
[DataRow(false, true, false, false, false, DisplayName = "Write request - Multi master - no preferred locations - should NOT retry")]
[DataRow(true, false, true, true, false, DisplayName = "Read request - Single master - with preferred locations - should retry")]
[DataRow(false, false, true, false, false, DisplayName = "Write request - Single master - with preferred locations - should NOT retry")]
[DataRow(true, true, true, true, false, DisplayName = "Read request - Multi master - with preferred locations - should retry")]
[DataRow(false, true, true, true, false, DisplayName = "Write request - Multi master - with preferred locations - should retry")]
[DataRow(true, false, false, false, true, DisplayName = "Read request - Single master - no preferred locations - should NOT retry")]
[DataRow(false, false, false, false, true, DisplayName = "Write request - Single master - no preferred locations - should NOT retry")]
[DataRow(true, true, false, false, true, DisplayName = "Read request - Multi master - no preferred locations - should NOT retry")]
[DataRow(false, true, false, false, true, DisplayName = "Write request - Multi master - no preferred locations - should NOT retry")]
[DataRow(true, false, true, true, true, DisplayName = "Read request - Single master - with preferred locations - should retry")]
[DataRow(false, false, true, false, true, DisplayName = "Write request - Single master - with preferred locations - should NOT retry")]
[DataRow(true, true, true, true, true, DisplayName = "Read request - Multi master - with preferred locations - should retry")]
[DataRow(false, true, true, true, true, DisplayName = "Write request - Multi master - with preferred locations - should retry")]
public async Task ClientRetryPolicy_ValidateRetryOnServiceUnavailable(
bool isReadRequest,
bool useMultipleWriteLocations,
bool usesPreferredLocations,
bool shouldHaveRetried)
bool shouldHaveRetried,
bool enablePartitionLevelFailover)
{
const bool enableEndpointDiscovery = true;

Expand All @@ -667,6 +676,7 @@ public async Task ClientRetryPolicy_ValidateRetryOnServiceUnavailable(
useMultipleWriteLocations: useMultipleWriteLocations,
enableEndpointDiscovery: enableEndpointDiscovery,
isPreferredLocationsListEmpty: !usesPreferredLocations,
enablePartitionLevelFailover: enablePartitionLevelFailover,
preferedRegionListOverride: preferredList,
enforceSingleMasterSingleWriteLocation: true);

Expand Down Expand Up @@ -765,7 +775,8 @@ private void Initialize(
bool enableEndpointDiscovery,
bool isPreferredLocationsListEmpty,
bool enforceSingleMasterSingleWriteLocation = false, // Some tests depend on the Initialize to create an account with multiple write locations, even when not multi master
ReadOnlyCollection<string> preferedRegionListOverride = null)
ReadOnlyCollection<string> preferedRegionListOverride = null,
bool enablePartitionLevelFailover = false)
{
this.databaseAccount = LocationCacheTests.CreateDatabaseAccount(
useMultipleWriteLocations,
Expand Down Expand Up @@ -811,7 +822,15 @@ private void Initialize(
}

this.endpointManager = new GlobalEndpointManager(mockedClient.Object, connectionPolicy);
this.partitionKeyRangeLocationCache = GlobalPartitionEndpointManagerNoOp.Instance;

if (enablePartitionLevelFailover)
{
this.partitionKeyRangeLocationCache = new GlobalPartitionEndpointManagerCore(this.endpointManager);
}
else
{
this.partitionKeyRangeLocationCache = GlobalPartitionEndpointManagerNoOp.Instance;
}
}

private async Task ValidateLocationCacheAsync(
Expand Down
Loading