From ebf25d6669414b633c4013f309bbcc84e879314b Mon Sep 17 00:00:00 2001 From: Mitch Denny Date: Mon, 25 Mar 2024 14:58:50 +1100 Subject: [PATCH 01/11] Retry reading kubeconfig. --- src/Aspire.Hosting/Dcp/DcpOptions.cs | 7 ++++ src/Aspire.Hosting/Dcp/KubernetesService.cs | 37 ++++++++++++++++++--- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/src/Aspire.Hosting/Dcp/DcpOptions.cs b/src/Aspire.Hosting/Dcp/DcpOptions.cs index 0e8226c5b4..feffbf18af 100644 --- a/src/Aspire.Hosting/Dcp/DcpOptions.cs +++ b/src/Aspire.Hosting/Dcp/DcpOptions.cs @@ -77,6 +77,10 @@ internal sealed class DcpOptions /// public bool? RandomizePorts { get; set; } + public int KubernetesConfigReadRetryCount { get; set; } = 10; + + public int KubernetesConfigReadRetryIntervalSeconds { get; set; } = 1; + public void ApplyApplicationConfiguration(DistributedApplicationOptions appOptions, IConfiguration dcpPublisherConfiguration, IConfiguration publishingConfiguration, IConfiguration coreConfiguration) { string? publisher = publishingConfiguration[nameof(PublishingOptions.Publisher)]; @@ -125,6 +129,9 @@ public void ApplyApplicationConfiguration(DistributedApplicationOptions appOptio DependencyCheckTimeout = coreConfiguration.GetValue("DOTNET_ASPIRE_DEPENDENCY_CHECK_TIMEOUT", DependencyCheckTimeout); } + KubernetesConfigReadRetryCount = dcpPublisherConfiguration.GetValue(nameof(KubernetesConfigReadRetryCount), KubernetesConfigReadRetryCount); + KubernetesConfigReadRetryIntervalSeconds = dcpPublisherConfiguration.GetValue(nameof(KubernetesConfigReadRetryIntervalSeconds), KubernetesConfigReadRetryIntervalSeconds); + if (!string.IsNullOrEmpty(dcpPublisherConfiguration[nameof(ResourceNameSuffix)])) { ResourceNameSuffix = dcpPublisherConfiguration[nameof(ResourceNameSuffix)]; diff --git a/src/Aspire.Hosting/Dcp/KubernetesService.cs b/src/Aspire.Hosting/Dcp/KubernetesService.cs index c96b761fcc..59d3a86fb0 100644 --- a/src/Aspire.Hosting/Dcp/KubernetesService.cs +++ b/src/Aspire.Hosting/Dcp/KubernetesService.cs @@ -7,6 +7,9 @@ using k8s; using k8s.Exceptions; using k8s.Models; +using Microsoft.Extensions.Logging; +using Polly; +using Polly.Retry; namespace Aspire.Hosting.Dcp; @@ -39,7 +42,7 @@ Task GetLogStreamAsync( CancellationToken cancellationToken = default) where T : CustomResource; } -internal sealed class KubernetesService(Locations locations) : IKubernetesService, IDisposable +internal sealed class KubernetesService(ILogger logger, DcpOptions dcpOptions, Locations locations) : IKubernetesService, IDisposable { private static readonly TimeSpan s_initialRetryDelay = TimeSpan.FromMilliseconds(100); private static GroupVersion GroupVersion => Model.Dcp.GroupVersion; @@ -280,16 +283,42 @@ private async Task ExecuteWithRetry( private static bool IsRetryable(Exception ex) => ex is HttpRequestException || ex is KubeConfigException; + private readonly object _ensureKubernetesLock = new object(); + private void EnsureKubernetes() { if (_kubernetes != null) { return; } - lock (Model.Dcp.Schema) + lock (_ensureKubernetesLock) { if (_kubernetes != null) { return; } - var config = KubernetesClientConfiguration.BuildConfigFromConfigFile(kubeconfigPath: locations.DcpKubeconfigPath, useRelativePaths: false); - _kubernetes = new DcpKubernetesClient(config); + var configurationReadRetry = new RetryStrategyOptions() + { + ShouldHandle = new PredicateBuilder().Handle(e => e.Message.StartsWith("The process cannot access the file")), + BackoffType = DelayBackoffType.Constant, + MaxRetryAttempts = dcpOptions.KubernetesConfigReadRetryCount, + MaxDelay = TimeSpan.FromSeconds(dcpOptions.KubernetesConfigReadRetryIntervalSeconds), + OnRetry = (retry) => + { + logger.LogDebug( + retry.Outcome.Exception, + "Reading Kubernetes configuration file from '{DcpKubeconfigPath}' failed. Retrying. (iteration {Iteration}).", + locations.DcpKubeconfigPath, + retry.AttemptNumber + ); + return ValueTask.CompletedTask; + } + }; + + var pipeline = new ResiliencePipelineBuilder().AddRetry(configurationReadRetry).Build(); + + pipeline.Execute(() => + { + var config = KubernetesClientConfiguration.BuildConfigFromConfigFile(kubeconfigPath: locations.DcpKubeconfigPath, useRelativePaths: false); + _kubernetes = new DcpKubernetesClient(config); + }); + } } } From ca3897ac78aa6a34c383ce1b83a5dbbe9b2f749a Mon Sep 17 00:00:00 2001 From: Mitch Denny Date: Mon, 25 Mar 2024 15:01:48 +1100 Subject: [PATCH 02/11] retry reading kubeconfig --- src/Aspire.Hosting/Dcp/KubernetesService.cs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/Aspire.Hosting/Dcp/KubernetesService.cs b/src/Aspire.Hosting/Dcp/KubernetesService.cs index 59d3a86fb0..a99014b8f8 100644 --- a/src/Aspire.Hosting/Dcp/KubernetesService.cs +++ b/src/Aspire.Hosting/Dcp/KubernetesService.cs @@ -293,6 +293,18 @@ private void EnsureKubernetes() { if (_kubernetes != null) { return; } + // This retry was created in relation to this comment in GitHub: + // + // https://github.com/dotnet/aspire/issues/2422#issuecomment-2016701083 + // + // It looks like it is possible for us to attempt to read the file before it is written/finished + // being written. We rely on DCP to write the configuration file but it may happen in parallel to + // this code executing is its possible the file does not exist, or is still being written by + // the time we get to it. + // + // This retry will retry reading the file 5 times (by default, but configurable) with a pause + // of 3 seconds between each attempt. This means it could take up to 15 seconds to fail. We emit + // debug level logs for each retry attempt should we need to help a customer debug this. var configurationReadRetry = new RetryStrategyOptions() { ShouldHandle = new PredicateBuilder().Handle(e => e.Message.StartsWith("The process cannot access the file")), @@ -310,7 +322,6 @@ private void EnsureKubernetes() return ValueTask.CompletedTask; } }; - var pipeline = new ResiliencePipelineBuilder().AddRetry(configurationReadRetry).Build(); pipeline.Execute(() => From 4b8a994f0b18b94a44554c58f35ae9c022d5099e Mon Sep 17 00:00:00 2001 From: Mitch Denny Date: Mon, 25 Mar 2024 16:28:02 +1100 Subject: [PATCH 03/11] WIP --- playground/TestShop/AppHost/appsettings.json | 2 +- src/Aspire.Hosting/Dcp/KubernetesService.cs | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/playground/TestShop/AppHost/appsettings.json b/playground/TestShop/AppHost/appsettings.json index 31c092aa45..80f87c3b44 100644 --- a/playground/TestShop/AppHost/appsettings.json +++ b/playground/TestShop/AppHost/appsettings.json @@ -3,7 +3,7 @@ "LogLevel": { "Default": "Information", "Microsoft.AspNetCore": "Warning", - "Aspire.Hosting.Dcp": "Warning" + "Aspire.Hosting.Dcp": "Debug" } } } diff --git a/src/Aspire.Hosting/Dcp/KubernetesService.cs b/src/Aspire.Hosting/Dcp/KubernetesService.cs index a99014b8f8..817f7b5c7f 100644 --- a/src/Aspire.Hosting/Dcp/KubernetesService.cs +++ b/src/Aspire.Hosting/Dcp/KubernetesService.cs @@ -8,6 +8,7 @@ using k8s.Exceptions; using k8s.Models; using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; using Polly; using Polly.Retry; @@ -42,7 +43,7 @@ Task GetLogStreamAsync( CancellationToken cancellationToken = default) where T : CustomResource; } -internal sealed class KubernetesService(ILogger logger, DcpOptions dcpOptions, Locations locations) : IKubernetesService, IDisposable +internal sealed class KubernetesService(ILogger logger, IOptions dcpOptions, Locations locations) : IKubernetesService, IDisposable { private static readonly TimeSpan s_initialRetryDelay = TimeSpan.FromMilliseconds(100); private static GroupVersion GroupVersion => Model.Dcp.GroupVersion; @@ -309,8 +310,8 @@ private void EnsureKubernetes() { ShouldHandle = new PredicateBuilder().Handle(e => e.Message.StartsWith("The process cannot access the file")), BackoffType = DelayBackoffType.Constant, - MaxRetryAttempts = dcpOptions.KubernetesConfigReadRetryCount, - MaxDelay = TimeSpan.FromSeconds(dcpOptions.KubernetesConfigReadRetryIntervalSeconds), + MaxRetryAttempts = dcpOptions.Value.KubernetesConfigReadRetryCount, + MaxDelay = TimeSpan.FromSeconds(dcpOptions.Value.KubernetesConfigReadRetryIntervalSeconds), OnRetry = (retry) => { logger.LogDebug( @@ -326,10 +327,11 @@ private void EnsureKubernetes() pipeline.Execute(() => { + logger.LogDebug("Reading Kubernetes configuration from '{DcpKubeconfigPath}' on thread {ThreadId}.", locations.DcpKubeconfigPath, Environment.CurrentManagedThreadId); var config = KubernetesClientConfiguration.BuildConfigFromConfigFile(kubeconfigPath: locations.DcpKubeconfigPath, useRelativePaths: false); + logger.LogDebug("Successfully read Kubernetes configuration from '{DcpKubeconfigPath}'.", locations.DcpKubeconfigPath); _kubernetes = new DcpKubernetesClient(config); }); - } } } From 47ba8f94d81b43f055ce421e4399c83e03bf83f2 Mon Sep 17 00:00:00 2001 From: Mitch Denny Date: Mon, 25 Mar 2024 21:41:54 +1100 Subject: [PATCH 04/11] Use semaphore to control the race and improve logging for diagnostics down the track. --- playground/TestShop/AppHost/appsettings.json | 2 +- src/Aspire.Hosting/Dcp/DcpOptions.cs | 2 +- src/Aspire.Hosting/Dcp/KubernetesService.cs | 53 ++++++++++++++------ 3 files changed, 41 insertions(+), 16 deletions(-) diff --git a/playground/TestShop/AppHost/appsettings.json b/playground/TestShop/AppHost/appsettings.json index 80f87c3b44..31c092aa45 100644 --- a/playground/TestShop/AppHost/appsettings.json +++ b/playground/TestShop/AppHost/appsettings.json @@ -3,7 +3,7 @@ "LogLevel": { "Default": "Information", "Microsoft.AspNetCore": "Warning", - "Aspire.Hosting.Dcp": "Debug" + "Aspire.Hosting.Dcp": "Warning" } } } diff --git a/src/Aspire.Hosting/Dcp/DcpOptions.cs b/src/Aspire.Hosting/Dcp/DcpOptions.cs index feffbf18af..e00a128708 100644 --- a/src/Aspire.Hosting/Dcp/DcpOptions.cs +++ b/src/Aspire.Hosting/Dcp/DcpOptions.cs @@ -79,7 +79,7 @@ internal sealed class DcpOptions public int KubernetesConfigReadRetryCount { get; set; } = 10; - public int KubernetesConfigReadRetryIntervalSeconds { get; set; } = 1; + public int KubernetesConfigReadRetryIntervalSeconds { get; set; } = 3; public void ApplyApplicationConfiguration(DistributedApplicationOptions appOptions, IConfiguration dcpPublisherConfiguration, IConfiguration publishingConfiguration, IConfiguration coreConfiguration) { diff --git a/src/Aspire.Hosting/Dcp/KubernetesService.cs b/src/Aspire.Hosting/Dcp/KubernetesService.cs index 817f7b5c7f..b0052e8c7b 100644 --- a/src/Aspire.Hosting/Dcp/KubernetesService.cs +++ b/src/Aspire.Hosting/Dcp/KubernetesService.cs @@ -48,7 +48,7 @@ internal sealed class KubernetesService(ILogger logger, IOpti private static readonly TimeSpan s_initialRetryDelay = TimeSpan.FromMilliseconds(100); private static GroupVersion GroupVersion => Model.Dcp.GroupVersion; - private DcpKubernetesClient? _kubernetes; + private volatile DcpKubernetesClient? _kubernetes; public TimeSpan MaxRetryDuration { get; set; } = TimeSpan.FromSeconds(20); @@ -216,6 +216,7 @@ public Task GetLogStreamAsync( public void Dispose() { + _kubeconfigReadSemaphore?.Dispose(); _kubernetes?.Dispose(); } @@ -258,7 +259,7 @@ private async Task ExecuteWithRetry( { try { - EnsureKubernetes(); + await EnsureKubernetesAsync(cancellationToken).ConfigureAwait(false); return await operation(_kubernetes!).ConfigureAwait(false); } catch (Exception e) when (IsRetryable(e)) @@ -284,16 +285,34 @@ private async Task ExecuteWithRetry( private static bool IsRetryable(Exception ex) => ex is HttpRequestException || ex is KubeConfigException; - private readonly object _ensureKubernetesLock = new object(); + private readonly SemaphoreSlim _kubeconfigReadSemaphore = new(1); - private void EnsureKubernetes() + private async Task EnsureKubernetesAsync(CancellationToken cancellationToken = default) { - if (_kubernetes != null) { return; } + // Return early before waiting for the semaphore if we can. + if (_kubernetes != null) + { + logger.LogDebug("Kubernetes ensured at first chance shortcut."); + return; + } - lock (_ensureKubernetesLock) + while (!await _kubeconfigReadSemaphore.WaitAsync(1000, cancellationToken).ConfigureAwait(false)) { - if (_kubernetes != null) { return; } + logger.LogDebug("Waiting for semaphore to read kubeconfig."); + } + + // Return late after getting the semaphore but also probably after + // another thread has gotten the config loaded. + + if (_kubernetes != null) + { + logger.LogDebug("Kubernetes ensured at second chance shortcut."); + _kubeconfigReadSemaphore.Release(); + return; + } + try + { // This retry was created in relation to this comment in GitHub: // // https://github.com/dotnet/aspire/issues/2422#issuecomment-2016701083 @@ -308,7 +327,7 @@ private void EnsureKubernetes() // debug level logs for each retry attempt should we need to help a customer debug this. var configurationReadRetry = new RetryStrategyOptions() { - ShouldHandle = new PredicateBuilder().Handle(e => e.Message.StartsWith("The process cannot access the file")), + ShouldHandle = new PredicateBuilder().Handle(), BackoffType = DelayBackoffType.Constant, MaxRetryAttempts = dcpOptions.Value.KubernetesConfigReadRetryCount, MaxDelay = TimeSpan.FromSeconds(dcpOptions.Value.KubernetesConfigReadRetryIntervalSeconds), @@ -316,7 +335,7 @@ private void EnsureKubernetes() { logger.LogDebug( retry.Outcome.Exception, - "Reading Kubernetes configuration file from '{DcpKubeconfigPath}' failed. Retrying. (iteration {Iteration}).", + "Reading Kubernetes configuration file from '{DcpKubeconfigPath}' failed. Retry pending. (iteration {Iteration}).", locations.DcpKubeconfigPath, retry.AttemptNumber ); @@ -325,13 +344,19 @@ private void EnsureKubernetes() }; var pipeline = new ResiliencePipelineBuilder().AddRetry(configurationReadRetry).Build(); - pipeline.Execute(() => + _kubernetes = await pipeline.ExecuteAsync(async (cancellationToken) => { - logger.LogDebug("Reading Kubernetes configuration from '{DcpKubeconfigPath}' on thread {ThreadId}.", locations.DcpKubeconfigPath, Environment.CurrentManagedThreadId); - var config = KubernetesClientConfiguration.BuildConfigFromConfigFile(kubeconfigPath: locations.DcpKubeconfigPath, useRelativePaths: false); + logger.LogDebug("Reading Kubernetes configuration from '{DcpKubeconfigPath}'.", locations.DcpKubeconfigPath); + var fileInfo = new FileInfo(locations.DcpKubeconfigPath); + var config = await KubernetesClientConfiguration.BuildConfigFromConfigFileAsync(kubeconfig: fileInfo, useRelativePaths: false).ConfigureAwait(false); logger.LogDebug("Successfully read Kubernetes configuration from '{DcpKubeconfigPath}'.", locations.DcpKubeconfigPath); - _kubernetes = new DcpKubernetesClient(config); - }); + + return new DcpKubernetesClient(config); + }, cancellationToken).ConfigureAwait(false); + } + finally + { + _kubeconfigReadSemaphore.Release(); } } } From f1a9afb9f20fb66bebaa7e536735be676897d0f0 Mon Sep 17 00:00:00 2001 From: Mitch Denny Date: Mon, 25 Mar 2024 21:47:20 +1100 Subject: [PATCH 05/11] Reduce spin interval. --- src/Aspire.Hosting/Dcp/KubernetesService.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Aspire.Hosting/Dcp/KubernetesService.cs b/src/Aspire.Hosting/Dcp/KubernetesService.cs index b0052e8c7b..dc8b0866f9 100644 --- a/src/Aspire.Hosting/Dcp/KubernetesService.cs +++ b/src/Aspire.Hosting/Dcp/KubernetesService.cs @@ -296,7 +296,7 @@ private async Task EnsureKubernetesAsync(CancellationToken cancellationToken = d return; } - while (!await _kubeconfigReadSemaphore.WaitAsync(1000, cancellationToken).ConfigureAwait(false)) + while (!await _kubeconfigReadSemaphore.WaitAsync(100, cancellationToken).ConfigureAwait(false)) { logger.LogDebug("Waiting for semaphore to read kubeconfig."); } From 1c49dbac48fbf2f2af7fb63be26826efd72c6be6 Mon Sep 17 00:00:00 2001 From: Mitch Denny Date: Mon, 25 Mar 2024 21:58:00 +1100 Subject: [PATCH 06/11] Does not need to be volatile. --- src/Aspire.Hosting/Dcp/KubernetesService.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Aspire.Hosting/Dcp/KubernetesService.cs b/src/Aspire.Hosting/Dcp/KubernetesService.cs index dc8b0866f9..7d75d4496a 100644 --- a/src/Aspire.Hosting/Dcp/KubernetesService.cs +++ b/src/Aspire.Hosting/Dcp/KubernetesService.cs @@ -48,7 +48,7 @@ internal sealed class KubernetesService(ILogger logger, IOpti private static readonly TimeSpan s_initialRetryDelay = TimeSpan.FromMilliseconds(100); private static GroupVersion GroupVersion => Model.Dcp.GroupVersion; - private volatile DcpKubernetesClient? _kubernetes; + private DcpKubernetesClient? _kubernetes; public TimeSpan MaxRetryDuration { get; set; } = TimeSpan.FromSeconds(20); From 3e5d7836b9d9c0b8c0006f3f164f9adb0be5f730 Mon Sep 17 00:00:00 2001 From: Mitch Denny Date: Tue, 26 Mar 2024 09:29:09 +1100 Subject: [PATCH 07/11] PR feedback. --- src/Aspire.Hosting/Dcp/KubernetesService.cs | 50 +++++++++++++-------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/src/Aspire.Hosting/Dcp/KubernetesService.cs b/src/Aspire.Hosting/Dcp/KubernetesService.cs index 7d75d4496a..0d44cb8a60 100644 --- a/src/Aspire.Hosting/Dcp/KubernetesService.cs +++ b/src/Aspire.Hosting/Dcp/KubernetesService.cs @@ -287,6 +287,36 @@ private async Task ExecuteWithRetry( private readonly SemaphoreSlim _kubeconfigReadSemaphore = new(1); + private ResiliencePipeline? _resiliencePipeline; + + private ResiliencePipeline GetReadKubeconfigResiliencePipeline() + { + if (_resiliencePipeline == null) + { + var configurationReadRetry = new RetryStrategyOptions() + { + ShouldHandle = new PredicateBuilder().Handle(), + BackoffType = DelayBackoffType.Constant, + MaxRetryAttempts = dcpOptions.Value.KubernetesConfigReadRetryCount, + MaxDelay = TimeSpan.FromSeconds(dcpOptions.Value.KubernetesConfigReadRetryIntervalSeconds), + OnRetry = (retry) => + { + logger.LogDebug( + retry.Outcome.Exception, + "Reading Kubernetes configuration file from '{DcpKubeconfigPath}' failed. Retry pending. (iteration {Iteration}).", + locations.DcpKubeconfigPath, + retry.AttemptNumber + ); + return ValueTask.CompletedTask; + } + }; + + _resiliencePipeline = new ResiliencePipelineBuilder().AddRetry(configurationReadRetry).Build(); + } + + return _resiliencePipeline; + } + private async Task EnsureKubernetesAsync(CancellationToken cancellationToken = default) { // Return early before waiting for the semaphore if we can. @@ -325,25 +355,7 @@ private async Task EnsureKubernetesAsync(CancellationToken cancellationToken = d // This retry will retry reading the file 5 times (by default, but configurable) with a pause // of 3 seconds between each attempt. This means it could take up to 15 seconds to fail. We emit // debug level logs for each retry attempt should we need to help a customer debug this. - var configurationReadRetry = new RetryStrategyOptions() - { - ShouldHandle = new PredicateBuilder().Handle(), - BackoffType = DelayBackoffType.Constant, - MaxRetryAttempts = dcpOptions.Value.KubernetesConfigReadRetryCount, - MaxDelay = TimeSpan.FromSeconds(dcpOptions.Value.KubernetesConfigReadRetryIntervalSeconds), - OnRetry = (retry) => - { - logger.LogDebug( - retry.Outcome.Exception, - "Reading Kubernetes configuration file from '{DcpKubeconfigPath}' failed. Retry pending. (iteration {Iteration}).", - locations.DcpKubeconfigPath, - retry.AttemptNumber - ); - return ValueTask.CompletedTask; - } - }; - var pipeline = new ResiliencePipelineBuilder().AddRetry(configurationReadRetry).Build(); - + var pipeline = GetReadKubeconfigResiliencePipeline(); _kubernetes = await pipeline.ExecuteAsync(async (cancellationToken) => { logger.LogDebug("Reading Kubernetes configuration from '{DcpKubeconfigPath}'.", locations.DcpKubeconfigPath); From bd8ac0e72d7c25d4ecc2fb86c6acfdfda7506b6c Mon Sep 17 00:00:00 2001 From: Mitch Denny Date: Tue, 26 Mar 2024 09:42:06 +1100 Subject: [PATCH 08/11] Reduce retry interval and increase number of attempts. --- src/Aspire.Hosting/Dcp/DcpOptions.cs | 4 ++-- src/Aspire.Hosting/Dcp/KubernetesService.cs | 15 +++------------ 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/src/Aspire.Hosting/Dcp/DcpOptions.cs b/src/Aspire.Hosting/Dcp/DcpOptions.cs index e00a128708..fda8bccbb8 100644 --- a/src/Aspire.Hosting/Dcp/DcpOptions.cs +++ b/src/Aspire.Hosting/Dcp/DcpOptions.cs @@ -77,9 +77,9 @@ internal sealed class DcpOptions /// public bool? RandomizePorts { get; set; } - public int KubernetesConfigReadRetryCount { get; set; } = 10; + public int KubernetesConfigReadRetryCount { get; set; } = 30; - public int KubernetesConfigReadRetryIntervalSeconds { get; set; } = 3; + public int KubernetesConfigReadRetryIntervalSeconds { get; set; } = 1; public void ApplyApplicationConfiguration(DistributedApplicationOptions appOptions, IConfiguration dcpPublisherConfiguration, IConfiguration publishingConfiguration, IConfiguration coreConfiguration) { diff --git a/src/Aspire.Hosting/Dcp/KubernetesService.cs b/src/Aspire.Hosting/Dcp/KubernetesService.cs index 0d44cb8a60..edd40ce583 100644 --- a/src/Aspire.Hosting/Dcp/KubernetesService.cs +++ b/src/Aspire.Hosting/Dcp/KubernetesService.cs @@ -343,18 +343,9 @@ private async Task EnsureKubernetesAsync(CancellationToken cancellationToken = d try { - // This retry was created in relation to this comment in GitHub: - // - // https://github.com/dotnet/aspire/issues/2422#issuecomment-2016701083 - // - // It looks like it is possible for us to attempt to read the file before it is written/finished - // being written. We rely on DCP to write the configuration file but it may happen in parallel to - // this code executing is its possible the file does not exist, or is still being written by - // the time we get to it. - // - // This retry will retry reading the file 5 times (by default, but configurable) with a pause - // of 3 seconds between each attempt. This means it could take up to 15 seconds to fail. We emit - // debug level logs for each retry attempt should we need to help a customer debug this. + // We retry reading the kubeconfig file because DCP takes a few moments to write + // it to disk. This retry pipeline will only be invoked by a single thread the + // rest will be held at the semaphore. var pipeline = GetReadKubeconfigResiliencePipeline(); _kubernetes = await pipeline.ExecuteAsync(async (cancellationToken) => { From a93fc952762e5c6f453d96142d3397834ab864dc Mon Sep 17 00:00:00 2001 From: Mitch Denny Date: Tue, 26 Mar 2024 15:06:38 +1100 Subject: [PATCH 09/11] Adjusted logging and interval to be less noisy but also a bit more responsive. --- src/Aspire.Hosting/Dcp/DcpOptions.cs | 6 ++--- src/Aspire.Hosting/Dcp/KubernetesService.cs | 28 ++++++++++----------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/Aspire.Hosting/Dcp/DcpOptions.cs b/src/Aspire.Hosting/Dcp/DcpOptions.cs index 5c8eb89b2d..ff09df1d79 100644 --- a/src/Aspire.Hosting/Dcp/DcpOptions.cs +++ b/src/Aspire.Hosting/Dcp/DcpOptions.cs @@ -77,9 +77,9 @@ internal sealed class DcpOptions /// public bool? RandomizePorts { get; set; } - public int KubernetesConfigReadRetryCount { get; set; } = 30; + public int KubernetesConfigReadRetryCount { get; set; } = 300; - public int KubernetesConfigReadRetryIntervalSeconds { get; set; } = 1; + public int KubernetesConfigReadRetryIntervalMilliseconds { get; set; } = 100; public void ApplyApplicationConfiguration(DistributedApplicationOptions appOptions, IConfiguration dcpPublisherConfiguration, IConfiguration publishingConfiguration, IConfiguration coreConfiguration) { @@ -130,7 +130,7 @@ public void ApplyApplicationConfiguration(DistributedApplicationOptions appOptio } KubernetesConfigReadRetryCount = dcpPublisherConfiguration.GetValue(nameof(KubernetesConfigReadRetryCount), KubernetesConfigReadRetryCount); - KubernetesConfigReadRetryIntervalSeconds = dcpPublisherConfiguration.GetValue(nameof(KubernetesConfigReadRetryIntervalSeconds), KubernetesConfigReadRetryIntervalSeconds); + KubernetesConfigReadRetryIntervalMilliseconds = dcpPublisherConfiguration.GetValue(nameof(KubernetesConfigReadRetryIntervalMilliseconds), KubernetesConfigReadRetryIntervalMilliseconds); if (!string.IsNullOrEmpty(dcpPublisherConfiguration[nameof(ResourceNameSuffix)])) { diff --git a/src/Aspire.Hosting/Dcp/KubernetesService.cs b/src/Aspire.Hosting/Dcp/KubernetesService.cs index edd40ce583..33fd700f72 100644 --- a/src/Aspire.Hosting/Dcp/KubernetesService.cs +++ b/src/Aspire.Hosting/Dcp/KubernetesService.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. using System.Collections.Immutable; +using System.Diagnostics; using System.Runtime.CompilerServices; using Aspire.Hosting.Dcp.Model; using k8s; @@ -298,12 +299,11 @@ private ResiliencePipeline GetReadKubeconfigResiliencePipeline() ShouldHandle = new PredicateBuilder().Handle(), BackoffType = DelayBackoffType.Constant, MaxRetryAttempts = dcpOptions.Value.KubernetesConfigReadRetryCount, - MaxDelay = TimeSpan.FromSeconds(dcpOptions.Value.KubernetesConfigReadRetryIntervalSeconds), + MaxDelay = TimeSpan.FromMilliseconds(dcpOptions.Value.KubernetesConfigReadRetryIntervalMilliseconds), OnRetry = (retry) => { logger.LogDebug( - retry.Outcome.Exception, - "Reading Kubernetes configuration file from '{DcpKubeconfigPath}' failed. Retry pending. (iteration {Iteration}).", + "Waiting for Kubernetes configuration file at '{DcpKubeconfigPath}' (attempt {Iteration}).", locations.DcpKubeconfigPath, retry.AttemptNumber ); @@ -322,21 +322,13 @@ private async Task EnsureKubernetesAsync(CancellationToken cancellationToken = d // Return early before waiting for the semaphore if we can. if (_kubernetes != null) { - logger.LogDebug("Kubernetes ensured at first chance shortcut."); return; } - while (!await _kubeconfigReadSemaphore.WaitAsync(100, cancellationToken).ConfigureAwait(false)) - { - logger.LogDebug("Waiting for semaphore to read kubeconfig."); - } - - // Return late after getting the semaphore but also probably after - // another thread has gotten the config loaded. + await _kubeconfigReadSemaphore.WaitAsync(-1, cancellationToken).ConfigureAwait(false); if (_kubernetes != null) { - logger.LogDebug("Kubernetes ensured at second chance shortcut."); _kubeconfigReadSemaphore.Release(); return; } @@ -346,13 +338,21 @@ private async Task EnsureKubernetesAsync(CancellationToken cancellationToken = d // We retry reading the kubeconfig file because DCP takes a few moments to write // it to disk. This retry pipeline will only be invoked by a single thread the // rest will be held at the semaphore. + var readStopwatch = new Stopwatch(); + readStopwatch.Start(); + var pipeline = GetReadKubeconfigResiliencePipeline(); _kubernetes = await pipeline.ExecuteAsync(async (cancellationToken) => { - logger.LogDebug("Reading Kubernetes configuration from '{DcpKubeconfigPath}'.", locations.DcpKubeconfigPath); var fileInfo = new FileInfo(locations.DcpKubeconfigPath); var config = await KubernetesClientConfiguration.BuildConfigFromConfigFileAsync(kubeconfig: fileInfo, useRelativePaths: false).ConfigureAwait(false); - logger.LogDebug("Successfully read Kubernetes configuration from '{DcpKubeconfigPath}'.", locations.DcpKubeconfigPath); + readStopwatch.Stop(); + + logger.LogDebug( + "Successfully read Kubernetes configuration from '{DcpKubeconfigPath}' after {DurationMs} milliseconds.", + locations.DcpKubeconfigPath, + readStopwatch.ElapsedMilliseconds + ); return new DcpKubernetesClient(config); }, cancellationToken).ConfigureAwait(false); From 52cd00821368a5fe807c67a764b9c428eb2b6320 Mon Sep 17 00:00:00 2001 From: Mitch Denny Date: Tue, 26 Mar 2024 20:48:57 +1100 Subject: [PATCH 10/11] PR feedback. --- src/Aspire.Hosting/Dcp/KubernetesService.cs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/Aspire.Hosting/Dcp/KubernetesService.cs b/src/Aspire.Hosting/Dcp/KubernetesService.cs index 33fd700f72..f65cf85189 100644 --- a/src/Aspire.Hosting/Dcp/KubernetesService.cs +++ b/src/Aspire.Hosting/Dcp/KubernetesService.cs @@ -327,14 +327,15 @@ private async Task EnsureKubernetesAsync(CancellationToken cancellationToken = d await _kubeconfigReadSemaphore.WaitAsync(-1, cancellationToken).ConfigureAwait(false); - if (_kubernetes != null) - { - _kubeconfigReadSemaphore.Release(); - return; - } try { + // Second chance shortcut if multiple threads got caught. + if (_kubernetes != null) + { + return; + } + // We retry reading the kubeconfig file because DCP takes a few moments to write // it to disk. This retry pipeline will only be invoked by a single thread the // rest will be held at the semaphore. From d417dc17d7ab2876d2f886d404ddc6e71deac335 Mon Sep 17 00:00:00 2001 From: Mitch Denny Date: Tue, 26 Mar 2024 20:56:07 +1100 Subject: [PATCH 11/11] Excess whitespace. --- src/Aspire.Hosting/Dcp/KubernetesService.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Aspire.Hosting/Dcp/KubernetesService.cs b/src/Aspire.Hosting/Dcp/KubernetesService.cs index f65cf85189..1d88327bdd 100644 --- a/src/Aspire.Hosting/Dcp/KubernetesService.cs +++ b/src/Aspire.Hosting/Dcp/KubernetesService.cs @@ -327,7 +327,6 @@ private async Task EnsureKubernetesAsync(CancellationToken cancellationToken = d await _kubeconfigReadSemaphore.WaitAsync(-1, cancellationToken).ConfigureAwait(false); - try { // Second chance shortcut if multiple threads got caught.