Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make reading Kubeconfig from DCP resilient #3132

Merged
merged 12 commits into from
Mar 26, 2024
7 changes: 7 additions & 0 deletions src/Aspire.Hosting/Dcp/DcpOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,10 @@ internal sealed class DcpOptions
/// </summary>
public bool? RandomizePorts { get; set; }

public int KubernetesConfigReadRetryCount { get; set; } = 10;

public int KubernetesConfigReadRetryIntervalSeconds { get; set; } = 3;

public void ApplyApplicationConfiguration(DistributedApplicationOptions appOptions, IConfiguration dcpPublisherConfiguration, IConfiguration publishingConfiguration, IConfiguration coreConfiguration)
{
string? publisher = publishingConfiguration[nameof(PublishingOptions.Publisher)];
Expand Down Expand Up @@ -125,6 +129,9 @@ public void ApplyApplicationConfiguration(DistributedApplicationOptions appOptio
DependencyCheckTimeout = coreConfiguration.GetValue<int>("DOTNET_ASPIRE_DEPENDENCY_CHECK_TIMEOUT", DependencyCheckTimeout);
}

KubernetesConfigReadRetryCount = dcpPublisherConfiguration.GetValue<int>(nameof(KubernetesConfigReadRetryCount), KubernetesConfigReadRetryCount);
KubernetesConfigReadRetryIntervalSeconds = dcpPublisherConfiguration.GetValue<int>(nameof(KubernetesConfigReadRetryIntervalSeconds), KubernetesConfigReadRetryIntervalSeconds);

if (!string.IsNullOrEmpty(dcpPublisherConfiguration[nameof(ResourceNameSuffix)]))
{
ResourceNameSuffix = dcpPublisherConfiguration[nameof(ResourceNameSuffix)];
Expand Down
83 changes: 75 additions & 8 deletions src/Aspire.Hosting/Dcp/KubernetesService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
using k8s;
using k8s.Exceptions;
using k8s.Models;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using Polly;
using Polly.Retry;

namespace Aspire.Hosting.Dcp;

Expand Down Expand Up @@ -39,7 +43,7 @@ Task<Stream> GetLogStreamAsync<T>(
CancellationToken cancellationToken = default) where T : CustomResource;
}

internal sealed class KubernetesService(Locations locations) : IKubernetesService, IDisposable
internal sealed class KubernetesService(ILogger<KubernetesService> logger, IOptions<DcpOptions> dcpOptions, Locations locations) : IKubernetesService, IDisposable
{
private static readonly TimeSpan s_initialRetryDelay = TimeSpan.FromMilliseconds(100);
private static GroupVersion GroupVersion => Model.Dcp.GroupVersion;
Expand Down Expand Up @@ -212,6 +216,7 @@ public Task<Stream> GetLogStreamAsync<T>(

public void Dispose()
{
_kubeconfigReadSemaphore?.Dispose();
_kubernetes?.Dispose();
}

Expand Down Expand Up @@ -254,7 +259,7 @@ private async Task<TResult> ExecuteWithRetry<TResult>(
{
try
{
EnsureKubernetes();
await EnsureKubernetesAsync(cancellationToken).ConfigureAwait(false);
return await operation(_kubernetes!).ConfigureAwait(false);
}
catch (Exception e) when (IsRetryable(e))
Expand All @@ -280,16 +285,78 @@ private async Task<TResult> ExecuteWithRetry<TResult>(

private static bool IsRetryable(Exception ex) => ex is HttpRequestException || ex is KubeConfigException;

private void EnsureKubernetes()
private readonly SemaphoreSlim _kubeconfigReadSemaphore = new(1);

private async Task EnsureKubernetesAsync(CancellationToken cancellationToken = default)
{
if (_kubernetes != null) { return; }
// Return early before waiting for the semaphore if we can.
if (_kubernetes != null)
{
logger.LogDebug("Kubernetes ensured at first chance shortcut.");
return;
}

while (!await _kubeconfigReadSemaphore.WaitAsync(100, cancellationToken).ConfigureAwait(false))
{
logger.LogDebug("Waiting for semaphore to read kubeconfig.");
}
mitchdenny marked this conversation as resolved.
Show resolved Hide resolved

// Return late after getting the semaphore but also probably after
// another thread has gotten the config loaded.

lock (Model.Dcp.Schema)
if (_kubernetes != null)
{
if (_kubernetes != null) { return; }
logger.LogDebug("Kubernetes ensured at second chance shortcut.");
_kubeconfigReadSemaphore.Release();
return;
}

var config = KubernetesClientConfiguration.BuildConfigFromConfigFile(kubeconfigPath: locations.DcpKubeconfigPath, useRelativePaths: false);
_kubernetes = new DcpKubernetesClient(config);
try
{
// This retry was created in relation to this comment in GitHub:
//
// https://github.com/dotnet/aspire/issues/2422#issuecomment-2016701083
//
// It looks like it is possible for us to attempt to read the file before it is written/finished
mitchdenny marked this conversation as resolved.
Show resolved Hide resolved
// being written. We rely on DCP to write the configuration file but it may happen in parallel to
// this code executing is its possible the file does not exist, or is still being written by
// the time we get to it.
//
// This retry will retry reading the file 5 times (by default, but configurable) with a pause
mitchdenny marked this conversation as resolved.
Show resolved Hide resolved
// of 3 seconds between each attempt. This means it could take up to 15 seconds to fail. We emit
// debug level logs for each retry attempt should we need to help a customer debug this.
var configurationReadRetry = new RetryStrategyOptions()
mitchdenny marked this conversation as resolved.
Show resolved Hide resolved
{
ShouldHandle = new PredicateBuilder().Handle<KubeConfigException>(),
BackoffType = DelayBackoffType.Constant,
mitchdenny marked this conversation as resolved.
Show resolved Hide resolved
MaxRetryAttempts = dcpOptions.Value.KubernetesConfigReadRetryCount,
MaxDelay = TimeSpan.FromSeconds(dcpOptions.Value.KubernetesConfigReadRetryIntervalSeconds),
mitchdenny marked this conversation as resolved.
Show resolved Hide resolved
OnRetry = (retry) =>
{
logger.LogDebug(
mitchdenny marked this conversation as resolved.
Show resolved Hide resolved
retry.Outcome.Exception,
"Reading Kubernetes configuration file from '{DcpKubeconfigPath}' failed. Retry pending. (iteration {Iteration}).",
locations.DcpKubeconfigPath,
retry.AttemptNumber
);
return ValueTask.CompletedTask;
}
};
var pipeline = new ResiliencePipelineBuilder().AddRetry(configurationReadRetry).Build();
mitchdenny marked this conversation as resolved.
Show resolved Hide resolved

_kubernetes = await pipeline.ExecuteAsync<DcpKubernetesClient>(async (cancellationToken) =>
{
logger.LogDebug("Reading Kubernetes configuration from '{DcpKubeconfigPath}'.", locations.DcpKubeconfigPath);
var fileInfo = new FileInfo(locations.DcpKubeconfigPath);
var config = await KubernetesClientConfiguration.BuildConfigFromConfigFileAsync(kubeconfig: fileInfo, useRelativePaths: false).ConfigureAwait(false);
logger.LogDebug("Successfully read Kubernetes configuration from '{DcpKubeconfigPath}'.", locations.DcpKubeconfigPath);

return new DcpKubernetesClient(config);
}, cancellationToken).ConfigureAwait(false);
}
finally
{
_kubeconfigReadSemaphore.Release();
}
}
}