From b5f9e660986773571a20c9a1accc461535ba4f92 Mon Sep 17 00:00:00 2001 From: Reuben Bond <203839+ReubenBond@users.noreply.github.com> Date: Wed, 16 Oct 2024 09:26:14 -0700 Subject: [PATCH] Strong consistency, distributed, in-memory grain directory (#9103) --- Directory.Packages.props | 16 +- Orleans.sln | 24 + .../ChaoticCluster.AppHost.csproj | 23 + .../ChaoticCluster.AppHost/Program.cs | 10 + .../Properties/launchSettings.json | 29 + .../appsettings.Development.json | 8 + .../ChaoticCluster.AppHost/appsettings.json | 9 + .../ChaoticCluster.ServiceDefaults.csproj | 22 + .../Extensions.cs | 111 +++ .../ChaoticCluster.Silo.csproj | 19 + .../ChaoticCluster.Silo/Program.cs | 151 ++++ .../SiloBuilderConfigurator.cs | 26 + .../Core/IGrainBase.cs | 8 + .../IDs/GrainAddress.cs | 21 +- .../Runtime/MembershipVersion.cs | 2 +- .../Diagnostics/MessagingTrace.cs | 2 + .../Metrics/DirectoryInstruments.cs | 50 +- .../Diagnostics/Metrics/InstrumentNames.cs | 6 + src/Orleans.Core/Networking/Connection.cs | 1 + src/Orleans.Core/Runtime/Constants.cs | 3 + src/Orleans.Core/Runtime/IRuntimeClient.cs | 2 +- .../Runtime/OutsideRuntimeClient.cs | 2 +- .../Statistics/EnvironmentStatistics.cs | 38 +- .../Catalog/ActivationCollector.cs | 90 +- src/Orleans.Runtime/Catalog/ActivationData.cs | 99 +-- .../Catalog/ActivationDirectory.cs | 5 +- .../Catalog/ActivationWorkingSet.cs | 2 + src/Orleans.Runtime/Catalog/Catalog.cs | 134 +-- .../Catalog}/ICatalog.cs | 1 - .../Core/InsideRuntimeClient.cs | 2 +- .../DirectoryMembershipService.cs | 92 +++ .../DirectoryMembershipSnapshot.cs | 306 +++++++ .../GrainDirectory/DirectoryResult.cs | 32 + .../DistributedGrainDirectory.cs | 381 +++++++++ .../GrainDirectoryHandoffManager.cs | 4 +- .../GrainDirectoryPartitionSnapshot.cs | 16 + .../GrainDirectoryReplica.Interface.cs | 112 +++ .../GrainDirectory/GrainDirectoryReplica.cs | 771 ++++++++++++++++++ .../GrainDirectory/GrainDirectoryResolver.cs | 4 +- .../GrainDirectory/GrainLocatorResolver.cs | 2 +- .../IGrainDirectoryPartition.cs | 39 + .../GrainDirectory/LocalGrainDirectory.cs | 4 +- ...ion.cs => LocalGrainDirectoryPartition.cs} | 8 +- .../GrainDirectory/RemoteGrainDirectory.cs | 2 +- .../GrainDirectory/RingRange.cs | 241 ++++++ .../GrainDirectory/RingRangeCollection.cs | 224 +++++ .../Hosting/CoreHostingExtensions.cs | 37 +- .../Hosting/DefaultSiloServices.cs | 2 +- .../ClusterMembershipService.cs | 10 +- .../ClusterMembershipSnapshot.cs | 2 + .../InMemoryMembershipTable.cs | 24 +- .../LocalSiloHealthMonitor.cs | 1 + .../SystemTargetBasedMembershipTable.cs | 13 +- .../Messaging/MessageCenter.cs | 42 +- .../Networking/GatewayInboundConnection.cs | 3 +- .../Networking/SiloConnection.cs | 15 +- .../Scheduler/ClosureWorkItem.cs | 24 + .../Scheduler/SchedulerExtensions.cs | 8 + .../Scheduler/WorkItemGroup.cs | 5 +- src/Orleans.Runtime/Silo/SiloControl.cs | 38 +- .../Utilities/SearchAlgorithms.cs | 94 +++ .../Utilities/StripedMpscBuffer.cs | 1 - .../ConfigureDistributedGrainDirectory.cs | 10 + .../InProcess/InProcessMembershipTable.cs | 2 +- src/Orleans.TestingHost/TestCluster.cs | 12 +- src/Orleans.TestingHost/TestClusterBuilder.cs | 1 + .../TestClusterHostFactory.cs | 2 +- test/DefaultCluster.Tests/ObserverTests.cs | 2 +- test/Directory.Build.props | 4 + .../RedisGrainDirectoryTests.cs | 2 +- .../AzureGrainDirectoryTests.cs | 20 +- test/Grains/TestInternalGrains/TimerGrain.cs | 42 +- .../DirectoryMembershipSnapshotTests.cs | 123 +++ .../Directory/RingRangeCollectionTests.cs | 142 ++++ .../NonSilo.Tests/Directory/RingRangeTests.cs | 183 +++++ test/NonSilo.Tests/NonSilo.Tests.csproj | 1 + .../OrleansTaskSchedulerBasicTests.cs | 2 +- .../Orleans.Serialization.FSharp.Tests.fsproj | 4 +- .../Tester/Directories/GrainDirectoryTests.cs | 254 +++--- .../ConsistentRingProviderTests_Silo.cs | 7 +- .../DistributedGrainDirectoryTests.cs | 22 + .../GrainDirectoryResilienceTests.cs | 182 +++++ .../GrainDirectoryPartitionTests.cs | 4 +- .../ConsistentRingProviderTests.cs | 51 +- .../Hosting/TransactionTestExtensions.cs | 6 +- 85 files changed, 4060 insertions(+), 491 deletions(-) create mode 100644 playground/ChaoticCluster/ChaoticCluster.AppHost/ChaoticCluster.AppHost.csproj create mode 100644 playground/ChaoticCluster/ChaoticCluster.AppHost/Program.cs create mode 100644 playground/ChaoticCluster/ChaoticCluster.AppHost/Properties/launchSettings.json create mode 100644 playground/ChaoticCluster/ChaoticCluster.AppHost/appsettings.Development.json create mode 100644 playground/ChaoticCluster/ChaoticCluster.AppHost/appsettings.json create mode 100644 playground/ChaoticCluster/ChaoticCluster.ServiceDefaults/ChaoticCluster.ServiceDefaults.csproj create mode 100644 playground/ChaoticCluster/ChaoticCluster.ServiceDefaults/Extensions.cs create mode 100644 playground/ChaoticCluster/ChaoticCluster.Silo/ChaoticCluster.Silo.csproj create mode 100644 playground/ChaoticCluster/ChaoticCluster.Silo/Program.cs create mode 100644 playground/ChaoticCluster/ChaoticCluster.Silo/SiloBuilderConfigurator.cs rename src/{Orleans.Core/SystemTargetInterfaces => Orleans.Runtime/Catalog}/ICatalog.cs (99%) create mode 100644 src/Orleans.Runtime/GrainDirectory/DirectoryMembershipService.cs create mode 100644 src/Orleans.Runtime/GrainDirectory/DirectoryMembershipSnapshot.cs create mode 100644 src/Orleans.Runtime/GrainDirectory/DirectoryResult.cs create mode 100644 src/Orleans.Runtime/GrainDirectory/DistributedGrainDirectory.cs create mode 100644 src/Orleans.Runtime/GrainDirectory/GrainDirectoryPartitionSnapshot.cs create mode 100644 src/Orleans.Runtime/GrainDirectory/GrainDirectoryReplica.Interface.cs create mode 100644 src/Orleans.Runtime/GrainDirectory/GrainDirectoryReplica.cs create mode 100644 src/Orleans.Runtime/GrainDirectory/IGrainDirectoryPartition.cs rename src/Orleans.Runtime/GrainDirectory/{GrainDirectoryPartition.cs => LocalGrainDirectoryPartition.cs} (97%) create mode 100644 src/Orleans.Runtime/GrainDirectory/RingRange.cs create mode 100644 src/Orleans.Runtime/GrainDirectory/RingRangeCollection.cs create mode 100644 src/Orleans.Runtime/Utilities/SearchAlgorithms.cs create mode 100644 src/Orleans.TestingHost/ConfigureDistributedGrainDirectory.cs create mode 100644 test/NonSilo.Tests/Directory/DirectoryMembershipSnapshotTests.cs create mode 100644 test/NonSilo.Tests/Directory/RingRangeCollectionTests.cs create mode 100644 test/NonSilo.Tests/Directory/RingRangeTests.cs create mode 100644 test/TesterInternal/GrainDirectory/DistributedGrainDirectoryTests.cs create mode 100644 test/TesterInternal/GrainDirectory/GrainDirectoryResilienceTests.cs diff --git a/Directory.Packages.props b/Directory.Packages.props index 656513e07f..a084e077cc 100644 --- a/Directory.Packages.props +++ b/Directory.Packages.props @@ -51,10 +51,17 @@ - - - - + + + + + + + + + + + @@ -68,6 +75,7 @@ + diff --git a/Orleans.sln b/Orleans.sln index 0df56b5714..d5e8786f4a 100644 --- a/Orleans.sln +++ b/Orleans.sln @@ -242,6 +242,14 @@ Project("{6EC3EE1D-3C4E-46DD-8F32-0CC8E7565705}") = "Orleans.Serialization.FShar EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Orleans.Serialization.MessagePack", "src\Orleans.Serialization.MessagePack\Orleans.Serialization.MessagePack.csproj", "{F50F81B6-E9B5-4143-B66B-A1AD913F6E9C}" EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "ChaoticCluster", "ChaoticCluster", "{2579A7F6-EBE8-485A-BB20-A5D19DB5612B}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ChaoticCluster.AppHost", "playground\ChaoticCluster\ChaoticCluster.AppHost\ChaoticCluster.AppHost.csproj", "{4E79EC4B-2DC4-41E3-9AE6-17C1FFF17B02}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ChaoticCluster.Silo", "playground\ChaoticCluster\ChaoticCluster.Silo\ChaoticCluster.Silo.csproj", "{76A549FA-69F1-4967-82B6-161A8B52C86B}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ChaoticCluster.ServiceDefaults", "playground\ChaoticCluster\ChaoticCluster.ServiceDefaults\ChaoticCluster.ServiceDefaults.csproj", "{4004A79F-B6BB-4472-891B-AD1348AE3E93}" +EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "TestSerializerExternalModels", "test\Misc\TestSerializerExternalModels\TestSerializerExternalModels.csproj", "{5D587DDE-036D-4694-A314-8DDF270AC031}" EndProject Global @@ -634,6 +642,18 @@ Global {F50F81B6-E9B5-4143-B66B-A1AD913F6E9C}.Debug|Any CPU.Build.0 = Debug|Any CPU {F50F81B6-E9B5-4143-B66B-A1AD913F6E9C}.Release|Any CPU.ActiveCfg = Release|Any CPU {F50F81B6-E9B5-4143-B66B-A1AD913F6E9C}.Release|Any CPU.Build.0 = Release|Any CPU + {4E79EC4B-2DC4-41E3-9AE6-17C1FFF17B02}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {4E79EC4B-2DC4-41E3-9AE6-17C1FFF17B02}.Debug|Any CPU.Build.0 = Debug|Any CPU + {4E79EC4B-2DC4-41E3-9AE6-17C1FFF17B02}.Release|Any CPU.ActiveCfg = Release|Any CPU + {4E79EC4B-2DC4-41E3-9AE6-17C1FFF17B02}.Release|Any CPU.Build.0 = Release|Any CPU + {76A549FA-69F1-4967-82B6-161A8B52C86B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {76A549FA-69F1-4967-82B6-161A8B52C86B}.Debug|Any CPU.Build.0 = Debug|Any CPU + {76A549FA-69F1-4967-82B6-161A8B52C86B}.Release|Any CPU.ActiveCfg = Release|Any CPU + {76A549FA-69F1-4967-82B6-161A8B52C86B}.Release|Any CPU.Build.0 = Release|Any CPU + {4004A79F-B6BB-4472-891B-AD1348AE3E93}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {4004A79F-B6BB-4472-891B-AD1348AE3E93}.Debug|Any CPU.Build.0 = Debug|Any CPU + {4004A79F-B6BB-4472-891B-AD1348AE3E93}.Release|Any CPU.ActiveCfg = Release|Any CPU + {4004A79F-B6BB-4472-891B-AD1348AE3E93}.Release|Any CPU.Build.0 = Release|Any CPU {5D587DDE-036D-4694-A314-8DDF270AC031}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {5D587DDE-036D-4694-A314-8DDF270AC031}.Debug|Any CPU.Build.0 = Debug|Any CPU {5D587DDE-036D-4694-A314-8DDF270AC031}.Release|Any CPU.ActiveCfg = Release|Any CPU @@ -754,6 +774,10 @@ Global {84B44F1D-B7FE-40E3-82F0-730A55AC8613} = {316CDCC7-323F-4264-9FC9-667662BB1F80} {B2D53D3C-E44A-4C9B-AAEE-28FB8C1BDF62} = {A6573187-FD0D-4DF7-91D1-03E07E470C0A} {F50F81B6-E9B5-4143-B66B-A1AD913F6E9C} = {4CD3AA9E-D937-48CA-BB6C-158E12257D23} + {2579A7F6-EBE8-485A-BB20-A5D19DB5612B} = {A41DE3D1-F8AA-4234-BE6F-3C9646A1507A} + {4E79EC4B-2DC4-41E3-9AE6-17C1FFF17B02} = {2579A7F6-EBE8-485A-BB20-A5D19DB5612B} + {76A549FA-69F1-4967-82B6-161A8B52C86B} = {2579A7F6-EBE8-485A-BB20-A5D19DB5612B} + {4004A79F-B6BB-4472-891B-AD1348AE3E93} = {2579A7F6-EBE8-485A-BB20-A5D19DB5612B} {5D587DDE-036D-4694-A314-8DDF270AC031} = {70BCC54E-1618-4742-A079-07588065E361} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution diff --git a/playground/ChaoticCluster/ChaoticCluster.AppHost/ChaoticCluster.AppHost.csproj b/playground/ChaoticCluster/ChaoticCluster.AppHost/ChaoticCluster.AppHost.csproj new file mode 100644 index 0000000000..95ad36182c --- /dev/null +++ b/playground/ChaoticCluster/ChaoticCluster.AppHost/ChaoticCluster.AppHost.csproj @@ -0,0 +1,23 @@ + + + + Exe + net8.0 + enable + enable + + + + + 8cceaca4-1c1f-473f-ac3a-6f220c8791cf + + + + + + + + + + + diff --git a/playground/ChaoticCluster/ChaoticCluster.AppHost/Program.cs b/playground/ChaoticCluster/ChaoticCluster.AppHost/Program.cs new file mode 100644 index 0000000000..a3147fd1ec --- /dev/null +++ b/playground/ChaoticCluster/ChaoticCluster.AppHost/Program.cs @@ -0,0 +1,10 @@ +using Projects; + +var builder = DistributedApplication.CreateBuilder(args); + +/* +// Comment this out once Aspire no longer requires a 'workload' to build. +builder.AddProject("silo"); +*/ + +builder.Build().Run(); diff --git a/playground/ChaoticCluster/ChaoticCluster.AppHost/Properties/launchSettings.json b/playground/ChaoticCluster/ChaoticCluster.AppHost/Properties/launchSettings.json new file mode 100644 index 0000000000..de31dd2521 --- /dev/null +++ b/playground/ChaoticCluster/ChaoticCluster.AppHost/Properties/launchSettings.json @@ -0,0 +1,29 @@ +{ + "$schema": "https://json.schemastore.org/launchsettings.json", + "profiles": { + "https": { + "commandName": "Project", + "dotnetRunMessages": true, + "launchBrowser": true, + "applicationUrl": "https://localhost:17213;http://localhost:15139", + "environmentVariables": { + "ASPNETCORE_ENVIRONMENT": "Development", + "DOTNET_ENVIRONMENT": "Development", + "DOTNET_DASHBOARD_OTLP_ENDPOINT_URL": "https://localhost:21045", + "DOTNET_RESOURCE_SERVICE_ENDPOINT_URL": "https://localhost:22043" + } + }, + "http": { + "commandName": "Project", + "dotnetRunMessages": true, + "launchBrowser": true, + "applicationUrl": "http://localhost:15139", + "environmentVariables": { + "ASPNETCORE_ENVIRONMENT": "Development", + "DOTNET_ENVIRONMENT": "Development", + "DOTNET_DASHBOARD_OTLP_ENDPOINT_URL": "http://localhost:19150", + "DOTNET_RESOURCE_SERVICE_ENDPOINT_URL": "http://localhost:20085" + } + } + } +} diff --git a/playground/ChaoticCluster/ChaoticCluster.AppHost/appsettings.Development.json b/playground/ChaoticCluster/ChaoticCluster.AppHost/appsettings.Development.json new file mode 100644 index 0000000000..0c208ae918 --- /dev/null +++ b/playground/ChaoticCluster/ChaoticCluster.AppHost/appsettings.Development.json @@ -0,0 +1,8 @@ +{ + "Logging": { + "LogLevel": { + "Default": "Information", + "Microsoft.AspNetCore": "Warning" + } + } +} diff --git a/playground/ChaoticCluster/ChaoticCluster.AppHost/appsettings.json b/playground/ChaoticCluster/ChaoticCluster.AppHost/appsettings.json new file mode 100644 index 0000000000..31c092aa45 --- /dev/null +++ b/playground/ChaoticCluster/ChaoticCluster.AppHost/appsettings.json @@ -0,0 +1,9 @@ +{ + "Logging": { + "LogLevel": { + "Default": "Information", + "Microsoft.AspNetCore": "Warning", + "Aspire.Hosting.Dcp": "Warning" + } + } +} diff --git a/playground/ChaoticCluster/ChaoticCluster.ServiceDefaults/ChaoticCluster.ServiceDefaults.csproj b/playground/ChaoticCluster/ChaoticCluster.ServiceDefaults/ChaoticCluster.ServiceDefaults.csproj new file mode 100644 index 0000000000..2388aea655 --- /dev/null +++ b/playground/ChaoticCluster/ChaoticCluster.ServiceDefaults/ChaoticCluster.ServiceDefaults.csproj @@ -0,0 +1,22 @@ + + + + net8.0 + enable + enable + true + + + + + + + + + + + + + + + diff --git a/playground/ChaoticCluster/ChaoticCluster.ServiceDefaults/Extensions.cs b/playground/ChaoticCluster/ChaoticCluster.ServiceDefaults/Extensions.cs new file mode 100644 index 0000000000..29dcb42871 --- /dev/null +++ b/playground/ChaoticCluster/ChaoticCluster.ServiceDefaults/Extensions.cs @@ -0,0 +1,111 @@ +using Microsoft.AspNetCore.Builder; +using Microsoft.AspNetCore.Diagnostics.HealthChecks; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Diagnostics.HealthChecks; +using Microsoft.Extensions.Logging; +using OpenTelemetry; +using OpenTelemetry.Metrics; +using OpenTelemetry.Trace; + +namespace Microsoft.Extensions.Hosting; +// Adds common .NET Aspire services: service discovery, resilience, health checks, and OpenTelemetry. +// This project should be referenced by each service project in your solution. +// To learn more about using this project, see https://aka.ms/dotnet/aspire/service-defaults +public static class Extensions +{ + public static IHostApplicationBuilder AddServiceDefaults(this IHostApplicationBuilder builder) + { + builder.ConfigureOpenTelemetry(); + + builder.AddDefaultHealthChecks(); + + builder.Services.AddServiceDiscovery(); + + builder.Services.ConfigureHttpClientDefaults(http => + { + // Turn on resilience by default + http.AddStandardResilienceHandler(); + + // Turn on service discovery by default + http.AddServiceDiscovery(); + }); + + // Uncomment the following to restrict the allowed schemes for service discovery. + // builder.Services.Configure(options => + // { + // options.AllowedSchemes = ["https"]; + // }); + + return builder; + } + + public static IHostApplicationBuilder ConfigureOpenTelemetry(this IHostApplicationBuilder builder) + { + builder.Logging.AddOpenTelemetry(logging => + { + logging.IncludeFormattedMessage = true; + logging.IncludeScopes = true; + }); + + builder.Services.AddOpenTelemetry() + .WithMetrics(metrics => + { + metrics.AddAspNetCoreInstrumentation() + .AddHttpClientInstrumentation() + .AddRuntimeInstrumentation() + .AddMeter("System.Runtime") + .AddMeter("Microsoft.Orleans"); + }); + + builder.AddOpenTelemetryExporters(); + + return builder; + } + + private static IHostApplicationBuilder AddOpenTelemetryExporters(this IHostApplicationBuilder builder) + { + var useOtlpExporter = !string.IsNullOrWhiteSpace(builder.Configuration["OTEL_EXPORTER_OTLP_ENDPOINT"]); + + if (useOtlpExporter) + { + builder.Services.AddOpenTelemetry().UseOtlpExporter(); + } + + // Uncomment the following lines to enable the Azure Monitor exporter (requires the Azure.Monitor.OpenTelemetry.AspNetCore package) + //if (!string.IsNullOrEmpty(builder.Configuration["APPLICATIONINSIGHTS_CONNECTION_STRING"])) + //{ + // builder.Services.AddOpenTelemetry() + // .UseAzureMonitor(); + //} + + return builder; + } + + public static IHostApplicationBuilder AddDefaultHealthChecks(this IHostApplicationBuilder builder) + { + builder.Services.AddHealthChecks() + // Add a default liveness check to ensure app is responsive + .AddCheck("self", () => HealthCheckResult.Healthy(), ["live"]); + + return builder; + } + + public static WebApplication MapDefaultEndpoints(this WebApplication app) + { + // Adding health checks endpoints to applications in non-development environments has security implications. + // See https://aka.ms/dotnet/aspire/healthchecks for details before enabling these endpoints in non-development environments. + if (app.Environment.IsDevelopment()) + { + // All health checks must pass for app to be considered ready to accept traffic after starting + app.MapHealthChecks("/health"); + + // Only health checks tagged with the "live" tag must pass for app to be considered alive + app.MapHealthChecks("/alive", new HealthCheckOptions + { + Predicate = r => r.Tags.Contains("live") + }); + } + + return app; + } +} diff --git a/playground/ChaoticCluster/ChaoticCluster.Silo/ChaoticCluster.Silo.csproj b/playground/ChaoticCluster/ChaoticCluster.Silo/ChaoticCluster.Silo.csproj new file mode 100644 index 0000000000..6dfb9074aa --- /dev/null +++ b/playground/ChaoticCluster/ChaoticCluster.Silo/ChaoticCluster.Silo.csproj @@ -0,0 +1,19 @@ + + + + Exe + net8.0 + enable + enable + true + true + + + + + + + + + + diff --git a/playground/ChaoticCluster/ChaoticCluster.Silo/Program.cs b/playground/ChaoticCluster/ChaoticCluster.Silo/Program.cs new file mode 100644 index 0000000000..01869077fd --- /dev/null +++ b/playground/ChaoticCluster/ChaoticCluster.Silo/Program.cs @@ -0,0 +1,151 @@ +using System.Diagnostics; +using ChaoticCluster.Silo; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Hosting; +using Microsoft.Extensions.Logging; +using Orleans.TestingHost; + +var builder = Host.CreateApplicationBuilder(args); +builder.AddServiceDefaults(); // Configure OTel +using var app = builder.Build(); +await app.StartAsync(); + +var testClusterBuilder = new InProcessTestClusterBuilder(1); +testClusterBuilder.ConfigureSilo((options, siloBuilder) => new SiloBuilderConfigurator().Configure(siloBuilder)); +testClusterBuilder.ConfigureSiloHost((options, hostBuilder) => +{ + foreach (var provider in app.Services.GetServices()) + { + hostBuilder.Logging.AddProvider(provider); + } +}); + +testClusterBuilder.ConfigureClientHost(hostBuilder => +{ + foreach (var provider in app.Services.GetServices()) + { + hostBuilder.Logging.AddProvider(provider); + } +}); + +var testCluster = testClusterBuilder.Build(); +await testCluster.DeployAsync(); +var log = testCluster.Client.ServiceProvider.GetRequiredService>(); +log.LogInformation($"ServiceId: {testCluster.Options.ServiceId}"); +log.LogInformation($"ClusterId: {testCluster.Options.ClusterId}"); + +var cts = new CancellationTokenSource(TimeSpan.FromMinutes(15)); +var reconfigurationTimer = Stopwatch.StartNew(); +var upperLimit = 10; +var lowerLimit = 1; // Membership is kept on the primary, so we can't go below 1 +var target = upperLimit; +var idBase = 0L; +var client = testCluster.Silos[0].ServiceProvider.GetRequiredService(); +const int CallsPerIteration = 100; +const int MaxGrains = 524_288; // 2**19; + +var loadTask = Task.Run(async () => +{ + while (!cts.IsCancellationRequested) + { + var time = Stopwatch.StartNew(); + var tasks = Enumerable.Range(0, CallsPerIteration).Select(i => client.GetGrain((idBase + i) % MaxGrains).Ping().AsTask()).ToList(); + var workTask = Task.WhenAll(tasks); + using var delayCancellation = new CancellationTokenSource(); + var delay = TimeSpan.FromMilliseconds(90_000); + var delayTask = Task.Delay(delay, delayCancellation.Token); + await Task.WhenAny(workTask, delayTask); + + try + { + await workTask; + } + catch (SiloUnavailableException sue) + { + log.LogInformation(sue, "Swallowed transient exception."); + } + catch (OrleansMessageRejectionException omre) + { + log.LogInformation(omre, "Swallowed rejection."); + } + catch (Exception exception) + { + log.LogError(exception, "Unhandled exception."); + throw; + } + + delayCancellation.Cancel(); + idBase += CallsPerIteration; + } +}); + +var chaosTask = Task.Run(async () => +{ + var clusterOperation = Task.CompletedTask; + while (!cts.IsCancellationRequested) + { + try + { + var remaining = TimeSpan.FromSeconds(10) - reconfigurationTimer.Elapsed; + if (remaining <= TimeSpan.Zero) + { + reconfigurationTimer.Restart(); + await clusterOperation; + + clusterOperation = Task.Run(async () => + { + var currentCount = testCluster.Silos.Count; + + if (currentCount > target) + { + // Stop or kill a random silo, but not the primary (since that hosts cluster membership) + var victim = testCluster.Silos[Random.Shared.Next(1, testCluster.Silos.Count - 1)]; + if (currentCount % 2 == 0) + { + log.LogInformation($"Stopping '{victim.SiloAddress}'."); + await testCluster.StopSiloAsync(victim); + log.LogInformation($"Stopped '{victim.SiloAddress}'."); + } + else + { + log.LogInformation($"Killing '{victim.SiloAddress}'."); + await testCluster.KillSiloAsync(victim); + log.LogInformation($"Killed '{victim.SiloAddress}'."); + } + } + else if (currentCount < target) + { + log.LogInformation("Starting new silo."); + var result = await testCluster.StartAdditionalSiloAsync(); + log.LogInformation($"Started '{result.SiloAddress}'."); + } + + if (currentCount <= lowerLimit) + { + target = upperLimit; + } + else if (currentCount >= upperLimit) + { + target = lowerLimit; + } + }); + } + else + { + await Task.Delay(remaining); + } + } + catch (Exception exception) + { + log.LogInformation(exception, "Ignoring chaos exception."); + } + } +}); + +await await Task.WhenAny(loadTask, chaosTask); +cts.Cancel(); +await Task.WhenAll(loadTask, chaosTask); +await testCluster.StopAllSilosAsync(); +await testCluster.DisposeAsync(); + +await app.StopAsync(); \ No newline at end of file diff --git a/playground/ChaoticCluster/ChaoticCluster.Silo/SiloBuilderConfigurator.cs b/playground/ChaoticCluster/ChaoticCluster.Silo/SiloBuilderConfigurator.cs new file mode 100644 index 0000000000..aac181b83f --- /dev/null +++ b/playground/ChaoticCluster/ChaoticCluster.Silo/SiloBuilderConfigurator.cs @@ -0,0 +1,26 @@ +using Microsoft.Extensions.DependencyInjection; +using Orleans.Configuration; +using Orleans.TestingHost; + +namespace ChaoticCluster.Silo; + +class SiloBuilderConfigurator : ISiloConfigurator + { + public void Configure(ISiloBuilder siloBuilder) + { +#pragma warning disable ORLEANSEXP002 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed. + siloBuilder.AddDistributedGrainDirectory(); +#pragma warning restore ORLEANSEXP002 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed. + } + } + +internal interface IMyTestGrain : IGrainWithIntegerKey +{ + ValueTask Ping(); +} + +[CollectionAgeLimit(Minutes = 1.01)] +internal class MyTestGrain : Grain, IMyTestGrain +{ + public ValueTask Ping() => default; +} diff --git a/src/Orleans.Core.Abstractions/Core/IGrainBase.cs b/src/Orleans.Core.Abstractions/Core/IGrainBase.cs index a0145673f8..25b806c5b3 100644 --- a/src/Orleans.Core.Abstractions/Core/IGrainBase.cs +++ b/src/Orleans.Core.Abstractions/Core/IGrainBase.cs @@ -299,4 +299,12 @@ public enum DeactivationReasonCode : byte /// Migrating, } + + internal static class DeactivationReasonCodeExtensions + { + public static bool IsTransientError(this DeactivationReasonCode reasonCode) + { + return reasonCode is DeactivationReasonCode.DirectoryFailure; + } + } } diff --git a/src/Orleans.Core.Abstractions/IDs/GrainAddress.cs b/src/Orleans.Core.Abstractions/IDs/GrainAddress.cs index 94c1ab9a9d..4224b88425 100644 --- a/src/Orleans.Core.Abstractions/IDs/GrainAddress.cs +++ b/src/Orleans.Core.Abstractions/IDs/GrainAddress.cs @@ -1,4 +1,6 @@ using System; +using System.Diagnostics.CodeAnalysis; +using System.Runtime.CompilerServices; using System.Text.Json.Serialization; using Orleans.GrainDirectory; @@ -46,8 +48,9 @@ public sealed class GrainAddress : IEquatable, ISpanFormattable public bool Equals(GrainAddress? other) { - return other != null && (SiloAddress?.Equals(other.SiloAddress) ?? other.SiloAddress is null) - && _grainId.Equals(other._grainId) && _activationId.Equals(other._activationId) && MembershipVersion == other.MembershipVersion; + if (ReferenceEquals(this, other)) return true; + return MatchesGrainIdAndSilo(this, other) + && _activationId.Equals(other._activationId); } /// @@ -56,15 +59,21 @@ public bool Equals(GrainAddress? other) /// /// The other to compare this one with. /// Returns true if the two are considered to match. - public bool Matches(GrainAddress other) + public bool Matches(GrainAddress? other) { - return other is not null && _grainId.Equals(other._grainId) && (SiloAddress?.Equals(other.SiloAddress) ?? other.SiloAddress is null) + if (ReferenceEquals(this, other)) return true; + return MatchesGrainIdAndSilo(this, other) && (_activationId.IsDefault || other._activationId.IsDefault || _activationId.Equals(other._activationId)); } - internal static bool MatchesGrainIdAndSilo(GrainAddress address, GrainAddress other) + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static bool MatchesGrainIdAndSilo([NotNullWhen(true)] GrainAddress? address, [NotNullWhen(true)] GrainAddress? other) { - return other is not null && address.GrainId.Equals(other.GrainId) && (address.SiloAddress?.Equals(other.SiloAddress) ?? other.SiloAddress is null); + return other is not null + && address is not null + && address.GrainId.Equals(other.GrainId) + && !(address.SiloAddress is null ^ other.SiloAddress is null) + && (address.SiloAddress is null || address.SiloAddress.Equals(other.SiloAddress)); } public override int GetHashCode() => HashCode.Combine(SiloAddress, _grainId, _activationId); diff --git a/src/Orleans.Core.Abstractions/Runtime/MembershipVersion.cs b/src/Orleans.Core.Abstractions/Runtime/MembershipVersion.cs index f3df4e2476..98f47a291f 100644 --- a/src/Orleans.Core.Abstractions/Runtime/MembershipVersion.cs +++ b/src/Orleans.Core.Abstractions/Runtime/MembershipVersion.cs @@ -44,7 +44,7 @@ public MembershipVersion(long version) public override int GetHashCode() => this.Value.GetHashCode(); /// - public override string ToString() => this.Value.ToString(); + public override string ToString() => Value != MinValue.Value ? $"{Value}" : "default"; /// /// Compares the provided operands for equality. diff --git a/src/Orleans.Core/Diagnostics/MessagingTrace.cs b/src/Orleans.Core/Diagnostics/MessagingTrace.cs index aaefa946cf..9f44a421a9 100644 --- a/src/Orleans.Core/Diagnostics/MessagingTrace.cs +++ b/src/Orleans.Core/Diagnostics/MessagingTrace.cs @@ -1,5 +1,7 @@ using System; using System.Diagnostics; +using System.Globalization; +using System.IO; using System.Runtime.CompilerServices; using Microsoft.Extensions.Logging; diff --git a/src/Orleans.Core/Diagnostics/Metrics/DirectoryInstruments.cs b/src/Orleans.Core/Diagnostics/Metrics/DirectoryInstruments.cs index 9bc752ea63..e59e3a9dde 100644 --- a/src/Orleans.Core/Diagnostics/Metrics/DirectoryInstruments.cs +++ b/src/Orleans.Core/Diagnostics/Metrics/DirectoryInstruments.cs @@ -5,21 +5,27 @@ namespace Orleans.Runtime; internal static class DirectoryInstruments { - internal static Counter LookupsLocalIssued = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_LOOKUPS_LOCAL_ISSUED); - internal static Counter LookupsLocalSuccesses = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_LOOKUPS_LOCAL_SUCCESSES); + internal static readonly Counter LookupsLocalIssued = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_LOOKUPS_LOCAL_ISSUED); + internal static readonly Counter LookupsLocalSuccesses = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_LOOKUPS_LOCAL_SUCCESSES); - internal static Counter LookupsFullIssued = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_LOOKUPS_FULL_ISSUED); + internal static readonly Counter LookupsFullIssued = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_LOOKUPS_FULL_ISSUED); - internal static Counter LookupsRemoteSent = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_LOOKUPS_REMOTE_SENT); - internal static Counter LookupsRemoteReceived = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_LOOKUPS_REMOTE_RECEIVED); + internal static readonly Counter LookupsRemoteSent = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_LOOKUPS_REMOTE_SENT); + internal static readonly Counter LookupsRemoteReceived = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_LOOKUPS_REMOTE_RECEIVED); - internal static Counter LookupsLocalDirectoryIssued = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_LOOKUPS_LOCALDIRECTORY_ISSUED); - internal static Counter LookupsLocalDirectorySuccesses = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_LOOKUPS_LOCALDIRECTORY_SUCCESSES); + internal static readonly Counter LookupsLocalDirectoryIssued = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_LOOKUPS_LOCALDIRECTORY_ISSUED); + internal static readonly Counter LookupsLocalDirectorySuccesses = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_LOOKUPS_LOCALDIRECTORY_SUCCESSES); - internal static Counter LookupsCacheIssued = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_LOOKUPS_CACHE_ISSUED); - internal static Counter LookupsCacheSuccesses = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_LOOKUPS_CACHE_SUCCESSES); - internal static Counter ValidationsCacheSent = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_VALIDATIONS_CACHE_SENT); - internal static Counter ValidationsCacheReceived = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_VALIDATIONS_CACHE_RECEIVED); + internal static readonly Counter LookupsCacheIssued = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_LOOKUPS_CACHE_ISSUED); + internal static readonly Counter LookupsCacheSuccesses = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_LOOKUPS_CACHE_SUCCESSES); + internal static readonly Counter ValidationsCacheSent = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_VALIDATIONS_CACHE_SENT); + internal static readonly Counter ValidationsCacheReceived = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_VALIDATIONS_CACHE_RECEIVED); + + internal static readonly Counter SnapshotTransferCount = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_RANGE_SNAPSHOT_TRANSFER_COUNT); + internal static readonly Histogram SnapshotTransferDuration = Instruments.Meter.CreateHistogram(InstrumentNames.DIRECTORY_RANGE_SNAPSHOT_TRANSFER_DURATION); + internal static readonly Counter RangeRecoveryCount = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_RANGE_RECOVERY_COUNT); + internal static readonly Histogram RangeRecoveryDuration = Instruments.Meter.CreateHistogram(InstrumentNames.DIRECTORY_RANGE_RECOVERY_DURATION); + internal static readonly Histogram RangeLockHeldDuration = Instruments.Meter.CreateHistogram(InstrumentNames.DIRECTORY_RANGE_LOCK_HELD_DURATION); internal static ObservableGauge DirectoryPartitionSize; internal static void RegisterDirectoryPartitionSizeObserve(Func observeValue) @@ -57,15 +63,15 @@ internal static void RegisterMyPortionAverageRingPercentageObserve(Func o MyPortionAverageRingPercentage = Instruments.Meter.CreateObservableGauge(InstrumentNames.DIRECTORY_RING_MYPORTION_AVERAGERINGPERCENTAGE, observeValue); } - internal static Counter RegistrationsSingleActIssued = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_REGISTRATIONS_SINGLE_ACT_ISSUED); - internal static Counter RegistrationsSingleActLocal = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_REGISTRATIONS_SINGLE_ACT_LOCAL); - internal static Counter RegistrationsSingleActRemoteSent = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_REGISTRATIONS_SINGLE_ACT_REMOTE_SENT); - internal static Counter RegistrationsSingleActRemoteReceived = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_REGISTRATIONS_SINGLE_ACT_REMOTE_RECEIVED); - internal static Counter UnregistrationsIssued = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_UNREGISTRATIONS_ISSUED); - internal static Counter UnregistrationsLocal = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_UNREGISTRATIONS_LOCAL); - internal static Counter UnregistrationsRemoteSent = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_UNREGISTRATIONS_REMOTE_SENT); - internal static Counter UnregistrationsRemoteReceived = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_UNREGISTRATIONS_REMOTE_RECEIVED); - internal static Counter UnregistrationsManyIssued = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_UNREGISTRATIONS_MANY_ISSUED); - internal static Counter UnregistrationsManyRemoteSent = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_UNREGISTRATIONS_MANY_REMOTE_SENT); - internal static Counter UnregistrationsManyRemoteReceived = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_UNREGISTRATIONS_MANY_REMOTE_RECEIVED); + internal static readonly Counter RegistrationsSingleActIssued = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_REGISTRATIONS_SINGLE_ACT_ISSUED); + internal static readonly Counter RegistrationsSingleActLocal = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_REGISTRATIONS_SINGLE_ACT_LOCAL); + internal static readonly Counter RegistrationsSingleActRemoteSent = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_REGISTRATIONS_SINGLE_ACT_REMOTE_SENT); + internal static readonly Counter RegistrationsSingleActRemoteReceived = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_REGISTRATIONS_SINGLE_ACT_REMOTE_RECEIVED); + internal static readonly Counter UnregistrationsIssued = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_UNREGISTRATIONS_ISSUED); + internal static readonly Counter UnregistrationsLocal = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_UNREGISTRATIONS_LOCAL); + internal static readonly Counter UnregistrationsRemoteSent = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_UNREGISTRATIONS_REMOTE_SENT); + internal static readonly Counter UnregistrationsRemoteReceived = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_UNREGISTRATIONS_REMOTE_RECEIVED); + internal static readonly Counter UnregistrationsManyIssued = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_UNREGISTRATIONS_MANY_ISSUED); + internal static readonly Counter UnregistrationsManyRemoteSent = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_UNREGISTRATIONS_MANY_REMOTE_SENT); + internal static readonly Counter UnregistrationsManyRemoteReceived = Instruments.Meter.CreateCounter(InstrumentNames.DIRECTORY_UNREGISTRATIONS_MANY_REMOTE_RECEIVED); } diff --git a/src/Orleans.Core/Diagnostics/Metrics/InstrumentNames.cs b/src/Orleans.Core/Diagnostics/Metrics/InstrumentNames.cs index f40fe71ff8..cf1a8ec643 100644 --- a/src/Orleans.Core/Diagnostics/Metrics/InstrumentNames.cs +++ b/src/Orleans.Core/Diagnostics/Metrics/InstrumentNames.cs @@ -83,6 +83,12 @@ internal static class InstrumentNames public const string DIRECTORY_UNREGISTRATIONS_MANY_REMOTE_SENT = "orleans-directory-unregistrations-many-remote-sent"; public const string DIRECTORY_UNREGISTRATIONS_MANY_REMOTE_RECEIVED = "orleans-directory-unregistrations-many-remote-received"; + public const string DIRECTORY_RANGE_SNAPSHOT_TRANSFER_COUNT = "orleans-directory-snapshot-transfer-count"; + public const string DIRECTORY_RANGE_SNAPSHOT_TRANSFER_DURATION = "orleans-directory-snapshot-transfer-duration"; + public const string DIRECTORY_RANGE_RECOVERY_COUNT = "orleans-directory-recovery-count"; + public const string DIRECTORY_RANGE_RECOVERY_DURATION = "orleans-directory-recovery-duration"; + public const string DIRECTORY_RANGE_LOCK_HELD_DURATION = "orleans-directory-range-lock-held-duration"; + // ConsistentRing public const string CONSISTENTRING_SIZE = "orleans-consistent-ring-size"; public const string CONSISTENTRING_LOCAL_SIZE_PERCENTAGE = "orleans-consistent-ring-range-percentage-local"; diff --git a/src/Orleans.Core/Networking/Connection.cs b/src/Orleans.Core/Networking/Connection.cs index 9892458d3c..88e52018b5 100644 --- a/src/Orleans.Core/Networking/Connection.cs +++ b/src/Orleans.Core/Networking/Connection.cs @@ -58,6 +58,7 @@ protected Connection( this.LocalEndPoint = NormalizeEndpoint(this.Context.LocalEndPoint); } + public ConnectionCommon Shared => shared; public string ConnectionId => this.Context?.ConnectionId; public virtual EndPoint RemoteEndPoint { get; } public virtual EndPoint LocalEndPoint { get; } diff --git a/src/Orleans.Core/Runtime/Constants.cs b/src/Orleans.Core/Runtime/Constants.cs index 39aaa6ac12..198908e73a 100644 --- a/src/Orleans.Core/Runtime/Constants.cs +++ b/src/Orleans.Core/Runtime/Constants.cs @@ -26,6 +26,8 @@ internal static class Constants public static readonly GrainType ManifestProviderType = SystemTargetGrainId.CreateGrainType("manifest"); public static readonly GrainType ActivationMigratorType = SystemTargetGrainId.CreateGrainType("migrator"); public static readonly GrainType ActivationRepartitionerType = SystemTargetGrainId.CreateGrainType("repartitioner"); + public static readonly GrainType GrainDirectoryPartition = SystemTargetGrainId.CreateGrainType("dir.grain.part"); + public static readonly GrainType GrainDirectory = SystemTargetGrainId.CreateGrainType("dir.grain"); public static readonly GrainId SiloDirectConnectionId = GrainId.Create( GrainType.Create(GrainTypePrefix.SystemPrefix + "silo"), @@ -53,6 +55,7 @@ internal static class Constants {ManifestProviderType, "ManifestProvider"}, {ActivationMigratorType, "ActivationMigrator"}, {ActivationRepartitionerType, "ActivationRepartitioner"}, + {GrainDirectory, "GrainDirectory"}, }.ToFrozenDictionary(); public static string SystemTargetName(GrainType id) => SingletonSystemTargetNames.TryGetValue(id, out var name) ? name : id.ToString(); diff --git a/src/Orleans.Core/Runtime/IRuntimeClient.cs b/src/Orleans.Core/Runtime/IRuntimeClient.cs index 768684c955..bc18406f24 100644 --- a/src/Orleans.Core/Runtime/IRuntimeClient.cs +++ b/src/Orleans.Core/Runtime/IRuntimeClient.cs @@ -56,7 +56,7 @@ internal interface IRuntimeClient IGrainReferenceRuntime GrainReferenceRuntime { get; } - void BreakOutstandingMessagesToDeadSilo(SiloAddress deadSilo); + void BreakOutstandingMessagesToSilo(SiloAddress deadSilo); // For testing purposes only. int GetRunningRequestsCount(GrainInterfaceType grainInterfaceType); diff --git a/src/Orleans.Core/Runtime/OutsideRuntimeClient.cs b/src/Orleans.Core/Runtime/OutsideRuntimeClient.cs index d9dff1864b..bf37494993 100644 --- a/src/Orleans.Core/Runtime/OutsideRuntimeClient.cs +++ b/src/Orleans.Core/Runtime/OutsideRuntimeClient.cs @@ -397,7 +397,7 @@ public void Dispose() disposed = true; } - public void BreakOutstandingMessagesToDeadSilo(SiloAddress deadSilo) + public void BreakOutstandingMessagesToSilo(SiloAddress deadSilo) { foreach (var callback in callbacks) { diff --git a/src/Orleans.Core/Statistics/EnvironmentStatistics.cs b/src/Orleans.Core/Statistics/EnvironmentStatistics.cs index 3a878f3e77..24c3ecc758 100644 --- a/src/Orleans.Core/Statistics/EnvironmentStatistics.cs +++ b/src/Orleans.Core/Statistics/EnvironmentStatistics.cs @@ -36,30 +36,30 @@ public EnvironmentStatisticsProvider() } /// - public EnvironmentStatistics GetEnvironmentStatistics() - { - var memoryInfo = GC.GetGCMemoryInfo(); +public EnvironmentStatistics GetEnvironmentStatistics() +{ + var memoryInfo = GC.GetGCMemoryInfo(); - var cpuUsage = _eventCounterListener.CpuUsage; - var memoryUsage = GC.GetTotalMemory(false) + memoryInfo.FragmentedBytes; + var cpuUsage = _eventCounterListener.CpuUsage; + var memoryUsage = GC.GetTotalMemory(false) + memoryInfo.FragmentedBytes; - var committedOfLimit = memoryInfo.TotalAvailableMemoryBytes - memoryInfo.TotalCommittedBytes; - var unusedLoad = memoryInfo.HighMemoryLoadThresholdBytes - memoryInfo.MemoryLoadBytes; - var systemAvailable = Math.Max(0, Math.Min(committedOfLimit, unusedLoad)); - var processAvailable = memoryInfo.TotalCommittedBytes - memoryInfo.HeapSizeBytes; - var availableMemory = systemAvailable + processAvailable; - var maxAvailableMemory = Math.Min(memoryInfo.TotalAvailableMemoryBytes, memoryInfo.HighMemoryLoadThresholdBytes); + var committedOfLimit = memoryInfo.TotalAvailableMemoryBytes - memoryInfo.TotalCommittedBytes; + var unusedLoad = memoryInfo.HighMemoryLoadThresholdBytes - memoryInfo.MemoryLoadBytes; + var systemAvailable = Math.Max(0, Math.Min(committedOfLimit, unusedLoad)); + var processAvailable = memoryInfo.TotalCommittedBytes - memoryInfo.HeapSizeBytes; + var availableMemory = systemAvailable + processAvailable; + var maxAvailableMemory = Math.Min(memoryInfo.TotalAvailableMemoryBytes, memoryInfo.HighMemoryLoadThresholdBytes); - var filteredCpuUsage = _cpuUsageFilter.Filter(cpuUsage); - var filteredMemoryUsage = (long)_memoryUsageFilter.Filter(memoryUsage); - var filteredAvailableMemory = (long)_availableMemoryFilter.Filter(availableMemory); - // no need to filter 'maxAvailableMemory' as it will almost always be a steady value. + var filteredCpuUsage = _cpuUsageFilter.Filter(cpuUsage); + var filteredMemoryUsage = (long)_memoryUsageFilter.Filter(memoryUsage); + var filteredAvailableMemory = (long)_availableMemoryFilter.Filter(availableMemory); + // no need to filter 'maxAvailableMemory' as it will almost always be a steady value. - _availableMemoryBytes = filteredAvailableMemory; - _maximumAvailableMemoryBytes = maxAvailableMemory; + _availableMemoryBytes = filteredAvailableMemory; + _maximumAvailableMemoryBytes = maxAvailableMemory; - return new(filteredCpuUsage, filteredMemoryUsage, filteredAvailableMemory, maxAvailableMemory); - } + return new(filteredCpuUsage, filteredMemoryUsage, filteredAvailableMemory, maxAvailableMemory); +} public void Dispose() => _eventCounterListener.Dispose(); diff --git a/src/Orleans.Runtime/Catalog/ActivationCollector.cs b/src/Orleans.Runtime/Catalog/ActivationCollector.cs index 90ba8287cd..20b5eb8d49 100644 --- a/src/Orleans.Runtime/Catalog/ActivationCollector.cs +++ b/src/Orleans.Runtime/Catalog/ActivationCollector.cs @@ -7,7 +7,6 @@ using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using Orleans.Configuration; -using Orleans.Internal; using Orleans.Runtime.Internal; namespace Orleans.Runtime @@ -15,11 +14,12 @@ namespace Orleans.Runtime /// /// Identifies activations that have been idle long enough to be deactivated. /// - internal class ActivationCollector : IActivationWorkingSetObserver, ILifecycleParticipant + internal class ActivationCollector : IActivationWorkingSetObserver, ILifecycleParticipant, IDisposable { private readonly TimeSpan quantum; private readonly TimeSpan shortestAgeLimit; private readonly ConcurrentDictionary buckets = new(); + private readonly CancellationTokenSource _shutdownCts = new(); private DateTime nextTicket; private static readonly List nothing = new(0); private readonly ILogger logger; @@ -74,7 +74,7 @@ public int GetNumRecentlyUsed(TimeSpan recencyPeriod) /// /// The age limit. /// A representing the work performed. - public Task CollectActivations(TimeSpan ageLimit) => CollectActivationsImpl(false, ageLimit); + public Task CollectActivations(TimeSpan ageLimit, CancellationToken cancellationToken) => CollectActivationsImpl(false, ageLimit, cancellationToken); /// /// Schedules the provided grain context for collection if it becomes idle for the specified duration. @@ -213,7 +213,6 @@ public List ScanStale() { var now = DateTime.UtcNow; List condemned = null; - var reason = GetDeactivationReason(); while (DequeueQuantum(out var activations, now)) { // At this point, all tickets associated with activations are cancelled and any attempts to reschedule will fail silently. @@ -225,15 +224,9 @@ public List ScanStale() activation.CollectionTicket = default; if (!activation.IsValid) { + // This is not an error scenario because the activation may have become invalid between the time + // we captured a snapshot in 'DequeueQuantum' and now. We are not be able to observe such changes. // Do nothing: don't collect, don't reschedule. - // The activation can't be in Created or Activating, since we only ScheduleCollection after successfull activation. - // If the activation is already in Deactivating or Invalid state, its already being collected or was collected - // (both mean a bug, this activation should not be in the collector) - // So in any state except for Valid we should just not collect and not reschedule. - logger.LogWarning( - (int)ErrorCode.Catalog_ActivationCollector_BadState_1, - "ActivationCollector found an activation in a non Valid state. All activation inside the ActivationCollector should be in Valid state. Activation: {Activation}", - activation); } else if (activation.KeepAliveUntil > now) { @@ -248,8 +241,8 @@ public List ScanStale() else { // Atomically set Deactivating state, to disallow any new requests or new timer ticks to be dispatched on this activation. - activation.Deactivate(reason, cancellationToken: default); - AddActivationToList(activation, ref condemned); + condemned ??= []; + condemned.Add(activation); } } } @@ -267,7 +260,6 @@ public List ScanAll(TimeSpan ageLimit) { List condemned = null; var now = DateTime.UtcNow; - var reason = GetDeactivationReason(); foreach (var kv in buckets) { var bucket = kv.Value; @@ -294,10 +286,10 @@ public List ScanAll(TimeSpan ageLimit) { if (bucket.TryRemove(activation)) { - // we removed the activation from the collector. it's our responsibility to deactivate it. - activation.Deactivate(reason, cancellationToken: default); - AddActivationToList(activation, ref condemned); + condemned ??= []; + condemned.Add(activation); } + // someone else has already deactivated the activation, so there's nothing to do. } else @@ -319,12 +311,6 @@ private static DeactivationReason GetDeactivationReason() return reason; } - private void AddActivationToList(ICollectibleGrainContext activation, ref List condemned) - { - condemned ??= []; - condemned.Add(activation); - } - private void ThrowIfTicketIsInvalid(DateTime ticket) { if (ticket.Ticks == 0) throw new ArgumentException("Empty ticket is not allowed in this context."); @@ -372,9 +358,9 @@ private void Add(ICollectibleGrainContext item, DateTime ticket) void IActivationWorkingSetObserver.OnAdded(IActivationWorkingSetMember member) { - Interlocked.Increment(ref _activationCount); if (member is ICollectibleGrainContext activation) { + Interlocked.Increment(ref _activationCount); if (activation.CollectionTicket == default) { ScheduleCollection(activation, activation.CollectionAgeLimit, DateTime.UtcNow); @@ -410,10 +396,9 @@ void IActivationWorkingSetObserver.OnDeactivating(IActivationWorkingSetMember me void IActivationWorkingSetObserver.OnDeactivated(IActivationWorkingSetMember member) { - Interlocked.Decrement(ref _activationCount); - if (member is ICollectibleGrainContext activation) + if (member is ICollectibleGrainContext activation && TryCancelCollection(activation)) { - TryCancelCollection(activation); + Interlocked.Decrement(ref _activationCount); } } @@ -426,6 +411,7 @@ private Task Start(CancellationToken cancellationToken) private async Task Stop(CancellationToken cancellationToken) { + using var registration = cancellationToken.Register(() => _shutdownCts.Cancel()); _collectionTimer.Dispose(); if (_collectionLoopTask is Task task) @@ -439,18 +425,19 @@ void ILifecycleParticipant.Participate(ISiloLifecycle lifecycle) lifecycle.Subscribe( nameof(ActivationCollector), ServiceLifecycleStage.RuntimeServices, - async cancellation => await Start(cancellation), - async cancellation => await Stop(cancellation)); + Start, + Stop); } private async Task RunActivationCollectionLoop() { await Task.CompletedTask.ConfigureAwait(ConfigureAwaitOptions.ForceYielding); + var cancellationToken = _shutdownCts.Token; while (await _collectionTimer.WaitForNextTickAsync()) { try { - await this.CollectActivationsImpl(true); + await this.CollectActivationsImpl(true, ageLimit: default, cancellationToken); } catch (Exception exception) { @@ -459,7 +446,7 @@ private async Task RunActivationCollectionLoop() } } - private async Task CollectActivationsImpl(bool scanStale, TimeSpan ageLimit = default) + private async Task CollectActivationsImpl(bool scanStale, TimeSpan ageLimit, CancellationToken cancellationToken) { var watch = ValueStopwatch.StartNew(); var number = Interlocked.Increment(ref collectionNumber); @@ -478,12 +465,10 @@ private async Task CollectActivationsImpl(bool scanStale, TimeSpan ageLimit = de List list = scanStale ? ScanStale() : ScanAll(ageLimit); CatalogInstruments.ActivationCollections.Add(1); - var count = 0; - if (list != null && list.Count > 0) + if (list is { Count: > 0 }) { - count = list.Count; if (logger.IsEnabled(LogLevel.Debug)) logger.LogDebug("CollectActivations {Activations}", list.ToStrings(d => d.GrainId.ToString() + d.ActivationId)); - await DeactivateActivationsFromCollector(list); + await DeactivateActivationsFromCollector(list, cancellationToken); } long memAfter = GC.GetTotalMemory(false) / (1024 * 1024); @@ -497,31 +482,38 @@ private async Task CollectActivationsImpl(bool scanStale, TimeSpan ageLimit = de number, memAfter, _activationCount, - count, + list?.Count ?? 0, ToString(), watch.Elapsed); } } - private async Task DeactivateActivationsFromCollector(List list) + private async Task DeactivateActivationsFromCollector(List list, CancellationToken cancellationToken) { - var mtcs = new MultiTaskCompletionSource(list.Count); - - logger.LogInformation((int)ErrorCode.Catalog_ShutdownActivations_1, "DeactivateActivationsFromCollector: total {Count} to promptly Destroy.", list.Count); + logger.LogInformation((int)ErrorCode.Catalog_ShutdownActivations_1, "Deactivating '{Count}' idle activations.", list.Count); CatalogInstruments.ActivationShutdownViaCollection(); - Action signalCompletion = mtcs.SetOneResult; var reason = GetDeactivationReason(); - for (var i = 0; i < list.Count; i++) + + var options = new ParallelOptions { - var activationData = list[i]; + // Avoid passing the cancellation token, since we want all of these activations to be deactivated, even if cancellation is triggered. + CancellationToken = CancellationToken.None, + MaxDegreeOfParallelism = Environment.ProcessorCount * 512 + }; + await Parallel.ForEachAsync(list, options, async (activationData, token) => + { // Continue deactivation when ready. - activationData.Deactivate(reason); - activationData.Deactivated.GetAwaiter().OnCompleted(signalCompletion); - } + activationData.Deactivate(reason, cancellationToken); + await activationData.Deactivated.ConfigureAwait(false); + }).WaitAsync(cancellationToken); + } - await mtcs.Task; + public void Dispose() + { + _collectionTimer.Dispose(); + _shutdownCts.Dispose(); } private class Bucket @@ -568,7 +560,7 @@ public List CancelAll() item.CollectionTicket = default; } - result ??= new List(); + result ??= []; result.Add(pair.Value); } diff --git a/src/Orleans.Runtime/Catalog/ActivationData.cs b/src/Orleans.Runtime/Catalog/ActivationData.cs index 5bb3b41027..345f71490d 100644 --- a/src/Orleans.Runtime/Catalog/ActivationData.cs +++ b/src/Orleans.Runtime/Catalog/ActivationData.cs @@ -3,7 +3,6 @@ using System.Collections.Generic; using System.Diagnostics; using System.Diagnostics.CodeAnalysis; -using System.Linq; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Threading; @@ -607,8 +606,9 @@ public bool DeactivateCore(DeactivationReason reason, CancellationToken cancella if (state is ActivationState.Creating or ActivationState.Activating or ActivationState.Valid) { CancelPendingOperations(); - SetState(ActivationState.Deactivating); + _shared.InternalRuntime.ActivationWorkingSet.OnDeactivating(this); + SetState(ActivationState.Deactivating); var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken); cts.CancelAfter(_shared.InternalRuntime.CollectionOptions.Value.DeactivationTimeout); ScheduleOperation(new Command.Deactivate(cts, state)); @@ -805,9 +805,11 @@ public async ValueTask DisposeAsync() _extras.IsDisposing = true; CancelPendingOperations(); + lock (this) { - State = ActivationState.Invalid; + _shared.InternalRuntime.ActivationWorkingSet.OnDeactivated(this); + SetState(ActivationState.Invalid); } DisposeTimers(); @@ -1177,13 +1179,10 @@ async Task ProcessOperationsAsync() await ActivateAsync(command.RequestContext, command.CancellationToken).SuppressThrowing(); break; case Command.Deactivate command: - await FinishDeactivating(command.CancellationToken, command.PreviousState).SuppressThrowing(); + await FinishDeactivating(command.PreviousState, command.CancellationToken).SuppressThrowing(); break; case Command.Delay command: - await Task.Delay(command.Duration, GrainRuntime.TimeProvider, command.CancellationToken); - break; - case Command.UnregisterFromCatalog: - UnregisterMessageTarget(); + await Task.Delay(command.Duration, GrainRuntime.TimeProvider, command.CancellationToken).SuppressThrowing(); break; default: throw new NotSupportedException($"Encountered unknown operation of type {op?.GetType().ToString() ?? "null"} {op}."); @@ -1462,6 +1461,15 @@ private void RerouteAllQueuedMessages() return; } + // If deactivation was caused by a transient failure, allow messages to be forwarded. + if (DeactivationReason.ReasonCode.IsTransientError()) + { + foreach (var msg in msgs) + { + msg.ForwardCount = Math.Max(msg.ForwardCount - 1, 0); + } + } + if (_shared.Logger.IsEnabled(LogLevel.Debug)) { if (ForwardingAddress is { } address) @@ -1497,7 +1505,6 @@ private async Task ActivateAsync(Dictionary? requestContextData, { // A chain of promises that will have to complete in order to complete the activation // Register with the grain directory and call the Activate method on the new activation. - var stopwatch = ValueStopwatch.StartNew(); try { // Currently, the only grain type that is not registered in the Grain Directory is StatelessWorker. @@ -1511,6 +1518,11 @@ private async Task ActivateAsync(Dictionary? requestContextData, { while (true) { + if (_shared.Logger.IsEnabled(LogLevel.Debug)) + { + _shared.Logger.LogDebug("Registering grain '{Grain}' in activation directory. Previous known registration is '{PreviousRegistration}'.", this, previousRegistration); + } + var result = await _shared.InternalRuntime.GrainLocator.Register(Address, previousRegistration).WaitAsync(cancellationToken); if (Address.Matches(result)) { @@ -1519,20 +1531,22 @@ private async Task ActivateAsync(Dictionary? requestContextData, } else if (result?.SiloAddress is { } registeredSilo && registeredSilo.Equals(Address.SiloAddress)) { + // Attempt to register this activation again, using the registration of the previous instance of this grain, + // which is registered to this silo. That activation must be a defunct predecessor of this activation, + // since the catalog only allows one activation of a given grain at a time. + // This could occur if the previous activation failed to unregister itself from the grain directory. + previousRegistration = result; + if (_shared.Logger.IsEnabled(LogLevel.Debug)) { _shared.Logger.LogDebug( - "The grain directory has an existing entry pointing to a different activation of this grain on this silo, {PreviousRegistration}." + "The grain directory has an existing entry pointing to a different activation of this grain, '{GrainId}', on this silo: '{PreviousRegistration}'." + " This may indicate that the previous activation was deactivated but the directory was not successfully updated." + " The directory will be updated to point to this activation.", - previousRegistration); + GrainId, + result); } - // Attempt to register this activation again, using the registration of the previous instance of this grain, - // which is registered to this silo. That activation must be a defunct predecessor of this activation, - // since the catalog only allows one activation of a given grain at a time. - // This could occur if the previous activation failed to unregister itself from the grain directory. - previousRegistration = result; continue; } else @@ -1631,14 +1645,13 @@ private async Task ActivateAsync(Dictionary? requestContextData, lock (this) { - if (State == ActivationState.Activating) + if (State is ActivationState.Activating) { SetState(ActivationState.Valid); // Activate calls on this activation are finished + _shared.InternalRuntime.ActivationWorkingSet.OnActivated(this); } } - _shared.InternalRuntime.ActivationWorkingSet.OnActivated(this); - if (_shared.Logger.IsEnabled(LogLevel.Debug)) { _shared.Logger.LogDebug((int)ErrorCode.Catalog_AfterCallingActivate, "Finished activating grain {Grain}", this); @@ -1676,11 +1689,6 @@ private async Task ActivateAsync(Dictionary? requestContextData, } finally { - if (cancellationToken.IsCancellationRequested && stopwatch.Elapsed.TotalMilliseconds > 50) - { - _shared.Logger.LogInformation("Cancellation requested for activation {Activation} took {ElapsedMilliseconds:0.0}ms.", this, stopwatch.Elapsed.TotalMilliseconds); - } - _workSignal.Signal(); } } @@ -1691,10 +1699,8 @@ private async Task ActivateAsync(Dictionary? requestContextData, /// /// Completes the deactivation process. /// - /// A cancellation which terminates graceful deactivation when cancelled. - private async Task FinishDeactivating(CancellationToken cancellationToken, ActivationState previousState) + private async Task FinishDeactivating(ActivationState previousState, CancellationToken cancellationToken) { - var stopwatch = ValueStopwatch.StartNew(); var migrated = false; var encounteredError = false; try @@ -1707,13 +1713,6 @@ private async Task FinishDeactivating(CancellationToken cancellationToken, Activ // Stop timers from firing. DisposeTimers(); - // Note: This call is being made from within Scheduler.Queue wrapper, so we are already executing on worker thread - if (_shared.Logger.IsEnabled(LogLevel.Debug)) - _shared.Logger.LogDebug( - (int)ErrorCode.Catalog_BeforeCallingDeactivate, - "About to call OnDeactivateAsync for '{Activation}'", - this); - // If the grain was valid when deactivation started, call OnDeactivateAsync. if (previousState == ActivationState.Valid) { @@ -1721,6 +1720,12 @@ private async Task FinishDeactivating(CancellationToken cancellationToken, Activ { try { + if (_shared.Logger.IsEnabled(LogLevel.Debug)) + _shared.Logger.LogDebug( + (int)ErrorCode.Catalog_BeforeCallingDeactivate, + "About to call OnDeactivateAsync for '{Activation}'", + this); + await grainBase.OnDeactivateAsync(DeactivationReason, cancellationToken).WaitAsync(cancellationToken); if (_shared.Logger.IsEnabled(LogLevel.Debug)) @@ -1793,8 +1798,9 @@ private async Task FinishDeactivating(CancellationToken cancellationToken, Activ // If the instance is being deactivated due to a directory failure, we should not unregister it. var isDirectoryFailure = DeactivationReason.ReasonCode is DeactivationReasonCode.DirectoryFailure; + var isShuttingDown = DeactivationReason.ReasonCode is DeactivationReasonCode.ShuttingDown; - if (!migrated && IsUsingGrainDirectory && !cancellationToken.IsCancellationRequested && !isDirectoryFailure) + if (!migrated && IsUsingGrainDirectory && !cancellationToken.IsCancellationRequested && !isDirectoryFailure && !isShuttingDown) { // Unregister from directory. // If the grain was migrated, the new activation will perform a check-and-set on the registration itself. @@ -1804,9 +1810,17 @@ private async Task FinishDeactivating(CancellationToken cancellationToken, Activ } catch (Exception exception) { - _shared.Logger.LogError(exception, "Failed to unregister activation '{Activation}' from directory.", this); + if (!cancellationToken.IsCancellationRequested) + { + _shared.Logger.LogError(exception, "Failed to unregister activation '{Activation}' from directory.", this); + } } } + else if (isDirectoryFailure) + { + // Optimization: forward to the same host to restart activation without needing to invalidate caches. + ForwardingAddress ??= Address.SiloAddress; + } } catch (Exception ex) { @@ -1830,7 +1844,7 @@ private async Task FinishDeactivating(CancellationToken cancellationToken, Activ CatalogInstruments.ActivationShutdownViaCollection(); } - _shared.InternalRuntime.ActivationWorkingSet.OnDeactivated(this); + UnregisterMessageTarget(); try { @@ -1841,15 +1855,9 @@ private async Task FinishDeactivating(CancellationToken cancellationToken, Activ _shared.Logger.LogWarning(exception, "Exception disposing activation '{Activation}'.", this); } - UnregisterMessageTarget(); - // Signal deactivation GetDeactivationCompletionSource().TrySetResult(true); _workSignal.Signal(); - if (cancellationToken.IsCancellationRequested && stopwatch.Elapsed.TotalMilliseconds > 50) - { - _shared.Logger.LogInformation("Cancellation requested for deactivation {Activation} took {ElapsedMilliseconds:0.0}ms.", this, stopwatch.Elapsed.TotalMilliseconds); - } } private TaskCompletionSource GetDeactivationCompletionSource() @@ -2049,11 +2057,6 @@ public sealed class Delay(TimeSpan duration) : Command(new()) { public TimeSpan Duration { get; } = duration; } - - public sealed class UnregisterFromCatalog() : Command(new()) - { - public static readonly UnregisterFromCatalog Instance = new(); - } } internal class ReentrantRequestTracker : Dictionary diff --git a/src/Orleans.Runtime/Catalog/ActivationDirectory.cs b/src/Orleans.Runtime/Catalog/ActivationDirectory.cs index 0d3a9d7fd6..bde0450e69 100644 --- a/src/Orleans.Runtime/Catalog/ActivationDirectory.cs +++ b/src/Orleans.Runtime/Catalog/ActivationDirectory.cs @@ -35,12 +35,15 @@ public void RecordNewTarget(IGrainContext target) } } - public void RemoveTarget(IGrainContext target) + public bool RemoveTarget(IGrainContext target) { if (_activations.TryRemove(KeyValuePair.Create(target.GrainId, target))) { Interlocked.Decrement(ref _activationsCount); + return true; } + + return false; } public IEnumerator> GetEnumerator() => _activations.GetEnumerator(); diff --git a/src/Orleans.Runtime/Catalog/ActivationWorkingSet.cs b/src/Orleans.Runtime/Catalog/ActivationWorkingSet.cs index 0f36b959e9..b232c58bb0 100644 --- a/src/Orleans.Runtime/Catalog/ActivationWorkingSet.cs +++ b/src/Orleans.Runtime/Catalog/ActivationWorkingSet.cs @@ -1,6 +1,7 @@ using System; using System.Collections.Concurrent; using System.Collections.Generic; +using System.Diagnostics; using System.Linq; using System.Threading; using System.Threading.Tasks; @@ -43,6 +44,7 @@ public ActivationWorkingSet( public void OnActivated(IActivationWorkingSetMember member) { + Debug.Assert(member is not ICollectibleGrainContext collectible || collectible.IsValid); if (_members.TryAdd(member, new MemberState())) { Interlocked.Increment(ref _activeCount); diff --git a/src/Orleans.Runtime/Catalog/Catalog.cs b/src/Orleans.Runtime/Catalog/Catalog.cs index d19e51e64b..56b6dedf27 100644 --- a/src/Orleans.Runtime/Catalog/Catalog.cs +++ b/src/Orleans.Runtime/Catalog/Catalog.cs @@ -22,7 +22,6 @@ internal sealed class Catalog : SystemTarget, ICatalog private readonly ActivationDirectory activations; private readonly IServiceProvider serviceProvider; private readonly ILogger logger; - private readonly IOptions collectionOptions; private readonly GrainContextActivator grainActivator; public Catalog( @@ -32,7 +31,6 @@ public Catalog( ActivationCollector activationCollector, IServiceProvider serviceProvider, ILoggerFactory loggerFactory, - IOptions collectionOptions, GrainContextActivator grainActivator) : base(Constants.CatalogType, localSiloDetails.SiloAddress, loggerFactory) { @@ -40,7 +38,6 @@ public Catalog( this.grainDirectoryResolver = grainDirectoryResolver; this.activations = activationDirectory; this.serviceProvider = serviceProvider; - this.collectionOptions = collectionOptions; this.grainActivator = grainActivator; this.logger = loggerFactory.CreateLogger(); this.activationCollector = activationCollector; @@ -70,20 +67,21 @@ public Catalog( /// public void UnregisterMessageTarget(IGrainContext activation) { - if (logger.IsEnabled(LogLevel.Trace)) + if (activations.RemoveTarget(activation)) { - logger.LogTrace("Unregistering activation {Activation}", activation.ToString()); - } + if (logger.IsEnabled(LogLevel.Trace)) + { + logger.LogTrace("Unregistered activation {Activation}", activation); + } - activations.RemoveTarget(activation); + // this should be removed once we've refactored the deactivation code path. For now safe to keep. + if (activation is ICollectibleGrainContext collectibleActivation) + { + activationCollector.TryCancelCollection(collectibleActivation); + } - // this should be removed once we've refactored the deactivation code path. For now safe to keep. - if (activation is ICollectibleGrainContext collectibleActivation) - { - activationCollector.TryCancelCollection(collectibleActivation); + CatalogInstruments.ActivationsDestroyed.Add(1); } - - CatalogInstruments.ActivationsDestroyed.Add(1); } /// @@ -161,7 +159,13 @@ public IGrainContext GetOrCreateActivation( if (!SiloStatusOracle.CurrentStatus.IsTerminating()) { - var address = GrainAddress.GetAddress(Silo, grainId, ActivationId.NewId()); + var address = new GrainAddress + { + SiloAddress = Silo, + GrainId = grainId, + ActivationId = ActivationId.NewId(), + MembershipVersion = MembershipVersion.MinValue, + }; result = this.grainActivator.CreateInstance(address); activations.RecordNewTarget(result); } @@ -182,17 +186,17 @@ public IGrainContext GetOrCreateActivation( } // Initialize the new activation asynchronously. - var cancellation = new CancellationTokenSource(collectionOptions.Value.ActivationTimeout); - result.Activate(requestContextData, cancellation.Token); + result.Activate(requestContextData); return result; [MethodImpl(MethodImplOptions.NoInlining)] static IGrainContext UnableToCreateActivation(Catalog self, GrainId grainId) { // Did not find and did not start placing new + var isTerminating = self.SiloStatusOracle.CurrentStatus.IsTerminating(); if (self.logger.IsEnabled(LogLevel.Debug)) { - if (self.SiloStatusOracle.CurrentStatus.IsTerminating()) + if (isTerminating) { self.logger.LogDebug((int)ErrorCode.CatalogNonExistingActivation2, "Unable to create activation for grain {GrainId} because this silo is terminating", grainId); } @@ -206,14 +210,17 @@ static IGrainContext UnableToCreateActivation(Catalog self, GrainId grainId) var grainLocator = self.serviceProvider.GetRequiredService(); grainLocator.InvalidateCache(grainId); + if (!isTerminating) + { + // Unregister the target activation so we don't keep getting spurious messages. + // The time delay (one minute, as of this writing) is to handle the unlikely but possible race where + // this request snuck ahead of another request, with new placement requested, for the same activation. + // If the activation registration request from the new placement somehow sneaks ahead of this deregistration, + // we want to make sure that we don't unregister the activation we just created. + var address = new GrainAddress { SiloAddress = self.Silo, GrainId = grainId }; + _ = self.UnregisterNonExistentActivation(address); + } - // Unregister the target activation so we don't keep getting spurious messages. - // The time delay (one minute, as of this writing) is to handle the unlikely but possible race where - // this request snuck ahead of another request, with new placement requested, for the same activation. - // If the activation registration request from the new placement somehow sneaks ahead of this deregistration, - // we want to make sure that we don't unregister the activation we just created. - var address = new GrainAddress { SiloAddress = self.Silo, GrainId = grainId }; - _ = self.UnregisterNonExistentActivation(address); return null; } } @@ -245,7 +252,7 @@ private bool TryGetGrainContext(GrainId grainId, out IGrainContext data) } /// - /// Gracefully deletes activations, putting it into a shutdown state to + /// Gracefully deactivates activations, waiting for them to complete /// complete and commit outstanding transactions before deleting it. /// To be called not from within Activation context, so can be awaited. /// @@ -254,26 +261,21 @@ internal async Task DeactivateActivations(DeactivationReason reason, List(list.Count); - foreach (var activation in list) + var options = new ParallelOptions { - activation.Deactivate(reason, cancellationToken); - tasks.Add(activation.Deactivated); - } - - await Task.WhenAll(tasks); - } - - internal void StartDeactivatingActivations(DeactivationReason reason, List list, CancellationToken cancellationToken) - { - if (list == null || list.Count == 0) return; - - if (logger.IsEnabled(LogLevel.Debug)) logger.LogDebug("DeactivateActivations: {Count} activations.", list.Count); - - foreach (var activation in list) + CancellationToken = CancellationToken.None, + MaxDegreeOfParallelism = Environment.ProcessorCount * 512 + }; + await Parallel.ForEachAsync(list, options, (activation, _) => { + if (activation.GrainId.Type.IsSystemTarget()) + { + return ValueTask.CompletedTask; + } + activation.Deactivate(reason, cancellationToken); - } + return new (activation.Deactivated); + }).WaitAsync(cancellationToken); } public async Task DeactivateAllActivations(CancellationToken cancellationToken) @@ -283,14 +285,24 @@ public async Task DeactivateAllActivations(CancellationToken cancellationToken) logger.LogDebug((int)ErrorCode.Catalog_DeactivateAllActivations, "DeactivateAllActivations."); } - var activationsToShutdown = new List(); - foreach (var pair in activations) + if (logger.IsEnabled(LogLevel.Debug)) logger.LogDebug("DeactivateActivations: {Count} activations.", activations.Count); + var reason = new DeactivationReason(DeactivationReasonCode.ShuttingDown, "This process is terminating."); + var options = new ParallelOptions { - activationsToShutdown.Add(pair.Value); - } + CancellationToken = CancellationToken.None, + MaxDegreeOfParallelism = Environment.ProcessorCount * 512 + }; + await Parallel.ForEachAsync(activations, options, (kv, _) => + { + if (kv.Key.IsSystemTarget()) + { + return ValueTask.CompletedTask; + } - var reason = new DeactivationReason(DeactivationReasonCode.ShuttingDown, "This process is terminating."); - await DeactivateActivations(reason, activationsToShutdown, cancellationToken).WaitAsync(cancellationToken); + var activation = kv.Value; + activation.Deactivate(reason, cancellationToken); + return new (activation.Deactivated); + }).WaitAsync(cancellationToken); } public SiloStatus LocalSiloStatus @@ -301,20 +313,20 @@ public SiloStatus LocalSiloStatus } } - public Task DeleteActivations(List addresses, DeactivationReasonCode reasonCode, string reasonText) + public async Task DeleteActivations(List addresses, DeactivationReasonCode reasonCode, string reasonText) { var tasks = new List(addresses.Count); var deactivationReason = new DeactivationReason(reasonCode, reasonText); - foreach (var activationAddress in addresses) + await Parallel.ForEachAsync(addresses, (activationAddress, cancellationToken) => { if (TryGetGrainContext(activationAddress.GrainId, out var grainContext)) { grainContext.Deactivate(deactivationReason); - tasks.Add(grainContext.Deactivated); + return new ValueTask(grainContext.Deactivated); } - } - return Task.WhenAll(tasks); + return ValueTask.CompletedTask; + }); } // TODO move this logic in the LocalGrainDirectory @@ -330,7 +342,7 @@ internal void OnSiloStatusChange(SiloAddress updatedSilo, SiloStatus status) if (!status.IsTerminating()) return; if (status == SiloStatus.Dead) { - this.RuntimeClient.BreakOutstandingMessagesToDeadSilo(updatedSilo); + this.RuntimeClient.BreakOutstandingMessagesToSilo(updatedSilo); } var activationsToShutdown = new List(); @@ -347,7 +359,7 @@ internal void OnSiloStatusChange(SiloAddress updatedSilo, SiloStatus status) var activationData = activation.Value; var placementStrategy = activationData.GetComponent(); var isUsingGrainDirectory = placementStrategy is { IsUsingGrainDirectory: true }; - if (!isUsingGrainDirectory || !grainDirectoryResolver.IsUsingDhtDirectory(activationData.GrainId.Type)) continue; + if (!isUsingGrainDirectory || !grainDirectoryResolver.IsUsingDefaultDirectory(activationData.GrainId.Type)) continue; if (!updatedSilo.Equals(directory.GetPrimaryForGrain(activationData.GrainId))) continue; activationsToShutdown.Add(activationData); @@ -381,6 +393,18 @@ internal void OnSiloStatusChange(SiloAddress updatedSilo, SiloStatus status) StartDeactivatingActivations(reason, activationsToShutdown, CancellationToken.None); } } + + void StartDeactivatingActivations(DeactivationReason reason, List list, CancellationToken cancellationToken) + { + if (list == null || list.Count == 0) return; + + if (logger.IsEnabled(LogLevel.Debug)) logger.LogDebug("DeactivateActivations: {Count} activations.", list.Count); + + foreach (var activation in list) + { + activation.Deactivate(reason, cancellationToken); + } + } } } } diff --git a/src/Orleans.Core/SystemTargetInterfaces/ICatalog.cs b/src/Orleans.Runtime/Catalog/ICatalog.cs similarity index 99% rename from src/Orleans.Core/SystemTargetInterfaces/ICatalog.cs rename to src/Orleans.Runtime/Catalog/ICatalog.cs index b8fcbc1bc9..fd716c27b7 100644 --- a/src/Orleans.Core/SystemTargetInterfaces/ICatalog.cs +++ b/src/Orleans.Runtime/Catalog/ICatalog.cs @@ -1,7 +1,6 @@ using System.Collections.Generic; using System.Threading.Tasks; - namespace Orleans.Runtime { /// diff --git a/src/Orleans.Runtime/Core/InsideRuntimeClient.cs b/src/Orleans.Runtime/Core/InsideRuntimeClient.cs index 922fe223d2..d3069636fc 100644 --- a/src/Orleans.Runtime/Core/InsideRuntimeClient.cs +++ b/src/Orleans.Runtime/Core/InsideRuntimeClient.cs @@ -506,7 +506,7 @@ private Task OnRuntimeInitializeStart(CancellationToken tc) return Task.CompletedTask; } - public void BreakOutstandingMessagesToDeadSilo(SiloAddress deadSilo) + public void BreakOutstandingMessagesToSilo(SiloAddress deadSilo) { foreach (var callback in callbacks) { diff --git a/src/Orleans.Runtime/GrainDirectory/DirectoryMembershipService.cs b/src/Orleans.Runtime/GrainDirectory/DirectoryMembershipService.cs new file mode 100644 index 0000000000..cdf3585dc6 --- /dev/null +++ b/src/Orleans.Runtime/GrainDirectory/DirectoryMembershipService.cs @@ -0,0 +1,92 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.Logging; +using Orleans.Internal; +using Orleans.Runtime.Internal; +using Orleans.Runtime.Utilities; + +#nullable enable +namespace Orleans.Runtime.GrainDirectory; + +internal sealed partial class DirectoryMembershipService : IAsyncDisposable +{ + private readonly IInternalGrainFactory _grainFactory; + private readonly ILogger _logger; + private readonly CancellationTokenSource _shutdownCts = new(); + private readonly Task _runTask; + private readonly AsyncEnumerable _viewUpdates; + + public DirectoryMembershipSnapshot CurrentView { get; private set; } = DirectoryMembershipSnapshot.Default; + + public IAsyncEnumerable ViewUpdates => _viewUpdates; + + public ClusterMembershipService ClusterMembershipService { get; } + + public async ValueTask RefreshViewAsync(MembershipVersion version, CancellationToken cancellationToken) + { + _ = ClusterMembershipService.Refresh(version, cancellationToken); + if (CurrentView.Version <= version) + { + await foreach (var view in _viewUpdates.WithCancellation(cancellationToken)) + { + if (view.Version >= version) + { + break; + } + } + } + + return CurrentView; + } + + public DirectoryMembershipService(ClusterMembershipService clusterMembershipService, IInternalGrainFactory grainFactory, ILogger logger) + { + _viewUpdates = new( + DirectoryMembershipSnapshot.Default, + (previous, proposed) => proposed.Version >= previous.Version, + update => CurrentView = update); + ClusterMembershipService = clusterMembershipService; + _grainFactory = grainFactory; + _logger = logger; + using var _ = new ExecutionContextSuppressor(); + _runTask = Task.Run(ProcessMembershipUpdates); + } + + private async Task ProcessMembershipUpdates() + { + try + { + while (!_shutdownCts.IsCancellationRequested) + { + try + { + await foreach (var update in ClusterMembershipService.MembershipUpdates.WithCancellation(_shutdownCts.Token)) + { + var view = new DirectoryMembershipSnapshot(update, _grainFactory); + _viewUpdates.Publish(view); + } + } + catch (Exception exception) + { + if (!_shutdownCts.IsCancellationRequested) + { + _logger.LogError(exception, "Error processing membership updates."); + } + } + } + } + finally + { + _viewUpdates.Dispose(); + } + } + + public async ValueTask DisposeAsync() + { + _shutdownCts.Cancel(); + await _runTask.SuppressThrowing(); + } +} diff --git a/src/Orleans.Runtime/GrainDirectory/DirectoryMembershipSnapshot.cs b/src/Orleans.Runtime/GrainDirectory/DirectoryMembershipSnapshot.cs new file mode 100644 index 0000000000..62e80a3179 --- /dev/null +++ b/src/Orleans.Runtime/GrainDirectory/DirectoryMembershipSnapshot.cs @@ -0,0 +1,306 @@ +using System; +using System.Collections; +using System.Collections.Generic; +using System.Collections.Immutable; +using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using Microsoft.CodeAnalysis; +using Orleans.Configuration; +using Orleans.Runtime.Utilities; + +#nullable enable +namespace Orleans.Runtime.GrainDirectory; + +internal sealed class DirectoryMembershipSnapshot +{ + internal const int PartitionsPerSilo = ConsistentRingOptions.DEFAULT_NUM_VIRTUAL_RING_BUCKETS; + private readonly ImmutableArray<(uint Start, int MemberIndex, int PartitionIndex)> _ringBoundaries; + private readonly RingRangeCollection[] _rangesByMember; + private readonly ImmutableArray> _partitionsByMember; + private readonly ImmutableArray> _rangesByMemberPartition; + + public DirectoryMembershipSnapshot(ClusterMembershipSnapshot snapshot, IInternalGrainFactory grainFactory) : this(snapshot, grainFactory, static (silo, count) => silo.GetUniformHashCodes(count)) + { + } + + internal DirectoryMembershipSnapshot(ClusterMembershipSnapshot snapshot, IInternalGrainFactory grainFactory, Func getRingBoundaries) + { + var sortedActiveMembers = ImmutableArray.CreateBuilder(snapshot.Members.Count(static m => m.Value.Status == SiloStatus.Active)); + foreach (var member in snapshot.Members) + { + // Only active members are part of directory membership. + if (member.Value.Status == SiloStatus.Active) + { + sortedActiveMembers.Add(member.Key); + } + } + + sortedActiveMembers.Sort(static (left, right) => left.CompareTo(right)); + var hashIndexPairs = ImmutableArray.CreateBuilder<(uint Hash, int MemberIndex, int PartitionIndex)>(PartitionsPerSilo * sortedActiveMembers.Count); + var memberPartitions = ImmutableArray.CreateBuilder>(); + for (var memberIndex = 0; memberIndex < sortedActiveMembers.Count; memberIndex++) + { + var activeMember = sortedActiveMembers[memberIndex]; + var hashCodes = getRingBoundaries(activeMember, PartitionsPerSilo).ToList(); + hashCodes.Sort(); + Debug.Assert(hashCodes.Count == PartitionsPerSilo); + var partitionReferences = ImmutableArray.CreateBuilder(PartitionsPerSilo); + for (var partitionIndex = 0; partitionIndex < hashCodes.Count; partitionIndex++) + { + var hashCode = hashCodes[partitionIndex]; + hashIndexPairs.Add((hashCode, memberIndex, partitionIndex)); + partitionReferences.Add(grainFactory?.GetSystemTarget(GrainDirectoryReplica.CreateGrainId(activeMember, partitionIndex).GrainId)!); + } + + memberPartitions.Add(partitionReferences.ToImmutable()); + } + + _partitionsByMember = memberPartitions.ToImmutable(); + + hashIndexPairs.Sort(static (left, right) => + { + var hashCompare = left.Hash.CompareTo(right.Hash); + if (hashCompare != 0) + { + return hashCompare; + } + + var partitionCompare = left.PartitionIndex.CompareTo(right.PartitionIndex); + if (partitionCompare != 0) + { + return partitionCompare; + } + + return left.MemberIndex.CompareTo(right.MemberIndex); + }); + + Dictionary.Builder> rangesByMemberPartitionBuilders = []; + for (var i = 0; i < hashIndexPairs.Count; i++) + { + var (_, memberIndex, _) = hashIndexPairs[i]; + ref var builder = ref CollectionsMarshal.GetValueRefOrAddDefault(rangesByMemberPartitionBuilders, memberIndex, out _); + builder ??= ImmutableArray.CreateBuilder(PartitionsPerSilo); + var (entryStart, _, _) = hashIndexPairs[i]; + var (nextStart, _, _) = hashIndexPairs[(i + 1) % hashIndexPairs.Count]; + var range = (entryStart == nextStart) switch + { + true when hashIndexPairs.Count == 1 => RingRange.Full, + true => RingRange.Empty, + _ => RingRange.Create(entryStart, nextStart) + }; + builder.Add(range); + } + + var rangesByMemberPartition = ImmutableArray.CreateBuilder>(sortedActiveMembers.Count); + for (var i = 0; i < sortedActiveMembers.Count; i++) + { + rangesByMemberPartition.Add(rangesByMemberPartitionBuilders[i].ToImmutable()); + } + + _rangesByMemberPartition = rangesByMemberPartition.ToImmutable(); + + // Remove empty ranges. + if (hashIndexPairs.Count > 1) + { + for (var i = 1; i < hashIndexPairs.Count;) + { + if (hashIndexPairs[i].Hash == hashIndexPairs[i - 1].Hash) + { + hashIndexPairs.RemoveAt(i); + } + else + { + i++; + } + } + } + + _ringBoundaries = hashIndexPairs.ToImmutable(); + + Members = sortedActiveMembers.ToImmutable(); + + _rangesByMember = new RingRangeCollection[Members.Length]; + ClusterMembershipSnapshot = snapshot; + } + + public static DirectoryMembershipSnapshot Default { get; } = new DirectoryMembershipSnapshot( + new ClusterMembershipSnapshot(ImmutableDictionary.Empty, MembershipVersion.MinValue), null!); + + public MembershipVersion Version => ClusterMembershipSnapshot.Version; + + public ImmutableArray Members { get; } + + public RingRange GetRange(SiloAddress address, int partitionIndex) + { + ArgumentOutOfRangeException.ThrowIfLessThan(partitionIndex, 0); + ArgumentOutOfRangeException.ThrowIfGreaterThan(partitionIndex, PartitionsPerSilo - 1); + + var memberIndex = TryGetMemberIndex(address); + if (memberIndex < 0) + { + return RingRange.Empty; + } + + var ranges = GetMemberRangesByPartition(memberIndex); + if (partitionIndex >= ranges.Length) + { + return RingRange.Empty; + } + + return ranges[partitionIndex]; + } + + public RingRangeCollection GetMemberRanges(SiloAddress address) + { + var memberIndex = TryGetMemberIndex(address); + + if (memberIndex < 0) + { + return RingRangeCollection.Empty; + } + + var range = _rangesByMember[memberIndex]; + if (range.IsDefault) + { + range = _rangesByMember[memberIndex] = RingRangeCollection.Create(GetMemberRangesByPartition(memberIndex)); + } + + return range; + } + + public ImmutableArray GetMemberRangesByPartition(SiloAddress address) + { + var memberIndex = TryGetMemberIndex(address); + + if (memberIndex < 0) + { + return []; + } + + return GetMemberRangesByPartition(memberIndex); + } + + private ImmutableArray GetMemberRangesByPartition(int memberIndex) + { + ArgumentOutOfRangeException.ThrowIfLessThan(memberIndex, 0); + ArgumentOutOfRangeException.ThrowIfGreaterThanOrEqual(memberIndex, _rangesByMemberPartition.Length); + return _rangesByMemberPartition[memberIndex]; + } + + public RangeCollection RangeOwners => new(this); + + public ClusterMembershipSnapshot ClusterMembershipSnapshot { get; } + + private (RingRange Range, int MemberIndex, int PartitionIndex) GetRangeInfo(int index) + { + ArgumentOutOfRangeException.ThrowIfGreaterThanOrEqual(index, _ringBoundaries.Length); + ArgumentOutOfRangeException.ThrowIfLessThan(index, 0); + + var range = GetRangeCore(index); + var boundary = _ringBoundaries[index]; + return (range, boundary.MemberIndex, boundary.PartitionIndex); + } + + private RingRange GetRangeCore(int index) + { + ArgumentOutOfRangeException.ThrowIfGreaterThanOrEqual(index, _ringBoundaries.Length); + ArgumentOutOfRangeException.ThrowIfLessThan(index, 0); + + var (entryStart, _, _) = _ringBoundaries[index]; + var (nextStart, _, _) = _ringBoundaries[(index + 1) % _ringBoundaries.Length]; + if (entryStart == nextStart) + { + // Handle hash collisions by making subsequent adjacent ranges empty. + if (_ringBoundaries.Length == 1) + { + return RingRange.Full; + } + else + { + // Handle hash collisions by making subsequent adjacent ranges empty. + return RingRange.Empty; + } + } + + return RingRange.Create(entryStart, nextStart); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private int TryGetMemberIndex(SiloAddress? address) + { + if (address is null) + { + return -1; + } + + return SearchAlgorithms.BinarySearch( + Members.Length, + (this, address), + static (index, state) => + { + var (snapshot, address) = state; + var candidate = snapshot.Members[index]; + return candidate.CompareTo(address); + }); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryGetOwner(GrainId grainId, [NotNullWhen(true)] out SiloAddress? owner, [NotNullWhen(true)] out IGrainDirectoryPartition? partitionReference) => TryGetOwner(grainId.GetUniformHashCode(), out owner, out partitionReference); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryGetOwner(uint hashCode, [NotNullWhen(true)] out SiloAddress? owner, [NotNullWhen(true)] out IGrainDirectoryPartition? partitionReference) + { + var index = SearchAlgorithms.RingRangeBinarySearch( + _ringBoundaries.Length, + this, + static (collection, index) => collection.GetRangeCore(index), + hashCode); + if (index >= 0) + { + var (_, memberIndex, partitionIndex) = _ringBoundaries[index]; + owner = Members[memberIndex]; + partitionReference = _partitionsByMember[memberIndex][partitionIndex]; + return true; + } + + Debug.Assert(Members.Length == 0); + owner = null; + partitionReference = null; + return false; + } + + public readonly struct RangeCollection(DirectoryMembershipSnapshot snapshot) : IReadOnlyList<(RingRange Range, int MemberIndex, int PartitionIndex)> + { + public int Count => snapshot._ringBoundaries.Length; + + public (RingRange Range, int MemberIndex, int PartitionIndex) this[int index] => snapshot.GetRangeInfo(index); + + IEnumerator IEnumerable.GetEnumerator() => GetEnumerator(); + IEnumerator<(RingRange Range, int MemberIndex, int PartitionIndex)> IEnumerable<(RingRange Range, int MemberIndex, int PartitionIndex)>.GetEnumerator() => GetEnumerator(); + public RangeCollectionEnumerator GetEnumerator() => new(snapshot); + + public struct RangeCollectionEnumerator(DirectoryMembershipSnapshot snapshot) : IEnumerator<(RingRange Range, int MemberIndex, int PartitionIndex)> + { + private int _index = 0; + public readonly (RingRange Range, int MemberIndex, int PartitionIndex) Current => snapshot.GetRangeInfo(_index - 1); + readonly (RingRange Range, int MemberIndex, int PartitionIndex) IEnumerator<(RingRange Range, int MemberIndex, int PartitionIndex)>.Current => Current; + readonly object IEnumerator.Current => Current; + + public void Dispose() => _index = int.MaxValue; + public bool MoveNext() + { + if (_index >= 0 && _index++ < snapshot._ringBoundaries.Length) + { + return true; + } + + return false; + } + + public void Reset() => _index = 0; + } + } +} diff --git a/src/Orleans.Runtime/GrainDirectory/DirectoryResult.cs b/src/Orleans.Runtime/GrainDirectory/DirectoryResult.cs new file mode 100644 index 0000000000..13489730e0 --- /dev/null +++ b/src/Orleans.Runtime/GrainDirectory/DirectoryResult.cs @@ -0,0 +1,32 @@ +using System.Diagnostics.CodeAnalysis; + +#nullable enable +namespace Orleans.Runtime; + +internal static class DirectoryResult +{ + public static DirectoryResult FromResult(T result, MembershipVersion version) => new DirectoryResult(result, version); + public static DirectoryResult RefreshRequired(MembershipVersion version) => new DirectoryResult(default, version); +} + +[GenerateSerializer, Alias("DirectoryResult`1"), Immutable] +internal readonly struct DirectoryResult(T? result, MembershipVersion version) +{ + [Id(0)] + private readonly T? _result = result; + + [Id(1)] + public readonly MembershipVersion Version = version; + + public bool TryGetResult(MembershipVersion version, [NotNullWhen(true)] out T? result) + { + if (Version == version) + { + result = _result!; + return true; + } + + result = default; + return false; + } +} diff --git a/src/Orleans.Runtime/GrainDirectory/DistributedGrainDirectory.cs b/src/Orleans.Runtime/GrainDirectory/DistributedGrainDirectory.cs new file mode 100644 index 0000000000..3d2612946c --- /dev/null +++ b/src/Orleans.Runtime/GrainDirectory/DistributedGrainDirectory.cs @@ -0,0 +1,381 @@ +using System; +using System.Collections.Generic; +using System.Collections.Immutable; +using System.Linq; +using System.Runtime.CompilerServices; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using Orleans.Concurrency; +using Orleans.GrainDirectory; +using Orleans.Internal; +using Orleans.Runtime.Internal; +using Orleans.Runtime.Scheduler; + +#nullable enable +namespace Orleans.Runtime.GrainDirectory; + +internal sealed partial class DistributedGrainDirectory : SystemTarget, IGrainDirectory, IGrainDirectoryClient, ILifecycleParticipant, DistributedGrainDirectory.ITestHooks +{ + private readonly DirectoryMembershipService _membershipService; + private readonly ILogger _logger; + private readonly IServiceProvider _serviceProvider; + private readonly ImmutableArray _partitions; + private readonly CancellationTokenSource _stoppedCts = new(); + + internal CancellationToken OnStoppedToken => _stoppedCts.Token; + internal ClusterMembershipSnapshot ClusterMembershipSnapshot => _membershipService.CurrentView.ClusterMembershipSnapshot; + + // The recovery membership value is used to avoid a race between concurrent registration & recovery operations which could lead to lost registrations. + // This could occur when a new activation is created and begins registering itself with a host which crashes. Concurrently, the new owner initiates + // recovery and asks all silos for their activations. When this silo processes this request, it will have the activation in its internal + // 'ActivationDirectory' even though these activations may not yet have completed registration. Therefore, multiple silos may return an entry for the same + // grain. By ensuring that any registration occurred at a version at least as high as the recovery version, we avoid this issue. This could be made more + // precise by also tracking the sets of ranges which need to be recovered, but that complicates things somewhat since it would require tracking the ranges + // for each recovery version. + private long _recoveryMembershipVersion; + private Task _runTask = Task.CompletedTask; + + public DistributedGrainDirectory( + DirectoryMembershipService membershipService, + ILogger logger, + ILocalSiloDetails localSiloDetails, + ILoggerFactory loggerFactory, + IServiceProvider serviceProvider, + IInternalGrainFactory grainFactory) : base(Constants.GrainDirectory, localSiloDetails.SiloAddress, loggerFactory) + { + _serviceProvider = serviceProvider; + _membershipService = membershipService; + _logger = logger; + var partitions = ImmutableArray.CreateBuilder(DirectoryMembershipSnapshot.PartitionsPerSilo); + for (var i = 0; i < DirectoryMembershipSnapshot.PartitionsPerSilo; i++) + { + partitions.Add(new GrainDirectoryReplica(i, this, localSiloDetails, loggerFactory, serviceProvider, grainFactory)); + } + + _partitions = partitions.ToImmutable(); + } + + public async Task Lookup(GrainId grainId) => await InvokeAsync( + grainId, + static (replica, version, grainId, cancellationToken) => replica.LookupAsync(version, grainId), + grainId, + CancellationToken.None); + + public async Task Register(GrainAddress address) => await InvokeAsync( + address.GrainId, + static (replica, version, address, cancellationToken) => replica.RegisterAsync(version, address, null), + address, + CancellationToken.None); + + public async Task Unregister(GrainAddress address) => await InvokeAsync( + address.GrainId, + static (replica, version, address, cancellationToken) => replica.DeregisterAsync(version, address), + address, + CancellationToken.None); + + public async Task Register(GrainAddress address, GrainAddress? previousAddress) => await InvokeAsync( + address.GrainId, + static (replica, version, state, cancellationToken) => replica.RegisterAsync(version, state.Address, state.PreviousAddress), + (Address: address, PreviousAddress: previousAddress), + CancellationToken.None); + + public Task UnregisterSilos(List siloAddresses) => Task.CompletedTask; + + private async Task InvokeAsync( + GrainId grainId, + Func>> func, + TState state, + CancellationToken cancellationToken, + [CallerMemberName] string operation = "") + { + DirectoryResult invokeResult; + var view = _membershipService.CurrentView; + var attempts = 0; + const int MaxAttempts = 10; + var delay = TimeSpan.FromMilliseconds(10); + while (true) + { + cancellationToken.ThrowIfCancellationRequested(); + var initialRecoveryMembershipVersion = _recoveryMembershipVersion; + if (view.Version.Value < initialRecoveryMembershipVersion || !view.TryGetOwner(grainId, out var owner, out var partitionReference)) + { + // If there are no members, bail out with the default return value. + if (view.Members.Length == 0 && view.Version.Value > 0) + { + return default!; + } + + var targetVersion = Math.Max(view.Version.Value + 1, initialRecoveryMembershipVersion); + view = await _membershipService.RefreshViewAsync(new(targetVersion), cancellationToken); + continue; + } + +#if false + if (logger.IsEnabled(LogLevel.Trace)) + { + logger.LogTrace("Invoking '{Operation}' on '{Owner}' for grain '{GrainId}'.", operation, owner, grainId); + } +#endif + + try + { + RequestContext.Set("gid", partitionReference.GetGrainId()); + invokeResult = await func(partitionReference, view.Version, state, cancellationToken); + } + catch (OrleansMessageRejectionException) when (attempts < MaxAttempts && !cancellationToken.IsCancellationRequested) + { + // This likely indicates that the target silo has been declared dead. + ++attempts; + await Task.Delay(delay); + delay *= 1.5; + continue; + } + + if (initialRecoveryMembershipVersion != _recoveryMembershipVersion) + { + // If the recovery version changed, perform a view refresh and re-issue the operation. + // See the comment on the declaration of '_recoveryMembershipVersionValue' for more details. + continue; + } + + if (!invokeResult.TryGetResult(view.Version, out var result)) + { + // The remote replica has a newer view of membership and is no longer the owner of the grain specified in the request. + // Refresh membership and re-evaluate. + view = await _membershipService.RefreshViewAsync(invokeResult.Version, cancellationToken); + continue; + } + + if (_logger.IsEnabled(LogLevel.Trace)) + { + _logger.LogTrace("Invoked '{Operation}' on '{Owner}' for grain '{GrainId}' and received result '{Result}'.", operation, owner, grainId, result); + } + + return result; + } + } + + public async ValueTask>> GetRegisteredActivations(MembershipVersion membershipVersion, RingRange range, bool isValidation) + { + if (!isValidation && _logger.IsEnabled(LogLevel.Debug)) + { + _logger.LogDebug("Collecting registered activations for range {Range} at version {MembershipVersion}.", range, membershipVersion); + } + + var recoveryMembershipVersion = _recoveryMembershipVersion; + if (recoveryMembershipVersion < membershipVersion.Value) + { + // Ensure that the value is immediately visible to any thread registering an activation. + Interlocked.CompareExchange(ref _recoveryMembershipVersion, membershipVersion.Value, recoveryMembershipVersion); + } + + var localActivations = _serviceProvider.GetRequiredService(); + var grainDirectoryResolver = _serviceProvider.GetRequiredService(); + List result = []; + List deactivationTasks = []; + var stopwatch = CoarseStopwatch.StartNew(); + using var cts = new CancellationTokenSource(); + cts.Cancel(); + foreach (var (grainId, activation) in localActivations) + { + var directory = GetGrainDirectory(activation, grainDirectoryResolver); + if (directory is not null && directory == this) + { + var address = activation.Address; + if (!range.Contains(address.GrainId)) + { + continue; + } + + if (address.MembershipVersion == MembershipVersion.MinValue + || activation is ActivationData activationData && !activationData.IsValid) + { + // Validation does not require that the grain is deactivated, skip it instead. + //if (isValidation) continue; + + try + { + // This activation has not completed registration or is not currently active. + // Abort the activation with a pre-canceled cancellation token so that it skips directory deregistration. + // TODO: Expand validity check to non-ActivationData activations. + //logger.LogWarning("Deactivating activation '{Activation}' due to failure of a directory range owner.", activation); + activation.Deactivate(new DeactivationReason(DeactivationReasonCode.DirectoryFailure, "This activation's directory partition was salvaged while registration status was in-doubt."), cts.Token); + deactivationTasks.Add(activation.Deactivated); + } + catch (Exception exception) + { + _logger.LogWarning(exception, "Failed to deactivate activation {Activation}", activation); + } + } + else + { + if (!isValidation) + { + _logger.LogTrace("Sending activation '{Activation}' for recovery because its in the requested range {Range} (version {Version}).", activation.GrainId, range, membershipVersion); + } + + result.Add(activation.Address); + } + } + } + + await Task.WhenAll(deactivationTasks); + + if (!isValidation && _logger.IsEnabled(LogLevel.Debug)) + { + _logger.LogDebug( + "Submitting {Count} registered activations for range {Range} at version {MembershipVersion}. Deactivated {DeactivationCount} in-doubt registrations. Took {ElapsedMilliseconds}ms", + result.Count, + range, + membershipVersion, + deactivationTasks.Count, + stopwatch.ElapsedMilliseconds); + } + + return result.AsImmutable(); + + static IGrainDirectory? GetGrainDirectory(IGrainContext grainContext, GrainDirectoryResolver grainDirectoryResolver) + { + if (grainContext is ActivationData activationData) + { + return activationData.Shared.GrainDirectory; + } + else if (grainContext is SystemTarget systemTarget) + { + return null; + } + else if (grainContext.GetComponent() is { IsUsingGrainDirectory: true }) + { + return grainDirectoryResolver.Resolve(grainContext.GrainId.Type); + } + + return null; + } + } + + internal ValueTask RefreshViewAsync(MembershipVersion version, CancellationToken cancellationToken) => _membershipService.RefreshViewAsync(version, cancellationToken); + + void ILifecycleParticipant.Participate(ISiloLifecycle observer) + { + observer.Subscribe(nameof(DistributedGrainDirectory), ServiceLifecycleStage.RuntimeInitialize, OnRuntimeInitializeStart, OnRuntimeInitializeStop); + + // Transition into 'ShuttingDown'/'Stopping' stage, removing ourselves from directory membership, but allow some time for hand-off before transitioning to 'Dead'. + observer.Subscribe(nameof(DistributedGrainDirectory), ServiceLifecycleStage.BecomeActive - 1, _ => Task.CompletedTask, OnShuttingDown); + + Task OnRuntimeInitializeStart(CancellationToken cancellationToken) + { + var catalog = _serviceProvider.GetRequiredService(); + catalog.RegisterSystemTarget(this); + foreach (var partition in _partitions) + { + catalog.RegisterSystemTarget(partition); + } + + using var _ = new ExecutionContextSuppressor(); + WorkItemGroup.QueueAction(() => _runTask = ProcessMembershipUpdates()); + + return Task.CompletedTask; + } + + async Task OnRuntimeInitializeStop(CancellationToken cancellationToken) + { + _stoppedCts.Cancel(); + if (_runTask is { } task) + { + // Try to wait for hand-off to complete. + await this.RunOrQueueTask(async () => await task.WaitAsync(cancellationToken).SuppressThrowing()); + } + } + + async Task OnShuttingDown(CancellationToken token) + { + var tasks = new List(_partitions.Length); + foreach (var partition in _partitions) + { + tasks.Add(partition.OnShuttingDown(token)); + } + await Task.WhenAll(tasks).SuppressThrowing(); + } + } + + private async Task ProcessMembershipUpdates() + { + // Ensure all child tasks are completed before exiting, tracking them here. + List tasks = []; + var previousUpdate = ClusterMembershipSnapshot.Default; + while (!_stoppedCts.IsCancellationRequested) + { + try + { + await foreach (var update in _membershipService.ViewUpdates.WithCancellation(_stoppedCts.Token)) + { + tasks.RemoveAll(t => t.IsCompleted); + var changes = update.ClusterMembershipSnapshot.CreateUpdate(previousUpdate); + + foreach (var change in changes.Changes) + { + if (change.Status == SiloStatus.Dead) + { + foreach (var partition in _partitions) + { + tasks.Add(partition.OnSiloRemovedFromClusterAsync(change)); + } + } + } + + var current = update; + + foreach (var partition in _partitions) + { + tasks.Add(partition.ProcessMembershipUpdateAsync(current)); + } + + if (_logger.IsEnabled(LogLevel.Debug)) + { + _logger.LogDebug("Updated view from '{PreviousVersion}' to '{Version}'.", previousUpdate.Version, update.Version); + } + + previousUpdate = update.ClusterMembershipSnapshot; + } + } + catch (Exception exception) + { + if (!_stoppedCts.IsCancellationRequested) + { + _logger.LogError(exception, "Error processing membership updates."); + } + } + } + + await Task.WhenAll(tasks).SuppressThrowing(); + } + + SiloAddress? ITestHooks.GetPrimaryForGrain(GrainId grainId) + { + _membershipService.CurrentView.TryGetOwner(grainId, out var owner, out _); + return owner; + } + + async Task ITestHooks.GetLocalRecord(GrainId grainId) + { + var view = _membershipService.CurrentView; + if (view.TryGetOwner(grainId, out var owner, out var partitionReference) && Silo.Equals(owner)) + { + var result = await partitionReference.LookupAsync(view.Version, grainId); + if (result.TryGetResult(view.Version, out var address)) + { + return address; + } + } + + return null; + } + + internal interface ITestHooks + { + SiloAddress? GetPrimaryForGrain(GrainId grainId); + Task GetLocalRecord(GrainId grainId); + } +} diff --git a/src/Orleans.Runtime/GrainDirectory/GrainDirectoryHandoffManager.cs b/src/Orleans.Runtime/GrainDirectory/GrainDirectoryHandoffManager.cs index 75a5d249ac..4fb70750f7 100644 --- a/src/Orleans.Runtime/GrainDirectory/GrainDirectoryHandoffManager.cs +++ b/src/Orleans.Runtime/GrainDirectory/GrainDirectoryHandoffManager.cs @@ -21,7 +21,7 @@ internal sealed class GrainDirectoryHandoffManager private readonly ISiloStatusOracle siloStatusOracle; private readonly IInternalGrainFactory grainFactory; private readonly ILogger logger; - private readonly Factory createPartion; + private readonly Factory createPartion; private readonly Queue<(string name, object state, Func action)> pendingOperations = new(); private readonly AsyncLock executorLock = new AsyncLock(); @@ -29,7 +29,7 @@ internal GrainDirectoryHandoffManager( LocalGrainDirectory localDirectory, ISiloStatusOracle siloStatusOracle, IInternalGrainFactory grainFactory, - Factory createPartion, + Factory createPartion, ILoggerFactory loggerFactory) { logger = loggerFactory.CreateLogger(); diff --git a/src/Orleans.Runtime/GrainDirectory/GrainDirectoryPartitionSnapshot.cs b/src/Orleans.Runtime/GrainDirectory/GrainDirectoryPartitionSnapshot.cs new file mode 100644 index 0000000000..05445df2ab --- /dev/null +++ b/src/Orleans.Runtime/GrainDirectory/GrainDirectoryPartitionSnapshot.cs @@ -0,0 +1,16 @@ +using System.Collections.Generic; + +#nullable enable +namespace Orleans.Runtime.GrainDirectory; + +[GenerateSerializer, Alias(nameof(GrainDirectoryPartitionSnapshot)), Immutable] +internal sealed class GrainDirectoryPartitionSnapshot( + MembershipVersion directoryMembershipVersion, + List grainAddresses) +{ + [Id(0)] + public MembershipVersion DirectoryMembershipVersion { get; } = directoryMembershipVersion; + + [Id(1)] + public List GrainAddresses { get; } = grainAddresses; +} diff --git a/src/Orleans.Runtime/GrainDirectory/GrainDirectoryReplica.Interface.cs b/src/Orleans.Runtime/GrainDirectory/GrainDirectoryReplica.Interface.cs new file mode 100644 index 0000000000..cb40a7cf83 --- /dev/null +++ b/src/Orleans.Runtime/GrainDirectory/GrainDirectoryReplica.Interface.cs @@ -0,0 +1,112 @@ +using System; +using System.Diagnostics; +using System.Runtime.InteropServices; +using System.Threading.Tasks; +using Microsoft.Extensions.Logging; + +#nullable enable + +namespace Orleans.Runtime.GrainDirectory; + +internal sealed partial class GrainDirectoryReplica +{ + async ValueTask> IGrainDirectoryPartition.RegisterAsync(MembershipVersion version, GrainAddress address, GrainAddress? currentRegistration) + { + ArgumentNullException.ThrowIfNull(address); + if (_logger.IsEnabled(LogLevel.Trace)) + { + _logger.LogTrace("RegisterAsync('{Version}', '{Address}', '{ExistingAddress}')", version, address, currentRegistration); + } + + // Ensure that the current membership version is new enough. + await WaitForRange(address.GrainId, version); + if (!IsOwner(CurrentView, address.GrainId)) + { + return DirectoryResult.RefreshRequired(CurrentView.Version); + } + + DebugAssertOwnership(address.GrainId); + return DirectoryResult.FromResult(RegisterCore(address, currentRegistration), version); + } + + async ValueTask> IGrainDirectoryPartition.LookupAsync(MembershipVersion version, GrainId grainId) + { + if (_logger.IsEnabled(LogLevel.Trace)) + { + _logger.LogTrace("LookupAsync('{Version}', '{GrainId}')", version, grainId); + } + + // Ensure we can serve the request. + await WaitForRange(grainId, version); + if (!IsOwner(CurrentView, grainId)) + { + return DirectoryResult.RefreshRequired(CurrentView.Version); + } + + return DirectoryResult.FromResult(LookupCore(grainId), version); + } + + async ValueTask> IGrainDirectoryPartition.DeregisterAsync(MembershipVersion version, GrainAddress address) + { + ArgumentNullException.ThrowIfNull(address); + if (_logger.IsEnabled(LogLevel.Trace)) + { + _logger.LogTrace("DeregisterAsync('{Version}', '{Address}')", version, address); + } + + await WaitForRange(address.GrainId, version); + if (!IsOwner(CurrentView, address.GrainId)) + { + return DirectoryResult.RefreshRequired(CurrentView.Version); + } + + DebugAssertOwnership(address.GrainId); + return DirectoryResult.FromResult(DeregisterCore(address), version); + } + + private bool DeregisterCore(GrainAddress address) + { + if (_directory.TryGetValue(address.GrainId, out var existing) && (existing.Matches(address) || IsSiloDead(existing))) + { + return _directory.Remove(address.GrainId); + } + + return false; + } + + internal GrainAddress? LookupCore(GrainId grainId) + { + if (_directory.TryGetValue(grainId, out var existing) && !IsSiloDead(existing)) + { + return existing; + } + + return null; + } + + private GrainAddress RegisterCore(GrainAddress newAddress, GrainAddress? existingAddress) + { + ref var existing = ref CollectionsMarshal.GetValueRefOrAddDefault(_directory, newAddress.GrainId, out _); + + if (existing is null || existing.Matches(existingAddress) || IsSiloDead(existing)) + { + if (newAddress.MembershipVersion != CurrentView.Version) + { + // Set the membership version to match the view number in which it was registered. + newAddress = new() + { + GrainId = newAddress.GrainId, + SiloAddress = newAddress.SiloAddress, + ActivationId = newAddress.ActivationId, + MembershipVersion = CurrentView.Version + }; + } + + existing = newAddress; + } + + return existing; + } + + private bool IsSiloDead(GrainAddress existing) => _owner.ClusterMembershipSnapshot.GetSiloStatus(existing.SiloAddress) == SiloStatus.Dead; +} diff --git a/src/Orleans.Runtime/GrainDirectory/GrainDirectoryReplica.cs b/src/Orleans.Runtime/GrainDirectory/GrainDirectoryReplica.cs new file mode 100644 index 0000000000..00a28cba4d --- /dev/null +++ b/src/Orleans.Runtime/GrainDirectory/GrainDirectoryReplica.cs @@ -0,0 +1,771 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; +using System.Globalization; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.CodeAnalysis; +using Microsoft.Extensions.Logging; +using Orleans.Concurrency; +using Orleans.Internal; +using Orleans.Runtime.Scheduler; +using Orleans.Runtime.Utilities; + +#nullable enable +namespace Orleans.Runtime.GrainDirectory; + +/// +/// Represents a single contiguous partition of the distributed grain directory. +/// +/// The index of this partition on this silo. Each silo hosts a fixed number of dynamically sized partitions. +internal sealed partial class GrainDirectoryReplica( + int partitionIndex, + DistributedGrainDirectory owner, + ILocalSiloDetails localSiloDetails, + ILoggerFactory loggerFactory, + IServiceProvider serviceProvider, + IInternalGrainFactory grainFactory) + : SystemTarget(CreateGrainId(localSiloDetails.SiloAddress, partitionIndex), localSiloDetails.SiloAddress, loggerFactory), IGrainDirectoryPartition, IGrainDirectoryTestHooks +{ + internal static SystemTargetGrainId CreateGrainId(SiloAddress siloAddress, int partitionIndex) => SystemTargetGrainId.Create(Constants.GrainDirectoryPartition, siloAddress, partitionIndex.ToString(CultureInfo.InvariantCulture)); + private readonly Dictionary _directory = []; + private readonly int _partitionIndex = partitionIndex; + private readonly DistributedGrainDirectory _owner = owner; + private readonly IServiceProvider _serviceProvider = serviceProvider; + private readonly IInternalGrainFactory _grainFactory = grainFactory; + private readonly CancellationTokenSource _drainSnapshotsCts = new(); + private readonly SiloAddress _id = localSiloDetails.SiloAddress; + private readonly ILogger _logger = loggerFactory.CreateLogger(); + private readonly TaskCompletionSource _snapshotsDrainedTcs = new(TaskCreationOptions.RunContinuationsAsynchronously); + private readonly AsyncEnumerable _viewUpdates = new( + DirectoryMembershipSnapshot.Default, + (previous, proposed) => proposed.Version >= previous.Version, + _ => { }); + + // Ranges which cannot be served currently, eg because the replica is currently transferring them from a previous owner. + // Requests in these ranges must wait for the range to become available. + private readonly List<(RingRange Range, MembershipVersion Version, TaskCompletionSource Completion)> _rangeLocks = []; + + // Ranges which were previously at least partially owned by this replica, but which are pending transfer to a new replica. + private readonly List _partitionSnapshots = []; + + // Tracked for diagnostic purposes only. + private readonly List _viewChangeTasks = []; + private CancellationToken ShutdownToken => _owner.OnStoppedToken; + + private RingRange _currentRange; + + // The current directory membership snapshot. + public DirectoryMembershipSnapshot CurrentView { get; private set; } = DirectoryMembershipSnapshot.Default; + + public async ValueTask RefreshViewAsync(MembershipVersion version, CancellationToken cancellationToken) + { + _ = _owner.RefreshViewAsync(version, cancellationToken); + if (CurrentView.Version <= version) + { + await foreach (var view in _viewUpdates.WithCancellation(cancellationToken)) + { + if (view.Version >= version) + { + break; + } + } + } + + return CurrentView; + } + + async ValueTask IGrainDirectoryPartition.GetSnapshotAsync(MembershipVersion version, MembershipVersion rangeVersion, RingRange range) + { + if (_logger.IsEnabled(LogLevel.Trace)) + { + _logger.LogTrace("GetSnapshotAsync('{Version}', '{RangeVersion}', '{Range}')", version, rangeVersion, range); + } + + // Wait for the range to be unlocked. + await WaitForRange(range, version); + + ShutdownToken.ThrowIfCancellationRequested(); + List partitionAddresses = []; + foreach (var partitionSnapshot in _partitionSnapshots) + { + if (partitionSnapshot.DirectoryMembershipVersion != rangeVersion) + { + continue; + } + + // Only include addresses which are in the requested range. + foreach (var address in partitionSnapshot.GrainAddresses) + { + if (range.Contains(address.GrainId)) + { + partitionAddresses.Add(address); + } + } + + var rangeSnapshot = new GrainDirectoryPartitionSnapshot(rangeVersion, partitionAddresses); + if (_logger.IsEnabled(LogLevel.Debug)) + { + _logger.LogDebug("Transferring '{Count}' entries in range '{Range}' from version '{Version}' snapshot.", partitionAddresses.Count, range, rangeVersion); + } + + return rangeSnapshot; + } + + _logger.LogWarning("Received a request for a snapshot which this replica does not have, version '{Version}', range version '{RangeVersion}', range '{Range}'.", version, rangeVersion, range); + return null; + } + + ValueTask IGrainDirectoryPartition.AcknowledgeSnapshotTransferAsync(SiloAddress silo, int partitionIndex, MembershipVersion rangeVersion) + { + RemoveSnapshotTransferPartner((silo, partitionIndex), rangeVersion); + return new(true); + } + + private void RemoveSnapshotTransferPartner((SiloAddress Silo, int PartitionIndex) owner, MembershipVersion? rangeVersion) + { + for (var i = 0; i < _partitionSnapshots.Count; ++i) + { + var partitionSnapshot = _partitionSnapshots[i]; + if (rangeVersion.HasValue && partitionSnapshot.DirectoryMembershipVersion != rangeVersion.Value) + { + continue; + } + + var partners = partitionSnapshot.TransferPartners; + partners.RemoveWhere(p => p.SiloAddress.Equals(owner.Silo) && (owner.PartitionIndex < 0 || p.PartitionIndex == owner.PartitionIndex)); + if (partners.Count == 0) + { + _partitionSnapshots.RemoveAt(i); + --i; + + if (_logger.IsEnabled(LogLevel.Debug)) + { + _logger.LogDebug("Removing version '{Version}' snapshot. Current snapshots: [{CurrentSnapshots}].", partitionSnapshot.DirectoryMembershipVersion, string.Join(", ", _partitionSnapshots.Select(s => s.DirectoryMembershipVersion))); + } + + // If shutdown has been requested and there are no more pending snapshots, signal completion. + if (_drainSnapshotsCts.IsCancellationRequested && _partitionSnapshots.Count == 0) + { + _snapshotsDrainedTcs.TrySetResult(); + } + } + } + } + + [Conditional("DEBUG")] + private void DebugAssertOwnership(GrainId grainId) => DebugAssertOwnership(CurrentView, grainId); + + [Conditional("DEBUG")] + private void DebugAssertOwnership(DirectoryMembershipSnapshot view, GrainId grainId) + { + if (!view.TryGetOwner(grainId, out var owner, out var partitionReference)) + { + Debug.Fail($"Could not find owner for grain grain '{grainId}' in view '{view}'."); + } + + if (!_id.Equals(owner)) + { + Debug.Fail($"'{_id}' expected to be the owner of grain '{grainId}', but the owner is '{owner}'."); + } + + if (!GrainId.Equals(partitionReference.GetGrainId())) + { + Debug.Fail($"'{GrainId}' expected to be the owner of grain '{grainId}', but the owner is '{partitionReference.GetGrainId()}'."); + } + } + + private bool IsOwner(DirectoryMembershipSnapshot view, GrainId grainId) => view.TryGetOwner(grainId, out _, out var partitionReference) && GrainId.Equals(partitionReference.GetGrainId()); + + private ValueTask WaitForRange(GrainId grainId, MembershipVersion version) => WaitForRange(RingRange.FromPoint(grainId.GetUniformHashCode()), version); + + private ValueTask WaitForRange(RingRange range, MembershipVersion version) + { + GrainRuntime.CheckRuntimeContext(this); + Task? completion = null; + if (CurrentView.Version < version || TryGetIntersectingLock(range, version, out completion)) + { + return WaitForRangeCore(range, version, completion); + } + + return ValueTask.CompletedTask; + + bool TryGetIntersectingLock(RingRange range, MembershipVersion version, [NotNullWhen(true)] out Task? completion) + { + foreach (var rangeLock in _rangeLocks) + { + if (rangeLock.Version <= version && range.Intersects(rangeLock.Range)) + { + completion = rangeLock.Completion.Task; + return true; + } + } + + completion = null; + return false; + } + + async ValueTask WaitForRangeCore(RingRange range, MembershipVersion version, Task? task) + { + if (task is not null) + { + await task; + } + + if (CurrentView.Version < version) + { + await RefreshViewAsync(version, ShutdownToken); + } + + while (TryGetIntersectingLock(range, version, out var completion)) + { + await completion.WaitAsync(ShutdownToken); + } + } + } + + public IGrainDirectoryPartition GetReplicaReference(SiloAddress address, int partitionIndex) => _grainFactory.GetSystemTarget(CreateGrainId(address, partitionIndex).GrainId); + + internal async Task OnShuttingDown(CancellationToken token) + { + await this.RunOrQueueTask(async () => + { + _drainSnapshotsCts.Cancel(); + if (_partitionSnapshots.Count > 0) + { + await _snapshotsDrainedTcs.Task.WaitAsync(token).SuppressThrowing(); + } + }); + } + internal Task OnSiloRemovedFromClusterAsync(ClusterMember change) => + this.QueueAction( + static state => state.Self.OnSiloRemovedFromCluster(state.Change), + (Self: this, Change: change), + nameof(OnSiloRemovedFromCluster)); + + private void OnSiloRemovedFromCluster(ClusterMember change) + { + GrainRuntime.CheckRuntimeContext(this); + var toRemove = new List(); + foreach (var entry in _directory) + { + if (change.SiloAddress.Equals(entry.Value.SiloAddress)) + { + toRemove.Add(entry.Value); + } + } + + if (toRemove.Count > 0) + { + if (_logger.IsEnabled(LogLevel.Debug)) + { + _logger.LogDebug("Deleting '{Count}' entries located on now-defunct silo '{SiloAddress}'.", toRemove.Count, change.SiloAddress); + } + + foreach (var grainAddress in toRemove) + { +#if false + if (_logger.IsEnabled(LogLevel.Debug)) + { + _logger.LogDebug("Deleting '{GrainAddress}' located on now-defunct silo '{SiloAddress}'.", grainAddress, change.SiloAddress); + } +#endif + DeregisterCore(grainAddress); + } + } + + RemoveSnapshotTransferPartner((change.SiloAddress, -1), rangeVersion: null); + } + + internal Task ProcessMembershipUpdateAsync(DirectoryMembershipSnapshot current) => + this.QueueAction( + static state => state.Self.ProcessMembershipUpdate(state.Current), + (Self: this, Current: current), + nameof(ProcessMembershipUpdate)); + + private void ProcessMembershipUpdate(DirectoryMembershipSnapshot current) + { + GrainRuntime.CheckRuntimeContext(this); + + _viewChangeTasks.RemoveAll(task => task.IsCompleted); + + if (_logger.IsEnabled(LogLevel.Trace)) + { + _logger.LogTrace("Observed membership version '{Version}'.", current.Version); + } + + var previous = CurrentView; + CurrentView = current; + + var previousRange = previous.GetRange(_id, _partitionIndex); + _currentRange = current.GetRange(_id, _partitionIndex); + + // It is important that this method is synchronous, to ensure that updates are atomic. + var deltaSize = _currentRange.SizePercent - previousRange.SizePercent; + var meanSizePercent = current.Members.Length > 0 ? 100.0 / current.Members.Length : 0f; + var deviationFromMean = Math.Abs(meanSizePercent - _currentRange.SizePercent); + if (_logger.IsEnabled(LogLevel.Debug)) + { + _logger.LogDebug("Updating view from '{PreviousVersion}' to '{Version}'. Now responsible for '{Range}' (Δ {DeltaPercent:0.00}%. {DeviationFromMean:0.00}% from ideal share).", previous.Version, current.Version, _currentRange, deltaSize, deviationFromMean); + } + + var removedRange = previousRange.Difference(_currentRange).SingleOrDefault(); + var addedRange = _currentRange.Difference(previousRange).SingleOrDefault(); + +#if DEBUG + Debug.Assert(addedRange.IsEmpty ^ removedRange.IsEmpty || addedRange.IsEmpty && removedRange.IsEmpty); // Either the range grew or it shrank, but not both. + Debug.Assert(previousRange.Difference(_currentRange).Count() < 2); + Debug.Assert(_currentRange.Difference(previousRange).Count() < 2); + Debug.Assert(_currentRange.Size == previousRange.Size + addedRange.Size - removedRange.Size); + Debug.Assert(!removedRange.Intersects(addedRange)); + Debug.Assert(!removedRange.Intersects(_currentRange)); + Debug.Assert(removedRange.IsEmpty || removedRange.Intersects(previousRange)); + Debug.Assert(!addedRange.Intersects(removedRange)); + Debug.Assert(addedRange.IsEmpty || addedRange.Intersects(_currentRange)); + Debug.Assert(!addedRange.Intersects(previousRange)); + Debug.Assert(previousRange.IsEmpty || _currentRange.IsEmpty || previousRange.Start == _currentRange.Start); +#endif + + if (!removedRange.IsEmpty) + { + _viewChangeTasks.Add(ReleaseRangeAsync(previous, current, removedRange)); + } + + if (!addedRange.IsEmpty) + { + _viewChangeTasks.Add(AcquireRangeAsync(previous, current, addedRange)); + } + + _viewUpdates.Publish(current); + } + + private async Task ReleaseRangeAsync(DirectoryMembershipSnapshot previous, DirectoryMembershipSnapshot current, RingRange removedRange) + { + GrainRuntime.CheckRuntimeContext(this); + var (tcs, sw) = LockRange(removedRange, current.Version); + if (_logger.IsEnabled(LogLevel.Debug)) + { + _logger.LogDebug("Relinquishing ownership of range '{Range}'.", removedRange); + } + + try + { + // Snapshot & remove everything not in the current range. + // The new owner will have the opportunity to retrieve the snapshot as they take ownership. + List removedAddresses = []; + HashSet<(SiloAddress, int)> transferPartners = []; + + // Wait for the range being removed to become valid. + await WaitForRange(removedRange, previous.Version); + + GrainRuntime.CheckRuntimeContext(this); + if (_logger.IsEnabled(LogLevel.Trace)) + { + _logger.LogTrace("Relinquishing ownership of range '{Range}'.", removedRange); + } + + foreach (var (range, ownerIndex, partitionIndex) in current.RangeOwners) + { + if (range.Intersects(removedRange)) + { + var owner = current.Members[ownerIndex]; + Debug.Assert(!_id.Equals(owner)); + transferPartners.Add((owner, partitionIndex)); + } + } + + // Collect all addresses that are not in the owned range. + foreach (var entry in _directory) + { + if (removedRange.Contains(entry.Key)) + { + removedAddresses.Add(entry.Value); + } + } + + // Remove these addresses from the partition. + foreach (var address in removedAddresses) + { + if (transferPartners.Count > 0) + { + _logger.LogTrace("Evicting entry '{Address}' to snapshot.", address); + } + + _directory.Remove(address.GrainId); + } + + if (transferPartners.Count > 0) + { + _partitionSnapshots.Add(new PartitionSnapshotState(previous.Version, removedAddresses, transferPartners)); + } + else + { + _logger.LogDebug("Dropping snapshot since there are no transfer partners."); + } + } + finally + { + UnlockRange(removedRange, current.Version, tcs, sw.Elapsed, "release"); + } + } + + private async Task AcquireRangeAsync(DirectoryMembershipSnapshot previous, DirectoryMembershipSnapshot current, RingRange addedRange) + { + GrainRuntime.CheckRuntimeContext(this); + // Suspend the range and transfer state from the previous owners. + // If the predecessor becomes unavailable or membership advances quickly, we will declare data loss and unlock the range. + var (tcs, sw) = LockRange(addedRange, current.Version); + + try + { + CoarseStopwatch stopwatch = default; + if (_logger.IsEnabled(LogLevel.Debug)) + { + _logger.LogDebug("Acquiring range '{Range}'.", addedRange); + stopwatch = CoarseStopwatch.StartNew(); + } + + // The view change is contiguous if the new version is exactly one greater than the previous version. + // If not, we have missed some updates, so we must declare a potential data loss event. + var isContiguous = current.Version.Value == previous.Version.Value + 1; + bool success; + if (isContiguous) + { + // Transfer subranges from previous owners. + var tasks = new List>(); + foreach (var previousOwner in previous.Members) + { + var previousOwnerRanges = previous.GetMemberRangesByPartition(previousOwner); + for (var partitionIndex = 0; partitionIndex < previousOwnerRanges.Length; partitionIndex++) + { + var previousOwnerRange = previousOwnerRanges[partitionIndex]; + if (previousOwnerRange.Intersects(addedRange)) + { + tasks.Add(TransferSnapshotAsync(current, addedRange, previousOwner, partitionIndex, previous.Version)); + } + } + } + + // Note: there should be no 'await' points before this point. + // An await before this point would result in ranges not being locked synchronously. + await Task.WhenAll(tasks).WaitAsync(ShutdownToken).SuppressThrowing(); + if (ShutdownToken.IsCancellationRequested) + { + return; + } + + success = tasks.All(t => t.Result); + } + else + { + if (_logger.IsEnabled(LogLevel.Debug)) + { + _logger.LogDebug( + "Non-contiguous view change detected: '{PreviousVersion}' to '{CurrentVersion}'. Performing recovery.", + previous.Version, + current.Version); + } + + success = false; + } + + var recovered = false; + if (!success) + { + // Wait for previous versions to be unlocked before proceeding. + await WaitForRange(addedRange, previous.Version); + + await RecoverPartitionRange(current, addedRange); + recovered = true; + } + + if (_logger.IsEnabled(LogLevel.Debug)) + { + _logger.LogDebug("Completed transferring entries for range '{Range}' at version '{Version}' took {Elapsed}ms.{Recovered}", addedRange, current.Version, stopwatch.ElapsedMilliseconds, recovered ? " Recovered" : ""); + } + } + finally + { + UnlockRange(addedRange, current.Version, tcs, sw.Elapsed, "acquire"); + } + } + + private (TaskCompletionSource Lock, ValueStopwatch Stopwatch) LockRange(RingRange range, MembershipVersion version) + { + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + _rangeLocks.Add((range, version, tcs)); + return (tcs, ValueStopwatch.StartNew()); + } + + private void UnlockRange(RingRange range, MembershipVersion version, TaskCompletionSource tcs, TimeSpan heldDuration, string operationName) + { + DirectoryInstruments.RangeLockHeldDuration.Record((long)heldDuration.TotalMilliseconds); + if (ShutdownToken.IsCancellationRequested) + { + // If the replica is stopped, the range is never unlocked and the task is cancelled instead. + tcs.SetCanceled(ShutdownToken); + } + else + { + tcs.SetResult(); + _rangeLocks.Remove((range, version, tcs)); + } + } + + private async Task TransferSnapshotAsync(DirectoryMembershipSnapshot current, RingRange addedRange, SiloAddress previousOwner, int partitionIndex, MembershipVersion previousVersion) + { + try + { + var stopwatch = ValueStopwatch.StartNew(); + if (_logger.IsEnabled(LogLevel.Trace)) + { + _logger.LogTrace("Requesting entries for ranges '{Range}' from '{PreviousOwner}' at version '{PreviousVersion}'.", addedRange, previousOwner, previousVersion); + } + + var replica = GetReplicaReference(previousOwner, partitionIndex); + + // Alternatively, the previous owner could push the snapshot. The pull-based approach is used here because it is simpler. + var snapshot = await replica.GetSnapshotAsync(current.Version, previousVersion, addedRange).AsTask().WaitAsync(ShutdownToken); + + if (snapshot is null) + { + _logger.LogWarning("Expected a valid snapshot from previous owner '{PreviousOwner}' for part of ranges '{Range}', but found none.", previousOwner, addedRange); + return false; + } + + // The acknowledgement step lets the previous owner know that the snapshot has been received so that it can proceed. + InvokeOnClusterMember( + previousOwner, + async () => await replica.AcknowledgeSnapshotTransferAsync(_id, _partitionIndex, previousVersion), + false, + nameof(IGrainDirectoryPartition.AcknowledgeSnapshotTransferAsync)).Ignore(); + + // Wait for previous versions to be unlocked before proceeding. + await WaitForRange(addedRange, previousVersion); + + // Incorporate the values into the grain directory. + foreach (var entry in snapshot.GrainAddresses) + { + DebugAssertOwnership(current, entry.GrainId); + + _logger.LogTrace("Received '{Entry}' via snapshot from '{PreviousOwner}' for version '{Version}'.", entry, previousOwner, previousVersion); + _directory[entry.GrainId] = entry; + } + + if (_logger.IsEnabled(LogLevel.Debug)) + { + _logger.LogDebug("Transferred '{Count}' entries for range '{Range}' from '{PreviousOwner}'.", snapshot.GrainAddresses.Count, addedRange, previousOwner); + } + + DirectoryInstruments.SnapshotTransferCount.Add(1); + DirectoryInstruments.SnapshotTransferDuration.Record((long)stopwatch.Elapsed.TotalMilliseconds); + + return true; + } + catch (Exception exception) + { + if (exception is SiloUnavailableException) + { + _logger.LogWarning("Remote host became unavailable while transferring ownership of range '{Range}'. Recovery will be performed.", addedRange); + } + else + { + _logger.LogWarning(exception, "Error transferring ownership of range '{Range}'. Recovery will be performed.", addedRange); + } + + return false; + } + } + + private async Task RecoverPartitionRange(DirectoryMembershipSnapshot current, RingRange addedRange) + { + var stopwatch = ValueStopwatch.StartNew(); + GrainRuntime.CheckRuntimeContext(this); + if (_logger.IsEnabled(LogLevel.Debug)) + { + _logger.LogDebug("Recovering activations from range '{Range}' at version '{Version}'.", addedRange, current.Version); + } + + await foreach (var activations in GetRegisteredActivations(current, addedRange, isValidation: false)) + { + GrainRuntime.CheckRuntimeContext(this); + foreach (var entry in activations) + { + DebugAssertOwnership(current, entry.GrainId); + _logger.LogTrace("Recovered '{Entry}' for version '{Version}'.", entry, current.Version); + _directory[entry.GrainId] = entry; + } + } + + DirectoryInstruments.RangeRecoveryCount.Add(1); + DirectoryInstruments.RangeRecoveryDuration.Record((long)stopwatch.Elapsed.TotalMilliseconds); + if (_logger.IsEnabled(LogLevel.Debug)) + { + _logger.LogDebug("Completed recovering activations from range '{Range}' at version '{Version}' took '{Elapsed}'.", addedRange, current.Version, stopwatch.Elapsed); + } + } + + private async IAsyncEnumerable> GetRegisteredActivations(DirectoryMembershipSnapshot current, RingRange range, bool isValidation) + { + // Membership is guaranteed to be at least as recent as the current view. + var clusterMembershipSnapshot = _owner.ClusterMembershipSnapshot; + Debug.Assert(clusterMembershipSnapshot.Version >= current.Version); + + var tasks = new List>>(); + foreach (var member in clusterMembershipSnapshot.Members.Values) + { + if (member.Status is not (SiloStatus.Active or SiloStatus.Joining or SiloStatus.ShuttingDown)) + { + continue; + } + + tasks.Add(GetRegisteredActivationsFromClusterMember(current.Version, range, member.SiloAddress, isValidation)); + } + + await Task.WhenAll(tasks).WaitAsync(ShutdownToken).SuppressThrowing(); + if (ShutdownToken.IsCancellationRequested) + { + yield break; + } + + foreach (var task in tasks) + { + yield return await task; + } + + async Task> GetRegisteredActivationsFromClusterMember(MembershipVersion version, RingRange range, SiloAddress siloAddress, bool isValidation) + { + var stopwatch = ValueStopwatch.StartNew(); + var client = _grainFactory.GetSystemTarget(Constants.GrainDirectory, siloAddress); + var result = await InvokeOnClusterMember( + siloAddress, + async () => await client.GetRegisteredActivations(version, range, isValidation), + new Immutable>([]), + nameof(GetRegisteredActivations)); + + if (_logger.IsEnabled(LogLevel.Debug)) + { + _logger.LogDebug("Recovered '{Count}' entries from silo '{SiloAddress}' for ranges '{Range}' at version '{Version}' in {ElapsedMilliseconds}ms.", result.Value.Count, siloAddress, range, version, stopwatch.Elapsed.TotalMilliseconds); + } + + return result.Value; + } + } + + private async Task InvokeOnClusterMember(SiloAddress siloAddress, Func> func, T defaultValue, string operationName) + { + GrainRuntime.CheckRuntimeContext(this); + var clusterMembershipSnapshot = _owner.ClusterMembershipSnapshot; + while (!ShutdownToken.IsCancellationRequested) + { + if (clusterMembershipSnapshot.GetSiloStatus(siloAddress) is not (SiloStatus.Active or SiloStatus.Joining or SiloStatus.ShuttingDown)) + { + break; + } + + try + { + return await func(); + } + catch (Exception ex) + { + if (ex is not OrleansMessageRejectionException) + { + _logger.LogError(ex, "Error invoking operation '{Operation}' on silo '{SiloAddress}'.", operationName, siloAddress); + } + + await _owner.RefreshViewAsync(default, CancellationToken.None); + if (_owner.ClusterMembershipSnapshot.Version == clusterMembershipSnapshot.Version) + { + await Task.Delay(TimeSpan.FromMilliseconds(100)); + } + + clusterMembershipSnapshot = _owner.ClusterMembershipSnapshot; + } + } + + ShutdownToken.ThrowIfCancellationRequested(); + return defaultValue; + } + + async ValueTask IGrainDirectoryTestHooks.CheckIntegrityAsync() + { + GrainRuntime.CheckRuntimeContext(this); + var current = CurrentView; + var range = _currentRange; + Debug.Assert(range.Equals(current.GetRange(_id, _partitionIndex))); + + await WaitForRange(RingRange.Full, current.Version); + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + _rangeLocks.Add((RingRange.Full, current.Version, tcs)); + try + { + foreach (var entry in _directory) + { + if (!range.Contains(entry.Key)) + { + Debug.Fail($"Invariant violated. This host is not the owner of grain '{entry.Key}'."); + } + + DebugAssertOwnership(current, entry.Key); + } + + var missing = 0; + var mismatched = 0; + var total = 0; + await foreach (var activationList in GetRegisteredActivations(current, range, isValidation: true)) + { + total += activationList.Count; + foreach (var entry in activationList) + { + if (!IsOwner(current, entry.GrainId)) + { + // The view has been refreshed since the request for registered activations was made. + if (current.Version <= current.Version) + { + Debug.Fail("Invariant violated. This host was sent a registration which it should not have been."); + } + + continue; + } + + if (_directory.TryGetValue(entry.GrainId, out var existingEntry)) + { + if (!existingEntry.Equals(entry)) + { + ++mismatched; + _logger.LogError("Integrity violation: Recovered entry '{RecoveredRecord}' does not match existing entry '{LocalRecord}'.", entry, existingEntry); + Debug.Fail($"Integrity violation: Recovered entry '{entry}' does not match existing entry '{existingEntry}'."); + } + } + else + { + ++missing; + _logger.LogError("Integrity violation: Recovered entry '{RecoveredRecord}' not found in directory.", entry); + Debug.Fail($"Integrity violation: Recovered entry '{entry}' not found in directory."); + } + } + } + } + finally + { + if (ShutdownToken.IsCancellationRequested) + { + tcs.SetCanceled(ShutdownToken); + } + else + { + tcs.SetResult(); + } + + _rangeLocks.Remove((RingRange.Full, current.Version, tcs)); + } + } + + private sealed record class PartitionSnapshotState( + MembershipVersion DirectoryMembershipVersion, + List GrainAddresses, + HashSet<(SiloAddress SiloAddress, int PartitionIndex)> TransferPartners); +} diff --git a/src/Orleans.Runtime/GrainDirectory/GrainDirectoryResolver.cs b/src/Orleans.Runtime/GrainDirectory/GrainDirectoryResolver.cs index c608686e73..164dc6477e 100644 --- a/src/Orleans.Runtime/GrainDirectory/GrainDirectoryResolver.cs +++ b/src/Orleans.Runtime/GrainDirectory/GrainDirectoryResolver.cs @@ -29,7 +29,7 @@ public GrainDirectoryResolver( var services = serviceProvider.GetGrainDirectories(); foreach (var svc in services) { - this.directoryPerName.Add(svc.Name, serviceProvider.GetRequiredKeyedService(svc.Name)); + this.directoryPerName[svc.Name] = serviceProvider.GetRequiredKeyedService(svc.Name); } this.directoryPerName.TryGetValue(GrainDirectoryAttribute.DEFAULT_GRAIN_DIRECTORY, out var defaultDirectory); @@ -43,7 +43,7 @@ public GrainDirectoryResolver( public IGrainDirectory Resolve(GrainType grainType) => this.directoryPerType.GetOrAdd(grainType, this.getGrainDirectoryInternal); - public bool IsUsingDhtDirectory(GrainType grainType) => Resolve(grainType) == null; + public bool IsUsingDefaultDirectory(GrainType grainType) => Resolve(grainType) == null; private IGrainDirectory GetGrainDirectoryPerType(GrainType grainType) { diff --git a/src/Orleans.Runtime/GrainDirectory/GrainLocatorResolver.cs b/src/Orleans.Runtime/GrainDirectory/GrainLocatorResolver.cs index c5e736ebb3..8b44041ca6 100644 --- a/src/Orleans.Runtime/GrainDirectory/GrainLocatorResolver.cs +++ b/src/Orleans.Runtime/GrainDirectory/GrainLocatorResolver.cs @@ -37,7 +37,7 @@ public IGrainLocator GetGrainLocatorInternal(GrainType grainType) { result = this._clientGrainLocator ??= _servicesProvider.GetRequiredService(); } - else if (this.grainDirectoryResolver.IsUsingDhtDirectory(grainType)) + else if (this.grainDirectoryResolver.IsUsingDefaultDirectory(grainType)) { result = this.dhtGrainLocator; } diff --git a/src/Orleans.Runtime/GrainDirectory/IGrainDirectoryPartition.cs b/src/Orleans.Runtime/GrainDirectory/IGrainDirectoryPartition.cs new file mode 100644 index 0000000000..f42df98812 --- /dev/null +++ b/src/Orleans.Runtime/GrainDirectory/IGrainDirectoryPartition.cs @@ -0,0 +1,39 @@ +using System.Collections.Generic; +using System.Threading.Tasks; +using Orleans.Concurrency; + +#nullable enable +namespace Orleans.Runtime.GrainDirectory; + +[Alias("IGrainDirectoryReplica")] +internal interface IGrainDirectoryPartition : ISystemTarget +{ + [Alias("RegisterAsync")] + ValueTask> RegisterAsync(MembershipVersion version, GrainAddress address, GrainAddress? currentRegistration); + + [Alias("LookupAsync")] + ValueTask> LookupAsync(MembershipVersion version, GrainId grainId); + + [Alias("DeregisterAsync")] + ValueTask> DeregisterAsync(MembershipVersion version, GrainAddress address); + + [Alias("GetSnapshotAsync")] + ValueTask GetSnapshotAsync(MembershipVersion version, MembershipVersion rangeVersion, RingRange range); + + [Alias("AcknowledgeSnapshotTransferAsync")] + ValueTask AcknowledgeSnapshotTransferAsync(SiloAddress silo, int partitionIndex, MembershipVersion version); +} + +[Alias("IGrainDirectoryReplicaClient")] +internal interface IGrainDirectoryClient : ISystemTarget +{ + [Alias("GetRegisteredActivations")] + ValueTask>> GetRegisteredActivations(MembershipVersion membershipVersion, RingRange range, bool isValidation); +} + +[Alias("IGrainDirectoryReplicaTestHooks")] +internal interface IGrainDirectoryTestHooks : ISystemTarget +{ + [Alias("CheckIntegrityAsync")] + ValueTask CheckIntegrityAsync(); +} diff --git a/src/Orleans.Runtime/GrainDirectory/LocalGrainDirectory.cs b/src/Orleans.Runtime/GrainDirectory/LocalGrainDirectory.cs index be5ec56ffc..48a0eff4c9 100644 --- a/src/Orleans.Runtime/GrainDirectory/LocalGrainDirectory.cs +++ b/src/Orleans.Runtime/GrainDirectory/LocalGrainDirectory.cs @@ -34,7 +34,7 @@ internal sealed class LocalGrainDirectory : ILocalGrainDirectory, ISiloStatusLis internal SiloAddress MyAddress { get; } internal IGrainDirectoryCache DirectoryCache { get; } - internal GrainDirectoryPartition DirectoryPartition { get; } + internal LocalGrainDirectoryPartition DirectoryPartition { get; } public RemoteGrainDirectory RemoteGrainDirectory { get; } public RemoteGrainDirectory CacheValidator { get; } @@ -46,7 +46,7 @@ public LocalGrainDirectory( ILocalSiloDetails siloDetails, ISiloStatusOracle siloStatusOracle, IInternalGrainFactory grainFactory, - Factory grainDirectoryPartitionFactory, + Factory grainDirectoryPartitionFactory, IOptions developmentClusterMembershipOptions, IOptions grainDirectoryOptions, ILoggerFactory loggerFactory) diff --git a/src/Orleans.Runtime/GrainDirectory/GrainDirectoryPartition.cs b/src/Orleans.Runtime/GrainDirectory/LocalGrainDirectoryPartition.cs similarity index 97% rename from src/Orleans.Runtime/GrainDirectory/GrainDirectoryPartition.cs rename to src/Orleans.Runtime/GrainDirectory/LocalGrainDirectoryPartition.cs index b2a29753ca..f445055950 100644 --- a/src/Orleans.Runtime/GrainDirectory/GrainDirectoryPartition.cs +++ b/src/Orleans.Runtime/GrainDirectory/LocalGrainDirectoryPartition.cs @@ -102,7 +102,7 @@ public bool RemoveActivation(ActivationId act, UnregistrationCause cause, TimeSp } } - internal sealed class GrainDirectoryPartition + internal sealed class LocalGrainDirectoryPartition { // Should we change this to SortedList<> or SortedDictionary so we can extract chunks better for shipping the full // partition to a follower, or should we leave it as a Dictionary to get O(1) lookups instead of O(log n), figuring we do @@ -118,11 +118,11 @@ internal sealed class GrainDirectoryPartition internal int Count { get { return partitionData.Count; } } - public GrainDirectoryPartition(ISiloStatusOracle siloStatusOracle, IOptions grainDirectoryOptions, ILoggerFactory loggerFactory) + public LocalGrainDirectoryPartition(ISiloStatusOracle siloStatusOracle, IOptions grainDirectoryOptions, ILoggerFactory loggerFactory) { partitionData = new Dictionary(); lockable = new object(); - log = loggerFactory.CreateLogger(); + log = loggerFactory.CreateLogger(); this.siloStatusOracle = siloStatusOracle; this.grainDirectoryOptions = grainDirectoryOptions; } @@ -260,7 +260,7 @@ internal int GetGrainETag(GrainId grain) /// /// /// Activations which must be deactivated. - internal Dictionary>? Merge(GrainDirectoryPartition other) + internal Dictionary>? Merge(LocalGrainDirectoryPartition other) { Dictionary>? activationsToRemove = null; lock (lockable) diff --git a/src/Orleans.Runtime/GrainDirectory/RemoteGrainDirectory.cs b/src/Orleans.Runtime/GrainDirectory/RemoteGrainDirectory.cs index 3dfe2eb20c..05b418c1da 100644 --- a/src/Orleans.Runtime/GrainDirectory/RemoteGrainDirectory.cs +++ b/src/Orleans.Runtime/GrainDirectory/RemoteGrainDirectory.cs @@ -11,7 +11,7 @@ namespace Orleans.Runtime.GrainDirectory internal sealed class RemoteGrainDirectory : SystemTarget, IRemoteGrainDirectory { private readonly LocalGrainDirectory router; - private readonly GrainDirectoryPartition partition; + private readonly LocalGrainDirectoryPartition partition; private readonly ILogger logger; internal RemoteGrainDirectory(LocalGrainDirectory r, GrainType grainType, ILoggerFactory loggerFactory) diff --git a/src/Orleans.Runtime/GrainDirectory/RingRange.cs b/src/Orleans.Runtime/GrainDirectory/RingRange.cs new file mode 100644 index 0000000000..1b583a2317 --- /dev/null +++ b/src/Orleans.Runtime/GrainDirectory/RingRange.cs @@ -0,0 +1,241 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics; + +#nullable enable +namespace Orleans.Runtime.GrainDirectory; + +/// +/// Represents a contiguous range of zero or more values on a ring. +/// +[GenerateSerializer, Immutable, Alias(nameof(RingRange))] +internal readonly struct RingRange : IEquatable, ISpanFormattable, IComparable +{ + // The exclusive starting point for the range. + // Note that _start == _end == 1 is used as a special value to represent a full range. + [Id(0)] + private readonly uint _start; + + // The inclusive ending point for the range. + // Note that _start == _end == 1 is used as a special value to represent a full range. + [Id(1)] + private readonly uint _end; + + public bool IsEmpty => _start == _end && _start == 0; + + public bool IsFull => _start == _end && _start != 0; + + // Whether the range includes uint.MaxValue. + internal bool IsWrapped => _start >= _end && _start != 0; + + public static RingRange Full { get; } = new (1, 1); + + public static RingRange Empty { get; } = new (0, 0); + + public uint Start => IsFull ? 0 : _start; + + public uint End => IsFull ? 0 : _end; + + private RingRange(uint start, uint end) + { + _start = start == end && start > 1 ? 1 : start; + _end = start == end && start > 1 ? 1 : end; + } + + // For internal use only. + internal static RingRange Create(uint start, uint end) => new (start, end); + + /// + /// Creates a range representing a single point. + /// + /// The point which the range will include. + /// A range including only . + public static RingRange FromPoint(uint point) => new (unchecked(point - 1), point); + + /// + /// Gets the size of the range. + /// + public uint Size + { + get + { + if (_start == _end) + { + // Empty + if (_start == 0) return 0; + + // Full + return uint.MaxValue; + } + + // Normal + if (_end > _start) return _end - _start; + + // Wrapped + return uint.MaxValue - _start + _end; + } + } + + public int CompareTo(uint point) + { + if (Contains(point)) + { + return 0; + } + + var start = Start; + if (IsWrapped) + { + // Start > End (wrap-around case) + if (point <= start) + { + // Range starts after N (range > N) + return -1; + } + + // n > _end + // Range starts & ends before N (range < N) + return 1; + } + + if (point <= start) + { + // Range starts after N (range > N) + return 1; + } + + // n > _end + // Range starts & ends before N (range < N) + return -1; + } + + /// + /// Checks if n is element of (Start, End], while remembering that the ranges are on a ring + /// + /// true if n is in (Start, End], false otherwise + internal bool Contains(GrainId grainId) => Contains(grainId.GetUniformHashCode()); + + /// + /// checks if n is element of (Start, End], while remembering that the ranges are on a ring + /// + /// + /// true if n is in (Start, End], false otherwise + public bool Contains(uint point) + { + if (IsEmpty) + { + return false; + } + + var num = point; + if (Start < End) + { + return num > Start && num <= End; + } + + // Start > End + return num > Start || num <= End; + } + + public float SizePercent => Size * (100.0f / uint.MaxValue); + + public bool Equals(RingRange other) => _start == other._start && _end == other._end; + + public override bool Equals(object? obj) => obj is RingRange other && Equals(other); + + public override int GetHashCode() => HashCode.Combine(_start, _end); + + public override string ToString() => $"{this}"; + + string IFormattable.ToString(string? format, IFormatProvider? formatProvider) => ToString(); + + bool ISpanFormattable.TryFormat(Span destination, out int charsWritten, ReadOnlySpan format, IFormatProvider? provider) + { + return IsEmpty + ? destination.TryWrite($"(0, 0) 0.00%", out charsWritten) + : IsFull + ? destination.TryWrite($"(0, 0] (100.00%)", out charsWritten) + : destination.TryWrite($"(0x{Start:X8}, 0x{End:X8}] ({SizePercent:0.00}%)", out charsWritten); + } + + public bool Intersects(RingRange other) => !IsEmpty && !other.IsEmpty && (Equals(other) || Contains(other.End) || other.Contains(End)); + + internal RingRange Complement() + { + if (IsEmpty) + { + return Full; + } + + if (IsFull) + { + return Empty; + } + + return new RingRange(End, Start); + } + + internal IEnumerable Intersections(RingRange other) + { + if (!Intersects(other)) + { + // No intersections. + yield break; + } + + if (IsFull) + { + // One intersection, the other range. + yield return other; + } + else if (other.IsFull) + { + yield return this; + } + else if (IsWrapped ^ other.IsWrapped) + { + var wrapped = IsWrapped ? this : other; + var normal = IsWrapped ? other : this; + var (normalStart, normalEnd) = (normal.Start, normal.End); + var (wrappedStart, wrappedEnd) = (wrapped.Start, wrapped.End); + + // There are possibly two intersections, between the normal and wrapped range. + // low high + // ...---NB====WE----WB====NE----... + + // Intersection at the low side. + if (wrappedEnd > normalStart) + { + // ---NB====WE--- + yield return new RingRange(normalStart, wrappedEnd); + } + + // Intersection at the high side. + if (wrappedStart < normalEnd) + { + // ---WB====NE--- + yield return new RingRange(wrappedStart, normalEnd); + } + } + else + { + yield return new RingRange(Math.Max(Start, other.Start), Math.Min(End, other.End)); + } + } + + // Gets the set difference: the sub-ranges which are in this range but are not in the 'other' range. + internal IEnumerable Difference(RingRange other) + { + // Additions are the intersections between this range and the inverse of the previous range. + foreach (var addition in Intersections(other.Complement())) + { + Debug.Assert(!addition.Intersects(other)); + Debug.Assert(addition.Intersects(this)); + yield return addition; + } + } + + public static bool operator ==(RingRange left, RingRange right) => left.Equals(right); + + public static bool operator !=(RingRange left, RingRange right) => !(left == right); +} diff --git a/src/Orleans.Runtime/GrainDirectory/RingRangeCollection.cs b/src/Orleans.Runtime/GrainDirectory/RingRangeCollection.cs new file mode 100644 index 0000000000..0772d2784e --- /dev/null +++ b/src/Orleans.Runtime/GrainDirectory/RingRangeCollection.cs @@ -0,0 +1,224 @@ +using System; +using System.Collections; +using System.Collections.Generic; +using System.Collections.Immutable; +using System.Diagnostics; +using System.Linq; +using Orleans.Runtime.Utilities; + +#nullable enable +namespace Orleans.Runtime.GrainDirectory; + +// Read-only, sorted collection of non-overlapping ranges. +[GenerateSerializer, Immutable, Alias(nameof(RingRangeCollection))] +internal readonly struct RingRangeCollection : IEquatable, ISpanFormattable, IEnumerable +{ + public RingRangeCollection(ImmutableArray ranges) + { +#if DEBUG + Debug.Assert(!ranges.IsDefault); + + // Ranges must be in sorted order and must not overlap with each other. + for (var i = 1; i < ranges.Length; i++) + { + var prev = ranges[i - 1]; + var curr = ranges[i]; + Debug.Assert(!curr.IsEmpty); + Debug.Assert(!prev.Intersects(curr)); + Debug.Assert(curr.Start >= prev.Start); + } + + if (ranges.Length > 1) + { + Debug.Assert(!ranges[0].Intersects(ranges[^1])); + } +#endif + Ranges = ranges; + } + + public static RingRangeCollection Create(TCollection ranges) where TCollection : ICollection + { + ArgumentNullException.ThrowIfNull(ranges); + var result = ImmutableArray.CreateBuilder(ranges.Count); + foreach (var range in ranges) + { + if (range.IsEmpty) + { + continue; + } + + result.AddRange(range); + } + + result.Sort((l, r) => l.Start.CompareTo(r.Start)); + return new(result.ToImmutable()); + } + + public static RingRangeCollection Empty { get; } = new([]); + + [Id(0)] + public ImmutableArray Ranges { get; } + + public bool IsDefault => Ranges.IsDefault; + + public bool IsEmpty => Ranges.Length == 0 || Ranges.All(r => r.IsEmpty); + + public bool IsFull => !IsEmpty && Ranges.Sum(r => r.Size) == uint.MaxValue; + + public uint Size => (uint)Ranges.Sum(static r => r.Size); + + public float SizePercent => Size * (100.0f / uint.MaxValue); + + public bool Contains(GrainId grainId) => Contains(grainId.GetUniformHashCode()); + + public bool Contains(uint value) + { + return SearchAlgorithms.RingRangeBinarySearch( + Ranges.Length, + Ranges, + static (ranges, index) => ranges[index], + value) >= 0; + } + + public bool Intersects(RingRange other) + { + if (IsEmpty || other.IsEmpty) + { + return false; + } + + if (Contains(other.End)) + { + return true; + } + + foreach (var range in Ranges) + { + if (other.Contains(range.End)) + { + return true; + } + } + + return false; + } + + public bool Intersects(RingRangeCollection other) + { + if (IsEmpty || other.IsEmpty) + { + return false; + } + + foreach (var range in Ranges) + { + if (other.Contains(range.End)) + { + return true; + } + } + + foreach (var otherRange in other.Ranges) + { + if (Contains(otherRange.End)) + { + return true; + } + } + + return false; + } + + public RingRangeCollection Difference(RingRangeCollection previous) + { + // Ranges in left must not overlap with each other. + // Ranges in right must not overlap with each other. + // Corresponding ranges in left and right have the same starting points. + // The number of ranges in both 'Ranges' or 'previous.Ranges' is either zero or the configured number of ranges, + // i.e., if both collections have more than zero ranges, the both have the same number of ranges. + if (Ranges.Length == previous.Ranges.Length) + { + var result = ImmutableArray.CreateBuilder(Ranges.Length); + for (var i = 0; i < Ranges.Length; i++) + { + var c = Ranges[i]; + var p = previous.Ranges[i]; + Debug.Assert(c.Start == p.Start); + if (c.Size > p.Size) + { + result.Add(RingRange.Create(p.End, c.End)); + } + } + + // If the last range wrapped around but its extension does not wrap around, move it to the front. + // This preserves sort order. + if (result.Count > 1 && result[^1].Start < result[^2].Start) + { + var last = result[^1]; + result.RemoveAt(result.Count - 1); + result.Insert(0, last); + } + + return new(result.ToImmutable()); + } + else + { + if (Ranges.Length > previous.Ranges.Length) + { + Debug.Assert(previous.Ranges.Length == 0); + return this; + } + else + { + Debug.Assert(Ranges.Length == 0 ^ previous.Ranges.Length == 0); + return Empty; + } + } + } + + public bool Equals(RingRangeCollection other) + { + if (IsEmpty && other.IsEmpty) + { + return true; + } + + if (IsEmpty ^ other.IsEmpty) + { + return false; + } + + return Ranges.SequenceEqual(other.Ranges); + } + + public static bool operator ==(RingRangeCollection left, RingRangeCollection right) => left.Equals(right); + + public static bool operator !=(RingRangeCollection left, RingRangeCollection right) => !(left == right); + + public override bool Equals(object? obj) => obj is RingRangeCollection range && Equals(range); + + public override int GetHashCode() + { + var result = new HashCode(); + result.Add(Ranges.Length); + if (!Ranges.IsDefaultOrEmpty) + { + foreach (var range in Ranges) + { + result.Add(range); + } + } + + return result.ToHashCode(); + } + + public ImmutableArray.Enumerator GetEnumerator() => Ranges.GetEnumerator(); + + public override string ToString() => $"{this}"; + string IFormattable.ToString(string? format, IFormatProvider? formatProvider) => ToString(); + + bool ISpanFormattable.TryFormat(Span destination, out int charsWritten, ReadOnlySpan format, IFormatProvider? provider) + => destination.TryWrite($"({Ranges.Length} subranges), {SizePercent:0.00}%", out charsWritten); + IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)Ranges).GetEnumerator(); + IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)Ranges).GetEnumerator(); +} \ No newline at end of file diff --git a/src/Orleans.Runtime/Hosting/CoreHostingExtensions.cs b/src/Orleans.Runtime/Hosting/CoreHostingExtensions.cs index de51a41b56..4d95b623cc 100644 --- a/src/Orleans.Runtime/Hosting/CoreHostingExtensions.cs +++ b/src/Orleans.Runtime/Hosting/CoreHostingExtensions.cs @@ -1,12 +1,17 @@ +#nullable enable using System; using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; using System.Net; using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.DependencyInjection.Extensions; using Microsoft.Extensions.Options; using Orleans.Configuration; using Orleans.Configuration.Internal; +using Orleans.GrainDirectory; using Orleans.Runtime; +using Orleans.Runtime.GrainDirectory; +using Orleans.Runtime.Hosting; using Orleans.Runtime.MembershipService; namespace Orleans.Hosting @@ -16,6 +21,8 @@ namespace Orleans.Hosting /// public static class CoreHostingExtensions { + private static readonly ServiceDescriptor DirectoryDescriptor = ServiceDescriptor.Singleton(); + /// /// Add propagation through grain calls. /// Note: according to activity will be created only when any listener for activity exists and returns . @@ -47,7 +54,7 @@ public static ISiloBuilder UseLocalhostClustering( this ISiloBuilder builder, int siloPort = EndpointOptions.DEFAULT_SILO_PORT, int gatewayPort = EndpointOptions.DEFAULT_GATEWAY_PORT, - IPEndPoint primarySiloEndpoint = null, + IPEndPoint? primarySiloEndpoint = null, string serviceId = ClusterOptions.DevelopmentServiceId, string clusterId = ClusterOptions.DevelopmentClusterId) { @@ -127,7 +134,7 @@ public static ISiloBuilder UseDevelopmentClustering( }); } - private static void ConfigurePrimarySiloEndpoint(OptionsBuilder optionsBuilder, IPEndPoint primarySiloEndpoint) + private static void ConfigurePrimarySiloEndpoint(OptionsBuilder optionsBuilder, IPEndPoint? primarySiloEndpoint) { optionsBuilder.Configure((DevelopmentClusterMembershipOptions options, IOptions endpointOptions) => { @@ -139,5 +146,31 @@ private static void ConfigurePrimarySiloEndpoint(OptionsBuilder + /// Opts-in to the experimental distributed grain directory. + /// + /// The silo builder to register the directory implementation with. + /// The name of the directory to register, or null to register the directory as the default. + /// The provided silo builder. + [Experimental("ORLEANSEXP002")] + public static ISiloBuilder AddDistributedGrainDirectory(this ISiloBuilder siloBuilder, string? name = null) + { + var services = siloBuilder.Services; + if (string.IsNullOrEmpty(name)) + { + name = GrainDirectoryAttribute.DEFAULT_GRAIN_DIRECTORY; + } + + // Distributed Grain Directory + services.TryAddSingleton(); + if (!services.Contains(DirectoryDescriptor)) + { + services.Add(DirectoryDescriptor); + services.AddGrainDirectory(name, (sp, name) => sp.GetRequiredService()); + } + + return siloBuilder; + } } } \ No newline at end of file diff --git a/src/Orleans.Runtime/Hosting/DefaultSiloServices.cs b/src/Orleans.Runtime/Hosting/DefaultSiloServices.cs index 8f9d8892f3..b8b3d7e6de 100644 --- a/src/Orleans.Runtime/Hosting/DefaultSiloServices.cs +++ b/src/Orleans.Runtime/Hosting/DefaultSiloServices.cs @@ -193,7 +193,7 @@ internal static void AddDefaultServices(ISiloBuilder builder) services.TryAddSingleton(); - services.TryAddSingleton(FactoryUtility.Create); + services.TryAddSingleton(FactoryUtility.Create); // Placement services.AddSingleton(); diff --git a/src/Orleans.Runtime/MembershipService/ClusterMembershipService.cs b/src/Orleans.Runtime/MembershipService/ClusterMembershipService.cs index 80da556090..db926a662f 100644 --- a/src/Orleans.Runtime/MembershipService/ClusterMembershipService.cs +++ b/src/Orleans.Runtime/MembershipService/ClusterMembershipService.cs @@ -50,25 +50,27 @@ public ClusterMembershipSnapshot CurrentSnapshot public IAsyncEnumerable MembershipUpdates => this.updates; - public ValueTask Refresh(MembershipVersion targetVersion) + public ValueTask Refresh(MembershipVersion targetVersion) => Refresh(targetVersion, CancellationToken.None); + public ValueTask Refresh(MembershipVersion targetVersion, CancellationToken cancellationToken) { if (targetVersion != default && targetVersion != MembershipVersion.MinValue && this.snapshot.Version >= targetVersion) return default; - return RefreshAsync(targetVersion); + return RefreshAsync(targetVersion, cancellationToken); - async ValueTask RefreshAsync(MembershipVersion v) + async ValueTask RefreshAsync(MembershipVersion v, CancellationToken cancellationToken) { var didRefresh = false; do { + cancellationToken.ThrowIfCancellationRequested(); if (!didRefresh || this.membershipTableManager.MembershipTableSnapshot.Version < v) { await this.membershipTableManager.Refresh(); didRefresh = true; } - await Task.Delay(TimeSpan.FromMilliseconds(10)); + await Task.Delay(TimeSpan.FromMilliseconds(10), cancellationToken); } while (this.snapshot.Version < v || this.snapshot.Version < this.membershipTableManager.MembershipTableSnapshot.Version); } } diff --git a/src/Orleans.Runtime/MembershipService/ClusterMembershipSnapshot.cs b/src/Orleans.Runtime/MembershipService/ClusterMembershipSnapshot.cs index 90b91e3e57..55c227950f 100644 --- a/src/Orleans.Runtime/MembershipService/ClusterMembershipSnapshot.cs +++ b/src/Orleans.Runtime/MembershipService/ClusterMembershipSnapshot.cs @@ -21,6 +21,8 @@ public ClusterMembershipSnapshot(ImmutableDictionary this.Version = version; } + internal static ClusterMembershipSnapshot Default => new(ImmutableDictionary.Empty, MembershipVersion.MinValue); + /// /// Gets the cluster members. /// diff --git a/src/Orleans.Runtime/MembershipService/InMemoryMembershipTable.cs b/src/Orleans.Runtime/MembershipService/InMemoryMembershipTable.cs index bd75b3f176..29059b391a 100644 --- a/src/Orleans.Runtime/MembershipService/InMemoryMembershipTable.cs +++ b/src/Orleans.Runtime/MembershipService/InMemoryMembershipTable.cs @@ -32,7 +32,7 @@ public MembershipTableData Read(SiloAddress key) public MembershipTableData ReadAll() { - return new MembershipTableData(siloTable.Values.Select(tuple => + return new MembershipTableData(siloTable.Values.Select(tuple => new Tuple(this.deepCopier.Copy(tuple.Item1), tuple.Item2)).ToList(), tableVersion); } @@ -47,7 +47,7 @@ public bool Insert(MembershipEntry entry, TableVersion version) siloTable.TryGetValue(entry.SiloAddress, out data); if (data != null) return false; if (!tableVersion.VersionEtag.Equals(version.VersionEtag)) return false; - + siloTable[entry.SiloAddress] = new Tuple( entry, lastETagCounter++.ToString(CultureInfo.InvariantCulture)); tableVersion = new TableVersion(version.Version, NewETag()); @@ -60,7 +60,7 @@ public bool Update(MembershipEntry entry, string etag, TableVersion version) siloTable.TryGetValue(entry.SiloAddress, out data); if (data == null) return false; if (!data.Item2.Equals(etag) || !tableVersion.VersionEtag.Equals(version.VersionEtag)) return false; - + siloTable[entry.SiloAddress] = new Tuple( entry, lastETagCounter++.ToString(CultureInfo.InvariantCulture)); tableVersion = new TableVersion(version.Version, NewETag()); @@ -83,5 +83,23 @@ private string NewETag() { return lastETagCounter++.ToString(CultureInfo.InvariantCulture); } + + public void CleanupDefunctSiloEntries(DateTimeOffset beforeDate) + { + var removedEnties = new List(); + foreach (var (key, (value, etag)) in siloTable) + { + if (value.Status == SiloStatus.Dead + && new DateTime(Math.Max(value.IAmAliveTime.Ticks, value.StartTime.Ticks), DateTimeKind.Utc) < beforeDate) + { + removedEnties.Add(key); + } + } + + foreach (var removedEntry in removedEnties) + { + siloTable.Remove(removedEntry); + } + } } } diff --git a/src/Orleans.Runtime/MembershipService/LocalSiloHealthMonitor.cs b/src/Orleans.Runtime/MembershipService/LocalSiloHealthMonitor.cs index 0a93c7058e..1f6a639796 100644 --- a/src/Orleans.Runtime/MembershipService/LocalSiloHealthMonitor.cs +++ b/src/Orleans.Runtime/MembershipService/LocalSiloHealthMonitor.cs @@ -3,6 +3,7 @@ using System; using System.Collections.Generic; using System.Collections.Immutable; +using System.Diagnostics; using System.Linq; using System.Threading; using System.Threading.Tasks; diff --git a/src/Orleans.Runtime/MembershipService/SystemTargetBasedMembershipTable.cs b/src/Orleans.Runtime/MembershipService/SystemTargetBasedMembershipTable.cs index eb77a9bdae..c932dc6bcb 100644 --- a/src/Orleans.Runtime/MembershipService/SystemTargetBasedMembershipTable.cs +++ b/src/Orleans.Runtime/MembershipService/SystemTargetBasedMembershipTable.cs @@ -4,12 +4,9 @@ using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; -using Orleans; using Orleans.Concurrency; using Orleans.Configuration; -using Orleans.Hosting; using Orleans.Internal; -using Orleans.Providers; using Orleans.Serialization; namespace Orleans.Runtime.MembershipService @@ -105,10 +102,7 @@ private async Task WaitForTableGrainToInit(IMembershipTableSystemTarget membersh public Task UpdateIAmAlive(MembershipEntry entry) => this.grain.UpdateIAmAlive(entry); - public Task CleanupDefunctSiloEntries(DateTimeOffset beforeDate) - { - throw new NotImplementedException(); - } + public Task CleanupDefunctSiloEntries(DateTimeOffset beforeDate) => this.grain.CleanupDefunctSiloEntries(beforeDate); } [Reentrant] @@ -199,7 +193,8 @@ public Task UpdateIAmAlive(MembershipEntry entry) public Task CleanupDefunctSiloEntries(DateTimeOffset beforeDate) { - throw new NotImplementedException(); + table.CleanupDefunctSiloEntries(beforeDate); + return Task.CompletedTask; } void ILifecycleParticipant.Participate(ISiloLifecycle lifecycle) @@ -207,4 +202,4 @@ void ILifecycleParticipant.Participate(ISiloLifecycle lifecycle) // Do nothing, just ensure that this instance is created so that it can register itself in the catalog. } } -} \ No newline at end of file +} diff --git a/src/Orleans.Runtime/Messaging/MessageCenter.cs b/src/Orleans.Runtime/Messaging/MessageCenter.cs index ddf8539414..1c20347ce0 100644 --- a/src/Orleans.Runtime/Messaging/MessageCenter.cs +++ b/src/Orleans.Runtime/Messaging/MessageCenter.cs @@ -178,8 +178,8 @@ public void SendMessage(Message msg) if (msg.TargetSilo is not { } targetSilo) { - log.LogError((int)ErrorCode.Runtime_Error_100113, "Message does not have a target silo: " + msg + " -- Call stack is: " + Utils.GetStackTrace()); - SendRejection(msg, Message.RejectionTypes.Unrecoverable, "Message to be sent does not have a target silo"); + log.LogError((int)ErrorCode.Runtime_Error_100113, "Message does not have a target silo: '{Message}'. Call stack: {StackTrace}", msg, Utils.GetStackTrace()); + SendRejection(msg, Message.RejectionTypes.Unrecoverable, "Message to be sent does not have a target silo."); return; } @@ -198,13 +198,6 @@ public void SendMessage(Message msg) } else { - if (stopped) - { - log.LogInformation((int)ErrorCode.Runtime_Error_100115, "Message was queued for sending after outbound queue was stopped: {Message}", msg); - SendRejection(msg, Message.RejectionTypes.Unrecoverable, "Message was queued for sending after outbound queue was stopped"); - return; - } - if (this.connectionManager.TryGetConnection(targetSilo, out var existingConnection)) { existingConnection.Send(msg); @@ -213,8 +206,12 @@ public void SendMessage(Message msg) else if (this.siloStatusOracle.IsDeadSilo(targetSilo)) { // Do not try to establish - this.messagingTrace.OnRejectSendMessageToDeadSilo(_siloAddress, msg); - this.SendRejection(msg, Message.RejectionTypes.Transient, "Target silo is known to be dead"); + if (msg.Direction is Message.Directions.Request or Message.Directions.OneWay) + { + this.messagingTrace.OnRejectSendMessageToDeadSilo(_siloAddress, msg); + this.SendRejection(msg, Message.RejectionTypes.Transient, "Target silo is known to be dead", new SiloUnavailableException()); + } + return; } else @@ -373,6 +370,7 @@ private void TryForwardRequest(Message message, GrainAddress? oldAddress, GrainA message.AddToCacheInvalidationHeader(oldAddress, validAddress: destination); } + if (log.IsEnabled(LogLevel.Debug)) log.LogDebug(exc, "Forwarding {Message} to '{ForwardingAddress}' after '{FailedOperation}'", message, forwardingAddress, failedOperation); forwardingSucceeded = this.TryForwardMessage(message, forwardingAddress); } catch (Exception exc2) @@ -422,6 +420,7 @@ private bool TryForwardMessage(Message message, SiloAddress? forwardingAddress) message.ForwardCount = message.ForwardCount + 1; MessagingProcessingInstruments.OnDispatcherMessageForwared(message); + ResendMessageImpl(message, forwardingAddress); return true; } @@ -574,7 +573,7 @@ private void ProcessMessageToNonExistentActivation(Message msg) { MessagingInstruments.OnRejectedMessage(msg); this.log.LogWarning( - (int) ErrorCode.MessagingMessageFromUnknownActivation, + (int)ErrorCode.MessagingMessageFromUnknownActivation, "Received a message {Message} for an unknown SystemTarget: {Target}", msg, msg.TargetGrain); @@ -593,17 +592,20 @@ private void ProcessMessageToNonExistentActivation(Message msg) else { // Activation does not exists and is not a new placement. - log.LogInformation( - (int)ErrorCode.Dispatcher_Intermediate_GetOrCreateActivation, - "Intermediate NonExistentActivation for message {Message}", - msg); + if (log.IsEnabled(LogLevel.Debug)) + { + log.LogDebug( + (int)ErrorCode.Dispatcher_Intermediate_GetOrCreateActivation, + "Unable to create local activation for message {Message}.", + msg); + } - var nonExistentActivation = new GrainAddress { SiloAddress = msg.TargetSilo, GrainId = msg.TargetGrain }; - ProcessRequestToInvalidActivation(msg, nonExistentActivation, null, "Non-existent activation"); + var partialAddress = new GrainAddress { SiloAddress = msg.TargetSilo, GrainId = msg.TargetGrain }; + ProcessRequestToInvalidActivation(msg, partialAddress, null, "Unable to create local activation"); } } - internal void SendRejection(Message msg, Message.RejectionTypes rejectionType, string reason) + internal void SendRejection(Message msg, Message.RejectionTypes rejectionType, string reason, Exception? exception = null) { MessagingInstruments.OnRejectedMessage(msg); @@ -616,7 +618,7 @@ internal void SendRejection(Message msg, Message.RejectionTypes rejectionType, s else { if (string.IsNullOrEmpty(reason)) reason = $"Rejection from silo {this._siloAddress} - Unknown reason."; - var error = this.messageFactory.CreateRejectionResponse(msg, rejectionType, reason); + var error = this.messageFactory.CreateRejectionResponse(msg, rejectionType, reason, exception); // rejection msgs are always originated in the local silo, they are never remote. this.ReceiveMessage(error); } diff --git a/src/Orleans.Runtime/Networking/GatewayInboundConnection.cs b/src/Orleans.Runtime/Networking/GatewayInboundConnection.cs index b3e62b279b..ffd4799163 100644 --- a/src/Orleans.Runtime/Networking/GatewayInboundConnection.cs +++ b/src/Orleans.Runtime/Networking/GatewayInboundConnection.cs @@ -172,7 +172,8 @@ public void FailMessage(Message msg, string reason) this.messageCenter.SendRejection( msg, Message.RejectionTypes.Transient, - $"Silo {this.myAddress} is rejecting message: {msg}. Reason = {reason}"); + $"Silo {this.myAddress} is rejecting message: {msg}. Reason = {reason}", + new SiloUnavailableException()); } else { diff --git a/src/Orleans.Runtime/Networking/SiloConnection.cs b/src/Orleans.Runtime/Networking/SiloConnection.cs index 7580107287..50916c4988 100644 --- a/src/Orleans.Runtime/Networking/SiloConnection.cs +++ b/src/Orleans.Runtime/Networking/SiloConnection.cs @@ -6,6 +6,7 @@ using System.Text; using System.Threading.Tasks; using Microsoft.AspNetCore.Connections; +using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Logging; using Orleans.Configuration; using Orleans.Messaging; @@ -96,7 +97,7 @@ protected override void OnReceivedMessage(Message msg) } MessagingInstruments.OnRejectedMessage(msg); - var rejection = this.MessageFactory.CreateRejectionResponse(msg, Message.RejectionTypes.Unrecoverable, "Silo stopping"); + var rejection = this.MessageFactory.CreateRejectionResponse(msg, Message.RejectionTypes.Unrecoverable, "Silo stopping", new SiloUnavailableException()); this.Send(rejection); return; } @@ -197,7 +198,7 @@ protected override async Task RunInternal() } finally { - if (!(this.RemoteSiloAddress is null)) + if (this.RemoteSiloAddress is not null) { this.connectionManager.OnConnectionTerminated(this.RemoteSiloAddress, this, error); } @@ -243,11 +244,11 @@ protected override bool PrepareMessageForSend(Message msg) // Don't send messages that have already timed out if (msg.IsExpired) { - this.MessagingTrace.OnDropExpiredMessage(msg, MessagingInstruments.Phase.Send); + this.MessagingTrace.OnDropExpiredMessage(msg, MessagingInstruments.Phase.Send); if (msg.IsPing()) { - this.Log.LogWarning("Droppping expired ping message {Message}", msg); + this.Log.LogWarning("Dropping expired ping message {Message}", msg); } return false; @@ -286,7 +287,11 @@ public void FailMessage(Message msg, string reason) if (this.Log.IsEnabled(LogLevel.Debug)) this.Log.LogDebug((int)ErrorCode.MessagingSendingRejection, "Silo {SiloAddress} is rejecting message: {Message}. Reason = {Reason}", this.LocalSiloAddress, msg, reason); // Done retrying, send back an error instead - this.messageCenter.SendRejection(msg, Message.RejectionTypes.Transient, $"Silo {this.LocalSiloAddress} is rejecting message: {msg}. Reason = {reason}"); + this.messageCenter.SendRejection( + msg, + Message.RejectionTypes.Transient, + $"Silo {this.LocalSiloAddress} is rejecting message: {msg}. Reason = {reason}", + new SiloUnavailableException()); } else { diff --git a/src/Orleans.Runtime/Scheduler/ClosureWorkItem.cs b/src/Orleans.Runtime/Scheduler/ClosureWorkItem.cs index 070461f845..4ed505332f 100644 --- a/src/Orleans.Runtime/Scheduler/ClosureWorkItem.cs +++ b/src/Orleans.Runtime/Scheduler/ClosureWorkItem.cs @@ -82,4 +82,28 @@ public override async void Execute() public override IGrainContext GrainContext { get; } } + + internal sealed class ClosureWorkItem(Action closure, TState state, string name, IGrainContext grainContext) : WorkItemBase + { + private readonly TaskCompletionSource _completion = new(TaskCreationOptions.RunContinuationsAsynchronously); + + public override string Name => name ?? AsyncClosureWorkItem.GetMethodName(closure); + public Task Task => _completion.Task; + + public override void Execute() + { + try + { + RequestContext.Clear(); + closure(state); + _completion.TrySetResult(true); + } + catch (Exception exception) + { + _completion.TrySetException(exception); + } + } + + public override IGrainContext GrainContext { get; } = grainContext; + } } diff --git a/src/Orleans.Runtime/Scheduler/SchedulerExtensions.cs b/src/Orleans.Runtime/Scheduler/SchedulerExtensions.cs index 1d3a705334..ee3cc7510e 100644 --- a/src/Orleans.Runtime/Scheduler/SchedulerExtensions.cs +++ b/src/Orleans.Runtime/Scheduler/SchedulerExtensions.cs @@ -1,3 +1,4 @@ +#nullable enable using System; using System.Threading.Tasks; @@ -19,6 +20,13 @@ internal static Task QueueTask(this WorkItemGroup scheduler, Func taskFunc return workItem.Task; } + internal static Task QueueAction(this IGrainContext targetContext, Action action, TState state, string? name = null) + { + var workItem = new ClosureWorkItem(action, state, name, targetContext); + targetContext.Scheduler.QueueWorkItem(workItem); + return workItem.Task; + } + internal static Task RunOrQueueTask(this IGrainContext targetContext, Func taskFunc) { var currentContext = RuntimeContext.Current; diff --git a/src/Orleans.Runtime/Scheduler/WorkItemGroup.cs b/src/Orleans.Runtime/Scheduler/WorkItemGroup.cs index cc24ad28c8..9eca80c08a 100644 --- a/src/Orleans.Runtime/Scheduler/WorkItemGroup.cs +++ b/src/Orleans.Runtime/Scheduler/WorkItemGroup.cs @@ -10,11 +10,10 @@ using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using Orleans.Configuration; -using Orleans.Internal; namespace Orleans.Runtime.Scheduler; -[DebuggerDisplay("WorkItemGroup Name={Name} State={state}")] +[DebuggerDisplay("WorkItemGroup Context={GrainContext} State={state}")] internal sealed class WorkItemGroup : IThreadPoolWorkItem, IWorkItemScheduler { private enum WorkGroupStatus : byte @@ -263,7 +262,7 @@ private void LogLongRunningTurn(Task task, long taskDurationMs) _log.LogWarning( (int)ErrorCode.SchedulerTurnTooLong3, "Task {Task} in WorkGroup {GrainContext} took elapsed time {Duration} for execution, which is longer than {TurnWarningLengthThreshold}. Running on thread {Thread}", - task, + task.AsyncState ?? task, GrainContext.ToString(), taskDuration.ToString("g"), _schedulingOptions.TurnWarningLengthThreshold, diff --git a/src/Orleans.Runtime/Silo/SiloControl.cs b/src/Orleans.Runtime/Silo/SiloControl.cs index 5bf9c2b73e..c8415c8e43 100644 --- a/src/Orleans.Runtime/Silo/SiloControl.cs +++ b/src/Orleans.Runtime/Silo/SiloControl.cs @@ -2,11 +2,13 @@ using System; using System.Collections.Generic; using System.Linq; +using System.Threading; using System.Threading.Tasks; using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using Orleans.Configuration; +using Orleans.GrainDirectory; using Orleans.Metadata; using Orleans.Providers; using Orleans.Runtime.GrainDirectory; @@ -47,7 +49,7 @@ public SiloControl( ILocalSiloDetails localSiloDetails, DeploymentLoadPublisher deploymentLoadPublisher, Catalog catalog, - CachedVersionSelectorManager cachedVersionSelectorManager, + CachedVersionSelectorManager cachedVersionSelectorManager, CompatibilityDirectorManager compatibilityDirectorManager, VersionSelectorManager selectorManager, IServiceProvider services, @@ -97,7 +99,7 @@ public Task ForceGarbageCollection() public Task ForceActivationCollection(TimeSpan ageLimit) { logger.LogInformation("ForceActivationCollection"); - return _activationCollector.CollectActivations(ageLimit); + return _activationCollector.CollectActivations(ageLimit, CancellationToken.None); } public Task ForceRuntimeStatisticsCollection() @@ -185,7 +187,7 @@ public Task GetSimpleGrainStatistics() new SimpleGrainStatistic { SiloAddress = this.localSiloDetails.SiloAddress, GrainType = p.Key, ActivationCount = (int)p.Value }).ToArray()); } - public Task GetDetailedGrainReport(GrainId grainId) + public async Task GetDetailedGrainReport(GrainId grainId) { logger.LogInformation("DetailedGrainReport for grain id {GrainId}", grainId); string? grainClassName; @@ -205,19 +207,39 @@ public Task GetDetailedGrainReport(GrainId grainId) var a => a?.ToString() }; - var directory = services.GetRequiredService(); + var resolver = services.GetRequiredService(); + var defaultDirectory = services.GetService(); + var dir = resolver.Resolve(grainId.Type) ?? defaultDirectory; + GrainAddress? localCacheActivationAddress = null; + GrainAddress? localDirectoryActivationAddress = null; + SiloAddress? primaryForGrain = null; + if (dir is DistributedGrainDirectory distributedGrainDirectory) + { + var grainLocator = services.GetRequiredService(); + grainLocator.TryLookupInCache(grainId, out localCacheActivationAddress); + localDirectoryActivationAddress = await ((DistributedGrainDirectory.ITestHooks)distributedGrainDirectory).GetLocalRecord(grainId); + primaryForGrain = ((DistributedGrainDirectory.ITestHooks)distributedGrainDirectory).GetPrimaryForGrain(grainId); + } + else if (dir is null && services.GetService() is { } localGrainDirectory) + { + localCacheActivationAddress = localGrainDirectory.GetLocalCacheData(grainId); + localDirectoryActivationAddress = localGrainDirectory.GetLocalDirectoryData(grainId).Address; + primaryForGrain = localGrainDirectory.GetPrimaryForGrain(grainId); + } + var report = new DetailedGrainReport() { Grain = grainId, SiloAddress = localSiloDetails.SiloAddress, SiloName = localSiloDetails.Name, - LocalCacheActivationAddress = directory.GetLocalCacheData(grainId), - LocalDirectoryActivationAddress = directory.GetLocalDirectoryData(grainId).Address, - PrimaryForGrain = directory.GetPrimaryForGrain(grainId), + LocalCacheActivationAddress = localCacheActivationAddress, + LocalDirectoryActivationAddress = localDirectoryActivationAddress, + PrimaryForGrain = primaryForGrain, GrainClassTypeName = grainClassName, LocalActivation = activation, }; - return Task.FromResult(report); + + return report; } public Task GetActivationCount() diff --git a/src/Orleans.Runtime/Utilities/SearchAlgorithms.cs b/src/Orleans.Runtime/Utilities/SearchAlgorithms.cs new file mode 100644 index 0000000000..3bf7c690b6 --- /dev/null +++ b/src/Orleans.Runtime/Utilities/SearchAlgorithms.cs @@ -0,0 +1,94 @@ +using System; +using System.Diagnostics; +using System.Runtime.CompilerServices; + +namespace Orleans.Runtime.Utilities; + +internal static class SearchAlgorithms +{ + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int BinarySearch(int length, TState state, Func comparer) + { + var left = 0; + var right = length - 1; + + while (left <= right) + { + var mid = left + (right - left) / 2; + var comparison = comparer(mid, state); + + if (comparison == 0) + { + return mid; + } + else if (comparison < 0) + { + left = mid + 1; + } + else + { + right = mid - 1; + } + } + + return -1; + } + + // Binary search for collections of ranges along a ring (eg, a consistent hash ring), sorted by the starting point of each range. + // This differs from a standard binary search in that the search can wrap around from the start to the last element in the collection. + // This is accommodated by checking the last element in the collection before returning a negative result, to handle the case where a + // range wraps around from end to start. See RingRange + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int RingRangeBinarySearch( + int length, + TCollection collection, + Func getEntry, + TKey key) where TElement : IComparable + { + if (length == 0) return -1; + + var left = 0; + var right = length - 1; + + TElement entry; + while (left <= right) + { + var mid = left + (right - left) / 2; + entry = getEntry(collection, mid); + var comparison = entry.CompareTo(key); + + if (comparison == 0) + { + return mid; + } + else if (comparison < 0) + { + // Go right. + left = mid + 1; + } + else + { + // Go left. + right = mid - 1; + } + } + + // Try the last element. + entry = getEntry(collection, length - 1); + if (entry.CompareTo(key) == 0) + { + return length - 1; + } + +#if DEBUG + // Try the first element. + entry = getEntry(collection, 0); + if (entry.CompareTo(key) == 0) + { + Debug.Fail("Sort order invariant violated."); + } +#endif + + return -1; + } +} diff --git a/src/Orleans.Runtime/Utilities/StripedMpscBuffer.cs b/src/Orleans.Runtime/Utilities/StripedMpscBuffer.cs index 8b0eb9d95d..57420c90d6 100644 --- a/src/Orleans.Runtime/Utilities/StripedMpscBuffer.cs +++ b/src/Orleans.Runtime/Utilities/StripedMpscBuffer.cs @@ -425,4 +425,3 @@ internal class Padding internal const int CACHE_LINE_SIZE = 64; #endif } - diff --git a/src/Orleans.TestingHost/ConfigureDistributedGrainDirectory.cs b/src/Orleans.TestingHost/ConfigureDistributedGrainDirectory.cs new file mode 100644 index 0000000000..f2beaaa0b5 --- /dev/null +++ b/src/Orleans.TestingHost/ConfigureDistributedGrainDirectory.cs @@ -0,0 +1,10 @@ +using Orleans.Hosting; + +namespace Orleans.TestingHost; + +internal class ConfigureDistributedGrainDirectory : ISiloConfigurator +{ +#pragma warning disable ORLEANSEXP002 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed. + public void Configure(ISiloBuilder siloBuilder) => siloBuilder.AddDistributedGrainDirectory(); +#pragma warning restore ORLEANSEXP002 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed. +} \ No newline at end of file diff --git a/src/Orleans.TestingHost/InProcess/InProcessMembershipTable.cs b/src/Orleans.TestingHost/InProcess/InProcessMembershipTable.cs index bfefa09531..b89cafd241 100644 --- a/src/Orleans.TestingHost/InProcess/InProcessMembershipTable.cs +++ b/src/Orleans.TestingHost/InProcess/InProcessMembershipTable.cs @@ -11,7 +11,7 @@ namespace Orleans.TestingHost.InProcess; /// /// An in-memory implementation of for testing purposes. /// -internal sealed class InProcessMembershipTable(string clusterId) : IMembershipTableSystemTarget, IGatewayListProvider +internal sealed class InProcessMembershipTable(string clusterId) : IMembershipTable, IGatewayListProvider { private readonly Table _table = new(); private readonly string _clusterId = clusterId; diff --git a/src/Orleans.TestingHost/TestCluster.cs b/src/Orleans.TestingHost/TestCluster.cs index b67021df35..7d0a59084b 100644 --- a/src/Orleans.TestingHost/TestCluster.cs +++ b/src/Orleans.TestingHost/TestCluster.cs @@ -776,20 +776,20 @@ public async ValueTask DisposeAsync() await Task.Run(async () => { - foreach (var handle in this.SecondarySilos) + foreach (var handle in SecondarySilos) { - await DisposeAsync(handle).ConfigureAwait(false); + await DisposeAsync(handle).ConfigureAwait(ConfigureAwaitOptions.SuppressThrowing); } - if (this.Primary is not null) + if (Primary is not null) { - await DisposeAsync(Primary).ConfigureAwait(false); + await DisposeAsync(Primary).ConfigureAwait(ConfigureAwaitOptions.SuppressThrowing); } - await DisposeAsync(ClientHost).ConfigureAwait(false); + await DisposeAsync(ClientHost).ConfigureAwait(ConfigureAwaitOptions.SuppressThrowing); ClientHost = null; - this.PortAllocator?.Dispose(); + PortAllocator?.Dispose(); }); _disposed = true; diff --git a/src/Orleans.TestingHost/TestClusterBuilder.cs b/src/Orleans.TestingHost/TestClusterBuilder.cs index 0a55fb3030..439eafcd63 100644 --- a/src/Orleans.TestingHost/TestClusterBuilder.cs +++ b/src/Orleans.TestingHost/TestClusterBuilder.cs @@ -44,6 +44,7 @@ public TestClusterBuilder(short initialSilosCount) AssumeHomogenousSilosForTesting = true }; + AddSiloBuilderConfigurator(); this.AddSiloBuilderConfigurator(); this.ConfigureBuilder(ConfigureDefaultPorts); } diff --git a/src/Orleans.TestingHost/TestClusterHostFactory.cs b/src/Orleans.TestingHost/TestClusterHostFactory.cs index 10ee2d8c99..5e87c5b081 100644 --- a/src/Orleans.TestingHost/TestClusterHostFactory.cs +++ b/src/Orleans.TestingHost/TestClusterHostFactory.cs @@ -179,7 +179,7 @@ private static void TryConfigureFileLogging(IConfiguration configuration, IServi bool.TryParse(configuration[nameof(TestClusterOptions.ConfigureFileLogging)], out bool configureFileLogging); if (configureFileLogging) { - var fileName = TestingUtils.CreateTraceFileName(name, configuration[nameof(TestClusterOptions.ClusterId)]); + var fileName = TestingUtils.CreateTraceFileName(name, configuration["Orleans:ClusterId"]); services.AddLogging(loggingBuilder => loggingBuilder.AddFile(fileName)); } } diff --git a/test/DefaultCluster.Tests/ObserverTests.cs b/test/DefaultCluster.Tests/ObserverTests.cs index efb719dc8f..4863cffc1e 100644 --- a/test/DefaultCluster.Tests/ObserverTests.cs +++ b/test/DefaultCluster.Tests/ObserverTests.cs @@ -18,7 +18,7 @@ public class ObserverTests : HostedTestClusterEnsureDefaultStarted private readonly bool[] callbacksReceived = new bool[2]; // we keep the observer objects as instance variables to prevent them from - // being garbage collected permaturely (the runtime stores them as weak references). + // being garbage collected prematurely (the runtime stores them as weak references). private SimpleGrainObserver observer1; private SimpleGrainObserver observer2; diff --git a/test/Directory.Build.props b/test/Directory.Build.props index 58489d6e57..c2b7dedb73 100644 --- a/test/Directory.Build.props +++ b/test/Directory.Build.props @@ -23,4 +23,8 @@ + + + + diff --git a/test/Extensions/Tester.Redis/GrainDirectory/RedisGrainDirectoryTests.cs b/test/Extensions/Tester.Redis/GrainDirectory/RedisGrainDirectoryTests.cs index b65ee7fe92..f81b77479c 100644 --- a/test/Extensions/Tester.Redis/GrainDirectory/RedisGrainDirectoryTests.cs +++ b/test/Extensions/Tester.Redis/GrainDirectory/RedisGrainDirectoryTests.cs @@ -18,7 +18,7 @@ public RedisGrainDirectoryTests(ITestOutputHelper testOutput) : base(testOutput) { } - protected override RedisGrainDirectory GetGrainDirectory() + protected override RedisGrainDirectory CreateGrainDirectory() { TestUtils.CheckForRedis(); var configuration = TestDefaultConfiguration.RedisConnectionString; diff --git a/test/Extensions/TesterAzureUtils/AzureGrainDirectoryTests.cs b/test/Extensions/TesterAzureUtils/AzureGrainDirectoryTests.cs index 6fa27aef0d..1b966da485 100644 --- a/test/Extensions/TesterAzureUtils/AzureGrainDirectoryTests.cs +++ b/test/Extensions/TesterAzureUtils/AzureGrainDirectoryTests.cs @@ -1,7 +1,7 @@ +#nullable enable using Microsoft.Extensions.Options; using Orleans.Configuration; using Orleans.GrainDirectory.AzureStorage; -using Orleans.Runtime; using Orleans.TestingHost.Utils; using Tester.Directories; using Xunit; @@ -9,14 +9,10 @@ namespace Tester.AzureUtils { - [TestCategory("AzureStorage"), TestCategory("Storage")] - public class AzureTableGrainDirectoryTests : GrainDirectoryTests + [TestCategory("AzureStorage"), TestCategory("Directory")] + public class AzureTableGrainDirectoryTests(ITestOutputHelper testOutput) : GrainDirectoryTests(testOutput) { - public AzureTableGrainDirectoryTests(ITestOutputHelper testOutput) : base(testOutput) - { - } - - protected override AzureTableGrainDirectory GetGrainDirectory() + protected override AzureTableGrainDirectory CreateGrainDirectory() { TestUtils.CheckForAzureStorage(); StorageEmulatorUtilities.EnsureEmulatorIsNotUsed(); @@ -56,7 +52,7 @@ public async Task UnregisterMany() MembershipVersion = new MembershipVersion(51) }; addresses.Add(addr); - await this.grainDirectory.Register(addr, previousAddress: null); + await GrainDirectory.Register(addr, previousAddress: null); } // Modify the Rth entry locally, to simulate another activation tentative by another silo @@ -71,20 +67,20 @@ public async Task UnregisterMany() }; // Batch unregister - await this.grainDirectory.UnregisterMany(addresses); + await GrainDirectory.UnregisterMany(addresses); // Now we should only find the old Rth entry for (int i = 0; i < N; i++) { if (i == R) { - var addr = await this.grainDirectory.Lookup(addresses[i].GrainId); + var addr = await GrainDirectory.Lookup(addresses[i].GrainId); Assert.NotNull(addr); Assert.Equal(oldActivation, addr.ActivationId); } else { - Assert.Null(await this.grainDirectory.Lookup(addresses[i].GrainId)); + Assert.Null(await GrainDirectory.Lookup(addresses[i].GrainId)); } } } diff --git a/test/Grains/TestInternalGrains/TimerGrain.cs b/test/Grains/TestInternalGrains/TimerGrain.cs index 0e6540407f..8d2c2f3f94 100644 --- a/test/Grains/TestInternalGrains/TimerGrain.cs +++ b/test/Grains/TestInternalGrains/TimerGrain.cs @@ -221,7 +221,7 @@ public async Task RunSelfDisposingTimer() timer[0].Dispose(); Assert.True(ct.IsCancellationRequested); await Task.Delay(100); - tcs.SetResult(); + tcs.TrySetResult(); } catch (Exception ex) { @@ -538,7 +538,7 @@ public Task StartStuckTimer(TimeSpan dueTime) private Task TimerTick() { - this.completionSource.SetResult(1); + this.completionSource.TrySetResult(1); return Task.CompletedTask; } @@ -556,7 +556,7 @@ public Task TestAllTimerOverloads() tasks.Add(new()); timers.Add(this.RegisterGrainTimer(() => { - tasks[0].SetResult(("NONE", CancellationToken.None)); + tasks[0].TrySetResult(("NONE", CancellationToken.None)); return Task.CompletedTask; }, new GrainTimerCreationOptions(TimeSpan.FromMilliseconds(25), TimeSpan.FromSeconds(10)) { Interleave = true })); @@ -564,7 +564,7 @@ public Task TestAllTimerOverloads() tasks.Add(new()); timers.Add(this.RegisterGrainTimer(() => { - tasks[1].SetResult(("NONE", CancellationToken.None)); + tasks[1].TrySetResult(("NONE", CancellationToken.None)); return Task.CompletedTask; }, TimeSpan.FromMilliseconds(25), TimeSpan.FromSeconds(10))); @@ -572,7 +572,7 @@ public Task TestAllTimerOverloads() tasks.Add(new()); timers.Add(this.RegisterGrainTimer(state => { - tasks[2].SetResult((state, CancellationToken.None)); + tasks[2].TrySetResult((state, CancellationToken.None)); return Task.CompletedTask; }, "STATE", @@ -582,7 +582,7 @@ public Task TestAllTimerOverloads() tasks.Add(new()); timers.Add(this.RegisterGrainTimer(state => { - tasks[3].SetResult((state, CancellationToken.None)); + tasks[3].TrySetResult((state, CancellationToken.None)); return Task.CompletedTask; }, "STATE", @@ -593,7 +593,7 @@ public Task TestAllTimerOverloads() tasks.Add(new()); timers.Add(this.RegisterGrainTimer(ct => { - tasks[4].SetResult(("NONE", ct)); + tasks[4].TrySetResult(("NONE", ct)); return Task.CompletedTask; }, new GrainTimerCreationOptions(TimeSpan.FromMilliseconds(25), TimeSpan.FromSeconds(10)) { Interleave = true })); @@ -601,7 +601,7 @@ public Task TestAllTimerOverloads() tasks.Add(new()); timers.Add(this.RegisterGrainTimer(ct => { - tasks[5].SetResult(("NONE", ct)); + tasks[5].TrySetResult(("NONE", ct)); return Task.CompletedTask; }, TimeSpan.FromMilliseconds(25), TimeSpan.FromSeconds(10))); @@ -609,7 +609,7 @@ public Task TestAllTimerOverloads() tasks.Add(new()); timers.Add(this.RegisterGrainTimer((state, ct) => { - tasks[6].SetResult((state, ct)); + tasks[6].TrySetResult((state, ct)); return Task.CompletedTask; }, "STATE", @@ -619,7 +619,7 @@ public Task TestAllTimerOverloads() tasks.Add(new()); timers.Add(this.RegisterGrainTimer((state, ct) => { - tasks[7].SetResult((state, ct)); + tasks[7].TrySetResult((state, ct)); return Task.CompletedTask; }, "STATE", @@ -878,7 +878,7 @@ public async Task RunSelfDisposingTimer() timer[0].Dispose(); Assert.True(ct.IsCancellationRequested); await Task.Delay(100); - tcs.SetResult(); + tcs.TrySetResult(); } catch (Exception ex) { @@ -1042,7 +1042,7 @@ public Task StartStuckTimer(TimeSpan dueTime) private Task TimerTick() { - this.completionSource.SetResult(1); + this.completionSource.TrySetResult(1); return Task.CompletedTask; } @@ -1060,7 +1060,7 @@ public Task TestAllTimerOverloads() tasks.Add(new()); timers.Add(this.RegisterGrainTimer(() => { - tasks[0].SetResult(("NONE", CancellationToken.None)); + tasks[0].TrySetResult(("NONE", CancellationToken.None)); return Task.CompletedTask; }, new GrainTimerCreationOptions(TimeSpan.FromMilliseconds(25), TimeSpan.FromSeconds(10)) { Interleave = true })); @@ -1068,7 +1068,7 @@ public Task TestAllTimerOverloads() tasks.Add(new()); timers.Add(this.RegisterGrainTimer(() => { - tasks[1].SetResult(("NONE", CancellationToken.None)); + tasks[1].TrySetResult(("NONE", CancellationToken.None)); return Task.CompletedTask; }, TimeSpan.FromMilliseconds(25), TimeSpan.FromSeconds(10))); @@ -1076,7 +1076,7 @@ public Task TestAllTimerOverloads() tasks.Add(new()); timers.Add(this.RegisterGrainTimer(state => { - tasks[2].SetResult((state, CancellationToken.None)); + tasks[2].TrySetResult((state, CancellationToken.None)); return Task.CompletedTask; }, "STATE", @@ -1086,7 +1086,7 @@ public Task TestAllTimerOverloads() tasks.Add(new()); timers.Add(this.RegisterGrainTimer(state => { - tasks[3].SetResult((state, CancellationToken.None)); + tasks[3].TrySetResult((state, CancellationToken.None)); return Task.CompletedTask; }, "STATE", @@ -1097,7 +1097,7 @@ public Task TestAllTimerOverloads() tasks.Add(new()); timers.Add(this.RegisterGrainTimer(ct => { - tasks[4].SetResult(("NONE", ct)); + tasks[4].TrySetResult(("NONE", ct)); return Task.CompletedTask; }, new GrainTimerCreationOptions(TimeSpan.FromMilliseconds(25), TimeSpan.FromSeconds(10)) { Interleave = true })); @@ -1105,7 +1105,7 @@ public Task TestAllTimerOverloads() tasks.Add(new()); timers.Add(this.RegisterGrainTimer(ct => { - tasks[5].SetResult(("NONE", ct)); + tasks[5].TrySetResult(("NONE", ct)); return Task.CompletedTask; }, TimeSpan.FromMilliseconds(25), TimeSpan.FromSeconds(10))); @@ -1113,7 +1113,7 @@ public Task TestAllTimerOverloads() tasks.Add(new()); timers.Add(this.RegisterGrainTimer((state, ct) => { - tasks[6].SetResult((state, ct)); + tasks[6].TrySetResult((state, ct)); return Task.CompletedTask; }, "STATE", @@ -1123,7 +1123,7 @@ public Task TestAllTimerOverloads() tasks.Add(new()); timers.Add(this.RegisterGrainTimer((state, ct) => { - tasks[7].SetResult((state, ct)); + tasks[7].TrySetResult((state, ct)); return Task.CompletedTask; }, "STATE", @@ -1206,7 +1206,7 @@ public async Task RunSelfDisposingTimer() { Assert.NotNull(timer[0]); timer[0].Dispose(); - tcs.SetResult(); + tcs.TrySetResult(); await Task.Delay(100); } catch (Exception ex) diff --git a/test/NonSilo.Tests/Directory/DirectoryMembershipSnapshotTests.cs b/test/NonSilo.Tests/Directory/DirectoryMembershipSnapshotTests.cs new file mode 100644 index 0000000000..40264e5b5a --- /dev/null +++ b/test/NonSilo.Tests/Directory/DirectoryMembershipSnapshotTests.cs @@ -0,0 +1,123 @@ +using System.Collections.Immutable; +using Orleans.Runtime.GrainDirectory; +using CsCheck; +using Xunit; +using Orleans.Configuration; + +namespace NonSilo.Tests.Directory; + +[TestCategory("BVT")] +public sealed class DirectoryMembershipSnapshotTests +{ + private static readonly Gen GenClusterMembershipSnapshot = Gen.Select(Gen.UInt, Gen.Enum(), (hash, status) => (hash, status)) + .Array[Gen.Int[1, 30]].Select((tuple) => + { + var dict = ImmutableDictionary.CreateBuilder(); + var port = 1; + foreach (var item in tuple) + { + var (hash, status) = item; + var addr = SiloAddress.New(new System.Net.IPEndPoint(System.Net.IPAddress.Loopback, port++), (int)hash); + dict.Add(addr, new ClusterMember(addr, status, $"Silo_{hash}")); + } + + return new ClusterMembershipSnapshot(dict.ToImmutable(), new(1)); + }); + + private static readonly Gen GenDirectoryMembershipSnapshot = + GenClusterMembershipSnapshot.SelectMany(snapshot => Gen.UInt.Array[ConsistentRingOptions.DEFAULT_NUM_VIRTUAL_RING_BUCKETS].Array[snapshot.Members.Count].Select(hashes => + { + var i = 0; + return new DirectoryMembershipSnapshot(snapshot, null!, (_, _) => hashes[i++]); + })); + + [Fact] + public void GetOwnerTest() + { + // As long as the cluster has at least one member, we should be able to find an owner. + Gen.Select(GenDirectoryMembershipSnapshot, Gen.UInt) + .Sample((snapshot, hash) => Assert.Equal(snapshot.Members.Length > 0, snapshot.TryGetOwner(hash, out var owner, out _))); + } + + [Fact] + public void MembersDoNotIntersectTest() + { + // Member ranges should not intersect. + GenDirectoryMembershipSnapshot.Where(s => s.Members.Length > 0) + .Sample(snapshot => + { + foreach (var range in snapshot.RangeOwners) + { + foreach (var otherRange in snapshot.RangeOwners) + { + if (range == otherRange) + { + continue; + } + + Assert.False(range.Range.Intersects(otherRange.Range)); + } + } + }); + } + + [Fact] + public void ViewCoversRingTest() + { + // The union of all member ranges should cover the entire ring. + GenDirectoryMembershipSnapshot.Where(s => s.Members.Length > 0) + .Sample(snapshot => + { + uint sum = 0; + var allRanges = new List(); + foreach (var member in snapshot.Members) + { + Assert.Equal(snapshot.GetMemberRanges(member).Sum(range => range.Size), snapshot.GetMemberRangesByPartition(member).Sum(range => range.Size)); + foreach (var range in snapshot.GetMemberRanges(member)) + { + allRanges.Add(range); + sum += range.Size; + } + } + + + Assert.Equal(uint.MaxValue, sum); + + var allRangesCollection = RingRangeCollection.Create(allRanges); + + Assert.Equal(uint.MaxValue, allRangesCollection.Size); + Assert.Equal(100f, allRangesCollection.SizePercent); + Assert.False(allRangesCollection.IsEmpty); + Assert.False(allRangesCollection.IsDefault); + Assert.True(allRangesCollection.IsFull); + }); + } + + [Fact] + public void MemberRangesCoverRingTest() + { + // The union of all member ranges should cover the entire ring. + GenDirectoryMembershipSnapshot.Where(s => s.Members.Length > 0) + .Sample(snapshot => + { + uint sum = 0; + var allRanges = new List(); + foreach (var member in snapshot.Members) + { + foreach (var range in snapshot.GetMemberRangesByPartition(member)) + { + allRanges.Add(range); + sum += range.Size; + } + } + + Assert.Equal(uint.MaxValue, sum); + var allRangesCollection = RingRangeCollection.Create(allRanges); + Assert.Equal(uint.MaxValue, allRangesCollection.Size); + Assert.Equal(100f, allRangesCollection.SizePercent); + Assert.False(allRangesCollection.IsEmpty); + Assert.False(allRangesCollection.IsDefault); + Assert.True(allRangesCollection.IsFull); + }); + } +} diff --git a/test/NonSilo.Tests/Directory/RingRangeCollectionTests.cs b/test/NonSilo.Tests/Directory/RingRangeCollectionTests.cs new file mode 100644 index 0000000000..22041191a9 --- /dev/null +++ b/test/NonSilo.Tests/Directory/RingRangeCollectionTests.cs @@ -0,0 +1,142 @@ +using System.Collections.Immutable; +using Orleans.Runtime.GrainDirectory; +using CsCheck; +using Xunit; + +namespace NonSilo.Tests.Directory; + +[TestCategory("BVT")] +public sealed class RingRangeCollectionTests +{ + private static readonly Gen GenRingRangeCollection = Gen.Int[0, 100].SelectMany(count => Gen.Select(Gen.UInt, Gen.Bool, static (boundary, included) => (boundary, included)).Array[count].Select(elements => + { + var arr = ImmutableArray.CreateBuilder(elements.Length); + for (var i = 1; i < arr.Count;) + { + var prev = elements[i - 1]; + var (boundary, included) = elements[i]; + if (!included) + { + continue; + } + + arr.Add(RingRange.Create(prev.boundary, boundary)); + } + + return RingRangeCollection.Create(arr); + })); + + [Fact] + public void Contains() + { + Gen.Select(GenRingRangeCollection, Gen.UInt).Sample((ranges, point) => + { + var doesContain = ranges.Ranges.Any(r => r.Contains(point)); + Assert.Equal(doesContain, ranges.Contains(point)); + }); + } + + [Fact] + public void Intersects() + { + GenRingRangeCollection.Sample(ranges => + { + foreach (var range in ranges.Ranges) + { + Assert.True(ranges.Intersects(range)); + } + }); + } + + [Fact] + public void Difference() + { + var ringWithUpdates = GenRingRangeCollection.SelectMany(original => Gen.Float[0f, 1f].Array[original.Ranges.Length].Select(diffs => + { + // Increase or decrease the end of each range by some amount. + var arr = ImmutableArray.CreateBuilder(original.Ranges.Length); + for (var i = 0; i < diffs.Length; i++) + { + var orig = original.Ranges[i]; + var next = original.Ranges[(i + 1) % original.Ranges.Length]; + var maxPossibleLength = RingRange.Create(orig.Start, next.Start).Size; + var newEnd = orig.Start + maxPossibleLength * diffs[i]; + arr.Add(RingRange.Create(orig.Start, (uint)Math.Clamp(orig.End + diffs[i], orig.Start + 1, next.Start))); + } + + return (original, RingRangeCollection.Create(arr)); + })); + + ringWithUpdates.Sample((original, updated) => + { + var additions = updated.Difference(original); + + foreach (var addition in additions) + { + Assert.True(updated.Intersects(addition)); + Assert.False(original.Intersects(addition)); + } + + var removals = updated.Difference(original); + + foreach (var removal in removals) + { + Assert.False(updated.Intersects(removal)); + Assert.True(original.Intersects(removal)); + } + }); + } + + [Fact] + public void ContainsTest() + { + Gen.Select(GenRingRangeCollection, Gen.UInt).Sample((collection, point) => + { + var allRanges = collection.Ranges.ToList(); + var expectedContains = allRanges.Any(r => r.Contains(point)); + Assert.Equal(expectedContains, collection.Contains(point)); + var numContains = collection.Count(r => r.Contains(point)); + Assert.Equal(expectedContains ? 1 : 0, numContains); + }); + } + + [Fact] + public void ContainsWrappedTest() + { + var ranges = new RingRange[] + { + RingRange.Create(0x10930012, 0x179C5AD4), + RingRange.Create(0x287844C7, 0x2B5DCCCB), + RingRange.Create(0x32AC80C2, 0x36F72978), + RingRange.Create(0x6F5C3AAC, 0x7776E202), + RingRange.Create(0x7D2B02F3, 0x7DF52810), + RingRange.Create(0xA18205D1, 0xA3A44031), + RingRange.Create(0xA847CD39, 0xAD6C28D0), + RingRange.Create(0xAF60D42F, 0xB278D2BE), + RingRange.Create(0xBB8EA837, 0xC61DA5E1), + RingRange.Create(0xF08C2237, 0xF3030A5A) + }.ToImmutableArray(); + var collection = new RingRangeCollection(ranges); + uint point = 0x16F4037C; + Assert.True(ranges[0].Contains(point)); + Assert.True(collection.Contains(point)); + + // Just outside the last range. + point = 0xF3030A5A + 1; + Assert.False(ranges[^1].Contains(point)); + Assert.False(collection.Contains(point)); + + // Just inside the last range. + point = 0xF3030A5A; + Assert.True(ranges[^1].Contains(point)); + Assert.True(collection.Contains(point)); + + // Between ranges. + point = 0xF08C2237 - 1; + Assert.False(collection.Contains(point)); + + // In an interior range. + point = 0x7D2B02F3 + 1; + Assert.True(collection.Contains(point)); + } +} diff --git a/test/NonSilo.Tests/Directory/RingRangeTests.cs b/test/NonSilo.Tests/Directory/RingRangeTests.cs new file mode 100644 index 0000000000..e689b015d4 --- /dev/null +++ b/test/NonSilo.Tests/Directory/RingRangeTests.cs @@ -0,0 +1,183 @@ +using Orleans.Runtime.GrainDirectory; +using CsCheck; +using Xunit; + +namespace NonSilo.Tests.Directory; + +[TestCategory("BVT")] +public sealed class RingRangeTests +{ + internal static Gen GenRingRange => Gen.Select(Gen.UInt, Gen.UInt, RingRange.Create); + + [Fact] + public void RingRangeDifference_EquallyDividedRange() + { + var previous = RingRange.Empty; + var current = CreateEquallyDividedRange(2, 0); + Assert.Empty(current.Difference(current)); + + Assert.Equal(current, Assert.Single(current.Difference(previous))); + Assert.Empty(previous.Difference(current)); + + var firstHalf = CreateEquallyDividedRange(2, 0); + var secondHalf = CreateEquallyDividedRange(2, 1); + + Assert.Equal(firstHalf, Assert.Single(firstHalf.Difference(secondHalf))); + Assert.Equal(secondHalf, Assert.Single(secondHalf.Difference(firstHalf))); + } + + [Fact] + public void ComplementDoesNotIntersect() + { + GenRingRange.Where(range => !range.IsEmpty && !range.IsFull) + .Sample((sample) => + { + var inverse = sample.Complement(); + Assert.False(sample.Intersects(inverse)); + Assert.Empty(sample.Intersections(inverse)); + Assert.False(sample.Contains(inverse.End)); + var difference = Assert.Single(sample.Difference(inverse)); + Assert.Equal(sample, difference); + var inverseDifference = Assert.Single(inverse.Difference(sample)); + Assert.Equal(inverse, inverseDifference); + }); + } + + [Fact] + public void ComplementComplementIsEqual() + { + GenRingRange + .Sample((sample) => + { + var inverse = sample.Complement(); + var inverseInverse = inverse.Complement(); + Assert.True(sample.Equals(inverseInverse)); + }); + } + + [Fact] + public void RingRangeDifference_HolePunch() + { + var first = CreateEquallyDividedRange(8, 0); + var second = CreateEquallyDividedRange(8, 1); + var third = CreateEquallyDividedRange(8, 2); + var fullRange = RingRange.Create(first.Start, third.End); + + var midPunch = fullRange.Difference(second); + Assert.Equal(2, midPunch.Count()); + Assert.Equal(first, midPunch.First()); + Assert.Equal(third, midPunch.Last()); + } + + [Fact] + public void RingRangeDifference_Empty() + { + var current = RingRange.Create(0x33333334, 0x66666667); + var result = current.Difference(RingRange.Empty); + Assert.Equal(current, Assert.Single(result)); + } + + [Fact] + public void RingRangeDifference_Empty_Two() + { + var current = RingRange.Create(0x33333334, 0x66666667); + var previous = RingRange.Create(uint.MaxValue - 1, 1); + var result = Assert.Single(current.Difference(previous)); + Assert.Equal(current, result); + Assert.Equal(previous, Assert.Single(previous.Difference(current))); + } + + [Fact] + public void RingRangeIntersection() + { + Assert.Empty(RingRange.Empty.Difference(RingRange.Empty)); + + Assert.Empty(RingRange.Full.Difference(RingRange.Full)); + + Assert.Equal(RingRange.Full, Assert.Single(RingRange.Full.Difference(RingRange.Empty))); + + Assert.Empty(RingRange.Empty.Difference(RingRange.Full)); + } + + [Fact] + public void RingRangeContains() + { + Assert.False(RingRange.Empty.Contains(0)); + Assert.False(RingRange.Empty.Contains(1)); + Assert.False(RingRange.Empty.Contains(uint.MaxValue)); + Assert.False(RingRange.Empty.Contains(uint.MaxValue / 2)); + + Assert.True(RingRange.Full.Contains(0)); + Assert.True(RingRange.Full.Contains(1)); + Assert.True(RingRange.Full.Contains(uint.MaxValue)); + Assert.True(RingRange.Full.Contains(uint.MaxValue / 2)); + + var wrapped = RingRange.Create(uint.MaxValue - 10, 10); + Assert.True(wrapped.Contains(0)); + Assert.True(wrapped.Contains(1)); + Assert.True(wrapped.Contains(uint.MaxValue)); + Assert.False(wrapped.Contains(uint.MaxValue / 2)); + } + + [InlineData(1)] + [InlineData(2)] + [InlineData(3)] + [InlineData(17)] + [InlineData(33)] + [Theory] + public void EqualRangeInvariants(int count) + { + var sum = 0ul; + var previous = RingRange.Empty; + for (var i = 0; i < count; i++) + { + var range = CreateEquallyDividedRange(count, i); + Assert.False(previous.Intersects(range)); + sum += range.Size; + previous = range; + } + + Assert.Equal(uint.MaxValue, sum); + } + + private static RingRange CreateEquallyDividedRange(int count, int index) + { + ArgumentOutOfRangeException.ThrowIfGreaterThanOrEqual(index, count, nameof(index)); + ArgumentOutOfRangeException.ThrowIfLessThan(count, 1); + return Core((uint)count, (uint)index); + static RingRange Core(uint count, uint index) + { + ArgumentOutOfRangeException.ThrowIfGreaterThanOrEqual(index, count, nameof(index)); + + if (count == 1 && index == 0) + { + return RingRange.Full; + } + + var rangeSize = (ulong)uint.MaxValue + 1; + var portion = rangeSize / count; + var remainder = rangeSize - portion * count; + var start = 0u; + for (var i = 0; i < count; i++) + { + // (Start, End] + var end = unchecked((uint)(start + portion)); + + if (remainder > 0) + { + end++; + remainder--; + } + + if (i == index) + { + return RingRange.Create(start, end); + } + + start = end; + } + + throw new ArgumentException(null, nameof(index)); + } + } +} diff --git a/test/NonSilo.Tests/NonSilo.Tests.csproj b/test/NonSilo.Tests/NonSilo.Tests.csproj index 1f4a4221a1..3868e1121e 100644 --- a/test/NonSilo.Tests/NonSilo.Tests.csproj +++ b/test/NonSilo.Tests/NonSilo.Tests.csproj @@ -17,6 +17,7 @@ + diff --git a/test/NonSilo.Tests/SchedulerTests/OrleansTaskSchedulerBasicTests.cs b/test/NonSilo.Tests/SchedulerTests/OrleansTaskSchedulerBasicTests.cs index d388aaa23e..2e38765d6f 100644 --- a/test/NonSilo.Tests/SchedulerTests/OrleansTaskSchedulerBasicTests.cs +++ b/test/NonSilo.Tests/SchedulerTests/OrleansTaskSchedulerBasicTests.cs @@ -407,7 +407,7 @@ internal static ILoggerFactory InitSchedulerLogging() var filters = new LoggerFilterOptions(); filters.AddFilter("Scheduler", LogLevel.Trace); filters.AddFilter("Scheduler.WorkerPoolThread", LogLevel.Trace); - var loggerFactory = TestingUtils.CreateDefaultLoggerFactory(TestingUtils.CreateTraceFileName("Silo", DateTime.Now.ToString("yyyyMMdd_hhmmss")), filters); + var loggerFactory = TestingUtils.CreateDefaultLoggerFactory(TestingUtils.CreateTraceFileName("Silo", DateTime.UtcNow.ToString("yyyyMMdd_hhmmss")), filters); return loggerFactory; } } diff --git a/test/Orleans.Serialization.FSharp.Tests/Orleans.Serialization.FSharp.Tests.fsproj b/test/Orleans.Serialization.FSharp.Tests/Orleans.Serialization.FSharp.Tests.fsproj index 3c52da56d9..adfb7b0328 100644 --- a/test/Orleans.Serialization.FSharp.Tests/Orleans.Serialization.FSharp.Tests.fsproj +++ b/test/Orleans.Serialization.FSharp.Tests/Orleans.Serialization.FSharp.Tests.fsproj @@ -1,4 +1,4 @@ - + latest $(TestTargetFrameworks) @@ -7,6 +7,8 @@ + + diff --git a/test/Tester/Directories/GrainDirectoryTests.cs b/test/Tester/Directories/GrainDirectoryTests.cs index 2f4486d002..d95f5bd6ca 100644 --- a/test/Tester/Directories/GrainDirectoryTests.cs +++ b/test/Tester/Directories/GrainDirectoryTests.cs @@ -1,151 +1,151 @@ +#nullable enable using Microsoft.Extensions.Logging; using Orleans.GrainDirectory; -using Orleans.Runtime; using TestExtensions; using Xunit; using Xunit.Abstractions; -namespace Tester.Directories +namespace Tester.Directories; + +// Base tests for custom Grain Directory +public abstract class GrainDirectoryTests where TGrainDirectory : IGrainDirectory { - // Base tests for custom Grain Directory - public abstract class GrainDirectoryTests where T : IGrainDirectory + protected readonly ILoggerFactory loggerFactory; + private TGrainDirectory? _directory; + + protected GrainDirectoryTests(ITestOutputHelper testOutput) { - protected T grainDirectory; - protected readonly ILoggerFactory loggerFactory; + this.loggerFactory = new LoggerFactory(); + this.loggerFactory.AddProvider(new XunitLoggerProvider(testOutput)); + } + + protected TGrainDirectory GrainDirectory => _directory ??= CreateGrainDirectory(); - protected GrainDirectoryTests(ITestOutputHelper testOutput) + protected abstract TGrainDirectory CreateGrainDirectory(); + + [SkippableFact] + public async Task RegisterLookupUnregisterLookup() + { + var expected = new GrainAddress { - this.loggerFactory = new LoggerFactory(); - this.loggerFactory.AddProvider(new XunitLoggerProvider(testOutput)); - this.grainDirectory = GetGrainDirectory(); - } + ActivationId = ActivationId.NewId(), + GrainId = GrainId.Parse("user/somerandomuser_" + Guid.NewGuid().ToString("N")), + SiloAddress = SiloAddress.FromParsableString("10.0.23.12:1000@5678"), + MembershipVersion = new MembershipVersion(51) + }; + + Assert.Equal(expected, await GrainDirectory.Register(expected, null)); + + Assert.Equal(expected, await GrainDirectory.Lookup(expected.GrainId)); - protected abstract T GetGrainDirectory(); + await GrainDirectory.Unregister(expected); - [SkippableFact] - public async Task RegisterLookupUnregisterLookup() + Assert.Null(await GrainDirectory.Lookup(expected.GrainId)); + } + + [SkippableFact] + public async Task DoNotOverwriteEntry() + { + var expected = new GrainAddress { - var expected = new GrainAddress - { - ActivationId = ActivationId.NewId(), - GrainId = GrainId.Parse("user/somerandomuser_" + Guid.NewGuid().ToString("N")), - SiloAddress = SiloAddress.FromParsableString("10.0.23.12:1000@5678"), - MembershipVersion = new MembershipVersion(51) - }; + ActivationId = ActivationId.NewId(), + GrainId = GrainId.Parse("user/somerandomuser_" + Guid.NewGuid().ToString("N")), + SiloAddress = SiloAddress.FromParsableString("10.0.23.12:1000@5678"), + MembershipVersion = new MembershipVersion(51) + }; - Assert.Equal(expected, await this.grainDirectory.Register(expected, null)); + var differentActivation = new GrainAddress + { + ActivationId = ActivationId.NewId(), + GrainId = expected.GrainId, + SiloAddress = SiloAddress.FromParsableString("10.0.23.12:1000@5678"), + MembershipVersion = new MembershipVersion(51) + }; - Assert.Equal(expected, await this.grainDirectory.Lookup(expected.GrainId)); + var differentSilo = new GrainAddress + { + ActivationId = expected.ActivationId, + GrainId = expected.GrainId, + SiloAddress = SiloAddress.FromParsableString("10.0.23.14:1000@4583"), + MembershipVersion = new MembershipVersion(51) + }; - await this.grainDirectory.Unregister(expected); + Assert.Equal(expected, await GrainDirectory.Register(expected, null)); + Assert.Equal(expected, await GrainDirectory.Register(differentActivation, null)); + Assert.Equal(expected, await GrainDirectory.Register(differentSilo, null)); - Assert.Null(await this.grainDirectory.Lookup(expected.GrainId)); - } + Assert.Equal(expected, await GrainDirectory.Lookup(expected.GrainId)); + } - [SkippableFact] - public async Task DoNotOverwriteEntry() + /// + /// Overwrite an existing entry if the register call includes a matching "previousAddress" parameter. + /// + [SkippableFact] + public async Task OverwriteEntryIfMatch() + { + var initial = new GrainAddress { - var expected = new GrainAddress - { - ActivationId = ActivationId.NewId(), - GrainId = GrainId.Parse("user/somerandomuser_" + Guid.NewGuid().ToString("N")), - SiloAddress = SiloAddress.FromParsableString("10.0.23.12:1000@5678"), - MembershipVersion = new MembershipVersion(51) - }; - - var differentActivation = new GrainAddress - { - ActivationId = ActivationId.NewId(), - GrainId = expected.GrainId, - SiloAddress = SiloAddress.FromParsableString("10.0.23.12:1000@5678"), - MembershipVersion = new MembershipVersion(51) - }; - - var differentSilo = new GrainAddress - { - ActivationId = expected.ActivationId, - GrainId = expected.GrainId, - SiloAddress = SiloAddress.FromParsableString("10.0.23.14:1000@4583"), - MembershipVersion = new MembershipVersion(51) - }; - - Assert.Equal(expected, await this.grainDirectory.Register(expected, null)); - Assert.Equal(expected, await this.grainDirectory.Register(differentActivation, null)); - Assert.Equal(expected, await this.grainDirectory.Register(differentSilo, null)); - - Assert.Equal(expected, await this.grainDirectory.Lookup(expected.GrainId)); - } - - /// - /// Overwrite an existing entry if the register call includes a matching "previousAddress" parameter. - /// - [SkippableFact] - public async Task OverwriteEntryIfMatch() + ActivationId = ActivationId.NewId(), + GrainId = GrainId.Parse("user/somerandomuser_" + Guid.NewGuid().ToString("N")), + SiloAddress = SiloAddress.FromParsableString("10.0.23.12:1000@5678"), + MembershipVersion = new MembershipVersion(51) + }; + + var differentActivation = new GrainAddress + { + ActivationId = ActivationId.NewId(), + GrainId = initial.GrainId, + SiloAddress = initial.SiloAddress, + MembershipVersion = initial.MembershipVersion + }; + + var differentSilo = new GrainAddress { - var initial = new GrainAddress - { - ActivationId = ActivationId.NewId(), - GrainId = GrainId.Parse("user/somerandomuser_" + Guid.NewGuid().ToString("N")), - SiloAddress = SiloAddress.FromParsableString("10.0.23.12:1000@5678"), - MembershipVersion = new MembershipVersion(51) - }; - - var differentActivation = new GrainAddress - { - ActivationId = ActivationId.NewId(), - GrainId = initial.GrainId, - SiloAddress = initial.SiloAddress, - MembershipVersion = initial.MembershipVersion - }; - - var differentSilo = new GrainAddress - { - ActivationId = initial.ActivationId, - GrainId = initial.GrainId, - SiloAddress = SiloAddress.FromParsableString("10.0.23.14:1000@4583"), - MembershipVersion = initial.MembershipVersion - }; - - // Success, no registration exists, so the previous address is ignored. - Assert.Equal(initial, await this.grainDirectory.Register(initial, differentSilo)); - - // Success, the previous address matches the existing registration. - Assert.Equal(differentActivation, await this.grainDirectory.Register(differentActivation, initial)); - - // Failure, the previous address does not match the existing registration. - Assert.Equal(differentActivation, await this.grainDirectory.Register(differentSilo, initial)); - - Assert.Equal(differentActivation, await this.grainDirectory.Lookup(initial.GrainId)); - } - - [SkippableFact] - public async Task DoNotDeleteDifferentActivationIdEntry() + ActivationId = initial.ActivationId, + GrainId = initial.GrainId, + SiloAddress = SiloAddress.FromParsableString("10.0.23.14:1000@4583"), + MembershipVersion = initial.MembershipVersion + }; + + // Success, no registration exists, so the previous address is ignored. + Assert.Equal(initial, await GrainDirectory.Register(initial, differentSilo)); + + // Success, the previous address matches the existing registration. + Assert.Equal(differentActivation, await GrainDirectory.Register(differentActivation, initial)); + + // Failure, the previous address does not match the existing registration. + Assert.Equal(differentActivation, await GrainDirectory.Register(differentSilo, initial)); + + Assert.Equal(differentActivation, await GrainDirectory.Lookup(initial.GrainId)); + } + + [SkippableFact] + public async Task DoNotDeleteDifferentActivationIdEntry() + { + var expected = new GrainAddress { - var expected = new GrainAddress - { - ActivationId = ActivationId.NewId(), - GrainId = GrainId.Parse("user/somerandomuser_" + Guid.NewGuid().ToString("N")), - SiloAddress = SiloAddress.FromParsableString("10.0.23.12:1000@5678"), - MembershipVersion = new MembershipVersion(51) - }; - - var otherEntry = new GrainAddress - { - ActivationId = ActivationId.NewId(), - GrainId = expected.GrainId, - SiloAddress = SiloAddress.FromParsableString("10.0.23.12:1000@5678"), - MembershipVersion = new MembershipVersion(51) - }; - - Assert.Equal(expected, await this.grainDirectory.Register(expected, null)); - await this.grainDirectory.Unregister(otherEntry); - Assert.Equal(expected, await this.grainDirectory.Lookup(expected.GrainId)); - } - - [SkippableFact] - public async Task LookupNotFound() + ActivationId = ActivationId.NewId(), + GrainId = GrainId.Parse("user/somerandomuser_" + Guid.NewGuid().ToString("N")), + SiloAddress = SiloAddress.FromParsableString("10.0.23.12:1000@5678"), + MembershipVersion = new MembershipVersion(51) + }; + + var otherEntry = new GrainAddress { - Assert.Null(await this.grainDirectory.Lookup(GrainId.Parse("user/somerandomuser_" + Guid.NewGuid().ToString("N")))); - } + ActivationId = ActivationId.NewId(), + GrainId = expected.GrainId, + SiloAddress = SiloAddress.FromParsableString("10.0.23.12:1000@5678"), + MembershipVersion = new MembershipVersion(51) + }; + + Assert.Equal(expected, await GrainDirectory.Register(expected, null)); + await GrainDirectory.Unregister(otherEntry); + Assert.Equal(expected, await GrainDirectory.Lookup(expected.GrainId)); + } + + [SkippableFact] + public async Task LookupNotFound() + { + Assert.Null(await GrainDirectory.Lookup(GrainId.Parse("user/somerandomuser_" + Guid.NewGuid().ToString("N")))); } } diff --git a/test/TesterInternal/General/ConsistentRingProviderTests_Silo.cs b/test/TesterInternal/General/ConsistentRingProviderTests_Silo.cs index b11bfc1349..149da1f0c8 100644 --- a/test/TesterInternal/General/ConsistentRingProviderTests_Silo.cs +++ b/test/TesterInternal/General/ConsistentRingProviderTests_Silo.cs @@ -2,7 +2,6 @@ using Microsoft.Extensions.Configuration; using Microsoft.Extensions.Logging; using Orleans.Configuration; -using Orleans.Runtime; using Orleans.Runtime.ReminderService; using Orleans.TestingHost; using TestExtensions; @@ -19,7 +18,7 @@ public class ConsistentRingProviderTests_Silo : TestClusterPerTest private readonly TimeSpan endWait = TimeSpan.FromMinutes(5); private enum Fail { First, Random, Last } - + protected override void ConfigureTestCluster(TestClusterBuilder builder) { builder.AddSiloBuilderConfigurator(); @@ -157,7 +156,7 @@ public async Task Ring_1F1J() // kill a silo and join a new one in parallel logger.LogInformation("Killing silo {SiloAddress} and joining a silo", failures[0].SiloAddress); - + var tasks = new Task[2] { Task.Factory.StartNew(() => this.HostedCluster.StopSiloAsync(failures[0])), @@ -282,7 +281,7 @@ private async Task> getSilosToFail(Fail fail, int numOfFailures await tableGrain.ReadRows(tableGrainId); SiloAddress reminderTableGrainPrimaryDirectoryAddress = (await TestUtils.GetDetailedGrainReport(this.HostedCluster.InternalGrainFactory, tableGrainId, this.HostedCluster.Primary)).PrimaryForGrain; - // ask a detailed report from the directory partition owner, and get the actionvation addresses + // ask a detailed report from the directory partition owner, and get the activation addresses var address = (await TestUtils.GetDetailedGrainReport(this.HostedCluster.InternalGrainFactory, tableGrainId, this.HostedCluster.GetSiloForAddress(reminderTableGrainPrimaryDirectoryAddress))).LocalDirectoryActivationAddress; GrainAddress reminderGrainActivation = address; diff --git a/test/TesterInternal/GrainDirectory/DistributedGrainDirectoryTests.cs b/test/TesterInternal/GrainDirectory/DistributedGrainDirectoryTests.cs new file mode 100644 index 0000000000..092c64de4a --- /dev/null +++ b/test/TesterInternal/GrainDirectory/DistributedGrainDirectoryTests.cs @@ -0,0 +1,22 @@ +#nullable enable +using Microsoft.Extensions.DependencyInjection; +using Orleans.GrainDirectory; +using Orleans.Runtime.GrainDirectory; +using Orleans.TestingHost; +using Tester.Directories; +using TestExtensions; +using Xunit; +using Xunit.Abstractions; + +namespace UnitTests.GrainDirectory; + +[TestCategory("BVT"), TestCategory("Directory")] +public sealed class DefaultGrainDirectoryTests(DefaultClusterFixture fixture, ITestOutputHelper output) + : GrainDirectoryTests(output), IClassFixture +{ + private readonly TestCluster _testCluster = fixture.HostedCluster; + private InProcessSiloHandle Primary => (InProcessSiloHandle)_testCluster.Primary; + + protected override IGrainDirectory CreateGrainDirectory() => + Primary.SiloHost.Services.GetRequiredService().DefaultGrainDirectory; +} diff --git a/test/TesterInternal/GrainDirectory/GrainDirectoryResilienceTests.cs b/test/TesterInternal/GrainDirectory/GrainDirectoryResilienceTests.cs new file mode 100644 index 0000000000..b026061baf --- /dev/null +++ b/test/TesterInternal/GrainDirectory/GrainDirectoryResilienceTests.cs @@ -0,0 +1,182 @@ +#nullable enable +using System.Diagnostics; +using System.Globalization; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using Orleans.Configuration; +using Orleans.Runtime.GrainDirectory; +using Orleans.Serialization; +using Orleans.Storage; +using Orleans.TestingHost; +using Xunit; +using Xunit.Abstractions; + +namespace UnitTests.GrainDirectory; + +internal interface IMyDirectoryTestGrain : IGrainWithIntegerKey +{ + ValueTask Ping(); +} + +[CollectionAgeLimit(Minutes = 1.01)] +internal class MyDirectoryTestGrain : Grain, IMyDirectoryTestGrain +{ + public ValueTask Ping() => default; +} + +[TestCategory("SlowBVT"), TestCategory("Directory")] +public sealed class GrainDirectoryResilienceTests +{ + /// + /// Cluster chaos test: tests directory functionality & integrity while starting/stopping/killing silos frequently. + /// + /// + [Fact] + public async Task ElasticChaos() + { + var testClusterBuilder = new TestClusterBuilder(1); + testClusterBuilder.AddSiloBuilderConfigurator(); + var testCluster = testClusterBuilder.Build(); + await testCluster.DeployAsync(); + var log = testCluster.ServiceProvider.GetRequiredService>(); + log.LogInformation("ServiceId: '{ServiceId}'", testCluster.Options.ServiceId); + log.LogInformation("ClusterId: '{ClusterId}'.", testCluster.Options.ClusterId); + + var cts = new CancellationTokenSource(TimeSpan.FromMinutes(5)); + var reconfigurationTimer = CoarseStopwatch.StartNew(); + var upperLimit = 10; + var lowerLimit = 1; // Membership is kept on the primary, so we can't go below 1 + var target = upperLimit; + var idBase = 0L; + var client = ((InProcessSiloHandle)testCluster.Primary).SiloHost.Services.GetRequiredService(); + const int CallsPerIteration = 100; + var loadTask = Task.Run(async () => + { + while (!cts.IsCancellationRequested) + { + var time = Stopwatch.StartNew(); + var tasks = Enumerable.Range(0, CallsPerIteration).Select(i => client.GetGrain(idBase + i).Ping().AsTask()).ToList(); + var workTask = Task.WhenAll(tasks); + + try + { + await workTask; + } + catch (SiloUnavailableException sue) + { + log.LogInformation(sue, "Swallowed transient exception."); + } + catch (OrleansMessageRejectionException omre) + { + log.LogInformation(omre, "Swallowed rejection."); + } + catch (Exception exception) + { + log.LogError(exception, "Unhandled exception."); + throw; + } + + idBase += CallsPerIteration; + } + }); + + var chaosTask = Task.Run(async () => + { + var clusterOperation = Task.CompletedTask; + while (!cts.IsCancellationRequested) + { + try + { + var remaining = TimeSpan.FromSeconds(10) - reconfigurationTimer.Elapsed; + if (remaining <= TimeSpan.Zero) + { + reconfigurationTimer.Restart(); + await clusterOperation; + + // Check integrity + var integrityChecks = new List(); + foreach (var silo in testCluster.Silos) + { + var address = silo.SiloAddress; + for (var partitionIndex = 0; partitionIndex < DirectoryMembershipSnapshot.PartitionsPerSilo; partitionIndex++) + { + var replica = ((IInternalGrainFactory)client).GetSystemTarget(GrainDirectoryReplica.CreateGrainId(address, partitionIndex).GrainId); + integrityChecks.Add(replica.CheckIntegrityAsync().AsTask()); + } + } + + await Task.WhenAll(integrityChecks); + foreach (var task in integrityChecks) + { + await task; + } + + clusterOperation = Task.Run(async () => + { + var currentCount = testCluster.Silos.Count; + + if (currentCount > target) + { + // Stop or kill a random silo, but not the primary (since that hosts cluster membership) + var victim = testCluster.SecondarySilos[Random.Shared.Next(testCluster.SecondarySilos.Count)]; + if (currentCount % 2 == 0) + { + log.LogInformation("Stopping '{Silo}'.", victim.SiloAddress); + await testCluster.StopSiloAsync(victim); + log.LogInformation("Stopped '{Silo}'.", victim.SiloAddress); + } + else + { + log.LogInformation("Killing '{Silo}'.", victim.SiloAddress); + await testCluster.KillSiloAsync(victim); + log.LogInformation("Killed '{Silo}'.", victim.SiloAddress); + } + } + else if (currentCount < target) + { + log.LogInformation("Starting new silo."); + var result = await testCluster.StartAdditionalSiloAsync(); + log.LogInformation("Started '{Silo}'.", result.SiloAddress); + } + + if (currentCount <= lowerLimit) + { + target = upperLimit; + } + else if (currentCount >= upperLimit) + { + target = lowerLimit; + } + }); + } + else + { + await Task.Delay(remaining); + } + } + catch (Exception exception) + { + log.LogInformation(exception, "Ignoring chaos exception."); + } + } + }); + + await await Task.WhenAny(loadTask, chaosTask); + cts.Cancel(); + await Task.WhenAll(loadTask, chaosTask); + await testCluster.StopAllSilosAsync(); + await testCluster.DisposeAsync(); + } + + private class SiloBuilderConfigurator : ISiloConfigurator + { + public void Configure(ISiloBuilder siloBuilder) + { + siloBuilder.Configure(o => o.ResponseTimeout = o.SystemResponseTimeout = TimeSpan.FromMinutes(2)); +#pragma warning disable ORLEANSEXP002 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed. + siloBuilder.AddDistributedGrainDirectory(); +#pragma warning restore ORLEANSEXP002 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed. + } + } +} + diff --git a/test/TesterInternal/GrainDirectoryPartitionTests.cs b/test/TesterInternal/GrainDirectoryPartitionTests.cs index 1e9f373295..6436c00526 100644 --- a/test/TesterInternal/GrainDirectoryPartitionTests.cs +++ b/test/TesterInternal/GrainDirectoryPartitionTests.cs @@ -11,7 +11,7 @@ namespace UnitTests; [TestCategory("BVT"), TestCategory("GrainDirectory")] public class GrainDirectoryPartitionTests { - private readonly GrainDirectoryPartition _target; + private readonly LocalGrainDirectoryPartition _target; private readonly MockSiloStatusOracle _siloStatusOracle; private static readonly SiloAddress LocalSiloAddress = SiloAddress.FromParsableString("127.0.0.1:11111@123"); private static readonly SiloAddress OtherSiloAddress = SiloAddress.FromParsableString("127.0.0.2:11111@456"); @@ -19,7 +19,7 @@ public class GrainDirectoryPartitionTests public GrainDirectoryPartitionTests() { _siloStatusOracle = new MockSiloStatusOracle(); - _target = new GrainDirectoryPartition( + _target = new LocalGrainDirectoryPartition( _siloStatusOracle, Options.Create(new GrainDirectoryOptions()), new LoggerFactory()); diff --git a/test/TesterInternal/LivenessTests/ConsistentRingProviderTests.cs b/test/TesterInternal/LivenessTests/ConsistentRingProviderTests.cs index d4c536251c..035063eead 100644 --- a/test/TesterInternal/LivenessTests/ConsistentRingProviderTests.cs +++ b/test/TesterInternal/LivenessTests/ConsistentRingProviderTests.cs @@ -1,44 +1,31 @@ +using System.Collections.Immutable; +using System.Net; using Microsoft.Extensions.Logging.Abstractions; using Orleans.Configuration; -using Orleans.Runtime; using Orleans.Runtime.ConsistentRing; using Orleans.Streams; +using TestExtensions; using Xunit; using Xunit.Abstractions; -using TestExtensions; -using System.Net; -using System.Collections.Immutable; namespace UnitTests.LivenessTests { - public class ConsistentRingProviderTests : IClassFixture + public class ConsistentRingProviderTests(ITestOutputHelper output) { - private readonly ITestOutputHelper output; - - public class Fixture - { - public Fixture() - { - } - } - - public ConsistentRingProviderTests(ITestOutputHelper output) - { - this.output = output; - } + private readonly ITestOutputHelper _output = output; [Fact, TestCategory("Functional"), TestCategory("Liveness"), TestCategory("Ring"), TestCategory("RingStandalone")] public void ConsistentRingProvider_Test1() { SiloAddress silo1 = SiloAddressUtils.NewLocalSiloAddress(0); ConsistentRingProvider ring = new ConsistentRingProvider(silo1, NullLoggerFactory.Instance, new FakeSiloStatusOracle()); - output.WriteLine("Silo1 range: {0}. The whole ring is: {1}", ring.GetMyRange(), ring.ToString()); + _output.WriteLine("Silo1 range: {0}. The whole ring is: {1}", ring.GetMyRange(), ring.ToString()); ring.AddServer(SiloAddressUtils.NewLocalSiloAddress(1)); - output.WriteLine("Silo1 range: {0}. The whole ring is: {1}", ring.GetMyRange(), ring.ToString()); + _output.WriteLine("Silo1 range: {0}. The whole ring is: {1}", ring.GetMyRange(), ring.ToString()); ring.AddServer(SiloAddressUtils.NewLocalSiloAddress(2)); - output.WriteLine("Silo1 range: {0}. The whole ring is: {1}", ring.GetMyRange(), ring.ToString()); + _output.WriteLine("Silo1 range: {0}. The whole ring is: {1}", ring.GetMyRange(), ring.ToString()); } [Fact, TestCategory("Functional"), TestCategory("Liveness"), TestCategory("Ring"), TestCategory("RingStandalone")] @@ -46,12 +33,12 @@ public void ConsistentRingProvider_Test2() { SiloAddress silo1 = SiloAddressUtils.NewLocalSiloAddress(0); VirtualBucketsRingProvider ring = new VirtualBucketsRingProvider(silo1, NullLoggerFactory.Instance, 30, new FakeSiloStatusOracle()); - output.WriteLine("\n\n*** Silo1 range: {0}.\n*** The whole ring with 1 silo is:\n{1}\n\n", ring.GetMyRange(), ring.ToString()); + _output.WriteLine("\n\n*** Silo1 range: {0}.\n*** The whole ring with 1 silo is:\n{1}\n\n", ring.GetMyRange(), ring.ToString()); for (int i = 1; i <= 10; i++) { ring.SiloStatusChangeNotification(SiloAddressUtils.NewLocalSiloAddress(i), SiloStatus.Active); - output.WriteLine("\n\n*** Silo1 range: {0}.\n*** The whole ring with {1} silos is:\n{2}\n\n", ring.GetMyRange(), i + 1, ring.ToString()); + _output.WriteLine("\n\n*** Silo1 range: {0}.\n*** The whole ring with {1} silos is:\n{2}\n\n", ring.GetMyRange(), i + 1, ring.ToString()); } } @@ -65,12 +52,12 @@ public void ConsistentRingProvider_Test3() Random random = new Random(); SiloAddress silo1 = SiloAddressUtils.NewLocalSiloAddress(random.Next(100000)); VirtualBucketsRingProvider ring = new VirtualBucketsRingProvider(silo1, NullLoggerFactory.Instance, 50, new FakeSiloStatusOracle()); - + for (int i = 1; i <= NUM_SILOS - 1; i++) { ring.SiloStatusChangeNotification(SiloAddressUtils.NewLocalSiloAddress(random.Next(100000)), SiloStatus.Active); } - + var siloRanges = ring.GetRanges(); var sortedSiloRanges = siloRanges.ToList(); sortedSiloRanges.Sort((t1, t2) => t1.Item2.RangePercentage().CompareTo(t2.Item2.RangePercentage())); @@ -79,7 +66,7 @@ public void ConsistentRingProvider_Test3() foreach (var siloRange in siloRanges) { List agentRanges = new List(); - for(int i=0; i < NUM_AGENTS; i++) + for (int i = 0; i < NUM_AGENTS; i++) { IRingRangeInternal agentRange = (IRingRangeInternal)RangeFactory.GetEquallyDividedSubRange(siloRange.Value, NUM_AGENTS, i); agentRanges.Add(agentRange); @@ -89,18 +76,18 @@ public void ConsistentRingProvider_Test3() Dictionary> queueHistogram = GetQueueHistogram(allAgentRanges, (int)NUM_QUEUES); string str = Utils.EnumerableToString(sortedSiloRanges, - tuple => string.Format("Silo {0} -> Range {1:0.000}%, {2} queues: {3}", + tuple => string.Format("Silo {0} -> Range {1:0.000}%, {2} queues: {3}", tuple.Item1, tuple.Item2.RangePercentage(), queueHistogram[tuple.Item1].Sum(), Utils.EnumerableToString(queueHistogram[tuple.Item1])), "\n"); - output.WriteLine("\n\n*** The whole ring with {0} silos is:\n{1}\n\n", NUM_SILOS, str); + _output.WriteLine("\n\n*** The whole ring with {0} silos is:\n{1}\n\n", NUM_SILOS, str); - output.WriteLine("Total number of queues is: {0}", queueHistogram.Values.Sum(list => list.Sum())); - output.WriteLine("Expected average range per silo is: {0:0.00}%, expected #queues per silo is: {1:0.00}, expected #queues per agent is: {2:0.000}.", + _output.WriteLine("Total number of queues is: {0}", queueHistogram.Values.Sum(list => list.Sum())); + _output.WriteLine("Expected average range per silo is: {0:0.00}%, expected #queues per silo is: {1:0.00}, expected #queues per agent is: {2:0.000}.", 100.0 / NUM_SILOS, NUM_QUEUES / NUM_SILOS, NUM_QUEUES / (NUM_SILOS * NUM_AGENTS)); - output.WriteLine("Min #queues per silo is: {0}, Max #queues per silo is: {1}.", + _output.WriteLine("Min #queues per silo is: {0}, Max #queues per silo is: {1}.", queueHistogram.Values.Min(list => list.Sum()), queueHistogram.Values.Max(list => list.Sum())); } @@ -182,7 +169,7 @@ public bool TryGetSiloName(SiloAddress siloAddress, out string siloName) } public bool UnSubscribeFromSiloStatusEvents(ISiloStatusListener observer) => _subscribers.Remove(observer); - public ImmutableArray GetActiveSilos() => [.. GetApproximateSiloStatuses(onlyActive: true).Keys]; + public ImmutableArray GetActiveSilos() => [.. GetApproximateSiloStatuses(onlyActive: true).Keys]; } } } diff --git a/test/Transactions/Orleans.Transactions.Tests/Hosting/TransactionTestExtensions.cs b/test/Transactions/Orleans.Transactions.Tests/Hosting/TransactionTestExtensions.cs index 92cda04554..bfc89aaf89 100644 --- a/test/Transactions/Orleans.Transactions.Tests/Hosting/TransactionTestExtensions.cs +++ b/test/Transactions/Orleans.Transactions.Tests/Hosting/TransactionTestExtensions.cs @@ -7,18 +7,18 @@ public static class TransactionTestExtensions { public static ISiloBuilder ConfigureTracingForTransactionTests(this ISiloBuilder clientBuilder) { - clientBuilder.Services.ConfiguretracingForTransactionTests(); + clientBuilder.Services.ConfigureTracingForTransactionTests(); return clientBuilder; } public static IClientBuilder ConfigureTracingForTransactionTests(this IClientBuilder clientBuilder) { - clientBuilder.Services.ConfiguretracingForTransactionTests(); + clientBuilder.Services.ConfigureTracingForTransactionTests(); return clientBuilder; } // control the tracing of the various components of the transaction mechanism - public static IServiceCollection ConfiguretracingForTransactionTests(this IServiceCollection services) + public static IServiceCollection ConfigureTracingForTransactionTests(this IServiceCollection services) { return services.AddLogging(loggingBuilder => {