Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add metrics with correct names for Resource Monitoring #5341

Merged
merged 39 commits into from
Aug 12, 2024
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
f959973
Rework metrics
evgenyfedorov2 Jul 25, 2024
831e502
Use nameof()
evgenyfedorov2 Jul 30, 2024
c8367dc
update
evgenyfedorov2 Jul 31, 2024
06bd487
Merge branch 'main' into users/evgenyfedorov2/rework_metrics_rm
evgenyfedorov2 Jul 31, 2024
9d852f8
Update
evgenyfedorov2 Aug 1, 2024
a498ed3
small optimization
evgenyfedorov2 Aug 1, 2024
a644626
fix test
evgenyfedorov2 Aug 2, 2024
bd3d68c
Merge branch 'main' into users/evgenyfedorov2/rework_metrics_rm
evgenyfedorov2 Aug 5, 2024
728ac12
update comment
evgenyfedorov2 Aug 5, 2024
fd5bb31
Fix more comments
evgenyfedorov2 Aug 5, 2024
b9b51a1
Update comments
evgenyfedorov2 Aug 5, 2024
b92463a
Move memoryUsedPercentage calculation to Calculator
evgenyfedorov2 Aug 5, 2024
b346c5d
Rename variables
evgenyfedorov2 Aug 5, 2024
b9bec77
Rename internal methods
evgenyfedorov2 Aug 5, 2024
e5ad793
Rename private variables
evgenyfedorov2 Aug 5, 2024
c8b2960
Rename private variables
evgenyfedorov2 Aug 5, 2024
4ed789d
Rename private var
evgenyfedorov2 Aug 5, 2024
fa85475
revert
evgenyfedorov2 Aug 5, 2024
3c0698c
WIP
evgenyfedorov2 Aug 5, 2024
9ca2c35
Update
evgenyfedorov2 Aug 5, 2024
5d54584
.
evgenyfedorov2 Aug 5, 2024
80b0496
Revert "."
evgenyfedorov2 Aug 6, 2024
d13cbaf
working version
evgenyfedorov2 Aug 6, 2024
cf8cd73
Update
evgenyfedorov2 Aug 6, 2024
4f4ffd6
fix tests
evgenyfedorov2 Aug 6, 2024
44d935f
Improve tests
evgenyfedorov2 Aug 6, 2024
c582bb2
remove unused metric
evgenyfedorov2 Aug 6, 2024
bbe1d94
extract test code to methods
evgenyfedorov2 Aug 6, 2024
67843fb
Merge branch 'main' into users/evgenyfedorov2/rework_metrics_rm
evgenyfedorov2 Aug 6, 2024
1a1efc2
Add UseContainerMetricNames switch
evgenyfedorov2 Aug 8, 2024
e46ed8c
Linux snapshot provider to report CPU utilization relative to request…
evgenyfedorov2 Aug 8, 2024
a60aca4
Merge branch 'main' into users/evgenyfedorov2/rework_metrics_rm
evgenyfedorov2 Aug 9, 2024
5e56f01
Emit all metrics
evgenyfedorov2 Aug 9, 2024
f5423a7
.
evgenyfedorov2 Aug 9, 2024
55350e4
Update
evgenyfedorov2 Aug 9, 2024
19ace1c
Use explicit types instead of var
evgenyfedorov2 Aug 12, 2024
932727b
Sort const in the order
evgenyfedorov2 Aug 12, 2024
64465d1
Change xml comments
evgenyfedorov2 Aug 12, 2024
bc5cd17
Update xml comments
evgenyfedorov2 Aug 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ public static ResourceUtilization CalculateUtilization(in Snapshot first, in Sna
long runtimeTickDelta = second.TotalTimeSinceStart.Ticks - first.TotalTimeSinceStart.Ticks;

// Compute the total number of ticks available on the machine during that interval
double totalSystemTicks = runtimeTickDelta * systemResources.GuaranteedCpuUnits;
evgenyfedorov2 marked this conversation as resolved.
Show resolved Hide resolved
double totalSystemTicks = runtimeTickDelta;

// fudge to avoid divide by zero
if (totalSystemTicks <= 0)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -444,7 +444,7 @@ private static bool TryGetCpuUnitsFromCgroups(IFileSystem fileSystem, out float
/// <summary>
/// In cgroup v1 the CPU shares is used to determine the CPU allocation.
/// in cgroup v2 the CPU weight is used to determine the CPU allocation.
/// To calculete CPU request in cgroup v2 we need to read the CPU weight and convert it to CPU shares.
/// To calculate CPU request in cgroup v2 we need to read the CPU weight and convert it to CPU shares.
/// But for cgroup v1 we can read the CPU shares directly from the file.
/// 1024 equals 1 CPU core.
/// In cgroup v1 on some systems the location of the CPU shares file is different.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,13 @@ internal sealed class LinuxUtilizationProvider : ISnapshotProvider
private readonly object _cpuLocker = new();
private readonly object _memoryLocker = new();
private readonly ILinuxUtilizationParser _parser;
private readonly ulong _totalMemoryInBytes;
private readonly ulong _memoryLimit;
private readonly TimeSpan _cpuRefreshInterval;
private readonly TimeSpan _memoryRefreshInterval;
private readonly TimeProvider _timeProvider;
private readonly double _scale;
private readonly double _scaleForTrackerApi;
private readonly double _scaleRelativeToCpuLimit;
private readonly double _scaleRelativeToCpuRequest;
private readonly double _scaleRelativeToCpuLimitForTrackerApi;

private DateTimeOffset _refreshAfterCpu;
private DateTimeOffset _refreshAfterMemory;
Expand All @@ -42,68 +43,69 @@ public LinuxUtilizationProvider(IOptions<ResourceMonitoringOptions> options, ILi
_memoryRefreshInterval = options.Value.MemoryConsumptionRefreshInterval;
_refreshAfterCpu = now;
_refreshAfterMemory = now;
_totalMemoryInBytes = _parser.GetAvailableMemoryInBytes();
_memoryLimit = _parser.GetAvailableMemoryInBytes();
_previousHostCpuTime = _parser.GetHostCpuUsageInNanoseconds();
_previousCgroupCpuTime = _parser.GetCgroupCpuUsageInNanoseconds();

var hostMemory = _parser.GetHostAvailableMemory();
var hostCpus = _parser.GetHostCpuCount();
var availableCpus = _parser.GetCgroupLimitedCpus();
var cpuGuaranteedRequest = _parser.GetCgroupRequestCpu();
_scale = hostCpus / availableCpus;
_scaleForTrackerApi = hostCpus / availableCpus;
var cpuLimit = _parser.GetCgroupLimitedCpus();
evgenyfedorov2 marked this conversation as resolved.
Show resolved Hide resolved
var cpuRequest = _parser.GetCgroupRequestCpu();
_scaleRelativeToCpuLimit = hostCpus / cpuLimit;
_scaleRelativeToCpuRequest = hostCpus / cpuRequest;
_scaleRelativeToCpuLimitForTrackerApi = hostCpus / cpuLimit * cpuRequest;

#pragma warning disable CA2000 // Dispose objects before losing scope
// We don't dispose the meter because IMeterFactory handles that
// An issue on analyzer side: https://github.com/dotnet/roslyn-analyzers/issues/6912
// Related documentation: https://github.com/dotnet/docs/pull/37170
var meter = meterFactory.Create("Microsoft.Extensions.Diagnostics.ResourceMonitoring");
var meter = meterFactory.Create(nameof(Microsoft.Extensions.Diagnostics.ResourceMonitoring));
#pragma warning restore CA2000 // Dispose objects before losing scope

_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.CpuUtilization, observeValue: CpuUtilization, unit: "1");
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.MemoryUtilization, observeValue: MemoryUtilization, unit: "1");
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerCpuLimitUtilization, observeValue: () => CpuUtilization() * _scaleRelativeToCpuLimit, unit: "1");
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerMemoryLimitUtilization, observeValue: MemoryUtilization, unit: "1");
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerCpuRequestUtilization, observeValue: () => CpuUtilization() * _scaleRelativeToCpuRequest, unit: "1");

// cpuGuaranteedRequest is a CPU request for pod, for host its 1 core
// available CPUs is a CPU limit for a pod or for a host.
// _totalMemoryInBytes - Resource Memory Limit (in k8s terms)
// _totalMemoryInBytes - To keep the contract, this parameter will get the Host available memory
Resources = new SystemResources(cpuGuaranteedRequest, availableCpus, _totalMemoryInBytes, _totalMemoryInBytes);
// Old metrics, obsolete for this class, but used by WindowsSnapshotProvider. Keeping them for backward compatibility:
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ProcessCpuUtilization, observeValue: () => CpuUtilization() * _scaleRelativeToCpuLimit, unit: "1");
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ProcessMemoryUtilization, observeValue: MemoryUtilization, unit: "1");

// cpuRequest is a CPU request (aka guaranteed number of CPU units) for pod, for host its 1 core
// cpuLimit is a CPU limit (aka max CPU units available) for a pod or for a host.
// _memoryLimit - Resource Memory Limit (in k8s terms)
// _memoryLimit - To keep the contract, this parameter will get the Host available memory
Resources = new SystemResources(cpuRequest, cpuLimit, _memoryLimit, _memoryLimit);
}

public double CpuUtilization()
{
var now = _timeProvider.GetUtcNow();
bool needUpdate = false;

lock (_cpuLocker)
{
if (now >= _refreshAfterCpu)
if (now < _refreshAfterCpu)
{
needUpdate = true;
return _cpuPercentage;
}
}

if (needUpdate)
{
var hostCpuTime = _parser.GetHostCpuUsageInNanoseconds();
var cgroupCpuTime = _parser.GetCgroupCpuUsageInNanoseconds();
var hostCpuTime = _parser.GetHostCpuUsageInNanoseconds();
var cgroupCpuTime = _parser.GetCgroupCpuUsageInNanoseconds();

lock (_cpuLocker)
lock (_cpuLocker)
{
if (now >= _refreshAfterCpu)
{
if (now >= _refreshAfterCpu)
var deltaHost = hostCpuTime - _previousHostCpuTime;
var deltaCgroup = cgroupCpuTime - _previousCgroupCpuTime;

if (deltaHost > 0 && deltaCgroup > 0)
{
var deltaHost = hostCpuTime - _previousHostCpuTime;
var deltaCgroup = cgroupCpuTime - _previousCgroupCpuTime;

if (deltaHost > 0 && deltaCgroup > 0)
{
var percentage = Math.Min(One, deltaCgroup / deltaHost * _scale);

_cpuPercentage = percentage;
_refreshAfterCpu = now.Add(_cpuRefreshInterval);
_previousCgroupCpuTime = cgroupCpuTime;
_previousHostCpuTime = hostCpuTime;
}
var percentage = Math.Min(One, deltaCgroup / deltaHost);

_cpuPercentage = percentage;
_refreshAfterCpu = now.Add(_cpuRefreshInterval);
_previousCgroupCpuTime = cgroupCpuTime;
_previousHostCpuTime = hostCpuTime;
}
}
}
Expand All @@ -114,29 +116,25 @@ public double CpuUtilization()
public double MemoryUtilization()
{
var now = _timeProvider.GetUtcNow();
bool needUpdate = false;

lock (_memoryLocker)
{
if (now >= _refreshAfterMemory)
if (now < _refreshAfterMemory)
{
needUpdate = true;
return _memoryPercentage;
}
}

if (needUpdate)
{
var memoryUsed = _parser.GetMemoryUsageInBytes();
var memoryUsed = _parser.GetMemoryUsageInBytes();

lock (_memoryLocker)
lock (_memoryLocker)
{
if (now >= _refreshAfterMemory)
{
if (now >= _refreshAfterMemory)
{
var memoryPercentage = Math.Min(One, (double)memoryUsed / _totalMemoryInBytes);
var memoryPercentage = Math.Min(One, (double)memoryUsed / _memoryLimit);

_memoryPercentage = memoryPercentage;
_refreshAfterMemory = now.Add(_memoryRefreshInterval);
}
_memoryPercentage = memoryPercentage;
_refreshAfterMemory = now.Add(_memoryRefreshInterval);
}
}

Expand All @@ -157,7 +155,7 @@ public Snapshot GetSnapshot()
return new Snapshot(
totalTimeSinceStart: TimeSpan.FromTicks(hostTime / Hundred),
kernelTimeSinceStart: TimeSpan.Zero,
userTimeSinceStart: TimeSpan.FromTicks((long)(cgroupTime / Hundred * _scaleForTrackerApi)),
userTimeSinceStart: TimeSpan.FromTicks((long)(cgroupTime / Hundred * _scaleRelativeToCpuLimitForTrackerApi)),
memoryUsageInBytes: memoryUsed);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ private static ResourceMonitorBuilder AddWindowsProvider(this ResourceMonitorBui
builder.PickWindowsSnapshotProvider();

_ = builder.Services
.AddActivatedSingleton<WindowsCounters>();
.AddActivatedSingleton<WindowsNetworkMetrics>();

_ = builder.Services
.AddActivatedSingleton<TcpTableInfo>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,16 @@ public readonly struct ResourceUtilization
/// <param name="systemResources">CPU and memory limits.</param>
public ResourceUtilization(double cpuUsedPercentage, ulong memoryUsedInBytes, SystemResources systemResources)
{
CpuUsedPercentage = Throw.IfLessThan(cpuUsedPercentage, 0.0);
var guaranteedCpuUnits = systemResources.GuaranteedCpuUnits;
if (guaranteedCpuUnits <= 0)
{
guaranteedCpuUnits = 1;
}

CpuUsedPercentage = Throw.IfLessThan(cpuUsedPercentage / guaranteedCpuUnits, 0.0);
MemoryUsedInBytes = Throw.IfLessThan(memoryUsedInBytes, 0);
SystemResources = systemResources;
MemoryUsedPercentage = Math.Min(Hundred, (double)MemoryUsedInBytes / SystemResources.GuaranteedMemoryInBytes * Hundred);
MemoryUsedPercentage = Math.Min(Hundred, (double)MemoryUsedInBytes / systemResources.GuaranteedMemoryInBytes * Hundred);
}

/// <summary>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System;

namespace Microsoft.Extensions.Diagnostics.ResourceMonitoring;

/// <summary>
Expand All @@ -13,18 +15,50 @@ namespace Microsoft.Extensions.Diagnostics.ResourceMonitoring;
internal static class ResourceUtilizationInstruments
evgenyfedorov2 marked this conversation as resolved.
Show resolved Hide resolved
{
/// <summary>
/// Gets the CPU consumption of the running application in range <c>[0, 1]</c>.
/// Gets the CPU consumption share of the running process in range <c>[0, 1]</c>.
RussKie marked this conversation as resolved.
Show resolved Hide resolved
/// </summary>
/// <remarks>
/// The type of an instrument is <see cref="System.Diagnostics.Metrics.ObservableGauge{T}"/>.
/// </remarks>
public const string ProcessCpuUtilization = "process.cpu.utilization";

/// <summary>
/// Gets the memory consumption share of the running process in range <c>[0, 1]</c>.
/// </summary>
/// <remarks>
/// The type of an instrument is <see cref="System.Diagnostics.Metrics.ObservableGauge{T}"/>.
/// </remarks>
public const string ProcessMemoryUtilization = "dotnet.process.memory.virtual.utilization";

/// <summary>
/// Gets the CPU limit consumption of all processes running inside a container or control group in range <c>[0, 1]</c>.
/// </summary>
/// <remarks>
/// The type of an instrument is <see cref="System.Diagnostics.Metrics.ObservableGauge{T}"/>.
/// </remarks>
public const string ContainerCpuLimitUtilization = "container.cpu.limit.utilization";

/// <summary>
/// Gets the CPU request consumption of all processes running inside a container or control group in range <c>[0, 1]</c>.
/// </summary>
/// <remarks>
/// The type of an instrument is <see cref="System.Diagnostics.Metrics.ObservableGauge{T}"/>.
/// </remarks>
public const string ContainerCpuRequestUtilization = "container.cpu.request.utilization";

/// <summary>
/// Gets the memory limit consumption of all processes running inside a container or control group in range <c>[0, 1]</c>.
/// </summary>
/// <remarks>
/// The type of an instrument is <see cref="System.Diagnostics.Metrics.ObservableGauge{T}"/>.
/// </remarks>
public const string CpuUtilization = "process.cpu.utilization";
public const string ContainerMemoryLimitUtilization = "container.memory.limit.utilization";
evgenyfedorov2 marked this conversation as resolved.
Show resolved Hide resolved

/// <summary>
/// Gets the memory consumption of the running application in range <c>[0, 1]</c>.
/// Gets the memory limit consumption of all processes running inside a container or control group in range <c>[0, 1]</c>.
/// </summary>
/// <remarks>
/// The type of an instrument is <see cref="System.Diagnostics.Metrics.ObservableGauge{T}"/>.
/// </remarks>
public const string MemoryUtilization = "dotnet.process.memory.virtual.utilization";
public const string ContainerMemoryRequestUtilization = "container.memory.request.utilization";
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ internal sealed class WindowsContainerSnapshotProvider : ISnapshotProvider
private readonly object _memoryLocker = new();
private readonly TimeProvider _timeProvider;
private readonly IProcessInfo _processInfo;
private readonly double _totalMemory;
private readonly double _cpuUnits;
private readonly double _memoryLimit;
private readonly double _cpuLimit;
private readonly TimeSpan _cpuRefreshInterval;
private readonly TimeSpan _memoryRefreshInterval;

Expand Down Expand Up @@ -77,15 +77,16 @@ internal WindowsContainerSnapshotProvider(

_timeProvider = timeProvider;

// initialize system resources information
using var jobHandle = _createJobHandleObject();

_cpuUnits = GetGuaranteedCpuUnits(jobHandle, systemInfo);
var memory = GetMemoryLimits(jobHandle);
var memoryLimitLong = GetMemoryLimit(jobHandle);
_memoryLimit = memoryLimitLong;
_cpuLimit = GetCpuLimit(jobHandle, systemInfo);

Resources = new SystemResources(_cpuUnits, _cpuUnits, memory, memory);
// CPU request (aka guaranteed CPU units) is not supported on Windows, so we set it to the same value as CPU limit (aka maximum CPU units).
// Memory request (aka guaranteed memory) is not supported on Windows, so we set it to the same value as memory limit (aka maximum memory).
Resources = new SystemResources(_cpuLimit, _cpuLimit, memoryLimitLong, memoryLimitLong);

_totalMemory = memory;
var basicAccountingInfo = jobHandle.GetBasicAccountingInfo();
_oldCpuUsageTicks = basicAccountingInfo.TotalKernelTime + basicAccountingInfo.TotalUserTime;
_oldCpuTimeTicks = _timeProvider.GetUtcNow().Ticks;
Expand All @@ -98,11 +99,15 @@ internal WindowsContainerSnapshotProvider(
// We don't dispose the meter because IMeterFactory handles that
// An issue on analyzer side: https://github.com/dotnet/roslyn-analyzers/issues/6912
// Related documentation: https://github.com/dotnet/docs/pull/37170
var meter = meterFactory.Create("Microsoft.Extensions.Diagnostics.ResourceMonitoring");
var meter = meterFactory.Create(nameof(Microsoft.Extensions.Diagnostics.ResourceMonitoring));
#pragma warning restore CA2000 // Dispose objects before losing scope

_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.CpuUtilization, observeValue: CpuPercentage);
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.MemoryUtilization, observeValue: MemoryPercentage);
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerCpuLimitUtilization, observeValue: CpuPercentage);
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerMemoryLimitUtilization, observeValue: MemoryPercentage);

// Old metrics, obsolete for this class, but used by WindowsSnapshotProvider. Keeping them for backward compatibility:
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ProcessCpuUtilization, observeValue: CpuPercentage);
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ProcessMemoryUtilization, observeValue: MemoryPercentage);
}

public Snapshot GetSnapshot()
Expand All @@ -119,7 +124,7 @@ public Snapshot GetSnapshot()
GetMemoryUsage());
}

private static double GetGuaranteedCpuUnits(IJobHandle jobHandle, ISystemInfo systemInfo)
private static double GetCpuLimit(IJobHandle jobHandle, ISystemInfo systemInfo)
{
// Note: This function convert the CpuRate from CPU cycles to CPU units, also it scales
// the CPU units with the number of processors (cores) available in the system.
Expand Down Expand Up @@ -149,7 +154,7 @@ private static double GetGuaranteedCpuUnits(IJobHandle jobHandle, ISystemInfo sy
/// Gets memory limit of the system.
/// </summary>
/// <returns>Memory limit allocated to the system in bytes.</returns>
private ulong GetMemoryLimits(IJobHandle jobHandle)
private ulong GetMemoryLimit(IJobHandle jobHandle)
{
var memoryLimitInBytes = jobHandle.GetExtendedLimitInfo().JobMemoryLimit.ToUInt64();

Expand Down Expand Up @@ -188,7 +193,7 @@ private double MemoryPercentage()
{
if (now >= _refreshAfterMemory)
{
_memoryPercentage = Math.Min(Hundred, currentMemoryUsage / _totalMemory * Hundred); // Don't change calculation order, otherwise we loose some precision
_memoryPercentage = Math.Min(Hundred, currentMemoryUsage / _memoryLimit * Hundred); // Don't change calculation order, otherwise we loose some precision
_refreshAfterMemory = now.Add(_memoryRefreshInterval);
}

Expand Down Expand Up @@ -217,7 +222,7 @@ private double CpuPercentage()
if (now >= _refreshAfterCpu)
{
var usageTickDelta = currentCpuTicks - _oldCpuUsageTicks;
var timeTickDelta = (now.Ticks - _oldCpuTimeTicks) * _cpuUnits;
var timeTickDelta = (now.Ticks - _oldCpuTimeTicks) * _cpuLimit;
if (usageTickDelta > 0 && timeTickDelta > 0)
{
_oldCpuUsageTicks = currentCpuTicks;
Expand Down
Loading