Skip to content

Commit

Permalink
Add a per-core percentile utilization metric
Browse files Browse the repository at this point in the history
Summary:
We want to replace the current host average cpu utilization with a percentile that works for a variety of hardware and network RX configurations.
The idea is that RX/RPS cores may have different utilization compared to the rest of them. This change allows tracking per-core utilization and computing a configured percentile of those utilizations.

Differential Revision: D49079695

fbshipit-source-id: ad97227fb9cedfd66eb487e78b7eb378aa27cac0
  • Loading branch information
meleshuk authored and facebook-github-bot committed Sep 16, 2023
1 parent be18754 commit f0500bd
Show file tree
Hide file tree
Showing 4 changed files with 119 additions and 4 deletions.
78 changes: 74 additions & 4 deletions proxygen/lib/stats/ResourceData.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#pragma once

#include <chrono>
#include <folly/logging/xlog.h>
#include <glog/stl_logging.h>
#include <stdint.h>
#include <vector>

Expand Down Expand Up @@ -49,6 +51,43 @@ struct ResourceData : public PeriodicStatsDataBase {
return getPctFromRatio(getCpuRatioUtil(normalized));
}

/**
* Gets the cpu utilization ratio (0-1.0 over last update interval),
* aggregated over all cores using a configured quantile.
* Utilized = non-idle and non-iowait
*
* Cgroup CPU utilization might be significantly off during peak utilization
* i.e. way above 100% due to CPU throttling, pass `normalized` as false if
* you want to see values above 100, for example if you aggregate values over
* some window.
*/
[[nodiscard]] double getCpuUtilPercentileRatio(bool normalized = true) const {
return normalized ? std::min(1.0, cpuRatioUtilPercentile_)
: cpuRatioUtilPercentile_;
}

/**
* Gets the cpu percentage utilization (0-100 over last update interval),
* aggregated over all cores using a configured quantile.
* Utilized = non-idle and non-iowait
*
* Cgroup CPU utilization might be significantly off during peak utilization
* i.e. way above 100% due to CPU throttling, pass `normalized` as false if
* you want to see values above 100, for example if you aggregate values over
* some window.
*/
[[nodiscard]] double getCpuUtilPercentile(bool normalized = true) const {
return getPctFromRatio(getCpuUtilPercentileRatio(normalized));
}

/**
* Gets the quantile configured for producing the aggregation from
* getCpuUtilPercentile().
*/
[[nodiscard]] double getCpuUtilPercentileConfigured() const {
return cpuUtilPercentileConfigured_;
}

/**
* Gets the average soft cpu ratio utilization (0, 1.0 over the last update
* interval).
Expand Down Expand Up @@ -289,11 +328,39 @@ struct ResourceData : public PeriodicStatsDataBase {
pressureUdpMemLimit_ != 0 && minUdpMemLimit_ != 0;
}

void setCpuStats(double cpuRatioUtil,
double cpuSoftIrqRatioUtil,
// Nearest rank, inclusive on upper boundary
static double computePercentile(std::vector<double>& sortedValues,
double percentile) {
size_t size = sortedValues.size();
if (size == 0) {
if (percentile < 0 || percentile > 100) {
XLOG(ERR) << "Invalid percentile " << percentile;
}

return 0.0;
}

// p100 is the largest value, as well as p(N-1/N)
size_t index =
std::min((size_t)((double)size * percentile / 100.0), size - 1);
XLOG(DBG4) << "index=" << index << ", percentile=" << percentile
<< "values=" << sortedValues;

return sortedValues[index];
}

void setCpuStats(double cpuUsageRatio,
std::vector<double>&& cpuCoreUsageRatios,
double cpuUtilPercentileConfigured,
double cpuSoftIrqUsageRatio,
std::vector<double>&& softIrqCpuCoreRatioUtils) {
cpuRatioUtil_ = cpuRatioUtil;
cpuSoftIrqRatioUtil_ = cpuSoftIrqRatioUtil;
cpuRatioUtil_ = cpuUsageRatio;
cpuCoreUsageRatios_ = std::move(cpuCoreUsageRatios);
std::sort(cpuCoreUsageRatios_.begin(), cpuCoreUsageRatios_.end());
cpuUtilPercentileConfigured_ = cpuUtilPercentileConfigured;
cpuRatioUtilPercentile_ =
computePercentile(cpuCoreUsageRatios_, cpuUtilPercentileConfigured_);
cpuSoftIrqRatioUtil_ = cpuSoftIrqUsageRatio;
softIrqCpuCoreRatioUtils_ = softIrqCpuCoreRatioUtils;
}

Expand Down Expand Up @@ -360,6 +427,9 @@ struct ResourceData : public PeriodicStatsDataBase {

// Resource utilization metrics
double cpuRatioUtil_{0};
std::vector<double> cpuCoreUsageRatios_;
double cpuRatioUtilPercentile_{0};
double cpuUtilPercentileConfigured_{61};
double cpuSoftIrqRatioUtil_{0};
std::vector<double> softIrqCpuCoreRatioUtils_;
uint64_t usedMemBytes_{0};
Expand Down
8 changes: 8 additions & 0 deletions proxygen/lib/stats/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,14 @@ proxygen_add_test(TARGET PeriodicStatsTest
testmain
)

proxygen_add_test(TARGET ResourceDataTest
SOURCES
ResourceDataTest.cpp
DEPENDS
proxygen
testmain
)

proxygen_add_test(TARGET ResourceStatsTest
SOURCES
ResourceStatsTest.cpp
Expand Down
2 changes: 2 additions & 0 deletions proxygen/lib/stats/test/MockResources.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ class MockResources : public Resources {

data.setCpuStats(
params.cpuUtilRatio,
std::vector<double>(params.numCpuCores, params.cpuUtilRatio),
80,
params.cpuSoftIrqUtilRatio,
std::vector<double>(params.numCpuCores, params.cpuSoftIrqUtilRatio));
uint64_t totalMemBytes = 100;
Expand Down
35 changes: 35 additions & 0 deletions proxygen/lib/stats/test/ResourceDataTest.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include "proxygen/lib/stats/ResourceData.h"
#include <folly/portability/GTest.h>

using namespace ::testing;
using namespace proxygen;

class ResourceDataTest : public ::testing::Test {};

TEST_F(ResourceDataTest, Percentiles) {
std::vector<double> values{0.1, 0.5, 0.6, 0.7, 0.8};
EXPECT_EQ(ResourceData::computePercentile(values, 0), 0.1);
EXPECT_EQ(ResourceData::computePercentile(values, 19), 0.1);
EXPECT_EQ(ResourceData::computePercentile(values, 20), 0.5);
EXPECT_EQ(ResourceData::computePercentile(values, 21), 0.5);
EXPECT_EQ(ResourceData::computePercentile(values, 39), 0.5);
EXPECT_EQ(ResourceData::computePercentile(values, 40), 0.6);
EXPECT_EQ(ResourceData::computePercentile(values, 41), 0.6);
EXPECT_EQ(ResourceData::computePercentile(values, 59), 0.6);
EXPECT_EQ(ResourceData::computePercentile(values, 60), 0.7);
EXPECT_EQ(ResourceData::computePercentile(values, 61), 0.7);
EXPECT_EQ(ResourceData::computePercentile(values, 79), 0.7);
EXPECT_EQ(ResourceData::computePercentile(values, 80), 0.8);
EXPECT_EQ(ResourceData::computePercentile(values, 81), 0.8);
EXPECT_EQ(ResourceData::computePercentile(values, 90), 0.8);
EXPECT_EQ(ResourceData::computePercentile(values, 99), 0.8);
EXPECT_EQ(ResourceData::computePercentile(values, 100), 0.8);
}

0 comments on commit f0500bd

Please sign in to comment.