From 40aeeef3630b3a44e87e3e0c5baf6b651380f561 Mon Sep 17 00:00:00 2001 From: Alex K <8418476+fearful-symmetry@users.noreply.github.com> Date: Fri, 18 Dec 2020 09:25:13 -0800 Subject: [PATCH] dynamically generate CPU counts for metrics (#23154) * dynamically generate CPU counts for metrics * dynamic-numcpu * fix exported vars * clean up CPU code * fix CPU count in system/cpu * update load * add changelog * fix tests --- CHANGELOG-developer.next.asciidoc | 1 + libbeat/cmd/instance/metrics/metrics.go | 2 +- libbeat/metric/system/cpu/cpu.go | 23 ++++++++----------- libbeat/metric/system/cpu/cpu_test.go | 23 ++++++++++++------- .../metric/system/diskio/diskstat_linux.go | 6 ++--- libbeat/metric/system/process/process.go | 5 +--- libbeat/metric/system/process/process_test.go | 14 +++++++---- metricbeat/module/system/cpu/data.go | 2 +- metricbeat/module/system/load/load.go | 4 +++- 9 files changed, 44 insertions(+), 36 deletions(-) diff --git a/CHANGELOG-developer.next.asciidoc b/CHANGELOG-developer.next.asciidoc index edec465f854..239bfa7d395 100644 --- a/CHANGELOG-developer.next.asciidoc +++ b/CHANGELOG-developer.next.asciidoc @@ -52,6 +52,7 @@ The list below covers the major changes between 7.0.0-rc2 and master only. - Replace `ACKCount`, `ACKEvents`, and `ACKLastEvent` callbacks with `ACKHandler` and interface in `beat.ClientConfig`. {pull}19632[19632] - Remove global ACK handler support via `SetACKHandler` from publisher pipeline. {pull}19632[19632] - Make implementing `Close` required for `reader.Reader` interfaces. {pull}20455[20455] +- Remove `NumCPU` as clients should update the CPU count on the fly in case of config changes in a VM. {pull}23154[23154] ==== Bugfixes diff --git a/libbeat/cmd/instance/metrics/metrics.go b/libbeat/cmd/instance/metrics/metrics.go index a1fd1f56783..425952cc7c2 100644 --- a/libbeat/cmd/instance/metrics/metrics.go +++ b/libbeat/cmd/instance/metrics/metrics.go @@ -264,7 +264,7 @@ func reportSystemCPUUsage(_ monitoring.Mode, V monitoring.Visitor) { V.OnRegistryStart() defer V.OnRegistryFinished() - monitoring.ReportInt(V, "cores", int64(process.NumCPU)) + monitoring.ReportInt(V, "cores", int64(runtime.NumCPU())) } func reportRuntime(_ monitoring.Mode, V monitoring.Visitor) { diff --git a/libbeat/metric/system/cpu/cpu.go b/libbeat/metric/system/cpu/cpu.go index abf274ff066..c5f0695c2af 100644 --- a/libbeat/metric/system/cpu/cpu.go +++ b/libbeat/metric/system/cpu/cpu.go @@ -26,12 +26,6 @@ import ( sigar "github.com/elastic/gosigar" ) -var ( - // NumCores is the number of CPU cores in the system. Changes to operating - // system CPU allocation after process startup are not reflected. - NumCores = runtime.NumCPU() -) - // CPU Monitor // Monitor is used to monitor the overall CPU usage of the system. @@ -83,16 +77,16 @@ type Metrics struct { } // NormalizedPercentages returns CPU percentage usage information that is -// normalized by the number of CPU cores (NumCores). The values will range from +// normalized by the number of CPU cores. The values will range from // 0 to 100%. func (m *Metrics) NormalizedPercentages() Percentages { return cpuPercentages(m.previousSample, m.currentSample, 1) } // Percentages returns CPU percentage usage information. The values range from -// 0 to 100% * NumCores. +// 0 to 100% * NumCPU. func (m *Metrics) Percentages() Percentages { - return cpuPercentages(m.previousSample, m.currentSample, NumCores) + return cpuPercentages(m.previousSample, m.currentSample, runtime.NumCPU()) } // cpuPercentages calculates the amount of CPU time used between the two given @@ -215,7 +209,7 @@ type LoadAverages struct { } // Averages return the CPU load averages. These values should range from -// 0 to NumCores. +// 0 to NumCPU. func (m *LoadMetrics) Averages() LoadAverages { return LoadAverages{ OneMinute: common.Round(m.sample.One, common.DefaultDecimalPlacesCount), @@ -224,12 +218,13 @@ func (m *LoadMetrics) Averages() LoadAverages { } } -// NormalizedAverages return the CPU load averages normalized by the NumCores. +// NormalizedAverages return the CPU load averages normalized by the NumCPU. // These values should range from 0 to 1. func (m *LoadMetrics) NormalizedAverages() LoadAverages { + cpus := runtime.NumCPU() return LoadAverages{ - OneMinute: common.Round(m.sample.One/float64(NumCores), common.DefaultDecimalPlacesCount), - FiveMinute: common.Round(m.sample.Five/float64(NumCores), common.DefaultDecimalPlacesCount), - FifteenMinute: common.Round(m.sample.Fifteen/float64(NumCores), common.DefaultDecimalPlacesCount), + OneMinute: common.Round(m.sample.One/float64(cpus), common.DefaultDecimalPlacesCount), + FiveMinute: common.Round(m.sample.Five/float64(cpus), common.DefaultDecimalPlacesCount), + FifteenMinute: common.Round(m.sample.Fifteen/float64(cpus), common.DefaultDecimalPlacesCount), } } diff --git a/libbeat/metric/system/cpu/cpu_test.go b/libbeat/metric/system/cpu/cpu_test.go index f45e43136dc..05e3ce035bf 100644 --- a/libbeat/metric/system/cpu/cpu_test.go +++ b/libbeat/metric/system/cpu/cpu_test.go @@ -29,6 +29,12 @@ import ( "github.com/elastic/gosigar" ) +var ( + // numCores is the number of CPU cores in the system. Changes to operating + // system CPU allocation after process startup are not reflected. + numCores = runtime.NumCPU() +) + func TestMonitorSample(t *testing.T) { cpu := &Monitor{lastSample: &gosigar.Cpu{}} s, err := cpu.Sample() @@ -55,7 +61,7 @@ func TestMonitorSample(t *testing.T) { } func TestCoresMonitorSample(t *testing.T) { - cores := &CoresMonitor{lastSample: make([]gosigar.Cpu, NumCores)} + cores := &CoresMonitor{lastSample: make([]gosigar.Cpu, numCores)} sample, err := cores.Sample() if err != nil { t.Fatal(err) @@ -102,8 +108,8 @@ func TestMetricsRounding(t *testing.T) { // TestMetricsPercentages tests that Metrics returns the correct // percentages and normalized percentages. func TestMetricsPercentages(t *testing.T) { - NumCores = 10 - defer func() { NumCores = runtime.NumCPU() }() + numCores = 10 + defer func() { numCores = runtime.NumCPU() }() // This test simulates 30% user and 70% system (normalized), or 3% and 7% // respectively when there are 10 CPUs. @@ -132,9 +138,10 @@ func TestMetricsPercentages(t *testing.T) { assert.EqualValues(t, .0, pct.Idle) assert.EqualValues(t, 1., pct.Total) - pct = sample.Percentages() - assert.EqualValues(t, .3*float64(NumCores), pct.User) - assert.EqualValues(t, .7*float64(NumCores), pct.System) - assert.EqualValues(t, .0*float64(NumCores), pct.Idle) - assert.EqualValues(t, 1.*float64(NumCores), pct.Total) + //bypass the Metrics API so we can have a constant CPU value + pct = cpuPercentages(&s0, &s1, numCores) + assert.EqualValues(t, .3*float64(numCores), pct.User) + assert.EqualValues(t, .7*float64(numCores), pct.System) + assert.EqualValues(t, .0*float64(numCores), pct.Idle) + assert.EqualValues(t, 1.*float64(numCores), pct.Total) } diff --git a/libbeat/metric/system/diskio/diskstat_linux.go b/libbeat/metric/system/diskio/diskstat_linux.go index 826aed78c27..5ab0f7e3723 100644 --- a/libbeat/metric/system/diskio/diskstat_linux.go +++ b/libbeat/metric/system/diskio/diskstat_linux.go @@ -20,10 +20,10 @@ package diskio import ( + "runtime" + "github.com/pkg/errors" "github.com/shirou/gopsutil/disk" - - "github.com/elastic/beats/v7/libbeat/metric/system/cpu" ) // GetCLKTCK emulates the _SC_CLK_TCK syscall @@ -63,7 +63,7 @@ func (stat *IOStat) CalcIOStatistics(counter disk.IOCountersStat) (IOMetric, err } // calculate the delta ms between the CloseSampling and OpenSampling - deltams := 1000.0 * float64(stat.curCPU.Total()-stat.lastCPU.Total()) / float64(cpu.NumCores) / float64(GetCLKTCK()) + deltams := 1000.0 * float64(stat.curCPU.Total()-stat.lastCPU.Total()) / float64(runtime.NumCPU()) / float64(GetCLKTCK()) if deltams <= 0 { return IOMetric{}, errors.New("The delta cpu time between close sampling and open sampling is less or equal to 0") } diff --git a/libbeat/metric/system/process/process.go b/libbeat/metric/system/process/process.go index 03ed5085c44..76098a43f19 100644 --- a/libbeat/metric/system/process/process.go +++ b/libbeat/metric/system/process/process.go @@ -36,9 +36,6 @@ import ( sigar "github.com/elastic/gosigar" ) -// NumCPU is the number of CPUs of the host -var NumCPU = runtime.NumCPU() - // ProcsMap is a map where the keys are the names of processes and the value is the Process with that name type ProcsMap map[int]*Process @@ -365,7 +362,7 @@ func GetProcCPUPercentage(s0, s1 *Process) (normalizedPct, pct, totalPct float64 totalCPUDeltaMillis := int64(s1.Cpu.Total - s0.Cpu.Total) pct := float64(totalCPUDeltaMillis) / float64(timeDeltaMillis) - normalizedPct := pct / float64(NumCPU) + normalizedPct := pct / float64(runtime.NumCPU()) return common.Round(normalizedPct, common.DefaultDecimalPlacesCount), common.Round(pct, common.DefaultDecimalPlacesCount), diff --git a/libbeat/metric/system/process/process_test.go b/libbeat/metric/system/process/process_test.go index 5e04346edbf..6bc6be447a5 100644 --- a/libbeat/metric/system/process/process_test.go +++ b/libbeat/metric/system/process/process_test.go @@ -33,6 +33,9 @@ import ( "github.com/elastic/gosigar" ) +// numCPU is the number of CPUs of the host +var numCPU = runtime.NumCPU() + func TestPids(t *testing.T) { pids, err := Pids() @@ -157,11 +160,14 @@ func TestProcCpuPercentage(t *testing.T) { SampleTime: p1.SampleTime.Add(time.Second), } - NumCPU = 48 - defer func() { NumCPU = runtime.NumCPU() }() - totalPercentNormalized, totalPercent, totalValue := GetProcCPUPercentage(p1, p2) - assert.EqualValues(t, 0.0721, totalPercentNormalized) + //GetProcCPUPercentage wil return a number that varies based on the host, due to NumCPU() + // So "un-normalize" it, then re-normalized with a constant. + cpu := float64(runtime.NumCPU()) + unNormalized := totalPercentNormalized * cpu + normalizedTest := common.Round(unNormalized/48, common.DefaultDecimalPlacesCount) + + assert.EqualValues(t, 0.0721, normalizedTest) assert.EqualValues(t, 3.459, totalPercent) assert.EqualValues(t, 14841, totalValue) } diff --git a/metricbeat/module/system/cpu/data.go b/metricbeat/module/system/cpu/data.go index 497a91a6173..555f9e62cfa 100644 --- a/metricbeat/module/system/cpu/data.go +++ b/metricbeat/module/system/cpu/data.go @@ -97,7 +97,7 @@ func getPlatformCPUMetrics(sample *cpu.Metrics, selectors []string, event common // gather CPU metrics func collectCPUMetrics(selectors []string, sample *cpu.Metrics) mb.Event { - event := common.MapStr{"cores": cpu.NumCores} + event := common.MapStr{"cores": runtime.NumCPU()} getPlatformCPUMetrics(sample, selectors, event) //generate the host fields here, since we don't want users disabling it. diff --git a/metricbeat/module/system/load/load.go b/metricbeat/module/system/load/load.go index 1e7a6fe2b3b..dd10d24cef1 100644 --- a/metricbeat/module/system/load/load.go +++ b/metricbeat/module/system/load/load.go @@ -20,6 +20,8 @@ package load import ( + "runtime" + "github.com/pkg/errors" "github.com/elastic/beats/v7/libbeat/common" @@ -58,7 +60,7 @@ func (m *MetricSet) Fetch(r mb.ReporterV2) error { normAvgs := load.NormalizedAverages() event := common.MapStr{ - "cores": cpu.NumCores, + "cores": runtime.NumCPU(), "1": avgs.OneMinute, "5": avgs.FiveMinute, "15": avgs.FifteenMinute,