Skip to content

Commit

Permalink
Add Distances field to machine.Node (#3179)
Browse files Browse the repository at this point in the history
Signed-off-by: PiotrProkop <[email protected]>
  • Loading branch information
PiotrProkop authored Oct 3, 2022
1 parent 967136f commit 24dd1de
Show file tree
Hide file tree
Showing 13 changed files with 270 additions and 8 deletions.
1 change: 1 addition & 0 deletions docs/storage/prometheus.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ Metric name | Type | Description | Unit (where applicable) | option parameter |
`machine_dimm_capacity_bytes` | Gauge | Total RAM DIMM capacity (all types memory modules) value labeled by dimm type,<br>information is retrieved from sysfs edac per-DIMM API (/sys/devices/system/edac/mc/) introduced in kernel 3.6 | bytes | | |
`machine_dimm_count` | Gauge | Number of RAM DIMM (all types memory modules) value labeled by dimm type,<br>information is retrieved from sysfs edac per-DIMM API (/sys/devices/system/edac/mc/) introduced in kernel 3.6 | | |
`machine_memory_bytes` | Gauge | Amount of memory installed on the machine | bytes | |
`machine_node_distance` | Gauge | Distance between NUMA node and target NUMA node | | cpu_topology |
`machine_node_hugepages_count` | Gauge | Numer of hugepages assigned to NUMA node | | cpu_topology |
`machine_node_memory_capacity_bytes` | Gauge | Amount of memory assigned to NUMA node | bytes | cpu_topology |
`machine_nvm_avg_power_budget_watts` | Gauge | NVM power budget | watts | | libipmctl
Expand Down
1 change: 1 addition & 0 deletions info/v1/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ type Node struct {
HugePages []HugePagesInfo `json:"hugepages"`
Cores []Core `json:"cores"`
Caches []Cache `json:"caches"`
Distances []uint64 `json:"distances"`
}

type Core struct {
Expand Down
23 changes: 22 additions & 1 deletion machine/topology_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,10 @@ func TestTopology(t *testing.T) {
"/fakeSysfs/devices/system/node/node0/cpu11": "1",
}
sysFs.SetPhysicalPackageIDs(physicalPackageIDs, nil)

sysFs.SetDistances("/fakeSysfs/devices/system/node/node0", "10 11", nil)
sysFs.SetDistances("/fakeSysfs/devices/system/node/node1", "11 10", nil)

topology, numCores, err := GetTopology(sysFs)
assert.Nil(t, err)
assert.Equal(t, 12, numCores)
Expand All @@ -217,12 +221,17 @@ func TestTopology(t *testing.T) {
Type: "unified",
Level: 1,
}
distances := [][]uint64{
{10, 11},
{11, 10},
}
for i := 0; i < numNodes; i++ {
node := info.Node{Id: i}
// Copy over Memory from result. TODO(rjnagal): Use memory from fake.
node.Memory = topology[i].Memory
// Copy over HugePagesInfo from result. TODO(ohsewon): Use HugePagesInfo from fake.
node.HugePages = topology[i].HugePages
node.Distances = distances[i]
for j := 0; j < numCoresPerNode; j++ {
core := info.Core{Id: i*numCoresPerNode + j}
core.Caches = append(core.Caches, cache)
Expand Down Expand Up @@ -298,12 +307,13 @@ func TestTopologyWithoutNodes(t *testing.T) {
topologyJSON2, err := json.Marshal(topology[1])
assert.Nil(t, err)

expectedTopology1 := `{"node_id":0,"memory":0,"hugepages":null,"cores":[{"core_id":0,"thread_ids":[0,2],"caches":[{"id":0, "size":32768,"type":"unified","level":0}], "socket_id": 0, "uncore_caches":null}],"caches":null}`
expectedTopology1 := `{"node_id":0,"memory":0,"hugepages":null,"distances":null,"cores":[{"core_id":0,"thread_ids":[0,2],"caches":[{"id":0, "size":32768,"type":"unified","level":0}], "socket_id": 0, "uncore_caches":null}],"caches":null}`
expectedTopology2 := `
{
"node_id":1,
"memory":0,
"hugepages":null,
"distances": null,
"cores":[
{
"core_id":1,
Expand Down Expand Up @@ -359,6 +369,9 @@ func TestTopologyWithNodesWithoutCPU(t *testing.T) {
}
sysFs.SetHugePagesNr(hugePageNr, nil)

sysFs.SetDistances("/fakeSysfs/devices/system/node/node0", "10 11", nil)
sysFs.SetDistances("/fakeSysfs/devices/system/node/node1", "11 10", nil)

topology, numCores, err := GetTopology(sysFs)

assert.Nil(t, err)
Expand All @@ -381,6 +394,10 @@ func TestTopologyWithNodesWithoutCPU(t *testing.T) {
"page_size": 1048576
}
],
"distances": [
10,
11
],
"memory": 33604804608,
"node_id": 0
},
Expand All @@ -397,6 +414,10 @@ func TestTopologyWithNodesWithoutCPU(t *testing.T) {
"page_size": 1048576
}
],
"distances": [
11,
10
],
"memory": 33604804608,
"node_id": 1
}
Expand Down
8 changes: 8 additions & 0 deletions metrics/prometheus_fake.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,10 @@ func (p testSubcontainersInfoProvider) GetMachineInfo() (*info.MachineInfo, erro
},
},
},
Distances: []uint64{
10,
12,
},
},
{
Id: 1,
Expand Down Expand Up @@ -260,6 +264,10 @@ func (p testSubcontainersInfoProvider) GetMachineInfo() (*info.MachineInfo, erro
Level: 3,
},
},
Distances: []uint64{
12,
10,
},
},
},
}, nil
Expand Down
40 changes: 33 additions & 7 deletions metrics/prometheus_machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,14 @@ import (
var baseLabelsNames = []string{"machine_id", "system_uuid", "boot_id"}

const (
prometheusModeLabelName = "mode"
prometheusTypeLabelName = "type"
prometheusLevelLabelName = "level"
prometheusNodeLabelName = "node_id"
prometheusCoreLabelName = "core_id"
prometheusThreadLabelName = "thread_id"
prometheusPageSizeLabelName = "page_size"
prometheusModeLabelName = "mode"
prometheusTypeLabelName = "type"
prometheusLevelLabelName = "level"
prometheusNodeLabelName = "node_id"
prometheusCoreLabelName = "core_id"
prometheusThreadLabelName = "thread_id"
prometheusPageSizeLabelName = "page_size"
prometheusTargetNodeLabelName = "target_node_id"

nvmMemoryMode = "memory_mode"
nvmAppDirectMode = "app_direct_mode"
Expand Down Expand Up @@ -191,6 +192,15 @@ func NewPrometheusMachineCollector(i infoProvider, includedMetrics container.Met
return getHugePagesCount(machineInfo)
},
},
{
name: "machine_node_distance",
help: "Distance between NUMA node and target NUMA node.",
valueType: prometheus.GaugeValue,
extraLabels: []string{prometheusNodeLabelName, prometheusTargetNodeLabelName},
getValues: func(machineInfo *info.MachineInfo) metricValues {
return getDistance(machineInfo)
},
},
}...)
}
return c
Expand Down Expand Up @@ -356,3 +366,19 @@ func getCaches(machineInfo *info.MachineInfo) metricValues {
}
return mValues
}

func getDistance(machineInfo *info.MachineInfo) metricValues {
mValues := make(metricValues, 0, len(machineInfo.Topology)^2)
for _, node := range machineInfo.Topology {
nodeID := strconv.Itoa(node.Id)
for i, target := range node.Distances {
mValues = append(mValues,
metricValue{
value: float64(target),
labels: []string{nodeID, strconv.Itoa(i)},
timestamp: machineInfo.Timestamp,
})
}
}
return mValues
}
16 changes: 16 additions & 0 deletions metrics/prometheus_machine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,22 @@ func TestGetHugePagesCount(t *testing.T) {
assertMetricValues(t, expectedMetricVals, metricVals, "Unexpected information about Node memory")
}

func TestGetDistance(t *testing.T) {
machineInfo, err := testSubcontainersInfoProvider{}.GetMachineInfo()
assert.Nil(t, err)

metricVals := getDistance(machineInfo)

assert.Equal(t, 4, len(metricVals))
expectedMetricVals := []metricValue{
{value: 10, labels: []string{"0", "0"}, timestamp: time.Unix(1395066363, 0)},
{value: 12, labels: []string{"0", "1"}, timestamp: time.Unix(1395066363, 0)},
{value: 12, labels: []string{"1", "0"}, timestamp: time.Unix(1395066363, 0)},
{value: 10, labels: []string{"1", "1"}, timestamp: time.Unix(1395066363, 0)},
}
assertMetricValues(t, expectedMetricVals, metricVals, "Unexpected information about Node memory")
}

func assertMetricValues(t *testing.T, expected metricValues, actual metricValues, message string) {
for i := range actual {
assert.Truef(t, reflect.DeepEqual(expected[i], actual[i]),
Expand Down
6 changes: 6 additions & 0 deletions metrics/testdata/prometheus_machine_metrics
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,12 @@ machine_dimm_count{boot_id="boot-id-test",machine_id="machine-id-test",system_uu
# HELP machine_memory_bytes Amount of memory installed on the machine.
# TYPE machine_memory_bytes gauge
machine_memory_bytes{boot_id="boot-id-test",machine_id="machine-id-test",system_uuid="system-uuid-test"} 1024 1395066363000
# HELP machine_node_distance Distance between NUMA node and target NUMA node.
# TYPE machine_node_distance gauge
machine_node_distance{boot_id="boot-id-test",machine_id="machine-id-test",node_id="0",system_uuid="system-uuid-test",target_node_id="0"} 10 1395066363000
machine_node_distance{boot_id="boot-id-test",machine_id="machine-id-test",node_id="0",system_uuid="system-uuid-test",target_node_id="1"} 12 1395066363000
machine_node_distance{boot_id="boot-id-test",machine_id="machine-id-test",node_id="1",system_uuid="system-uuid-test",target_node_id="0"} 12 1395066363000
machine_node_distance{boot_id="boot-id-test",machine_id="machine-id-test",node_id="1",system_uuid="system-uuid-test",target_node_id="1"} 10 1395066363000
# HELP machine_node_hugepages_count Numer of hugepages assigned to NUMA node.
# TYPE machine_node_hugepages_count gauge
machine_node_hugepages_count{boot_id="boot-id-test",machine_id="machine-id-test",node_id="0",page_size="1048576",system_uuid="system-uuid-test"} 0 1395066363000
Expand Down
24 changes: 24 additions & 0 deletions utils/sysfs/fakesysfs/fake.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ type FakeSysFs struct {
hugePagesNr map[string]string
hugePagesNrErr error

distances map[string]string
distancesErr error

onlineCPUs map[string]interface{}
}

Expand Down Expand Up @@ -201,6 +204,27 @@ func (fs *FakeSysFs) GetSystemUUID() (string, error) {
return "1F862619-BA9F-4526-8F85-ECEAF0C97430", nil
}

func (fs *FakeSysFs) GetDistances(nodeDir string) (string, error) {
if fs.distancesErr != nil {
return "", fs.distancesErr
}

if _, ok := fs.distances[nodeDir]; !ok {
return "", fmt.Errorf("distance not found")
}

return fs.distances[nodeDir], nil
}

func (fs *FakeSysFs) SetDistances(nodeDir string, distances string, err error) {
if fs.distances == nil {
fs.distances = map[string]string{nodeDir: distances}
} else {
fs.distances[nodeDir] = distances
}
fs.distancesErr = err
}

func (fs *FakeSysFs) IsCPUOnline(dir string) bool {
if fs.onlineCPUs == nil {
return true
Expand Down
15 changes: 15 additions & 0 deletions utils/sysfs/sysfs.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ const (

meminfoFile = "meminfo"

distanceFile = "distance"

sysFsCPUTopology = "topology"

// CPUPhysicalPackageID is a physical package id of cpu#. Typically corresponds to a physical socket number,
Expand Down Expand Up @@ -113,6 +115,10 @@ type SysFs interface {
GetCacheInfo(cpu int, cache string) (CacheInfo, error)

GetSystemUUID() (string, error)

// GetDistances returns distance array
GetDistances(string) (string, error)

// IsCPUOnline determines if CPU status from kernel hotplug machanism standpoint.
// See: https://www.kernel.org/doc/html/latest/core-api/cpu_hotplug.html
IsCPUOnline(dir string) bool
Expand Down Expand Up @@ -161,6 +167,15 @@ func (fs *realSysFs) GetMemInfo(nodePath string) (string, error) {
return strings.TrimSpace(string(meminfo)), err
}

func (fs *realSysFs) GetDistances(nodePath string) (string, error) {
distancePath := fmt.Sprintf("%s/%s", nodePath, distanceFile)
distance, err := ioutil.ReadFile(distancePath)
if err != nil {
return "", err
}
return strings.TrimSpace(string(distance)), err
}

func (fs *realSysFs) GetHugePagesInfo(hugePagesDirectory string) ([]os.FileInfo, error) {
return ioutil.ReadDir(hugePagesDirectory)
}
Expand Down
14 changes: 14 additions & 0 deletions utils/sysfs/sysfs_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -305,3 +305,17 @@ func TestUniqueCPUPropertyOnSingleSocketMultipleNUMAsSystem(t *testing.T) {
count = GetUniqueCPUPropertyCount("./testdata_single_socket_many_NUMAs/", CPUCoreID)
assert.Equal(t, 16, count)
}

func TestGetDistances(t *testing.T) {
sysFs := NewRealSysFs()
distances, err := sysFs.GetDistances("./testdata/node0")
assert.Nil(t, err)
assert.Equal(t, "10 11", distances)
}

func TestGetDistancesFileIsMissing(t *testing.T) {
sysFs := NewRealSysFs()
distances, err := sysFs.GetDistances("./testdata/node1")
assert.NotNil(t, err)
assert.Equal(t, "", distances)
}
1 change: 1 addition & 0 deletions utils/sysfs/testdata/node0/distance
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
10 11
26 changes: 26 additions & 0 deletions utils/sysinfo/sysinfo.go
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,11 @@ func GetNodesInfo(sysFs sysfs.SysFs) ([]info.Node, int, error) {
return nil, 0, err
}

node.Distances, err = getDistances(sysFs, nodeDir)
if err != nil {
return nil, 0, err
}

nodes = append(nodes, node)
}
return nodes, allLogicalCoresCount, err
Expand Down Expand Up @@ -391,6 +396,27 @@ func getNodeMemInfo(sysFs sysfs.SysFs, nodeDir string) (uint64, error) {
return uint64(memory), nil
}

// getDistances returns information about distances between NUMA nodes
func getDistances(sysFs sysfs.SysFs, nodeDir string) ([]uint64, error) {
rawDistance, err := sysFs.GetDistances(nodeDir)
if err != nil {
//Ignore if per-node info is not available.
klog.Warningf("Found node without distance information, nodeDir: %s", nodeDir)
return nil, nil
}

distances := []uint64{}
for _, distance := range strings.Split(rawDistance, " ") {
distanceUint, err := strconv.ParseUint(distance, 10, 64)
if err != nil {
return nil, fmt.Errorf("cannot convert %s to int", distance)
}
distances = append(distances, distanceUint)
}

return distances, nil
}

// getCoresInfo returns information about physical cores
func getCoresInfo(sysFs sysfs.SysFs, cpuDirs []string) ([]info.Core, error) {
cores := make([]info.Core, 0, len(cpuDirs))
Expand Down
Loading

0 comments on commit 24dd1de

Please sign in to comment.