Skip to content

Commit

Permalink
gpu: support read usage of multiple cards on linux (#79)
Browse files Browse the repository at this point in the history
  • Loading branch information
uubulb authored Oct 24, 2024
1 parent 20db2c9 commit 0cba96b
Show file tree
Hide file tree
Showing 18 changed files with 393 additions and 307 deletions.
3 changes: 3 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ require (
github.com/refraction-networking/utls v1.6.3
github.com/shirou/gopsutil/v4 v4.24.9
github.com/spf13/viper v1.19.0
github.com/tidwall/gjson v1.18.0
github.com/urfave/cli/v2 v2.27.5
golang.org/x/net v0.29.0
golang.org/x/sys v0.25.0
Expand Down Expand Up @@ -74,6 +75,8 @@ require (
github.com/spf13/pflag v1.0.5 // indirect
github.com/subosito/gotenv v1.6.0 // indirect
github.com/tcnksm/go-gitconfig v0.1.2 // indirect
github.com/tidwall/match v1.1.1 // indirect
github.com/tidwall/pretty v1.2.0 // indirect
github.com/tklauser/go-sysconf v0.3.12 // indirect
github.com/tklauser/numcpus v0.6.1 // indirect
github.com/ulikunitz/xz v0.5.11 // indirect
Expand Down
6 changes: 6 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,12 @@ github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8
github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU=
github.com/tcnksm/go-gitconfig v0.1.2 h1:iiDhRitByXAEyjgBqsKi9QU4o2TNtv9kPP3RgPgXBPw=
github.com/tcnksm/go-gitconfig v0.1.2/go.mod h1:/8EhP4H7oJZdIPyT+/UIsG87kTzrzM4UsLGSItWYCpE=
github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY=
github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA=
github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM=
github.com/tidwall/pretty v1.2.0 h1:RWIZEg2iJ8/g6fDDYzMpobmaoGh5OLl4AXtGUGPcqCs=
github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
github.com/tklauser/go-sysconf v0.3.12 h1:0QaGUFOdQaIVdPgfITYzaTegZvdCjmYO52cSFAEVmqU=
github.com/tklauser/go-sysconf v0.3.12/go.mod h1:Ho14jnntGE1fpdOqQEEaiKRpvIavV0hSfmBq8nJbHYI=
github.com/tklauser/numcpus v0.6.1 h1:ng9scYS7az0Bk4OZLvrNXNSAO2Pxr1XXRAPyjhIx+Fk=
Expand Down
2 changes: 1 addition & 1 deletion model/host.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ type HostState struct {
UdpConnCount uint64
ProcessCount uint64
Temperatures []SensorTemperature
GPU float64
GPU []float64
}

func (s *HostState) PB() *pb.State {
Expand Down
27 changes: 0 additions & 27 deletions pkg/gpu/gpu.go

This file was deleted.

5 changes: 3 additions & 2 deletions pkg/gpu/gpu_darwin.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,9 @@ func GetGPUModel() ([]string, error) {
return util.RemoveDuplicate(models), nil
}

func FindUtilization(key, dictKey string) (int, error) {
return findUtilization(key, dictKey)
func GetGPUStat() ([]float64, error) {
usage, err := findUtilization("PerformanceStatistics", "Device Utilization %")
return []float64{float64(usage)}, err
}

func findDevices(key string) ([]string, error) {
Expand Down
11 changes: 11 additions & 0 deletions pkg/gpu/gpu_fallback.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
//go:build !darwin && !linux && !windows

package gpu

func GetGPUModel() ([]string, error) {
return nil, nil
}

func GetGPUStat() ([]float64, error) {
return nil, nil
}
125 changes: 125 additions & 0 deletions pkg/gpu/gpu_linux.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
//go:build linux

package gpu

import (
"errors"

"github.com/nezhahq/agent/pkg/gpu/vendor"
)

const (
vendorAMD = iota + 1
vendorNVIDIA
)

var vendorType uint8

func init() {
_, err := getNvidiaStat()
if err != nil {
vendorType = vendorAMD
} else {
vendorType = vendorNVIDIA
}
}

func getNvidiaStat() ([]float64, error) {
smi := &vendor.NvidiaSMI{
BinPath: "/usr/bin/nvidia-smi",
}
err1 := smi.Start()
if err1 != nil {
return nil, err1
}
data, err2 := smi.GatherUsage()
if err2 != nil {
return nil, err2
}
return data, nil
}

func getAMDStat() ([]float64, error) {
rsmi := &vendor.ROCmSMI{
BinPath: "/opt/rocm/bin/rocm-smi",
}
err := rsmi.Start()
if err != nil {
return nil, err
}
data, err := rsmi.GatherUsage()
if err != nil {
return nil, err
}
return data, nil
}

func getNvidiaHost() ([]string, error) {
smi := &vendor.NvidiaSMI{
BinPath: "/usr/bin/nvidia-smi",
}
err := smi.Start()
if err != nil {
return nil, err
}
data, err := smi.GatherModel()
if err != nil {
return nil, err
}
return data, nil
}

func getAMDHost() ([]string, error) {
rsmi := &vendor.ROCmSMI{
BinPath: "/opt/rocm/bin/rocm-smi",
}
err := rsmi.Start()
if err != nil {
return nil, err
}
data, err := rsmi.GatherModel()
if err != nil {
return nil, err
}
return data, nil
}

func GetGPUModel() ([]string, error) {
var gi []string
var err error

switch vendorType {
case vendorAMD:
gi, err = getAMDHost()
case vendorNVIDIA:
gi, err = getNvidiaHost()
default:
return nil, errors.New("invalid vendor")
}

if err != nil {
return nil, err
}

return gi, nil
}

func GetGPUStat() ([]float64, error) {
var gs []float64
var err error

switch vendorType {
case vendorAMD:
gs, err = getAMDStat()
case vendorNVIDIA:
gs, err = getNvidiaStat()
default:
return nil, errors.New("invalid vendor")
}

if err != nil {
return nil, err
}

return gs, nil
}
59 changes: 37 additions & 22 deletions pkg/gpu/stat/stat_windows.go → pkg/gpu/gpu_windows.go
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
//go:build windows

// Modified from https://github.com/shirou/gopsutil/blob/master/internal/common/common_windows.go
// Original License: BSD-3-Clause

package stat
package gpu

import (
"errors"
"fmt"
"time"
"unsafe"

"github.com/jaypipes/ghw"
"golang.org/x/sys/windows"
)

Expand Down Expand Up @@ -43,6 +41,41 @@ type PDH_FMT_COUNTERVALUE_ITEM_DOUBLE struct {
FmtValue PDH_FMT_COUNTERVALUE_DOUBLE
}

func GetGPUModel() ([]string, error) {
var gpuModel []string
gi, err := ghw.GPU(ghw.WithDisableWarnings())
if err != nil {
return nil, err
}

for _, card := range gi.GraphicsCards {
if card.DeviceInfo == nil {
return nil, errors.New("Cannot find device info")
}
gpuModel = append(gpuModel, card.DeviceInfo.Product.Name)
}

return gpuModel, nil
}

func GetGPUStat() ([]float64, error) {
counter, err := newWin32PerformanceCounter("gpu_utilization", "\\GPU Engine(*engtype_3D)\\Utilization Percentage")
if err != nil {
return nil, err
}
defer pdhCloseQuery.Call(uintptr(counter.Query))

values, err := getValue(8192, counter)
if err != nil {
return nil, err
}
tot := sumArray(values)
if tot > 100 {
tot = 100
}
return []float64{tot}, nil
}

// https://github.com/influxdata/telegraf/blob/master/plugins/inputs/win_perf_counters/performance_query.go
func getCounterArrayValue(initialBufSize uint32, counter *win32PerformanceCounter) ([]float64, error) {
for buflen := initialBufSize; buflen <= 100*1024*1024; buflen *= 2 {
Expand Down Expand Up @@ -127,24 +160,6 @@ func getValue(initialBufSize uint32, counter *win32PerformanceCounter) ([]float6
return getCounterArrayValue(initialBufSize, counter)
}

func GetGPUStat() (float64, error) {
counter, err := newWin32PerformanceCounter("gpu_utilization", "\\GPU Engine(*engtype_3D)\\Utilization Percentage")
if err != nil {
return 0, err
}
defer pdhCloseQuery.Call(uintptr(counter.Query))

values, err := getValue(8192, counter)
if err != nil {
return 0, err
}
tot := sumArray(values)
if tot > 100 {
tot = 100
}
return tot, nil
}

func sumArray(arr []float64) float64 {
var sum float64
for _, value := range arr {
Expand Down
67 changes: 0 additions & 67 deletions pkg/gpu/stat/amd_rocm_smi.go

This file was deleted.

12 changes: 0 additions & 12 deletions pkg/gpu/stat/stat_darwin.go

This file was deleted.

7 changes: 0 additions & 7 deletions pkg/gpu/stat/stat_freebsd.go

This file was deleted.

Loading

0 comments on commit 0cba96b

Please sign in to comment.