From 4b6d970bce757a20dc25759d0ad09a145d6b9649 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Julia=CC=81n?= Date: Tue, 5 Nov 2024 10:39:09 +0100 Subject: [PATCH 01/10] Add event consumer to GPU monitoring --- cmd/system-probe/modules/all_linux.go | 2 +- cmd/system-probe/modules/eventmonitor.go | 12 ++++++++++ cmd/system-probe/modules/gpu.go | 29 +++++++++++++++++++++++- pkg/gpu/config/config.go | 3 +++ pkg/gpu/probe.go | 26 ++++++++------------- 5 files changed, 54 insertions(+), 18 deletions(-) diff --git a/cmd/system-probe/modules/all_linux.go b/cmd/system-probe/modules/all_linux.go index 869cda408a8d4..27602269a38f7 100644 --- a/cmd/system-probe/modules/all_linux.go +++ b/cmd/system-probe/modules/all_linux.go @@ -30,7 +30,7 @@ var All = []module.Factory{ Pinger, Traceroute, DiscoveryModule, - GPUMonitoring, + GPUMonitoring, // GPU monitoring needs to be initialized afer EventMOnit, so that we have the event consumer ready } func inactivityEventLog(_ time.Duration) { diff --git a/cmd/system-probe/modules/eventmonitor.go b/cmd/system-probe/modules/eventmonitor.go index 5949b755f2f65..1ece0458eb140 100644 --- a/cmd/system-probe/modules/eventmonitor.go +++ b/cmd/system-probe/modules/eventmonitor.go @@ -8,10 +8,14 @@ package modules import ( + "fmt" + "github.com/DataDog/datadog-agent/cmd/system-probe/api/module" sysconfigtypes "github.com/DataDog/datadog-agent/cmd/system-probe/config/types" "github.com/DataDog/datadog-agent/pkg/eventmonitor" emconfig "github.com/DataDog/datadog-agent/pkg/eventmonitor/config" + "github.com/DataDog/datadog-agent/pkg/gpu" + gpuconfig "github.com/DataDog/datadog-agent/pkg/gpu/config" netconfig "github.com/DataDog/datadog-agent/pkg/network/config" "github.com/DataDog/datadog-agent/pkg/network/events" procconsumer "github.com/DataDog/datadog-agent/pkg/process/events/consumer" @@ -91,5 +95,13 @@ func createEventMonitorModule(_ *sysconfigtypes.Config, deps module.FactoryDepen } } + gpucfg := gpuconfig.NewConfig() + if gpucfg.Enabled { + err := gpu.CreateProcessEventConsumer(evm) + if err != nil { + return nil, fmt.Errorf("cannot create event consumer for GPU: %w", err) + } + } + return evm, err } diff --git a/cmd/system-probe/modules/gpu.go b/cmd/system-probe/modules/gpu.go index 3f1a5f35b16bb..20452762e58ff 100644 --- a/cmd/system-probe/modules/gpu.go +++ b/cmd/system-probe/modules/gpu.go @@ -19,6 +19,8 @@ import ( "github.com/DataDog/datadog-agent/cmd/system-probe/config" sysconfigtypes "github.com/DataDog/datadog-agent/cmd/system-probe/config/types" "github.com/DataDog/datadog-agent/cmd/system-probe/utils" + "github.com/DataDog/datadog-agent/pkg/eventmonitor" + "github.com/DataDog/datadog-agent/pkg/eventmonitor/consumers" "github.com/DataDog/datadog-agent/pkg/gpu" gpuconfig "github.com/DataDog/datadog-agent/pkg/gpu/config" "github.com/DataDog/datadog-agent/pkg/util/log" @@ -27,18 +29,32 @@ import ( var _ module.Module = &GPUMonitoringModule{} var gpuMonitoringConfigNamespaces = []string{gpuconfig.GPUNS} +// processEventConsumer is a global variable that holds the process event consumer, created in the eventmonitor module +// Note: In the future we should have a better way to handle dependencies between modules +var processEventConsumer *consumers.ProcessConsumer + +const processConsumerId = "gpu" +const processConsumerChanSize = 100 + +var processConsumerEventTypes = []consumers.ProcessConsumerEventTypes{consumers.ExecEventType, consumers.ExitEventType} + // GPUMonitoring Factory var GPUMonitoring = module.Factory{ Name: config.GPUMonitoringModule, ConfigNamespaces: gpuMonitoringConfigNamespaces, Fn: func(_ *sysconfigtypes.Config, deps module.FactoryDependencies) (module.Module, error) { + if processEventConsumer == nil { + return nil, fmt.Errorf("process event consumer not initialized") + } + c := gpuconfig.New() probeDeps := gpu.ProbeDependencies{ Telemetry: deps.Telemetry, //if the config parameter doesn't exist or is empty string, the default value is used as defined in go-nvml library //(https://github.com/NVIDIA/go-nvml/blob/main/pkg/nvml/lib.go#L30) - NvmlLib: nvml.New(nvml.WithLibraryPath(c.NVMLLibraryPath)), + NvmlLib: nvml.New(nvml.WithLibraryPath(c.NVMLLibraryPath)), + ProcessMonitor: processEventConsumer, } ret := probeDeps.NvmlLib.Init() @@ -95,3 +111,14 @@ func (t *GPUMonitoringModule) GetStats() map[string]interface{} { func (t *GPUMonitoringModule) Close() { t.Probe.Close() } + +// CreateProcessEventConsumer creates the process event consumer for the GPU module. Should be called from the event monitor module +func CreateProcessEventConsumer(evm *eventmonitor.EventMonitor) error { + var err error + processEventConsumer, err = consumers.NewProcessConsumer(processConsumerId, processConsumerChanSize, processConsumerEventTypes, evm) + if err != nil { + return err + } + + return nil +} diff --git a/pkg/gpu/config/config.go b/pkg/gpu/config/config.go index aa239a129d24e..947aaa68a908a 100644 --- a/pkg/gpu/config/config.go +++ b/pkg/gpu/config/config.go @@ -50,6 +50,8 @@ func CheckGPUSupported() error { // Config holds the configuration for the GPU monitoring probe. type Config struct { ebpf.Config + // Enabled indicates whether the GPU monitoring probe is enabled. + Enabled bool // ScanTerminatedProcessesInterval is the interval at which the probe scans for terminated processes. ScanTerminatedProcessesInterval time.Duration // InitialProcessSync indicates whether the probe should sync the process list on startup. @@ -66,5 +68,6 @@ func New() *Config { ScanTerminatedProcessesInterval: time.Duration(spCfg.GetInt(sysconfig.FullKeyPath(GPUNS, "process_scan_interval_seconds"))) * time.Second, InitialProcessSync: spCfg.GetBool(sysconfig.FullKeyPath(GPUNS, "initial_process_sync")), NVMLLibraryPath: spCfg.GetString(sysconfig.FullKeyPath(GPUNS, "nvml_lib_path")), + Enabled: spCfg.GetBool(sysconfig.FullKeyPath(GPUNS, "enabled")), } } diff --git a/pkg/gpu/probe.go b/pkg/gpu/probe.go index 42311524bdd33..3253ded42fc82 100644 --- a/pkg/gpu/probe.go +++ b/pkg/gpu/probe.go @@ -9,13 +9,14 @@ package gpu import ( "fmt" - sysconfig "github.com/DataDog/datadog-agent/cmd/system-probe/config" - "github.com/DataDog/datadog-agent/pkg/ebpf/bytecode" "io" "math" "os" "regexp" + sysconfig "github.com/DataDog/datadog-agent/cmd/system-probe/config" + "github.com/DataDog/datadog-agent/pkg/ebpf/bytecode" + manager "github.com/DataDog/ebpf-manager" "github.com/NVIDIA/go-nvml/pkg/nvml" "github.com/cilium/ebpf" @@ -25,7 +26,6 @@ import ( ddebpf "github.com/DataDog/datadog-agent/pkg/ebpf" "github.com/DataDog/datadog-agent/pkg/ebpf/uprobes" "github.com/DataDog/datadog-agent/pkg/gpu/config" - "github.com/DataDog/datadog-agent/pkg/process/monitor" "github.com/DataDog/datadog-agent/pkg/util/log" ) @@ -72,6 +72,9 @@ type ProbeDependencies struct { // NvmlLib is the NVML library interface NvmlLib nvml.Interface + + // ProcessMonitor is the process monitor interface + ProcessMonitor uprobes.ProcessMonitor } // Probe represents the GPU monitoring probe @@ -83,7 +86,6 @@ type Probe struct { statsGenerator *statsGenerator deps ProbeDependencies sysCtx *systemContext - procMon *monitor.ProcessMonitor eventHandler ddebpf.EventHandler } @@ -101,22 +103,15 @@ func NewProbe(cfg *config.Config, deps ProbeDependencies) (*Probe, error) { } attachCfg := getAttacherConfig(cfg) - // Note: this will later be replaced by a common way to enable the process monitor across system-probe - procMon := monitor.GetProcessMonitor() - if err := procMon.Initialize(false); err != nil { - return nil, fmt.Errorf("error initializing process monitor: %w", err) - } - sysCtx, err := getSystemContext(deps.NvmlLib, cfg.ProcRoot) if err != nil { return nil, fmt.Errorf("error getting system context: %w", err) } p := &Probe{ - cfg: cfg, - deps: deps, - procMon: procMon, - sysCtx: sysCtx, + cfg: cfg, + deps: deps, + sysCtx: sysCtx, } allowRC := cfg.EnableRuntimeCompiler && cfg.AllowRuntimeCompiledFallback @@ -143,7 +138,7 @@ func NewProbe(cfg *config.Config, deps ProbeDependencies) (*Probe, error) { } } - p.attacher, err = uprobes.NewUprobeAttacher(gpuAttacherName, attachCfg, p.m, nil, &uprobes.NativeBinaryInspector{}, procMon) + p.attacher, err = uprobes.NewUprobeAttacher(gpuAttacherName, attachCfg, p.m, nil, &uprobes.NativeBinaryInspector{}, deps.ProcessMonitor) if err != nil { return nil, fmt.Errorf("error creating uprobes attacher: %w", err) } @@ -176,7 +171,6 @@ func (p *Probe) start() error { // Close stops the probe func (p *Probe) Close() { - p.procMon.Stop() p.attacher.Stop() _ = p.m.Stop(manager.CleanAll) p.consumer.Stop() From 239a6a15a9763b57ef001a344c1ea1cbe232b57a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Julia=CC=81n?= Date: Wed, 6 Nov 2024 17:45:31 +0100 Subject: [PATCH 02/10] Use correct consumer in tests --- pkg/gpu/probe_test.go | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/pkg/gpu/probe_test.go b/pkg/gpu/probe_test.go index bb51ce743b9b6..5f105fa97873b 100644 --- a/pkg/gpu/probe_test.go +++ b/pkg/gpu/probe_test.go @@ -15,10 +15,10 @@ import ( "github.com/stretchr/testify/suite" "github.com/DataDog/datadog-agent/pkg/ebpf/ebpftest" + consumerstestutil "github.com/DataDog/datadog-agent/pkg/eventmonitor/consumers/testutil" "github.com/DataDog/datadog-agent/pkg/gpu/config" "github.com/DataDog/datadog-agent/pkg/gpu/testutil" "github.com/DataDog/datadog-agent/pkg/network/usm/utils" - "github.com/DataDog/datadog-agent/pkg/process/monitor" ) type probeTestSuite struct { @@ -44,7 +44,8 @@ func (s *probeTestSuite) getProbe() *Probe { cfg.InitialProcessSync = false deps := ProbeDependencies{ - NvmlLib: testutil.GetBasicNvmlMock(), + NvmlLib: testutil.GetBasicNvmlMock(), + ProcessMonitor: consumerstestutil.NewTestProcessConsumer(t), } probe, err := NewProbe(cfg, deps) require.NoError(t, err) @@ -66,13 +67,7 @@ func (s *probeTestSuite) TestCanLoad() { func (s *probeTestSuite) TestCanReceiveEvents() { t := s.T() - procMon := monitor.GetProcessMonitor() - require.NotNil(t, procMon) - require.NoError(t, procMon.Initialize(false)) - t.Cleanup(procMon.Stop) - probe := s.getProbe() - cmd, err := testutil.RunSample(t, testutil.CudaSample) require.NoError(t, err) @@ -109,11 +104,6 @@ func (s *probeTestSuite) TestCanReceiveEvents() { func (s *probeTestSuite) TestCanGenerateStats() { t := s.T() - procMon := monitor.GetProcessMonitor() - require.NotNil(t, procMon) - require.NoError(t, procMon.Initialize(false)) - t.Cleanup(procMon.Stop) - probe := s.getProbe() cmd, err := testutil.RunSample(t, testutil.CudaSample) From d8edbc5e168e1b1827167801ab57b9d1e1d42245 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Julia=CC=81n?= Date: Wed, 6 Nov 2024 17:47:18 +0100 Subject: [PATCH 03/10] Fix comment --- cmd/system-probe/modules/all_linux.go | 2 +- cmd/system-probe/modules/all_linux_arm64.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/system-probe/modules/all_linux.go b/cmd/system-probe/modules/all_linux.go index 27602269a38f7..79b6f3d74afec 100644 --- a/cmd/system-probe/modules/all_linux.go +++ b/cmd/system-probe/modules/all_linux.go @@ -30,7 +30,7 @@ var All = []module.Factory{ Pinger, Traceroute, DiscoveryModule, - GPUMonitoring, // GPU monitoring needs to be initialized afer EventMOnit, so that we have the event consumer ready + GPUMonitoring, // GPU monitoring needs to be initialized afer EventMonitor, so that we have the event consumer ready } func inactivityEventLog(_ time.Duration) { diff --git a/cmd/system-probe/modules/all_linux_arm64.go b/cmd/system-probe/modules/all_linux_arm64.go index 9115d7037b6c3..e5a81541e5896 100644 --- a/cmd/system-probe/modules/all_linux_arm64.go +++ b/cmd/system-probe/modules/all_linux_arm64.go @@ -30,7 +30,7 @@ var All = []module.Factory{ Pinger, Traceroute, DiscoveryModule, - GPUMonitoring, + GPUMonitoring, // GPU monitoring needs to be initialized afer EventMonitor, so that we have the event consumer ready } func inactivityEventLog(_ time.Duration) { From 020789610cd87c761b2493fc8f33fb76305a1c89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Julia=CC=81n?= Date: Wed, 6 Nov 2024 18:01:49 +0100 Subject: [PATCH 04/10] Use correct import --- cmd/system-probe/modules/eventmonitor.go | 3 +-- cmd/system-probe/modules/gpu.go | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/cmd/system-probe/modules/eventmonitor.go b/cmd/system-probe/modules/eventmonitor.go index 1ece0458eb140..a29148853bf64 100644 --- a/cmd/system-probe/modules/eventmonitor.go +++ b/cmd/system-probe/modules/eventmonitor.go @@ -14,7 +14,6 @@ import ( sysconfigtypes "github.com/DataDog/datadog-agent/cmd/system-probe/config/types" "github.com/DataDog/datadog-agent/pkg/eventmonitor" emconfig "github.com/DataDog/datadog-agent/pkg/eventmonitor/config" - "github.com/DataDog/datadog-agent/pkg/gpu" gpuconfig "github.com/DataDog/datadog-agent/pkg/gpu/config" netconfig "github.com/DataDog/datadog-agent/pkg/network/config" "github.com/DataDog/datadog-agent/pkg/network/events" @@ -97,7 +96,7 @@ func createEventMonitorModule(_ *sysconfigtypes.Config, deps module.FactoryDepen gpucfg := gpuconfig.NewConfig() if gpucfg.Enabled { - err := gpu.CreateProcessEventConsumer(evm) + err := createGPUProcessEventConsumer(evm) if err != nil { return nil, fmt.Errorf("cannot create event consumer for GPU: %w", err) } diff --git a/cmd/system-probe/modules/gpu.go b/cmd/system-probe/modules/gpu.go index 20452762e58ff..f2ff9eff88185 100644 --- a/cmd/system-probe/modules/gpu.go +++ b/cmd/system-probe/modules/gpu.go @@ -112,8 +112,8 @@ func (t *GPUMonitoringModule) Close() { t.Probe.Close() } -// CreateProcessEventConsumer creates the process event consumer for the GPU module. Should be called from the event monitor module -func CreateProcessEventConsumer(evm *eventmonitor.EventMonitor) error { +// createGPUProcessEventConsumer creates the process event consumer for the GPU module. Should be called from the event monitor module +func createGPUProcessEventConsumer(evm *eventmonitor.EventMonitor) error { var err error processEventConsumer, err = consumers.NewProcessConsumer(processConsumerId, processConsumerChanSize, processConsumerEventTypes, evm) if err != nil { From 5eda681d696072dc74dcdb5dcfd6dc63a79ab206 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Julia=CC=81n?= Date: Thu, 7 Nov 2024 10:35:35 +0100 Subject: [PATCH 05/10] Fix linter --- pkg/gpu/probe_stub.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pkg/gpu/probe_stub.go b/pkg/gpu/probe_stub.go index b781cc344c8f3..734a11919abb9 100644 --- a/pkg/gpu/probe_stub.go +++ b/pkg/gpu/probe_stub.go @@ -18,8 +18,9 @@ import ( // ProbeDependencies holds the dependencies for the probe type ProbeDependencies struct { - Telemetry telemetry.Component - NvmlLib nvml.Interface + Telemetry telemetry.Component + NvmlLib nvml.Interface + ProcessMonitor any // uprobes.ProcessMonitor is only compiled with the linux_bpf build tag, so we need to use type any here } // Probe is not implemented on non-linux systems From a24aa62f55e57797c8cd9579c0fa8f638c72f384 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Julia=CC=81n?= Date: Thu, 7 Nov 2024 11:09:39 +0100 Subject: [PATCH 06/10] Fix linter --- cmd/system-probe/modules/gpu.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/system-probe/modules/gpu.go b/cmd/system-probe/modules/gpu.go index f2ff9eff88185..7ec273e7c2869 100644 --- a/cmd/system-probe/modules/gpu.go +++ b/cmd/system-probe/modules/gpu.go @@ -33,7 +33,7 @@ var gpuMonitoringConfigNamespaces = []string{gpuconfig.GPUNS} // Note: In the future we should have a better way to handle dependencies between modules var processEventConsumer *consumers.ProcessConsumer -const processConsumerId = "gpu" +const processConsumerID = "gpu" const processConsumerChanSize = 100 var processConsumerEventTypes = []consumers.ProcessConsumerEventTypes{consumers.ExecEventType, consumers.ExitEventType} @@ -115,7 +115,7 @@ func (t *GPUMonitoringModule) Close() { // createGPUProcessEventConsumer creates the process event consumer for the GPU module. Should be called from the event monitor module func createGPUProcessEventConsumer(evm *eventmonitor.EventMonitor) error { var err error - processEventConsumer, err = consumers.NewProcessConsumer(processConsumerId, processConsumerChanSize, processConsumerEventTypes, evm) + processEventConsumer, err = consumers.NewProcessConsumer(processConsumerID, processConsumerChanSize, processConsumerEventTypes, evm) if err != nil { return err } From 6f69fd53cdc16470b8943ea59897c7d6fca7466d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Julia=CC=81n?= Date: Thu, 7 Nov 2024 11:25:09 +0100 Subject: [PATCH 07/10] Fix rebase --- cmd/system-probe/modules/eventmonitor.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/system-probe/modules/eventmonitor.go b/cmd/system-probe/modules/eventmonitor.go index a29148853bf64..1f5e0e874667a 100644 --- a/cmd/system-probe/modules/eventmonitor.go +++ b/cmd/system-probe/modules/eventmonitor.go @@ -94,7 +94,7 @@ func createEventMonitorModule(_ *sysconfigtypes.Config, deps module.FactoryDepen } } - gpucfg := gpuconfig.NewConfig() + gpucfg := gpuconfig.New() if gpucfg.Enabled { err := createGPUProcessEventConsumer(evm) if err != nil { From bdbddddc10e90abce4d84a5d93262cacb7cb99da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Julia=CC=81n?= Date: Thu, 7 Nov 2024 17:19:45 +0100 Subject: [PATCH 08/10] Fix windows build --- cmd/system-probe/modules/eventmonitor_windows.go | 4 ++++ pkg/gpu/config/config.go | 2 -- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/cmd/system-probe/modules/eventmonitor_windows.go b/cmd/system-probe/modules/eventmonitor_windows.go index aa81d0f428f5e..870c707f8f3e7 100644 --- a/cmd/system-probe/modules/eventmonitor_windows.go +++ b/cmd/system-probe/modules/eventmonitor_windows.go @@ -24,3 +24,7 @@ var EventMonitor = module.Factory{ func createProcessMonitorConsumer(_ *eventmonitor.EventMonitor, _ *netconfig.Config) (eventmonitor.EventConsumer, error) { return nil, nil } + +func createGPUProcessEventConsumer(_ *eventmonitor.EventMonitor) error { + return nil +} diff --git a/pkg/gpu/config/config.go b/pkg/gpu/config/config.go index 947aaa68a908a..24c97054f79c9 100644 --- a/pkg/gpu/config/config.go +++ b/pkg/gpu/config/config.go @@ -3,8 +3,6 @@ // This product includes software developed at Datadog (https://www.datadoghq.com/). // Copyright 2024-present Datadog, Inc. -//go:build linux - // Package config provides the GPU monitoring config. package config From e3569e219ba922592c490f1965565cf77d032ad1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Julia=CC=81n?= Date: Fri, 8 Nov 2024 14:02:42 +0100 Subject: [PATCH 09/10] Split linux-only config code --- pkg/gpu/config/config.go | 24 ----------------------- pkg/gpu/config/config_linux.go | 36 ++++++++++++++++++++++++++++++++++ pkg/gpu/config/config_other.go | 15 ++++++++++++++ 3 files changed, 51 insertions(+), 24 deletions(-) create mode 100644 pkg/gpu/config/config_linux.go create mode 100644 pkg/gpu/config/config_other.go diff --git a/pkg/gpu/config/config.go b/pkg/gpu/config/config.go index 24c97054f79c9..4323ad6630a86 100644 --- a/pkg/gpu/config/config.go +++ b/pkg/gpu/config/config.go @@ -8,13 +8,11 @@ package config import ( "errors" - "fmt" "time" sysconfig "github.com/DataDog/datadog-agent/cmd/system-probe/config" pkgconfigsetup "github.com/DataDog/datadog-agent/pkg/config/setup" "github.com/DataDog/datadog-agent/pkg/ebpf" - "github.com/DataDog/datadog-agent/pkg/util/kernel" ) // GPUNS is the namespace for the GPU monitoring probe. @@ -23,28 +21,6 @@ const GPUNS = "gpu_monitoring" // ErrNotSupported is the error returned if GPU monitoring is not supported on this platform var ErrNotSupported = errors.New("GPU Monitoring is not supported") -// MinimumKernelVersion indicates the minimum kernel version required for GPU monitoring -var MinimumKernelVersion kernel.Version - -func init() { - // we rely on ring buffer support for GPU monitoring, hence the minimal kernel version is 5.8.0 - MinimumKernelVersion = kernel.VersionCode(5, 8, 0) -} - -// CheckGPUSupported checks if the host's kernel supports GPU monitoring -func CheckGPUSupported() error { - kversion, err := kernel.HostVersion() - if err != nil { - return fmt.Errorf("%w: could not determine the current kernel version: %w", ErrNotSupported, err) - } - - if kversion < MinimumKernelVersion { - return fmt.Errorf("%w: a Linux kernel version of %s or higher is required; we detected %s", ErrNotSupported, MinimumKernelVersion, kversion) - } - - return nil -} - // Config holds the configuration for the GPU monitoring probe. type Config struct { ebpf.Config diff --git a/pkg/gpu/config/config_linux.go b/pkg/gpu/config/config_linux.go new file mode 100644 index 0000000000000..55822e01600f5 --- /dev/null +++ b/pkg/gpu/config/config_linux.go @@ -0,0 +1,36 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2024-present Datadog, Inc. + +//go:build linux + +package config + +import ( + "fmt" + + "github.com/DataDog/datadog-agent/pkg/util/kernel" +) + +// MinimumKernelVersion indicates the minimum kernel version required for GPU monitoring +var MinimumKernelVersion kernel.Version + +func init() { + // we rely on ring buffer support for GPU monitoring, hence the minimal kernel version is 5.8.0 + MinimumKernelVersion = kernel.VersionCode(5, 8, 0) +} + +// CheckGPUSupported checks if the host's kernel supports GPU monitoring +func CheckGPUSupported() error { + kversion, err := kernel.HostVersion() + if err != nil { + return fmt.Errorf("%w: could not determine the current kernel version: %w", ErrNotSupported, err) + } + + if kversion < MinimumKernelVersion { + return fmt.Errorf("%w: a Linux kernel version of %s or higher is required; we detected %s", ErrNotSupported, MinimumKernelVersion, kversion) + } + + return nil +} diff --git a/pkg/gpu/config/config_other.go b/pkg/gpu/config/config_other.go new file mode 100644 index 0000000000000..ddfef2ff9c6b0 --- /dev/null +++ b/pkg/gpu/config/config_other.go @@ -0,0 +1,15 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2024-present Datadog, Inc. + +//go:build !linux + +package config + +import "errors" + +// CheckGPUSupported checks if the host's kernel supports GPU monitoring +func CheckGPUSupported() error { + return errors.New("GPU monitoring is not supported on this platform") +} From 7bb60acba2e2f4c37d53b8cbbeaf3077d54ed61f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Julia=CC=81n?= Date: Fri, 8 Nov 2024 06:56:04 -0800 Subject: [PATCH 10/10] Enable eventmonitor even if only GPU is enabled --- cmd/system-probe/config/config.go | 6 ++++-- cmd/system-probe/config/config_test.go | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/cmd/system-probe/config/config.go b/cmd/system-probe/config/config.go index 60fd9b679609b..49fd3e49d290c 100644 --- a/cmd/system-probe/config/config.go +++ b/cmd/system-probe/config/config.go @@ -123,6 +123,7 @@ func load() (*types.Config, error) { usmEnabled := cfg.GetBool(smNS("enabled")) ccmEnabled := cfg.GetBool(ccmNS("enabled")) csmEnabled := cfg.GetBool(secNS("enabled")) + gpuEnabled := cfg.GetBool(gpuNS("enabled")) if npmEnabled || usmEnabled || ccmEnabled || (csmEnabled && cfg.GetBool(secNS("network_monitoring.enabled"))) { c.EnabledModules[NetworkTracerModule] = struct{}{} @@ -136,7 +137,8 @@ func load() (*types.Config, error) { if cfg.GetBool(secNS("enabled")) || cfg.GetBool(secNS("fim_enabled")) || cfg.GetBool(evNS("process.enabled")) || - (c.ModuleIsEnabled(NetworkTracerModule) && cfg.GetBool(evNS("network_process.enabled"))) { + (c.ModuleIsEnabled(NetworkTracerModule) && cfg.GetBool(evNS("network_process.enabled")) || + gpuEnabled) { c.EnabledModules[EventMonitorModule] = struct{}{} } if cfg.GetBool(secNS("enabled")) && cfg.GetBool(secNS("compliance_module.enabled")) { @@ -163,7 +165,7 @@ func load() (*types.Config, error) { if cfg.GetBool(discoveryNS("enabled")) { c.EnabledModules[DiscoveryModule] = struct{}{} } - if cfg.GetBool(gpuNS("enabled")) { + if gpuEnabled { c.EnabledModules[GPUMonitoringModule] = struct{}{} } diff --git a/cmd/system-probe/config/config_test.go b/cmd/system-probe/config/config_test.go index 26164bfc0cd0f..4b44204851a0b 100644 --- a/cmd/system-probe/config/config_test.go +++ b/cmd/system-probe/config/config_test.go @@ -23,8 +23,8 @@ func TestEventMonitor(t *testing.T) { mock.NewSystemProbe(t) for i, tc := range []struct { - cws, fim, processEvents, networkEvents bool - enabled bool + cws, fim, processEvents, networkEvents, gpu bool + enabled bool }{ {cws: false, fim: false, processEvents: false, networkEvents: false, enabled: false}, {cws: false, fim: false, processEvents: true, networkEvents: false, enabled: true}, @@ -42,6 +42,7 @@ func TestEventMonitor(t *testing.T) { {cws: true, fim: false, processEvents: true, networkEvents: true, enabled: true}, {cws: true, fim: true, processEvents: false, networkEvents: true, enabled: true}, {cws: true, fim: true, processEvents: true, networkEvents: true, enabled: true}, + {cws: false, fim: false, processEvents: false, networkEvents: false, gpu: true, enabled: true}, } { t.Run(fmt.Sprintf("%d", i), func(t *testing.T) { t.Logf("%+v\n", tc) @@ -50,6 +51,7 @@ func TestEventMonitor(t *testing.T) { t.Setenv("DD_SYSTEM_PROBE_EVENT_MONITORING_PROCESS_ENABLED", strconv.FormatBool(tc.processEvents)) t.Setenv("DD_SYSTEM_PROBE_EVENT_MONITORING_NETWORK_PROCESS_ENABLED", strconv.FormatBool(tc.networkEvents)) t.Setenv("DD_SYSTEM_PROBE_NETWORK_ENABLED", strconv.FormatBool(tc.networkEvents)) + t.Setenv("DD_GPU_MONITORING_ENABLED", strconv.FormatBool(tc.gpu)) cfg, err := New("/doesnotexist", "") t.Logf("%+v\n", cfg)