diff --git a/cmd/statshouse/statshouse.go b/cmd/statshouse/statshouse.go index 9a2e76265..204772686 100644 --- a/cmd/statshouse/statshouse.go +++ b/cmd/statshouse/statshouse.go @@ -356,7 +356,7 @@ func mainAgent(aesPwd string, dc *pcache.DiskCache) int { } }() - m, err := stats.NewCollectorManager(stats.CollectorManagerOptions{ScrapeInterval: argv.hardwareMetricScrapeInterval, HostName: argv.customHostName}, w) + m, err := stats.NewCollectorManager(stats.CollectorManagerOptions{ScrapeInterval: argv.hardwareMetricScrapeInterval, HostName: argv.customHostName}, w, logErr) if err != nil { logErr.Println("failed to init hardware collector", err.Error()) } else { diff --git a/internal/format/predefined.go b/internal/format/predefined.go index 86ebe5c21..0b9cfd6e1 100644 --- a/internal/format/predefined.go +++ b/internal/format/predefined.go @@ -14,22 +14,26 @@ const ( BuiltinMetricIDNetBandwidth = -80 BuiltinMetricIDNetPacket = -81 BuiltinMetricIDNetError = -82 + BuiltinMetricIDDiskUsage = -83 + BuiltinMetricIDINodeUsage = -84 - BuiltinMetricNameCpuUsage = "host_cpu_usage" - BuiltinMetricNameMemUsage = "host_mem_usage" - BuiltinMetricNameBlockIOTime = "host_block_io_time" + BuiltinMetricNameCpuUsage = "test_host_cpu_usage" + BuiltinMetricNameMemUsage = "test_host_mem_usage" + BuiltinMetricNameBlockIOTime = "test_host_block_io_time" + BuiltinMetricNameDiskUsage = "test_host_disk_usage" + BuiltinMetricNameINodeUsage = "test_host_inode_usage" - BuiltinMetricNameSystemUptime = "host_system_uptime" - BuiltinMetricNameProcessCreated = "host_system_process_created" - BuiltinMetricNameProcessStatus = "host_system_process_status" + BuiltinMetricNameSystemUptime = "test_host_system_uptime" + BuiltinMetricNameProcessCreated = "test_host_system_process_created" + BuiltinMetricNameProcessStatus = "test_host_system_process_status" - BuiltinMetricNamePSICPU = "host_system_psi_cpu" - BuiltinMetricNamePSIMem = "host_system_psi_mem" - BuiltinMetricNamePSIIO = "host_system_psi_io" + BuiltinMetricNamePSICPU = "test_host_system_psi_cpu" + BuiltinMetricNamePSIMem = "test_host_system_psi_mem" + BuiltinMetricNamePSIIO = "test_host_system_psi_io" - BuiltinMetricNameNetBandwidth = "host_net_bandwidth" - BuiltinMetricNameNetPacket = "host_net_packet" - BuiltinMetricNameNetError = "host_net_error" + BuiltinMetricNameNetBandwidth = "test_host_net_bandwidth" + BuiltinMetricNameNetPacket = "test_host_net_packet" + BuiltinMetricNameNetError = "test_host_net_error" RawIDTagNice = 1 RawIDTagSystem = 2 @@ -233,21 +237,22 @@ var hostMetrics = map[int32]*MetricMetaValue{ Name: BuiltinMetricNameNetError, Kind: MetricKindCounter, Description: "Number of network errors", - Tags: []MetricMetaTag{{ - Description: "type", - Raw: true, - ValueComments: convertToValueComments(map[int32]string{ - RawIDTagInHdrError: "InHdrError", - RawIDTagInDiscard: "InDiscard", - RawIDTagOutDiscard: "OutDiscards", - RawIDTagOutNoRoute: "OutNoRoute", - RawIDTagInAddrError: "InAddrError", - RawIDTagInUnknownProto: "InUnknownProto", - RawIDTagInErr: "InErr", - RawIDTagInCsumErr: "InCsumError", - RawIDTagRetransSeg: "RetransSeg", - }), - }, + Tags: []MetricMetaTag{ + { + Description: "type", + Raw: true, + ValueComments: convertToValueComments(map[int32]string{ + RawIDTagInHdrError: "InHdrError", + RawIDTagInDiscard: "InDiscard", + RawIDTagOutDiscard: "OutDiscards", + RawIDTagOutNoRoute: "OutNoRoute", + RawIDTagInAddrError: "InAddrError", + RawIDTagInUnknownProto: "InUnknownProto", + RawIDTagInErr: "InErr", + RawIDTagInCsumErr: "InCsumError", + RawIDTagRetransSeg: "RetransSeg", + }), + }, { Description: "protocol", Raw: true, @@ -259,4 +264,38 @@ var hostMetrics = map[int32]*MetricMetaValue{ }), }}, }, + BuiltinMetricIDDiskUsage: { + Name: BuiltinMetricNameDiskUsage, + Kind: MetricKindValue, + Description: "Disk space utilization", + Tags: []MetricMetaTag{ + { + Description: "state", + Raw: true, + ValueComments: convertToValueComments(map[int32]string{ + RawIDTagFree: "free", + RawIDTagUsed: "used", + }), + }, + { + Description: "device", + }}, + }, + BuiltinMetricIDINodeUsage: { + Name: BuiltinMetricNameINodeUsage, + Kind: MetricKindValue, + Description: "", + Tags: []MetricMetaTag{ + { + Description: "state", + Raw: true, + ValueComments: convertToValueComments(map[int32]string{ + RawIDTagFree: "free", + RawIDTagUsed: "used", + }), + }, + { + Description: "device", + }}, + }, } diff --git a/internal/stats/collector.go b/internal/stats/collector.go index ab313bb22..ff52e84e2 100644 --- a/internal/stats/collector.go +++ b/internal/stats/collector.go @@ -27,6 +27,7 @@ type CollectorManager struct { ctx context.Context cancel func() collectors []Collector + logErr *log.Logger } type scrapeResult struct { @@ -36,7 +37,7 @@ type scrapeResult struct { const procPath = "/proc" const sysPath = "/sys" -func NewCollectorManager(opt CollectorManagerOptions, h receiver.Handler) (*CollectorManager, error) { +func NewCollectorManager(opt CollectorManagerOptions, h receiver.Handler, logErr *log.Logger) (*CollectorManager, error) { newPusher := func() Pusher { if h == nil { return &PusherRemoteImpl{HostName: opt.HostName} @@ -51,7 +52,7 @@ func NewCollectorManager(opt CollectorManagerOptions, h receiver.Handler) (*Coll if err != nil { return nil, err } - diskStats, err := NewDiskStats(newPusher()) + diskStats, err := NewDiskStats(newPusher(), logErr) if err != nil { return nil, err } @@ -74,6 +75,7 @@ func NewCollectorManager(opt CollectorManagerOptions, h receiver.Handler) (*Coll ctx: ctx, cancel: cancel, collectors: collectors, + logErr: logErr, }, nil } @@ -95,7 +97,7 @@ func (m *CollectorManager) RunCollector() error { for { err := c.PushMetrics() if err != nil { - log.Printf("failed to push metrics: %v (collector: %s)", err, c.Name()) + m.logErr.Printf("failed to push metrics: %v (collector: %s)", err, c.Name()) } // todo round interval to begin of second select { diff --git a/internal/stats/cpu_stats.go b/internal/stats/cpu_stats.go index 7341378d2..529e15fe4 100644 --- a/internal/stats/cpu_stats.go +++ b/internal/stats/cpu_stats.go @@ -16,7 +16,6 @@ type CPUStats struct { pusher Pusher } -const bt = "test_system_uptime" const cpu = format.BuiltinMetricNameCpuUsage const irq = "" const sirq = "" @@ -87,7 +86,7 @@ func (c *CPUStats) pushCPUMetrics(stat procfs.Stat) error { func (c *CPUStats) pushSystemMetrics(stat procfs.Stat) error { uptime := uint64(time.Now().Unix()) - stat.BootTime - c.pusher.PushSystemMetricValue(bt, float64(uptime)) + c.pusher.PushSystemMetricValue(format.BuiltinMetricNameSystemUptime, float64(uptime)) c.pusher.PushSystemMetricValue(format.BuiltinMetricNameProcessStatus, float64(stat.ProcessesRunning), format.RawIDTagRunning) c.pusher.PushSystemMetricValue(format.BuiltinMetricNameProcessStatus, float64(stat.ProcessesBlocked), format.RawIDTagBlocked) c.pusher.PushSystemMetricCount(format.BuiltinMetricNameProcessCreated, float64(stat.ProcessCreated-c.stat.ProcessCreated)) diff --git a/internal/stats/disk_stats.go b/internal/stats/disk_stats.go index 755d6b39e..11e2d0832 100644 --- a/internal/stats/disk_stats.go +++ b/internal/stats/disk_stats.go @@ -1,33 +1,53 @@ package stats import ( + "bufio" + "errors" "fmt" + "log" + "os" + "regexp" + "strings" "github.com/prometheus/procfs/blockdevice" "github.com/vkcom/statshouse/internal/format" + "golang.org/x/sys/unix" ) type DiskStats struct { fs blockdevice.FS - pusher Pusher - old map[string]blockdevice.Diskstats + pusher Pusher + old map[string]blockdevice.Diskstats + excludedMountPointsPattern *regexp.Regexp + excludedFSTypesPattern *regexp.Regexp + logErr *log.Logger } -const disk = "test_block_io" +type mount struct { + device, mountPoint, fsType, options string +} + +const ( + defMountPointsExcluded = "^/(dev|proc|run/credentials/.+|sys|var/lib/docker/.+|var/lib/containers/storage/.+)($|/)" + defFSTypesExcluded = "^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$" +) func (*DiskStats) Name() string { return "disk_stats" } -func NewDiskStats(pusher Pusher) (*DiskStats, error) { +func NewDiskStats(pusher Pusher, logErr *log.Logger) (*DiskStats, error) { fs, err := blockdevice.NewFS(procPath, sysPath) if err != nil { return nil, fmt.Errorf("failed to initialize procfs: %w", err) } return &DiskStats{ - fs: fs, - pusher: pusher, + fs: fs, + pusher: pusher, + excludedMountPointsPattern: regexp.MustCompile(defMountPointsExcluded), + excludedFSTypesPattern: regexp.MustCompile(defFSTypesExcluded), + logErr: logErr, }, nil } @@ -47,17 +67,90 @@ func (c *DiskStats) PushMetrics() error { writeIO := stat.WriteIOs - oldStat.WriteIOs discardIO := stat.DiscardIOs - oldStat.DiscardIOs - c.pusher.PushSystemMetricCount(disk, float64(readIO), format.RawIDTagRead) - c.pusher.PushSystemMetricCount(disk, float64(writeIO), format.RawIDTagWrite) - c.pusher.PushSystemMetricCount(disk, float64(discardIO), format.RawIDTagDiscard) + c.pusher.PushSystemMetricCount(format.BuiltinMetricNameBlockIOTime, float64(readIO), format.RawIDTagRead) + c.pusher.PushSystemMetricCount(format.BuiltinMetricNameBlockIOTime, float64(writeIO), format.RawIDTagWrite) + c.pusher.PushSystemMetricCount(format.BuiltinMetricNameBlockIOTime, float64(discardIO), format.RawIDTagDiscard) readIOTicks := float64(stat.ReadTicks-oldStat.ReadTicks) / 1000 writeIOTicks := float64(stat.WriteTicks-oldStat.WriteTicks) / 1000 discardIOTicks := float64(stat.DiscardTicks-oldStat.DiscardTicks) / 1000 - c.pusher.PushSystemMetricValue(disk, readIOTicks, format.RawIDTagRead) - c.pusher.PushSystemMetricValue(disk, writeIOTicks, format.RawIDTagWrite) - c.pusher.PushSystemMetricValue(disk, discardIOTicks, format.RawIDTagDiscard) + c.pusher.PushSystemMetricValue(format.BuiltinMetricNameBlockIOTime, readIOTicks, format.RawIDTagRead) + c.pusher.PushSystemMetricValue(format.BuiltinMetricNameBlockIOTime, writeIOTicks, format.RawIDTagWrite) + c.pusher.PushSystemMetricValue(format.BuiltinMetricNameBlockIOTime, discardIOTicks, format.RawIDTagDiscard) + } + err = c.pushFSStats() + return err +} + +func (c *DiskStats) pushFSStats() error { + stats, err := parseMounts() + if err != nil { + return err + } + seen := map[string]bool{} + for _, stat := range stats { + if c.excludedMountPointsPattern.MatchString(stat.mountPoint) { + continue + } + if c.excludedFSTypesPattern.MatchString(stat.fsType) { + continue + } + if seen[stat.device] { + continue + } + seen[stat.device] = true + s := unix.Statfs_t{} + err := unix.Statfs(stat.mountPoint, &s) + if err != nil { + c.logErr.Printf("failed to statfs of %s: %w", stat.mountPoint, err) + continue + } + free := float64(s.Bfree) * float64(s.Bsize) + used := float64(s.Blocks)*float64(s.Bsize) - free + c.pusher.PushSystemMetricValue(format.BuiltinMetricNameDiskUsage, free, format.RawIDTagFree) + c.pusher.PushSystemMetricValue(format.BuiltinMetricNameDiskUsage, used, format.RawIDTagUsed) + + inodeFree := float64(s.Ffree) + inodeUsed := float64(s.Files) - inodeFree + c.pusher.PushSystemMetricValue(format.BuiltinMetricNameINodeUsage, inodeFree, format.RawIDTagFree) + c.pusher.PushSystemMetricValue(format.BuiltinMetricNameINodeUsage, inodeUsed, format.RawIDTagUsed) } return nil } + +func parseMounts() ([]mount, error) { + file, err := os.Open("/proc/1/mounts") + if errors.Is(err, os.ErrNotExist) { + file, err = os.Open("/proc/mounts") + } + if err != nil { + return nil, err + } + defer file.Close() + + var mounts []mount + + scanner := bufio.NewScanner(file) + for scanner.Scan() { + parts := strings.Fields(scanner.Text()) + + if len(parts) < 4 { + return nil, fmt.Errorf("malformed mount point information: %q", scanner.Text()) + } + + // Ensure we handle the translation of \040 and \011 + // as per fstab(5). + parts[1] = strings.Replace(parts[1], "\\040", " ", -1) + parts[1] = strings.Replace(parts[1], "\\011", "\t", -1) + + mounts = append(mounts, mount{ + device: parts[0], + mountPoint: parts[1], + fsType: parts[2], + options: parts[3], + }) + } + + return mounts, scanner.Err() +} diff --git a/internal/stats/mem_stats.go b/internal/stats/mem_stats.go index 56bd83bf0..f5bfddc41 100644 --- a/internal/stats/mem_stats.go +++ b/internal/stats/mem_stats.go @@ -41,8 +41,6 @@ func (c *MemStats) PushMetrics() error { var cached uint64 var sreclaimable uint64 var shmem uint64 - // var dirty uint64 - // var writeBack uint64 if stat.MemTotal != nil { total = *stat.MemTotal @@ -62,12 +60,6 @@ func (c *MemStats) PushMetrics() error { if stat.Shmem != nil { shmem = *stat.Shmem } - //if stat.Dirty != nil { - // dirty = *stat.Dirty - //} - //if stat.Writeback != nil { - // writeBack = *stat.Writeback - //} cached = cached + sreclaimable - shmem used := total - free - buffers - cached c.pusher.PushSystemMetricValue(mem, float64(free), format.RawIDTagFree) diff --git a/internal/stats/net_stats.go b/internal/stats/net_stats.go index dbb76ed1f..3422cf8ba 100644 --- a/internal/stats/net_stats.go +++ b/internal/stats/net_stats.go @@ -24,15 +24,6 @@ type NetStats struct { pusher Pusher } -const ( - bandwidth = "test_net_bandwidth" - ipPackets = "test_net_ip_packet" - ipPacketsErrors = "test_net_ip_packet_errors" - tcpPackets = "test_net_tcp_packets" - tcpPacketsErrors = "test_net_tcp_packets_errors" - udpPackets = "test_net_udp_datagrams" -) - type netStat struct { ip ip tcp tcp @@ -162,8 +153,11 @@ func (c *NetStats) pushNetDev() error { total := dev.Total() if len(c.oldNetDev) > 0 { - c.pusher.PushSystemMetricValue(bandwidth, float64(total.RxBytes-c.oldNetDevTotal.RxBytes), format.RawIDTagReceived) - c.pusher.PushSystemMetricValue(bandwidth, float64(total.TxBytes-c.oldNetDevTotal.TxBytes), format.RawIDTagSent) + c.pusher.PushSystemMetricValue(format.BuiltinMetricNameNetBandwidth, float64(total.RxBytes-c.oldNetDevTotal.RxBytes), format.RawIDTagReceived) + c.pusher.PushSystemMetricValue(format.BuiltinMetricNameNetBandwidth, float64(total.TxBytes-c.oldNetDevTotal.TxBytes), format.RawIDTagSent) + + c.pusher.PushSystemMetricCount(format.BuiltinMetricNameNetBandwidth, float64(total.RxPackets-c.oldNetDevTotal.RxPackets), format.RawIDTagReceived) + c.pusher.PushSystemMetricCount(format.BuiltinMetricNameNetBandwidth, float64(total.TxPackets-c.oldNetDevTotal.TxPackets), format.RawIDTagSent) } c.oldNetDev = dev @@ -268,7 +262,6 @@ func parseNetstat(reader io.Reader) (netStat, error) { values := strings.Split(scanner.Text(), " ") protocol := strings.ToLower(strings.TrimSuffix(names[0], ":")) if len(names) != len(values) { - //todo log continue } var err error @@ -286,8 +279,8 @@ func parseNetstat(reader io.Reader) (netStat, error) { } if err != nil { + log.Println("failed to parse: %w", err) continue - //todo log } } return result, scanner.Err()